From 3343afb09273185b6621a0bce00c926b9f5b945a Mon Sep 17 00:00:00 2001 From: Amit Date: Sat, 24 Oct 2015 14:31:34 -0700 Subject: [PATCH] Remove debug info generation option for CUDA compilation in debug flavor builds and also enable fast-math optimizations . These changes have been done to eliminate differences in GPU results for the E2E tests between debug and release flavors. Setting environment variable CNTK_CUDA_DEVICE_DEBUGINFO=1 will enable debug info generation. The baselines for all E2E tests have also been updated in accordance with this change --- Makefile | 6 +- Math/Math/CNTKMathCUDA.vcxproj | 7 +- .../SinglePrecision/baseline.gpu.txt | 2608 ++++---- .../SinglePrecision/baseline.windows.gpu.txt | 2449 ++++++++ .../baseline.gpu.txt | 1144 +++- .../baseline.windows.gpu.txt | 1146 +++- .../DiscriminativePreTraining/testcases.yml | 6 +- .../ParallelNoQuantization/baseline.gpu.txt | 924 ++- .../baseline.windows.gpu.txt | 973 ++- .../LSTM/FullUtterance/baseline.gpu.txt | 5469 +++++++++------- .../FullUtterance/baseline.windows.gpu.txt | 5494 ++++++++++------- Tests/Speech/LSTM/Truncated/baseline.gpu.txt | 5128 +++++++++------ .../LSTM/Truncated/baseline.windows.gpu.txt | 5157 ++++++++++------ Tests/Speech/QuickE2E/baseline.gpu.txt | 1238 +++- .../Speech/QuickE2E/baseline.windows.gpu.txt | 1214 +++- 15 files changed, 22337 insertions(+), 10626 deletions(-) create mode 100644 Tests/ParallelTraining/NoQuantization/SinglePrecision/baseline.windows.gpu.txt diff --git a/Makefile b/Makefile index 62147fd9c..6f88951b5 100644 --- a/Makefile +++ b/Makefile @@ -151,7 +151,7 @@ ifeq ("$(BUILDTYPE)","debug") CXXFLAGS += -g CPPFLAGS += -D_DEBUG - CUFLAGS += -O0 -G -lineinfo $(GENCODE_FLAGS) + CUFLAGS += -O0 -use_fast_math -lineinfo $(GENCODE_FLAGS) endif ifeq ("$(BUILDTYPE)","release") @@ -165,6 +165,10 @@ ifeq ("$(BUILDTYPE)","release") CUFLAGS += -O3 -use_fast_math -lineinfo $(GENCODE_FLAGS) endif +ifdef CNTK_CUDA_DEVICE_DEBUGINFO + CUFLAGS += -G +endif + ####### OBJDIR:= $(BUILD_TOP)/.build diff --git a/Math/Math/CNTKMathCUDA.vcxproj b/Math/Math/CNTKMathCUDA.vcxproj index d1deb47b0..283caa2cf 100644 --- a/Math/Math/CNTKMathCUDA.vcxproj +++ b/Math/Math/CNTKMathCUDA.vcxproj @@ -85,6 +85,9 @@ true /WX $(CudaCodeGen) + true + false + true xcopy /D /I /Y "$(CudaPath)\bin\cudart64_*.dll" $(OutputPath) @@ -114,8 +117,6 @@ true - true - false false @@ -191,4 +192,4 @@ - \ No newline at end of file + diff --git a/Tests/ParallelTraining/NoQuantization/SinglePrecision/baseline.gpu.txt b/Tests/ParallelTraining/NoQuantization/SinglePrecision/baseline.gpu.txt index 1c7050785..294329581 100644 --- a/Tests/ParallelTraining/NoQuantization/SinglePrecision/baseline.gpu.txt +++ b/Tests/ParallelTraining/NoQuantization/SinglePrecision/baseline.gpu.txt @@ -1,4 +1,4 @@ -=== Running C:\Program Files\Microsoft MPI\Bin\/mpiexec.exe -n 4 E:\NetScale\CNTK\git_repos\public_master\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +=== Running mpiexec -n 4 /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../../SimpleMultiGPU.config RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr MPIWrapper: initializing MPI MPIWrapper: initializing MPI MPIWrapper: initializing MPI @@ -8,46 +8,40 @@ ping [requestnodes (before change)]: 4 nodes pinging each other ping [requestnodes (before change)]: 4 nodes pinging each other ping [requestnodes (before change)]: 4 nodes pinging each other ping [requestnodes (before change)]: all 4 nodes responded -ping [requestnodes (before change)]: all 4 nodes responded -ping [requestnodes (before change)]: all 4 nodes responded requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (2) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (1) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (0) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other ping [requestnodes (before change)]: all 4 nodes responded requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (3) are in (participating) ping [requestnodes (after change)]: 4 nodes pinging each other -ping [requestnodes (after change)]: 4 nodes pinging each other -requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (1) are in (participating) -ping [requestnodes (after change)]: 4 nodes pinging each other -ping [requestnodes (after change)]: 4 nodes pinging each other -ping [requestnodes (after change)]: all 4 nodes responded -ping [requestnodes (after change)]: all 4 nodes responded -ping [requestnodes (after change)]: all 4 nodes responded -mpihelper: we are cog 1 in a gearbox of 4 ping [requestnodes (after change)]: all 4 nodes responded mpihelper: we are cog 3 in a gearbox of 4 -mpihelper: we are cog 0 in a gearbox of 4 ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 1 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded mpihelper: we are cog 2 in a gearbox of 4 ping [mpihelper]: 4 nodes pinging each other -ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 0 in a gearbox of 4 ping [mpihelper]: 4 nodes pinging each other ping [mpihelper]: all 4 nodes responded -ping [mpihelper]: all 4 nodes responded -ping [mpihelper]: all 4 nodes responded -ping [mpihelper]: all 4 nodes responded -MPI Rank 0: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank0 -MPI Rank 0: ------------------------------------------------------------------- -MPI Rank 0: Build info: -MPI Rank 0: -MPI Rank 0: Built time: Aug 25 2015 17:44:46 -MPI Rank 0: Last modified date: Mon Aug 24 16:38:42 2015 -MPI Rank 0: Built by amitaga on Amitaga-Win-DT3 -MPI Rank 0: Build Path: E:\NetScale\CNTK\git_repos\public_master\MachineLearning\CNTK\ -MPI Rank 0: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 -MPI Rank 0: ------------------------------------------------------------------- -MPI Rank 0: running on Amitaga-Win-DT3 at 2015/08/26 01:48:43 -MPI Rank 0: command line options: -MPI Rank 0: configFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +Redirecting stderr to file /tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank0 +Redirecting stderr to file /tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank1 +Redirecting stderr to file /tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank2 +Redirecting stderr to file /tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank3 +MPI Rank 0: running on localhost at 2015/10/24 12:44:53 +MPI Rank 0: command line: +MPI Rank 0: /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../../SimpleMultiGPU.config RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 0: MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> MPI Rank 0: deviceId=$DeviceId$ @@ -75,12 +69,11 @@ MPI Rank 0: minibatchSize=25 MPI Rank 0: learningRatesPerMB=0.5:0.2*20:0.1 MPI Rank 0: momentumPerMB=0.9 MPI Rank 0: dropoutRate=0.0 -MPI Rank 0: maxEpochs=10 +MPI Rank 0: maxEpochs=4 MPI Rank 0: ParallelTrain=[ MPI Rank 0: parallelizationMethod=DataParallelSGD MPI Rank 0: DataParallelSGD=[ MPI Rank 0: gradientBits=1 -MPI Rank 0: parallelizationStartEpoch=1 MPI Rank 0: ] MPI Rank 0: ] MPI Rank 0: ] @@ -102,12 +95,13 @@ MPI Rank 0: labelMappingFile=$DataDir$/SimpleMapping.txt MPI Rank 0: ] MPI Rank 0: ] MPI Rank 0: ] -MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu -MPI Rank 0: DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data +MPI Rank 0: RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu +MPI Rank 0: DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data +MPI Rank 0: ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. MPI Rank 0: DeviceId=0 -MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 0: precision=float MPI Rank 0: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 0: stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 0: MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< MPI Rank 0: @@ -118,7 +112,7 @@ MPI Rank 0: precision=float MPI Rank 0: parallelTrain=true MPI Rank 0: SimpleMultiGPU=[ MPI Rank 0: action=train -MPI Rank 0: modelPath=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 0: modelPath=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn MPI Rank 0: deviceId=0 MPI Rank 0: traceLevel=1 MPI Rank 0: SimpleNetworkBuilder=[ @@ -137,18 +131,17 @@ MPI Rank 0: minibatchSize=25 MPI Rank 0: learningRatesPerMB=0.5:0.2*20:0.1 MPI Rank 0: momentumPerMB=0.9 MPI Rank 0: dropoutRate=0.0 -MPI Rank 0: maxEpochs=10 +MPI Rank 0: maxEpochs=4 MPI Rank 0: ParallelTrain=[ MPI Rank 0: parallelizationMethod=DataParallelSGD MPI Rank 0: DataParallelSGD=[ MPI Rank 0: gradientBits=1 -MPI Rank 0: parallelizationStartEpoch=1 MPI Rank 0: ] MPI Rank 0: ] MPI Rank 0: ] MPI Rank 0: reader=[ MPI Rank 0: readerType=UCIFastReader -MPI Rank 0: file=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 0: file=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt MPI Rank 0: miniBatchMode=Partial MPI Rank 0: randomize=None MPI Rank 0: verbosity=1 @@ -160,29 +153,31 @@ MPI Rank 0: labels=[ MPI Rank 0: start=2 MPI Rank 0: dim=1 MPI Rank 0: labelDim=2 -MPI Rank 0: labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleMapping.txt +MPI Rank 0: labelMappingFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleMapping.txt MPI Rank 0: ] MPI Rank 0: ] MPI Rank 0: ] -MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu -MPI Rank 0: DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data +MPI Rank 0: RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu +MPI Rank 0: DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data +MPI Rank 0: ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. MPI Rank 0: DeviceId=0 -MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 0: precision=float MPI Rank 0: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 0: stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 0: MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 0: MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> MPI Rank 0: configparameters: SimpleMultiGPU.config:command=SimpleMultiGPU -MPI Rank 0: configparameters: SimpleMultiGPU.config:DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data +MPI Rank 0: configparameters: SimpleMultiGPU.config:ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. +MPI Rank 0: configparameters: SimpleMultiGPU.config:DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data MPI Rank 0: configparameters: SimpleMultiGPU.config:deviceId=0 MPI Rank 0: configparameters: SimpleMultiGPU.config:parallelTrain=true MPI Rank 0: configparameters: SimpleMultiGPU.config:precision=float -MPI Rank 0: configparameters: SimpleMultiGPU.config:RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu +MPI Rank 0: configparameters: SimpleMultiGPU.config:RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu MPI Rank 0: configparameters: SimpleMultiGPU.config:SimpleMultiGPU=[ MPI Rank 0: action=train -MPI Rank 0: modelPath=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 0: modelPath=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn MPI Rank 0: deviceId=0 MPI Rank 0: traceLevel=1 MPI Rank 0: SimpleNetworkBuilder=[ @@ -201,18 +196,17 @@ MPI Rank 0: minibatchSize=25 MPI Rank 0: learningRatesPerMB=0.5:0.2*20:0.1 MPI Rank 0: momentumPerMB=0.9 MPI Rank 0: dropoutRate=0.0 -MPI Rank 0: maxEpochs=10 +MPI Rank 0: maxEpochs=4 MPI Rank 0: ParallelTrain=[ MPI Rank 0: parallelizationMethod=DataParallelSGD MPI Rank 0: DataParallelSGD=[ MPI Rank 0: gradientBits=1 -MPI Rank 0: parallelizationStartEpoch=1 MPI Rank 0: ] MPI Rank 0: ] MPI Rank 0: ] MPI Rank 0: reader=[ MPI Rank 0: readerType=UCIFastReader -MPI Rank 0: file=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 0: file=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt MPI Rank 0: miniBatchMode=Partial MPI Rank 0: randomize=None MPI Rank 0: verbosity=1 @@ -224,45 +218,100 @@ MPI Rank 0: labels=[ MPI Rank 0: start=2 MPI Rank 0: dim=1 MPI Rank 0: labelDim=2 -MPI Rank 0: labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleMapping.txt +MPI Rank 0: labelMappingFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleMapping.txt MPI Rank 0: ] MPI Rank 0: ] MPI Rank 0: ] [SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] MPI Rank 0: -MPI Rank 0: configparameters: SimpleMultiGPU.config:stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 0: configparameters: SimpleMultiGPU.config:stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 0: command: SimpleMultiGPU MPI Rank 0: precision = float +MPI Rank 0: CNTKModelPath: /tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 0: CNTKCommandTrainInfo: SimpleMultiGPU : 4 +MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4 +MPI Rank 0: CNTKCommandTrainBegin: SimpleMultiGPU MPI Rank 0: SimpleNetworkBuilder Using GPU 0 -MPI Rank 0: reading uci file E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 0: reading uci file /home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt +MPI Rank 0: SetUniformRandomValue (GPU): creating curand object with seed 1 MPI Rank 0: GetTrainCriterionNodes ... MPI Rank 0: GetEvalCriterionNodes ... MPI Rank 0: MPI Rank 0: -MPI Rank 0: Validating node CrossEntropyWithSoftmax +MPI Rank 0: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. MPI Rank 0: -MPI Rank 0: Validating --> labels = InputValue -MPI Rank 0: Validating --> W2 = LearnableParameter -MPI Rank 0: Validating --> W1 = LearnableParameter -MPI Rank 0: Validating --> W0 = LearnableParameter -MPI Rank 0: Validating --> features = InputValue -MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, 3]) -MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, 3]) -MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, 3]) -MPI Rank 0: Validating --> B0 = LearnableParameter -MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, 3], B0[50, 1]) -MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, 3]) -MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, 3]) -MPI Rank 0: Validating --> B1 = LearnableParameter -MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, 3], B1[50, 1]) -MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, 3]) -MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, 3]) -MPI Rank 0: Validating --> B2 = LearnableParameter -MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, 3], B2[2, 1]) -MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, 3], HLast[2, 3]) +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 0: +MPI Rank 0: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 0: +MPI Rank 0: Validating for node CrossEntropyWithSoftmax, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 0: +MPI Rank 0: 9 out of 20 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Precomputing --> 3 PreCompute nodes found. MPI Rank 0: -MPI Rank 0: Found 3 PreCompute nodes MPI Rank 0: NodeName: InvStdOfFeatures MPI Rank 0: NodeName: MeanOfFeatures MPI Rank 0: NodeName: Prior @@ -273,250 +322,320 @@ MPI Rank 0: starting epoch 0 at record count 0, and file position 0 MPI Rank 0: already there from last epoch MPI Rank 0: MPI Rank 0: -MPI Rank 0: Validating node InvStdOfFeatures +MPI Rank 0: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1. MPI Rank 0: -MPI Rank 0: Validating --> features = InputValue -MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, 25]) +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: +MPI Rank 0: Validating for node InvStdOfFeatures, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: +MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 0: MPI Rank 0: MPI Rank 0: -MPI Rank 0: Validating node MeanOfFeatures +MPI Rank 0: Validating for node MeanOfFeatures. 2 nodes to process in pass 1. MPI Rank 0: -MPI Rank 0: Validating --> features = InputValue -MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, 25]) +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: +MPI Rank 0: Validating for node MeanOfFeatures, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: +MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 0: MPI Rank 0: MPI Rank 0: -MPI Rank 0: Validating node Prior +MPI Rank 0: Validating for node Prior. 2 nodes to process in pass 1. MPI Rank 0: -MPI Rank 0: Validating --> labels = InputValue -MPI Rank 0: Validating --> Prior = Mean(labels[2, 25]) +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 0: +MPI Rank 0: Validating for node Prior. 1 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 0: +MPI Rank 0: Validating for node Prior, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 0: +MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once. +MPI Rank 0: +MPI Rank 0: Precomputing --> Completed. MPI Rank 0: MPI Rank 0: Set Max Temp Mem Size For Convolution Nodes to 0 samples. -MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.020000 momentum = 0.900001 +MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.020000 effective momentum = 0.900000 MPI Rank 0: starting epoch 0 at record count 0, and file position 0 MPI Rank 0: already there from last epoch MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 0: +MPI Rank 0: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 0: +MPI Rank 0: Validating for node EvalErrorPrediction, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 0: +MPI Rank 0: 9 out of 20 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 4, NumGradientBits = 32). -MPI Rank 0: -MPI Rank 0: -MPI Rank 0: Validating node EvalErrorPrediction -MPI Rank 0: -MPI Rank 0: Validating --> labels = InputValue -MPI Rank 0: Validating --> W2 = LearnableParameter -MPI Rank 0: Validating --> W1 = LearnableParameter -MPI Rank 0: Validating --> W0 = LearnableParameter -MPI Rank 0: Validating --> features = InputValue -MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, 6]) -MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, 6]) -MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, 6], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, 6]) -MPI Rank 0: Validating --> B0 = LearnableParameter -MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, 6], B0[50, 1]) -MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, 6]) -MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, 6]) -MPI Rank 0: Validating --> B1 = LearnableParameter -MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, 6], B1[50, 1]) -MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, 6]) -MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, 6]) -MPI Rank 0: Validating --> B2 = LearnableParameter -MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, 6], B2[2, 1]) -MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, 6], HLast[2, 6]) -MPI Rank 0: -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 1- 10 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70007980; EvalErr[0]PerSample = 0.52399999; TotalTime = 0.19902s; TotalTimePerSample = 0.79607ms; SamplesPerSecond = 1256 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 11- 20 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71514523; EvalErr[0]PerSample = 0.51999998; TotalTime = 0.15552s; TotalTimePerSample = 0.62210ms; SamplesPerSecond = 1607 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 21- 30 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72945595; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.14888s; TotalTimePerSample = 0.59550ms; SamplesPerSecond = 1679 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 31- 40 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70079005; EvalErr[0]PerSample = 0.52399999; TotalTime = 0.14477s; TotalTimePerSample = 0.57906ms; SamplesPerSecond = 1726 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 41- 50 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70605618; EvalErr[0]PerSample = 0.54000002; TotalTime = 0.14227s; TotalTimePerSample = 0.56910ms; SamplesPerSecond = 1757 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 51- 60 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71572435; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.13676s; TotalTimePerSample = 0.54705ms; SamplesPerSecond = 1827 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 61- 70 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72149903; EvalErr[0]PerSample = 0.47999999; TotalTime = 0.13631s; TotalTimePerSample = 0.54524ms; SamplesPerSecond = 1834 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 71- 80 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.79845655; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.13451s; TotalTimePerSample = 0.53804ms; SamplesPerSecond = 1858 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 81- 90 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69665188; EvalErr[0]PerSample = 0.46799999; TotalTime = 0.13043s; TotalTimePerSample = 0.52173ms; SamplesPerSecond = 1916 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70723337; EvalErr[0]PerSample = 0.49200001; TotalTime = 0.12788s; TotalTimePerSample = 0.51150ms; SamplesPerSecond = 1955 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71420360; EvalErr[0]PerSample = 0.55199999; TotalTime = 0.12629s; TotalTimePerSample = 0.50518ms; SamplesPerSecond = 1979 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69535255; EvalErr[0]PerSample = 0.43599999; TotalTime = 0.12558s; TotalTimePerSample = 0.50232ms; SamplesPerSecond = 1990 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70078611; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12260s; TotalTimePerSample = 0.49041ms; SamplesPerSecond = 2039 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71857810; EvalErr[0]PerSample = 0.54799998; TotalTime = 0.12292s; TotalTimePerSample = 0.49170ms; SamplesPerSecond = 2033 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72088283; EvalErr[0]PerSample = 0.48800001; TotalTime = 0.12233s; TotalTimePerSample = 0.48931ms; SamplesPerSecond = 2043 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71798825; EvalErr[0]PerSample = 0.55199999; TotalTime = 0.12124s; TotalTimePerSample = 0.48494ms; SamplesPerSecond = 2062 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.74162209; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12328s; TotalTimePerSample = 0.49313ms; SamplesPerSecond = 2027 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71835059; EvalErr[0]PerSample = 0.51599997; TotalTime = 0.12341s; TotalTimePerSample = 0.49363ms; SamplesPerSecond = 2025 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71529394; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12333s; TotalTimePerSample = 0.49334ms; SamplesPerSecond = 2027 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71727639; EvalErr[0]PerSample = 0.53200001; TotalTime = 0.12472s; TotalTimePerSample = 0.49886ms; SamplesPerSecond = 2004 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71745312; EvalErr[0]PerSample = 0.50400001; TotalTime = 0.12361s; TotalTimePerSample = 0.49445ms; SamplesPerSecond = 2022 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72088087; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12197s; TotalTimePerSample = 0.48789ms; SamplesPerSecond = 2049 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72006541; EvalErr[0]PerSample = 0.50800002; TotalTime = 0.12266s; TotalTimePerSample = 0.49062ms; SamplesPerSecond = 2038 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71275192; EvalErr[0]PerSample = 0.51200002; TotalTime = 0.12162s; TotalTimePerSample = 0.48650ms; SamplesPerSecond = 2055 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69645119; EvalErr[0]PerSample = 0.50400001; TotalTime = 0.12120s; TotalTimePerSample = 0.48480ms; SamplesPerSecond = 2062 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70129883; EvalErr[0]PerSample = 0.51200002; TotalTime = 0.12090s; TotalTimePerSample = 0.48360ms; SamplesPerSecond = 2067 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70768166; EvalErr[0]PerSample = 0.54400003; TotalTime = 0.12161s; TotalTimePerSample = 0.48645ms; SamplesPerSecond = 2055 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69744140; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.12268s; TotalTimePerSample = 0.49071ms; SamplesPerSecond = 2037 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69266015; EvalErr[0]PerSample = 0.44800001; TotalTime = 0.12182s; TotalTimePerSample = 0.48726ms; SamplesPerSecond = 2052 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69347072; EvalErr[0]PerSample = 0.49599999; TotalTime = 0.12233s; TotalTimePerSample = 0.48930ms; SamplesPerSecond = 2043 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69257420; EvalErr[0]PerSample = 0.54000002; TotalTime = 0.12287s; TotalTimePerSample = 0.49149ms; SamplesPerSecond = 2034 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.68625975; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12417s; TotalTimePerSample = 0.49666ms; SamplesPerSecond = 2013 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69064063; EvalErr[0]PerSample = 0.46799999; TotalTime = 0.12340s; TotalTimePerSample = 0.49358ms; SamplesPerSecond = 2025 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70192385; EvalErr[0]PerSample = 0.46000001; TotalTime = 0.12176s; TotalTimePerSample = 0.48704ms; SamplesPerSecond = 2053 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69058985; EvalErr[0]PerSample = 0.51999998; TotalTime = 0.12238s; TotalTimePerSample = 0.48950ms; SamplesPerSecond = 2042 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.67041212; EvalErr[0]PerSample = 0.39199999; TotalTime = 0.12185s; TotalTimePerSample = 0.48742ms; SamplesPerSecond = 2051 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.65914255; EvalErr[0]PerSample = 0.35600001; TotalTime = 0.12263s; TotalTimePerSample = 0.49050ms; SamplesPerSecond = 2038 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.63919920; EvalErr[0]PerSample = 0.36399999; TotalTime = 0.12265s; TotalTimePerSample = 0.49062ms; SamplesPerSecond = 2038 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.61294138; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.12143s; TotalTimePerSample = 0.48572ms; SamplesPerSecond = 2058 -MPI Rank 0: Epoch[ 1 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.55255663; EvalErr[0]PerSample = 0.18799999; TotalTime = 0.12034s; TotalTimePerSample = 0.48136ms; SamplesPerSecond = 2077 -MPI Rank 0: Finished Epoch[ 1 of 10]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.47350001; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.254118 -MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.008000 momentum = 0.900001 +MPI Rank 0: DecimateMinibatchSequences: WARNING: Number of parallel utterances 25 not a multiple of number of GPUs 4, GPU usage will be suboptimal. +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 1- 10 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70007977; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.12781s; TotalTimePerSample = 0.51124ms; SamplesPerSecond = 1956 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 11- 20 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71514542; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12492s; TotalTimePerSample = 0.49968ms; SamplesPerSecond = 2001 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 21- 30 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72945595; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12485s; TotalTimePerSample = 0.49942ms; SamplesPerSecond = 2002 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 31- 40 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70079058; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.12472s; TotalTimePerSample = 0.49886ms; SamplesPerSecond = 2004 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 41- 50 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70605616; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12435s; TotalTimePerSample = 0.49740ms; SamplesPerSecond = 2010 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 51- 60 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71572398; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12461s; TotalTimePerSample = 0.49844ms; SamplesPerSecond = 2006 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 61- 70 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72149851; EvalErr[0]PerSample = 0.48000000; TotalTime = 0.12464s; TotalTimePerSample = 0.49856ms; SamplesPerSecond = 2005 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 71- 80 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.79845605; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12423s; TotalTimePerSample = 0.49693ms; SamplesPerSecond = 2012 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 81- 90 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69665186; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12443s; TotalTimePerSample = 0.49770ms; SamplesPerSecond = 2009 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 91- 100 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70723326; EvalErr[0]PerSample = 0.49200000; TotalTime = 0.12508s; TotalTimePerSample = 0.50033ms; SamplesPerSecond = 1998 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 101- 110 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71420344; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.12484s; TotalTimePerSample = 0.49937ms; SamplesPerSecond = 2002 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 111- 120 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69535258; EvalErr[0]PerSample = 0.43600000; TotalTime = 0.12513s; TotalTimePerSample = 0.50053ms; SamplesPerSecond = 1997 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 121- 130 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70078531; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12505s; TotalTimePerSample = 0.50019ms; SamplesPerSecond = 1999 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 131- 140 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71857915; EvalErr[0]PerSample = 0.54800000; TotalTime = 0.12473s; TotalTimePerSample = 0.49892ms; SamplesPerSecond = 2004 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 141- 150 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72088358; EvalErr[0]PerSample = 0.48800000; TotalTime = 0.12486s; TotalTimePerSample = 0.49946ms; SamplesPerSecond = 2002 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 151- 160 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71798839; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.12508s; TotalTimePerSample = 0.50032ms; SamplesPerSecond = 1998 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 161- 170 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.74162164; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12445s; TotalTimePerSample = 0.49778ms; SamplesPerSecond = 2008 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 171- 180 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71835127; EvalErr[0]PerSample = 0.51600000; TotalTime = 0.12465s; TotalTimePerSample = 0.49860ms; SamplesPerSecond = 2005 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 181- 190 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71529462; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12445s; TotalTimePerSample = 0.49780ms; SamplesPerSecond = 2008 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 191- 200 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71727656; EvalErr[0]PerSample = 0.53200000; TotalTime = 0.12439s; TotalTimePerSample = 0.49756ms; SamplesPerSecond = 2009 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 201- 210 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71745517; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.12479s; TotalTimePerSample = 0.49914ms; SamplesPerSecond = 2003 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 211- 220 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72088397; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12444s; TotalTimePerSample = 0.49776ms; SamplesPerSecond = 2008 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 221- 230 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72006808; EvalErr[0]PerSample = 0.50800000; TotalTime = 0.12512s; TotalTimePerSample = 0.50050ms; SamplesPerSecond = 1998 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 231- 240 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71275468; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12514s; TotalTimePerSample = 0.50054ms; SamplesPerSecond = 1997 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 241- 250 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69644781; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.12465s; TotalTimePerSample = 0.49861ms; SamplesPerSecond = 2005 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 251- 260 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70129697; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12492s; TotalTimePerSample = 0.49967ms; SamplesPerSecond = 2001 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 261- 270 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70768095; EvalErr[0]PerSample = 0.54400000; TotalTime = 0.12508s; TotalTimePerSample = 0.50031ms; SamplesPerSecond = 1998 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 271- 280 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69744379; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.12539s; TotalTimePerSample = 0.50154ms; SamplesPerSecond = 1993 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 281- 290 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69266186; EvalErr[0]PerSample = 0.44800000; TotalTime = 0.12547s; TotalTimePerSample = 0.50188ms; SamplesPerSecond = 1992 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 291- 300 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69347266; EvalErr[0]PerSample = 0.49600000; TotalTime = 0.12492s; TotalTimePerSample = 0.49970ms; SamplesPerSecond = 2001 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 301- 310 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69257410; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12545s; TotalTimePerSample = 0.50181ms; SamplesPerSecond = 1992 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 311- 320 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.68625741; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12518s; TotalTimePerSample = 0.50072ms; SamplesPerSecond = 1997 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 321- 330 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69064011; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12542s; TotalTimePerSample = 0.50169ms; SamplesPerSecond = 1993 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 331- 340 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70192153; EvalErr[0]PerSample = 0.46000000; TotalTime = 0.12517s; TotalTimePerSample = 0.50067ms; SamplesPerSecond = 1997 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 341- 350 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69058912; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12544s; TotalTimePerSample = 0.50177ms; SamplesPerSecond = 1992 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 351- 360 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.67041489; EvalErr[0]PerSample = 0.39200000; TotalTime = 0.12585s; TotalTimePerSample = 0.50340ms; SamplesPerSecond = 1986 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 361- 370 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.65913971; EvalErr[0]PerSample = 0.35600000; TotalTime = 0.12536s; TotalTimePerSample = 0.50142ms; SamplesPerSecond = 1994 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 371- 380 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.63919874; EvalErr[0]PerSample = 0.36400000; TotalTime = 0.12517s; TotalTimePerSample = 0.50069ms; SamplesPerSecond = 1997 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 381- 390 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.61293878; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.12521s; TotalTimePerSample = 0.50085ms; SamplesPerSecond = 1996 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 391- 400 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.55255340; EvalErr[0]PerSample = 0.18800000; TotalTime = 0.12492s; TotalTimePerSample = 0.49968ms; SamplesPerSecond = 2001 +MPI Rank 0: Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.4735; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.008554 +MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.008000 effective momentum = 0.900000 MPI Rank 0: starting epoch 1 at record count 10000, and file position 0 MPI Rank 0: already there from last epoch MPI Rank 0: MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 4, NumGradientBits = 32). -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 1- 10 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.50775200; EvalErr[0]PerSample = 0.23999999; TotalTime = 0.12819s; TotalTimePerSample = 0.51276ms; SamplesPerSecond = 1950 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 11- 20 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.43389454; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12188s; TotalTimePerSample = 0.48751ms; SamplesPerSecond = 2051 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 21- 30 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.36675408; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12077s; TotalTimePerSample = 0.48307ms; SamplesPerSecond = 2070 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 31- 40 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.33769274; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12255s; TotalTimePerSample = 0.49020ms; SamplesPerSecond = 2039 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 41- 50 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.30321363; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12133s; TotalTimePerSample = 0.48531ms; SamplesPerSecond = 2060 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 51- 60 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.29576379; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12056s; TotalTimePerSample = 0.48225ms; SamplesPerSecond = 2073 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 61- 70 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.24924731; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12247s; TotalTimePerSample = 0.48987ms; SamplesPerSecond = 2041 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 71- 80 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.24632569; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12329s; TotalTimePerSample = 0.49315ms; SamplesPerSecond = 2027 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 81- 90 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20943311; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12289s; TotalTimePerSample = 0.49156ms; SamplesPerSecond = 2034 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19116065; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12192s; TotalTimePerSample = 0.48767ms; SamplesPerSecond = 2050 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17923315; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12243s; TotalTimePerSample = 0.48972ms; SamplesPerSecond = 2041 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17075513; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12160s; TotalTimePerSample = 0.48640ms; SamplesPerSecond = 2055 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14442432; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12216s; TotalTimePerSample = 0.48864ms; SamplesPerSecond = 2046 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17753857; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12283s; TotalTimePerSample = 0.49131ms; SamplesPerSecond = 2035 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15087914; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12183s; TotalTimePerSample = 0.48732ms; SamplesPerSecond = 2052 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19252978; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12265s; TotalTimePerSample = 0.49059ms; SamplesPerSecond = 2038 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17830664; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12267s; TotalTimePerSample = 0.49070ms; SamplesPerSecond = 2037 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15115429; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12254s; TotalTimePerSample = 0.49017ms; SamplesPerSecond = 2040 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19135889; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12210s; TotalTimePerSample = 0.48840ms; SamplesPerSecond = 2047 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.21491407; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12306s; TotalTimePerSample = 0.49222ms; SamplesPerSecond = 2031 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18682373; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12157s; TotalTimePerSample = 0.48627ms; SamplesPerSecond = 2056 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18483251; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12138s; TotalTimePerSample = 0.48552ms; SamplesPerSecond = 2059 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14684522; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12230s; TotalTimePerSample = 0.48920ms; SamplesPerSecond = 2044 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15322119; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12308s; TotalTimePerSample = 0.49230ms; SamplesPerSecond = 2031 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19882520; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12374s; TotalTimePerSample = 0.49496ms; SamplesPerSecond = 2020 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13683788; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12243s; TotalTimePerSample = 0.48972ms; SamplesPerSecond = 2041 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18621191; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12193s; TotalTimePerSample = 0.48772ms; SamplesPerSecond = 2050 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19408056; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12169s; TotalTimePerSample = 0.48674ms; SamplesPerSecond = 2054 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17298096; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12219s; TotalTimePerSample = 0.48878ms; SamplesPerSecond = 2045 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13265137; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12193s; TotalTimePerSample = 0.48773ms; SamplesPerSecond = 2050 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17627051; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12157s; TotalTimePerSample = 0.48630ms; SamplesPerSecond = 2056 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12734570; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12409s; TotalTimePerSample = 0.49636ms; SamplesPerSecond = 2014 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15108399; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12116s; TotalTimePerSample = 0.48465ms; SamplesPerSecond = 2063 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19729199; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12268s; TotalTimePerSample = 0.49072ms; SamplesPerSecond = 2037 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12857373; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12203s; TotalTimePerSample = 0.48814ms; SamplesPerSecond = 2048 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13867822; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12274s; TotalTimePerSample = 0.49096ms; SamplesPerSecond = 2036 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12786084; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12172s; TotalTimePerSample = 0.48690ms; SamplesPerSecond = 2053 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16643262; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12108s; TotalTimePerSample = 0.48432ms; SamplesPerSecond = 2064 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20440333; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12238s; TotalTimePerSample = 0.48952ms; SamplesPerSecond = 2042 -MPI Rank 0: Epoch[ 2 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14566259; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12230s; TotalTimePerSample = 0.48921ms; SamplesPerSecond = 2044 -MPI Rank 0: Finished Epoch[ 2 of 10]: [Training Set] TrainLossPerSample = 0.20373113; EvalErrPerSample = 0.082699999; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.92998 -MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.008000 momentum = 0.900001 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.50774607; EvalErr[0]PerSample = 0.24000000; TotalTime = 0.12561s; TotalTimePerSample = 0.50245ms; SamplesPerSecond = 1990 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.43388910; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12527s; TotalTimePerSample = 0.50106ms; SamplesPerSecond = 1995 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.36674852; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12554s; TotalTimePerSample = 0.50217ms; SamplesPerSecond = 1991 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.33768746; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12523s; TotalTimePerSample = 0.50093ms; SamplesPerSecond = 1996 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.30320932; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12505s; TotalTimePerSample = 0.50019ms; SamplesPerSecond = 1999 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.29576032; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12563s; TotalTimePerSample = 0.50252ms; SamplesPerSecond = 1989 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.24924483; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12499s; TotalTimePerSample = 0.49998ms; SamplesPerSecond = 2000 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.24632409; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12555s; TotalTimePerSample = 0.50219ms; SamplesPerSecond = 1991 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20943152; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12526s; TotalTimePerSample = 0.50102ms; SamplesPerSecond = 1995 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19115992; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12537s; TotalTimePerSample = 0.50147ms; SamplesPerSecond = 1994 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17923227; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12505s; TotalTimePerSample = 0.50020ms; SamplesPerSecond = 1999 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17075420; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12524s; TotalTimePerSample = 0.50097ms; SamplesPerSecond = 1996 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14442369; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12535s; TotalTimePerSample = 0.50138ms; SamplesPerSecond = 1994 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17753818; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12514s; TotalTimePerSample = 0.50054ms; SamplesPerSecond = 1997 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15087853; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12521s; TotalTimePerSample = 0.50084ms; SamplesPerSecond = 1996 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19253021; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12534s; TotalTimePerSample = 0.50135ms; SamplesPerSecond = 1994 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17830684; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12527s; TotalTimePerSample = 0.50109ms; SamplesPerSecond = 1995 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15115428; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12481s; TotalTimePerSample = 0.49925ms; SamplesPerSecond = 2002 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19135968; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12524s; TotalTimePerSample = 0.50094ms; SamplesPerSecond = 1996 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.21491485; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12494s; TotalTimePerSample = 0.49976ms; SamplesPerSecond = 2000 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18682346; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12540s; TotalTimePerSample = 0.50160ms; SamplesPerSecond = 1993 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18483205; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12495s; TotalTimePerSample = 0.49979ms; SamplesPerSecond = 2000 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14684503; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12495s; TotalTimePerSample = 0.49981ms; SamplesPerSecond = 2000 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15322116; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12531s; TotalTimePerSample = 0.50122ms; SamplesPerSecond = 1995 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19882571; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12523s; TotalTimePerSample = 0.50092ms; SamplesPerSecond = 1996 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13683832; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12499s; TotalTimePerSample = 0.49994ms; SamplesPerSecond = 2000 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18621189; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12490s; TotalTimePerSample = 0.49962ms; SamplesPerSecond = 2001 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19408050; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12603s; TotalTimePerSample = 0.50412ms; SamplesPerSecond = 1983 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17298137; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12517s; TotalTimePerSample = 0.50068ms; SamplesPerSecond = 1997 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13265130; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12562s; TotalTimePerSample = 0.50249ms; SamplesPerSecond = 1990 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17627178; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12516s; TotalTimePerSample = 0.50063ms; SamplesPerSecond = 1997 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12734628; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12563s; TotalTimePerSample = 0.50254ms; SamplesPerSecond = 1989 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15108452; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12518s; TotalTimePerSample = 0.50071ms; SamplesPerSecond = 1997 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19729185; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12573s; TotalTimePerSample = 0.50293ms; SamplesPerSecond = 1988 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12857333; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12511s; TotalTimePerSample = 0.50046ms; SamplesPerSecond = 1998 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13867804; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12548s; TotalTimePerSample = 0.50194ms; SamplesPerSecond = 1992 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12786051; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12555s; TotalTimePerSample = 0.50219ms; SamplesPerSecond = 1991 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16643303; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12504s; TotalTimePerSample = 0.50016ms; SamplesPerSecond = 1999 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20440408; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12571s; TotalTimePerSample = 0.50285ms; SamplesPerSecond = 1988 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14566237; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12526s; TotalTimePerSample = 0.50105ms; SamplesPerSecond = 1995 +MPI Rank 0: Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 0.20373026; EvalErrPerSample = 0.0827; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.013055 +MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.008000 effective momentum = 0.900000 MPI Rank 0: starting epoch 2 at record count 20000, and file position 0 MPI Rank 0: already there from last epoch MPI Rank 0: MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 4, NumGradientBits = 32). -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 1- 10 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12590086; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12915s; TotalTimePerSample = 0.51660ms; SamplesPerSecond = 1935 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 11- 20 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17780226; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12396s; TotalTimePerSample = 0.49586ms; SamplesPerSecond = 2016 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 21- 30 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14417633; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12281s; TotalTimePerSample = 0.49125ms; SamplesPerSecond = 2035 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 31- 40 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15796880; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12283s; TotalTimePerSample = 0.49131ms; SamplesPerSecond = 2035 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 41- 50 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17002991; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12303s; TotalTimePerSample = 0.49212ms; SamplesPerSecond = 2032 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 51- 60 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18262109; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12230s; TotalTimePerSample = 0.48918ms; SamplesPerSecond = 2044 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 61- 70 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14643688; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12105s; TotalTimePerSample = 0.48420ms; SamplesPerSecond = 2065 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 71- 80 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18030518; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12216s; TotalTimePerSample = 0.48862ms; SamplesPerSecond = 2046 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 81- 90 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15846142; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12216s; TotalTimePerSample = 0.48865ms; SamplesPerSecond = 2046 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14486536; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12135s; TotalTimePerSample = 0.48540ms; SamplesPerSecond = 2060 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13469091; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12150s; TotalTimePerSample = 0.48602ms; SamplesPerSecond = 2057 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12149s; TotalTimePerSample = 0.48594ms; SamplesPerSecond = 2057 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.11641297; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12268s; TotalTimePerSample = 0.49073ms; SamplesPerSecond = 2037 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16786633; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12217s; TotalTimePerSample = 0.48868ms; SamplesPerSecond = 2046 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12811548; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12209s; TotalTimePerSample = 0.48836ms; SamplesPerSecond = 2047 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17257836; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12253s; TotalTimePerSample = 0.49013ms; SamplesPerSecond = 2040 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17623682; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12012s; TotalTimePerSample = 0.48046ms; SamplesPerSecond = 2081 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14121118; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12139s; TotalTimePerSample = 0.48557ms; SamplesPerSecond = 2059 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19243409; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12070s; TotalTimePerSample = 0.48279ms; SamplesPerSecond = 2071 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20908155; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12150s; TotalTimePerSample = 0.48600ms; SamplesPerSecond = 2057 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18472095; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12150s; TotalTimePerSample = 0.48602ms; SamplesPerSecond = 2057 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18185547; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12188s; TotalTimePerSample = 0.48750ms; SamplesPerSecond = 2051 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14074194; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12174s; TotalTimePerSample = 0.48698ms; SamplesPerSecond = 2053 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14871632; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12170s; TotalTimePerSample = 0.48680ms; SamplesPerSecond = 2054 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20299682; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12216s; TotalTimePerSample = 0.48864ms; SamplesPerSecond = 2046 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12852076; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12055s; TotalTimePerSample = 0.48221ms; SamplesPerSecond = 2073 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18660498; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12199s; TotalTimePerSample = 0.48796ms; SamplesPerSecond = 2049 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19576025; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12240s; TotalTimePerSample = 0.48961ms; SamplesPerSecond = 2042 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16667627; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12266s; TotalTimePerSample = 0.49062ms; SamplesPerSecond = 2038 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12526172; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12060s; TotalTimePerSample = 0.48238ms; SamplesPerSecond = 2073 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17391992; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12106s; TotalTimePerSample = 0.48423ms; SamplesPerSecond = 2065 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12281641; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12142s; TotalTimePerSample = 0.48570ms; SamplesPerSecond = 2058 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14759424; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12211s; TotalTimePerSample = 0.48846ms; SamplesPerSecond = 2047 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19801368; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12210s; TotalTimePerSample = 0.48840ms; SamplesPerSecond = 2047 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12593359; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12243s; TotalTimePerSample = 0.48971ms; SamplesPerSecond = 2042 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13756640; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12192s; TotalTimePerSample = 0.48767ms; SamplesPerSecond = 2050 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12244s; TotalTimePerSample = 0.48976ms; SamplesPerSecond = 2041 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16654395; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12274s; TotalTimePerSample = 0.49097ms; SamplesPerSecond = 2036 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20658936; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12091s; TotalTimePerSample = 0.48365ms; SamplesPerSecond = 2067 -MPI Rank 0: Epoch[ 3 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14583300; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12209s; TotalTimePerSample = 0.48834ms; SamplesPerSecond = 2047 -MPI Rank 0: Finished Epoch[ 3 of 10]: [Training Set] TrainLossPerSample = 0.15948617; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.919512 -MPI Rank 0: Starting Epoch 4: learning rate per sample = 0.008000 momentum = 0.900001 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12590085; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12528s; TotalTimePerSample = 0.50112ms; SamplesPerSecond = 1995 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17780230; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12502s; TotalTimePerSample = 0.50008ms; SamplesPerSecond = 1999 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14417637; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12568s; TotalTimePerSample = 0.50270ms; SamplesPerSecond = 1989 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15796896; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12536s; TotalTimePerSample = 0.50144ms; SamplesPerSecond = 1994 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17003000; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12538s; TotalTimePerSample = 0.50150ms; SamplesPerSecond = 1994 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18262114; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12534s; TotalTimePerSample = 0.50135ms; SamplesPerSecond = 1994 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14643695; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12565s; TotalTimePerSample = 0.50262ms; SamplesPerSecond = 1989 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18030528; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12593s; TotalTimePerSample = 0.50374ms; SamplesPerSecond = 1985 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15846150; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12554s; TotalTimePerSample = 0.50218ms; SamplesPerSecond = 1991 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14486534; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12554s; TotalTimePerSample = 0.50218ms; SamplesPerSecond = 1991 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13469094; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12553s; TotalTimePerSample = 0.50213ms; SamplesPerSecond = 1991 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12517s; TotalTimePerSample = 0.50068ms; SamplesPerSecond = 1997 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.11641295; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12528s; TotalTimePerSample = 0.50113ms; SamplesPerSecond = 1995 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16786647; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12511s; TotalTimePerSample = 0.50044ms; SamplesPerSecond = 1998 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12811514; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12520s; TotalTimePerSample = 0.50080ms; SamplesPerSecond = 1996 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17257851; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12589s; TotalTimePerSample = 0.50356ms; SamplesPerSecond = 1985 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17623656; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12542s; TotalTimePerSample = 0.50168ms; SamplesPerSecond = 1993 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14121117; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12521s; TotalTimePerSample = 0.50086ms; SamplesPerSecond = 1996 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19243442; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12554s; TotalTimePerSample = 0.50215ms; SamplesPerSecond = 1991 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20908161; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12517s; TotalTimePerSample = 0.50070ms; SamplesPerSecond = 1997 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18472067; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12558s; TotalTimePerSample = 0.50231ms; SamplesPerSecond = 1990 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18185536; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12517s; TotalTimePerSample = 0.50067ms; SamplesPerSecond = 1997 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14074204; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12509s; TotalTimePerSample = 0.50034ms; SamplesPerSecond = 1998 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14871620; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12517s; TotalTimePerSample = 0.50068ms; SamplesPerSecond = 1997 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20299705; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12563s; TotalTimePerSample = 0.50250ms; SamplesPerSecond = 1990 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12852037; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12523s; TotalTimePerSample = 0.50091ms; SamplesPerSecond = 1996 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18660440; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12532s; TotalTimePerSample = 0.50126ms; SamplesPerSecond = 1994 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19575997; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12488s; TotalTimePerSample = 0.49953ms; SamplesPerSecond = 2001 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16667676; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12595s; TotalTimePerSample = 0.50380ms; SamplesPerSecond = 1984 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12526168; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12516s; TotalTimePerSample = 0.50065ms; SamplesPerSecond = 1997 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17392133; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12557s; TotalTimePerSample = 0.50228ms; SamplesPerSecond = 1990 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12281615; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12508s; TotalTimePerSample = 0.50032ms; SamplesPerSecond = 1998 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14759390; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12509s; TotalTimePerSample = 0.50035ms; SamplesPerSecond = 1998 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19801300; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12529s; TotalTimePerSample = 0.50114ms; SamplesPerSecond = 1995 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12593395; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12507s; TotalTimePerSample = 0.50028ms; SamplesPerSecond = 1998 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13756617; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12534s; TotalTimePerSample = 0.50136ms; SamplesPerSecond = 1994 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12534s; TotalTimePerSample = 0.50136ms; SamplesPerSecond = 1994 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16654368; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12549s; TotalTimePerSample = 0.50197ms; SamplesPerSecond = 1992 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20658950; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12535s; TotalTimePerSample = 0.50138ms; SamplesPerSecond = 1994 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14583322; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12566s; TotalTimePerSample = 0.50262ms; SamplesPerSecond = 1989 +MPI Rank 0: Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 0.15948618; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.016251 +MPI Rank 0: Starting Epoch 4: learning rate per sample = 0.008000 effective momentum = 0.900000 MPI Rank 0: starting epoch 3 at record count 30000, and file position 0 MPI Rank 0: already there from last epoch MPI Rank 0: MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 4, NumGradientBits = 32). -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 1- 10 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12371233; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12758s; TotalTimePerSample = 0.51030ms; SamplesPerSecond = 1959 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 11- 20 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18070515; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12300s; TotalTimePerSample = 0.49200ms; SamplesPerSecond = 2032 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 21- 30 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14239721; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12166s; TotalTimePerSample = 0.48662ms; SamplesPerSecond = 2054 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 31- 40 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15630139; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12203s; TotalTimePerSample = 0.48814ms; SamplesPerSecond = 2048 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 41- 50 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16935523; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12092s; TotalTimePerSample = 0.48370ms; SamplesPerSecond = 2067 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 51- 60 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18198816; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12086s; TotalTimePerSample = 0.48344ms; SamplesPerSecond = 2068 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 61- 70 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14475952; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12254s; TotalTimePerSample = 0.49015ms; SamplesPerSecond = 2040 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 71- 80 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18021594; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12209s; TotalTimePerSample = 0.48835ms; SamplesPerSecond = 2047 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 81- 90 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15849304; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12486s; TotalTimePerSample = 0.49944ms; SamplesPerSecond = 2002 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14474402; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12218s; TotalTimePerSample = 0.48872ms; SamplesPerSecond = 2046 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13362928; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12226s; TotalTimePerSample = 0.48906ms; SamplesPerSecond = 2044 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13708325; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12182s; TotalTimePerSample = 0.48729ms; SamplesPerSecond = 2052 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.11569763; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12245s; TotalTimePerSample = 0.48979ms; SamplesPerSecond = 2041 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16892321; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12148s; TotalTimePerSample = 0.48593ms; SamplesPerSecond = 2057 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12752125; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12131s; TotalTimePerSample = 0.48522ms; SamplesPerSecond = 2060 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17100880; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12211s; TotalTimePerSample = 0.48845ms; SamplesPerSecond = 2047 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17660449; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12219s; TotalTimePerSample = 0.48878ms; SamplesPerSecond = 2045 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14105836; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12170s; TotalTimePerSample = 0.48681ms; SamplesPerSecond = 2054 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19333544; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12124s; TotalTimePerSample = 0.48496ms; SamplesPerSecond = 2062 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20859498; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12071s; TotalTimePerSample = 0.48282ms; SamplesPerSecond = 2071 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18499707; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12381s; TotalTimePerSample = 0.49524ms; SamplesPerSecond = 2019 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18152441; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12164s; TotalTimePerSample = 0.48656ms; SamplesPerSecond = 2055 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14037134; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12261s; TotalTimePerSample = 0.49044ms; SamplesPerSecond = 2038 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14866894; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12391s; TotalTimePerSample = 0.49566ms; SamplesPerSecond = 2017 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20347705; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12231s; TotalTimePerSample = 0.48922ms; SamplesPerSecond = 2044 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12815039; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12248s; TotalTimePerSample = 0.48991ms; SamplesPerSecond = 2041 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18672803; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12289s; TotalTimePerSample = 0.49156ms; SamplesPerSecond = 2034 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19552930; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12108s; TotalTimePerSample = 0.48432ms; SamplesPerSecond = 2064 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16452637; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12195s; TotalTimePerSample = 0.48782ms; SamplesPerSecond = 2049 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12461865; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12028s; TotalTimePerSample = 0.48113ms; SamplesPerSecond = 2078 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17285107; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12185s; TotalTimePerSample = 0.48742ms; SamplesPerSecond = 2051 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12253613; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12355s; TotalTimePerSample = 0.49419ms; SamplesPerSecond = 2023 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14723291; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12238s; TotalTimePerSample = 0.48954ms; SamplesPerSecond = 2042 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19789551; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12366s; TotalTimePerSample = 0.49465ms; SamplesPerSecond = 2021 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12575878; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12262s; TotalTimePerSample = 0.49046ms; SamplesPerSecond = 2038 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13745947; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12227s; TotalTimePerSample = 0.48906ms; SamplesPerSecond = 2044 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12839746; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12155s; TotalTimePerSample = 0.48621ms; SamplesPerSecond = 2056 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16647315; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12211s; TotalTimePerSample = 0.48844ms; SamplesPerSecond = 2047 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20679444; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12144s; TotalTimePerSample = 0.48576ms; SamplesPerSecond = 2058 -MPI Rank 0: Epoch[ 4 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14585204; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12219s; TotalTimePerSample = 0.48876ms; SamplesPerSecond = 2045 -MPI Rank 0: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15914927; EvalErrPerSample = 0.076700002; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.927898 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12371233; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12540s; TotalTimePerSample = 0.50158ms; SamplesPerSecond = 1993 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18070514; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12543s; TotalTimePerSample = 0.50172ms; SamplesPerSecond = 1993 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14239731; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12566s; TotalTimePerSample = 0.50264ms; SamplesPerSecond = 1989 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15630155; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12541s; TotalTimePerSample = 0.50163ms; SamplesPerSecond = 1993 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16935525; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12543s; TotalTimePerSample = 0.50174ms; SamplesPerSecond = 1993 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18198833; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12546s; TotalTimePerSample = 0.50182ms; SamplesPerSecond = 1992 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14475946; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12534s; TotalTimePerSample = 0.50134ms; SamplesPerSecond = 1994 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18021602; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12470s; TotalTimePerSample = 0.49880ms; SamplesPerSecond = 2004 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15849308; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12530s; TotalTimePerSample = 0.50120ms; SamplesPerSecond = 1995 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14474426; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12567s; TotalTimePerSample = 0.50270ms; SamplesPerSecond = 1989 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13362926; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12452s; TotalTimePerSample = 0.49810ms; SamplesPerSecond = 2007 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13708300; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12525s; TotalTimePerSample = 0.50101ms; SamplesPerSecond = 1995 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.11569776; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12514s; TotalTimePerSample = 0.50057ms; SamplesPerSecond = 1997 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16892330; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12558s; TotalTimePerSample = 0.50232ms; SamplesPerSecond = 1990 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12752163; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12515s; TotalTimePerSample = 0.50060ms; SamplesPerSecond = 1997 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17100866; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12566s; TotalTimePerSample = 0.50264ms; SamplesPerSecond = 1989 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17660425; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12500s; TotalTimePerSample = 0.50000ms; SamplesPerSecond = 1999 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14105804; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12543s; TotalTimePerSample = 0.50173ms; SamplesPerSecond = 1993 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19333553; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12536s; TotalTimePerSample = 0.50145ms; SamplesPerSecond = 1994 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20859525; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12520s; TotalTimePerSample = 0.50081ms; SamplesPerSecond = 1996 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18499677; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12599s; TotalTimePerSample = 0.50396ms; SamplesPerSecond = 1984 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18152438; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12547s; TotalTimePerSample = 0.50186ms; SamplesPerSecond = 1992 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14037158; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12545s; TotalTimePerSample = 0.50178ms; SamplesPerSecond = 1992 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14866862; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12547s; TotalTimePerSample = 0.50189ms; SamplesPerSecond = 1992 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20347748; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12559s; TotalTimePerSample = 0.50235ms; SamplesPerSecond = 1990 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12815013; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12526s; TotalTimePerSample = 0.50102ms; SamplesPerSecond = 1995 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18672810; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12549s; TotalTimePerSample = 0.50196ms; SamplesPerSecond = 1992 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19552989; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12511s; TotalTimePerSample = 0.50044ms; SamplesPerSecond = 1998 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16452642; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12559s; TotalTimePerSample = 0.50234ms; SamplesPerSecond = 1990 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12461825; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12496s; TotalTimePerSample = 0.49984ms; SamplesPerSecond = 2000 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17285251; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12577s; TotalTimePerSample = 0.50309ms; SamplesPerSecond = 1987 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12253620; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12559s; TotalTimePerSample = 0.50234ms; SamplesPerSecond = 1990 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14723333; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12547s; TotalTimePerSample = 0.50189ms; SamplesPerSecond = 1992 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19789537; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12617s; TotalTimePerSample = 0.50469ms; SamplesPerSecond = 1981 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12575877; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12609s; TotalTimePerSample = 0.50438ms; SamplesPerSecond = 1982 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13745928; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12558s; TotalTimePerSample = 0.50230ms; SamplesPerSecond = 1990 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12839652; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12498s; TotalTimePerSample = 0.49990ms; SamplesPerSecond = 2000 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16647280; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12548s; TotalTimePerSample = 0.50191ms; SamplesPerSecond = 1992 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20679434; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12528s; TotalTimePerSample = 0.50111ms; SamplesPerSecond = 1995 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12570s; TotalTimePerSample = 0.50281ms; SamplesPerSecond = 1988 +MPI Rank 0: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.018144 +MPI Rank 0: CNTKCommandTrainEnd: SimpleMultiGPU MPI Rank 0: COMPLETED MPI Rank 0: ~MPIWrapper -MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank1 -MPI Rank 1: ------------------------------------------------------------------- -MPI Rank 1: Build info: -MPI Rank 1: -MPI Rank 1: Built time: Aug 25 2015 17:44:46 -MPI Rank 1: Last modified date: Mon Aug 24 16:38:42 2015 -MPI Rank 1: Built by amitaga on Amitaga-Win-DT3 -MPI Rank 1: Build Path: E:\NetScale\CNTK\git_repos\public_master\MachineLearning\CNTK\ -MPI Rank 1: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 -MPI Rank 1: ------------------------------------------------------------------- -MPI Rank 1: running on Amitaga-Win-DT3 at 2015/08/26 01:48:43 -MPI Rank 1: command line options: -MPI Rank 1: configFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 1: running on localhost at 2015/10/24 12:44:54 +MPI Rank 1: command line: +MPI Rank 1: /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../../SimpleMultiGPU.config RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 1: MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> MPI Rank 1: deviceId=$DeviceId$ @@ -544,12 +663,11 @@ MPI Rank 1: minibatchSize=25 MPI Rank 1: learningRatesPerMB=0.5:0.2*20:0.1 MPI Rank 1: momentumPerMB=0.9 MPI Rank 1: dropoutRate=0.0 -MPI Rank 1: maxEpochs=10 +MPI Rank 1: maxEpochs=4 MPI Rank 1: ParallelTrain=[ MPI Rank 1: parallelizationMethod=DataParallelSGD MPI Rank 1: DataParallelSGD=[ MPI Rank 1: gradientBits=1 -MPI Rank 1: parallelizationStartEpoch=1 MPI Rank 1: ] MPI Rank 1: ] MPI Rank 1: ] @@ -571,12 +689,13 @@ MPI Rank 1: labelMappingFile=$DataDir$/SimpleMapping.txt MPI Rank 1: ] MPI Rank 1: ] MPI Rank 1: ] -MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu -MPI Rank 1: DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data +MPI Rank 1: RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu +MPI Rank 1: DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data +MPI Rank 1: ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. MPI Rank 1: DeviceId=0 -MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 1: precision=float MPI Rank 1: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 1: stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 1: MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< MPI Rank 1: @@ -587,7 +706,7 @@ MPI Rank 1: precision=float MPI Rank 1: parallelTrain=true MPI Rank 1: SimpleMultiGPU=[ MPI Rank 1: action=train -MPI Rank 1: modelPath=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 1: modelPath=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn MPI Rank 1: deviceId=0 MPI Rank 1: traceLevel=1 MPI Rank 1: SimpleNetworkBuilder=[ @@ -606,18 +725,17 @@ MPI Rank 1: minibatchSize=25 MPI Rank 1: learningRatesPerMB=0.5:0.2*20:0.1 MPI Rank 1: momentumPerMB=0.9 MPI Rank 1: dropoutRate=0.0 -MPI Rank 1: maxEpochs=10 +MPI Rank 1: maxEpochs=4 MPI Rank 1: ParallelTrain=[ MPI Rank 1: parallelizationMethod=DataParallelSGD MPI Rank 1: DataParallelSGD=[ MPI Rank 1: gradientBits=1 -MPI Rank 1: parallelizationStartEpoch=1 MPI Rank 1: ] MPI Rank 1: ] MPI Rank 1: ] MPI Rank 1: reader=[ MPI Rank 1: readerType=UCIFastReader -MPI Rank 1: file=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 1: file=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt MPI Rank 1: miniBatchMode=Partial MPI Rank 1: randomize=None MPI Rank 1: verbosity=1 @@ -629,29 +747,31 @@ MPI Rank 1: labels=[ MPI Rank 1: start=2 MPI Rank 1: dim=1 MPI Rank 1: labelDim=2 -MPI Rank 1: labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleMapping.txt +MPI Rank 1: labelMappingFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleMapping.txt MPI Rank 1: ] MPI Rank 1: ] MPI Rank 1: ] -MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu -MPI Rank 1: DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data +MPI Rank 1: RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu +MPI Rank 1: DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data +MPI Rank 1: ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. MPI Rank 1: DeviceId=0 -MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 1: precision=float MPI Rank 1: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 1: stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 1: MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 1: MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> MPI Rank 1: configparameters: SimpleMultiGPU.config:command=SimpleMultiGPU -MPI Rank 1: configparameters: SimpleMultiGPU.config:DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data +MPI Rank 1: configparameters: SimpleMultiGPU.config:ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. +MPI Rank 1: configparameters: SimpleMultiGPU.config:DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data MPI Rank 1: configparameters: SimpleMultiGPU.config:deviceId=0 MPI Rank 1: configparameters: SimpleMultiGPU.config:parallelTrain=true MPI Rank 1: configparameters: SimpleMultiGPU.config:precision=float -MPI Rank 1: configparameters: SimpleMultiGPU.config:RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu +MPI Rank 1: configparameters: SimpleMultiGPU.config:RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu MPI Rank 1: configparameters: SimpleMultiGPU.config:SimpleMultiGPU=[ MPI Rank 1: action=train -MPI Rank 1: modelPath=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 1: modelPath=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn MPI Rank 1: deviceId=0 MPI Rank 1: traceLevel=1 MPI Rank 1: SimpleNetworkBuilder=[ @@ -670,18 +790,17 @@ MPI Rank 1: minibatchSize=25 MPI Rank 1: learningRatesPerMB=0.5:0.2*20:0.1 MPI Rank 1: momentumPerMB=0.9 MPI Rank 1: dropoutRate=0.0 -MPI Rank 1: maxEpochs=10 +MPI Rank 1: maxEpochs=4 MPI Rank 1: ParallelTrain=[ MPI Rank 1: parallelizationMethod=DataParallelSGD MPI Rank 1: DataParallelSGD=[ MPI Rank 1: gradientBits=1 -MPI Rank 1: parallelizationStartEpoch=1 MPI Rank 1: ] MPI Rank 1: ] MPI Rank 1: ] MPI Rank 1: reader=[ MPI Rank 1: readerType=UCIFastReader -MPI Rank 1: file=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 1: file=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt MPI Rank 1: miniBatchMode=Partial MPI Rank 1: randomize=None MPI Rank 1: verbosity=1 @@ -693,45 +812,100 @@ MPI Rank 1: labels=[ MPI Rank 1: start=2 MPI Rank 1: dim=1 MPI Rank 1: labelDim=2 -MPI Rank 1: labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleMapping.txt +MPI Rank 1: labelMappingFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleMapping.txt MPI Rank 1: ] MPI Rank 1: ] MPI Rank 1: ] [SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] MPI Rank 1: -MPI Rank 1: configparameters: SimpleMultiGPU.config:stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 1: configparameters: SimpleMultiGPU.config:stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 1: command: SimpleMultiGPU MPI Rank 1: precision = float +MPI Rank 1: CNTKModelPath: /tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 1: CNTKCommandTrainInfo: SimpleMultiGPU : 4 +MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4 +MPI Rank 1: CNTKCommandTrainBegin: SimpleMultiGPU MPI Rank 1: SimpleNetworkBuilder Using GPU 0 -MPI Rank 1: reading uci file E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 1: reading uci file /home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt +MPI Rank 1: SetUniformRandomValue (GPU): creating curand object with seed 1 MPI Rank 1: GetTrainCriterionNodes ... MPI Rank 1: GetEvalCriterionNodes ... MPI Rank 1: MPI Rank 1: -MPI Rank 1: Validating node CrossEntropyWithSoftmax +MPI Rank 1: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. MPI Rank 1: -MPI Rank 1: Validating --> labels = InputValue -MPI Rank 1: Validating --> W2 = LearnableParameter -MPI Rank 1: Validating --> W1 = LearnableParameter -MPI Rank 1: Validating --> W0 = LearnableParameter -MPI Rank 1: Validating --> features = InputValue -MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, 3]) -MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, 3]) -MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, 3]) -MPI Rank 1: Validating --> B0 = LearnableParameter -MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, 3], B0[50, 1]) -MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, 3]) -MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, 3]) -MPI Rank 1: Validating --> B1 = LearnableParameter -MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, 3], B1[50, 1]) -MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, 3]) -MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, 3]) -MPI Rank 1: Validating --> B2 = LearnableParameter -MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, 3], B2[2, 1]) -MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, 3], HLast[2, 3]) +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 1: +MPI Rank 1: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 1: +MPI Rank 1: Validating for node CrossEntropyWithSoftmax, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 1: +MPI Rank 1: 9 out of 20 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Precomputing --> 3 PreCompute nodes found. MPI Rank 1: -MPI Rank 1: Found 3 PreCompute nodes MPI Rank 1: NodeName: InvStdOfFeatures MPI Rank 1: NodeName: MeanOfFeatures MPI Rank 1: NodeName: Prior @@ -742,250 +916,320 @@ MPI Rank 1: starting epoch 0 at record count 0, and file position 0 MPI Rank 1: already there from last epoch MPI Rank 1: MPI Rank 1: -MPI Rank 1: Validating node InvStdOfFeatures +MPI Rank 1: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1. MPI Rank 1: -MPI Rank 1: Validating --> features = InputValue -MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, 25]) +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: +MPI Rank 1: Validating for node InvStdOfFeatures, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: +MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 1: MPI Rank 1: MPI Rank 1: -MPI Rank 1: Validating node MeanOfFeatures +MPI Rank 1: Validating for node MeanOfFeatures. 2 nodes to process in pass 1. MPI Rank 1: -MPI Rank 1: Validating --> features = InputValue -MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, 25]) +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: +MPI Rank 1: Validating for node MeanOfFeatures, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: +MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 1: MPI Rank 1: MPI Rank 1: -MPI Rank 1: Validating node Prior +MPI Rank 1: Validating for node Prior. 2 nodes to process in pass 1. MPI Rank 1: -MPI Rank 1: Validating --> labels = InputValue -MPI Rank 1: Validating --> Prior = Mean(labels[2, 25]) +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 1: +MPI Rank 1: Validating for node Prior. 1 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 1: +MPI Rank 1: Validating for node Prior, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 1: +MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once. +MPI Rank 1: +MPI Rank 1: Precomputing --> Completed. MPI Rank 1: MPI Rank 1: Set Max Temp Mem Size For Convolution Nodes to 0 samples. -MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.020000 momentum = 0.900001 +MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.020000 effective momentum = 0.900000 MPI Rank 1: starting epoch 0 at record count 0, and file position 0 MPI Rank 1: already there from last epoch MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 1: +MPI Rank 1: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 1: +MPI Rank 1: Validating for node EvalErrorPrediction, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 1: +MPI Rank 1: 9 out of 20 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 4, NumGradientBits = 32). -MPI Rank 1: -MPI Rank 1: -MPI Rank 1: Validating node EvalErrorPrediction -MPI Rank 1: -MPI Rank 1: Validating --> labels = InputValue -MPI Rank 1: Validating --> W2 = LearnableParameter -MPI Rank 1: Validating --> W1 = LearnableParameter -MPI Rank 1: Validating --> W0 = LearnableParameter -MPI Rank 1: Validating --> features = InputValue -MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, 6]) -MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, 6]) -MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, 6], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, 6]) -MPI Rank 1: Validating --> B0 = LearnableParameter -MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, 6], B0[50, 1]) -MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, 6]) -MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, 6]) -MPI Rank 1: Validating --> B1 = LearnableParameter -MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, 6], B1[50, 1]) -MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, 6]) -MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, 6]) -MPI Rank 1: Validating --> B2 = LearnableParameter -MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, 6], B2[2, 1]) -MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, 6], HLast[2, 6]) -MPI Rank 1: -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 1- 10 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70007980; EvalErr[0]PerSample = 0.52399999; TotalTime = 0.19936s; TotalTimePerSample = 0.79746ms; SamplesPerSecond = 1253 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 11- 20 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71514523; EvalErr[0]PerSample = 0.51999998; TotalTime = 0.15552s; TotalTimePerSample = 0.62208ms; SamplesPerSecond = 1607 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 21- 30 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72945595; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.14887s; TotalTimePerSample = 0.59550ms; SamplesPerSecond = 1679 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 31- 40 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70079005; EvalErr[0]PerSample = 0.52399999; TotalTime = 0.14476s; TotalTimePerSample = 0.57905ms; SamplesPerSecond = 1726 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 41- 50 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70605618; EvalErr[0]PerSample = 0.54000002; TotalTime = 0.14228s; TotalTimePerSample = 0.56912ms; SamplesPerSecond = 1757 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 51- 60 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71572435; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.13675s; TotalTimePerSample = 0.54699ms; SamplesPerSecond = 1828 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 61- 70 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72149903; EvalErr[0]PerSample = 0.47999999; TotalTime = 0.13631s; TotalTimePerSample = 0.54524ms; SamplesPerSecond = 1834 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 71- 80 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.79845655; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.13435s; TotalTimePerSample = 0.53738ms; SamplesPerSecond = 1860 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 81- 90 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69665188; EvalErr[0]PerSample = 0.46799999; TotalTime = 0.13044s; TotalTimePerSample = 0.52174ms; SamplesPerSecond = 1916 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70723337; EvalErr[0]PerSample = 0.49200001; TotalTime = 0.12786s; TotalTimePerSample = 0.51146ms; SamplesPerSecond = 1955 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71420360; EvalErr[0]PerSample = 0.55199999; TotalTime = 0.12631s; TotalTimePerSample = 0.50524ms; SamplesPerSecond = 1979 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69535255; EvalErr[0]PerSample = 0.43599999; TotalTime = 0.12559s; TotalTimePerSample = 0.50234ms; SamplesPerSecond = 1990 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70078611; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12261s; TotalTimePerSample = 0.49046ms; SamplesPerSecond = 2038 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71857810; EvalErr[0]PerSample = 0.54799998; TotalTime = 0.12293s; TotalTimePerSample = 0.49171ms; SamplesPerSecond = 2033 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72088283; EvalErr[0]PerSample = 0.48800001; TotalTime = 0.12233s; TotalTimePerSample = 0.48930ms; SamplesPerSecond = 2043 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71798825; EvalErr[0]PerSample = 0.55199999; TotalTime = 0.12123s; TotalTimePerSample = 0.48494ms; SamplesPerSecond = 2062 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.74162209; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12330s; TotalTimePerSample = 0.49320ms; SamplesPerSecond = 2027 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71835059; EvalErr[0]PerSample = 0.51599997; TotalTime = 0.12340s; TotalTimePerSample = 0.49358ms; SamplesPerSecond = 2025 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71529394; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12334s; TotalTimePerSample = 0.49336ms; SamplesPerSecond = 2026 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71727639; EvalErr[0]PerSample = 0.53200001; TotalTime = 0.12473s; TotalTimePerSample = 0.49890ms; SamplesPerSecond = 2004 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71745312; EvalErr[0]PerSample = 0.50400001; TotalTime = 0.12362s; TotalTimePerSample = 0.49447ms; SamplesPerSecond = 2022 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72088087; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12197s; TotalTimePerSample = 0.48789ms; SamplesPerSecond = 2049 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72006541; EvalErr[0]PerSample = 0.50800002; TotalTime = 0.12266s; TotalTimePerSample = 0.49064ms; SamplesPerSecond = 2038 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71275192; EvalErr[0]PerSample = 0.51200002; TotalTime = 0.12162s; TotalTimePerSample = 0.48650ms; SamplesPerSecond = 2055 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69645119; EvalErr[0]PerSample = 0.50400001; TotalTime = 0.12122s; TotalTimePerSample = 0.48486ms; SamplesPerSecond = 2062 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70129883; EvalErr[0]PerSample = 0.51200002; TotalTime = 0.12089s; TotalTimePerSample = 0.48356ms; SamplesPerSecond = 2068 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70768166; EvalErr[0]PerSample = 0.54400003; TotalTime = 0.12174s; TotalTimePerSample = 0.48697ms; SamplesPerSecond = 2053 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69744140; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.12268s; TotalTimePerSample = 0.49072ms; SamplesPerSecond = 2037 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69266015; EvalErr[0]PerSample = 0.44800001; TotalTime = 0.12181s; TotalTimePerSample = 0.48722ms; SamplesPerSecond = 2052 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69347072; EvalErr[0]PerSample = 0.49599999; TotalTime = 0.12233s; TotalTimePerSample = 0.48930ms; SamplesPerSecond = 2043 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69257420; EvalErr[0]PerSample = 0.54000002; TotalTime = 0.12288s; TotalTimePerSample = 0.49154ms; SamplesPerSecond = 2034 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.68625975; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12417s; TotalTimePerSample = 0.49669ms; SamplesPerSecond = 2013 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69064063; EvalErr[0]PerSample = 0.46799999; TotalTime = 0.12339s; TotalTimePerSample = 0.49354ms; SamplesPerSecond = 2026 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70192385; EvalErr[0]PerSample = 0.46000001; TotalTime = 0.12176s; TotalTimePerSample = 0.48704ms; SamplesPerSecond = 2053 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69058985; EvalErr[0]PerSample = 0.51999998; TotalTime = 0.12238s; TotalTimePerSample = 0.48952ms; SamplesPerSecond = 2042 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.67041212; EvalErr[0]PerSample = 0.39199999; TotalTime = 0.12186s; TotalTimePerSample = 0.48744ms; SamplesPerSecond = 2051 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.65914255; EvalErr[0]PerSample = 0.35600001; TotalTime = 0.12264s; TotalTimePerSample = 0.49054ms; SamplesPerSecond = 2038 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.63919920; EvalErr[0]PerSample = 0.36399999; TotalTime = 0.12265s; TotalTimePerSample = 0.49062ms; SamplesPerSecond = 2038 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.61294138; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.12143s; TotalTimePerSample = 0.48574ms; SamplesPerSecond = 2058 -MPI Rank 1: Epoch[ 1 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.55255663; EvalErr[0]PerSample = 0.18799999; TotalTime = 0.12034s; TotalTimePerSample = 0.48136ms; SamplesPerSecond = 2077 -MPI Rank 1: Finished Epoch[ 1 of 10]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.47350001; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.253972 -MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.008000 momentum = 0.900001 +MPI Rank 1: DecimateMinibatchSequences: WARNING: Number of parallel utterances 25 not a multiple of number of GPUs 4, GPU usage will be suboptimal. +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 1- 10 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70007977; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.12718s; TotalTimePerSample = 0.50872ms; SamplesPerSecond = 1965 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 11- 20 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71514542; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12478s; TotalTimePerSample = 0.49911ms; SamplesPerSecond = 2003 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 21- 30 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72945595; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12484s; TotalTimePerSample = 0.49937ms; SamplesPerSecond = 2002 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 31- 40 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70079058; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.12471s; TotalTimePerSample = 0.49883ms; SamplesPerSecond = 2004 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 41- 50 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70605616; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12439s; TotalTimePerSample = 0.49754ms; SamplesPerSecond = 2009 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 51- 60 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71572398; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12468s; TotalTimePerSample = 0.49870ms; SamplesPerSecond = 2005 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 61- 70 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72149851; EvalErr[0]PerSample = 0.48000000; TotalTime = 0.12451s; TotalTimePerSample = 0.49804ms; SamplesPerSecond = 2007 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 71- 80 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.79845605; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12436s; TotalTimePerSample = 0.49745ms; SamplesPerSecond = 2010 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 81- 90 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69665186; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12441s; TotalTimePerSample = 0.49766ms; SamplesPerSecond = 2009 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 91- 100 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70723326; EvalErr[0]PerSample = 0.49200000; TotalTime = 0.12505s; TotalTimePerSample = 0.50020ms; SamplesPerSecond = 1999 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 101- 110 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71420344; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.12494s; TotalTimePerSample = 0.49974ms; SamplesPerSecond = 2001 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 111- 120 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69535258; EvalErr[0]PerSample = 0.43600000; TotalTime = 0.12504s; TotalTimePerSample = 0.50015ms; SamplesPerSecond = 1999 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 121- 130 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70078531; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12504s; TotalTimePerSample = 0.50018ms; SamplesPerSecond = 1999 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 131- 140 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71857915; EvalErr[0]PerSample = 0.54800000; TotalTime = 0.12470s; TotalTimePerSample = 0.49879ms; SamplesPerSecond = 2004 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 141- 150 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72088358; EvalErr[0]PerSample = 0.48800000; TotalTime = 0.12489s; TotalTimePerSample = 0.49955ms; SamplesPerSecond = 2001 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 151- 160 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71798839; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.12496s; TotalTimePerSample = 0.49986ms; SamplesPerSecond = 2000 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 161- 170 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.74162164; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12439s; TotalTimePerSample = 0.49757ms; SamplesPerSecond = 2009 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 171- 180 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71835127; EvalErr[0]PerSample = 0.51600000; TotalTime = 0.12480s; TotalTimePerSample = 0.49921ms; SamplesPerSecond = 2003 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 181- 190 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71529462; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12442s; TotalTimePerSample = 0.49767ms; SamplesPerSecond = 2009 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 191- 200 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71727656; EvalErr[0]PerSample = 0.53200000; TotalTime = 0.12438s; TotalTimePerSample = 0.49752ms; SamplesPerSecond = 2009 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 201- 210 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71745517; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.12477s; TotalTimePerSample = 0.49908ms; SamplesPerSecond = 2003 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 211- 220 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72088397; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12431s; TotalTimePerSample = 0.49724ms; SamplesPerSecond = 2011 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 221- 230 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72006808; EvalErr[0]PerSample = 0.50800000; TotalTime = 0.12485s; TotalTimePerSample = 0.49941ms; SamplesPerSecond = 2002 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 231- 240 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71275468; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12509s; TotalTimePerSample = 0.50034ms; SamplesPerSecond = 1998 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 241- 250 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69644781; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.12464s; TotalTimePerSample = 0.49854ms; SamplesPerSecond = 2005 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 251- 260 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70129697; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12495s; TotalTimePerSample = 0.49980ms; SamplesPerSecond = 2000 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 261- 270 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70768095; EvalErr[0]PerSample = 0.54400000; TotalTime = 0.12487s; TotalTimePerSample = 0.49948ms; SamplesPerSecond = 2002 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 271- 280 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69744379; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.12557s; TotalTimePerSample = 0.50229ms; SamplesPerSecond = 1990 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 281- 290 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69266186; EvalErr[0]PerSample = 0.44800000; TotalTime = 0.12540s; TotalTimePerSample = 0.50162ms; SamplesPerSecond = 1993 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 291- 300 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69347266; EvalErr[0]PerSample = 0.49600000; TotalTime = 0.12498s; TotalTimePerSample = 0.49991ms; SamplesPerSecond = 2000 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 301- 310 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69257410; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12543s; TotalTimePerSample = 0.50172ms; SamplesPerSecond = 1993 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 311- 320 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.68625741; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12516s; TotalTimePerSample = 0.50066ms; SamplesPerSecond = 1997 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 321- 330 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69064011; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12542s; TotalTimePerSample = 0.50169ms; SamplesPerSecond = 1993 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 331- 340 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70192153; EvalErr[0]PerSample = 0.46000000; TotalTime = 0.12511s; TotalTimePerSample = 0.50042ms; SamplesPerSecond = 1998 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 341- 350 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69058912; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12551s; TotalTimePerSample = 0.50205ms; SamplesPerSecond = 1991 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 351- 360 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.67041489; EvalErr[0]PerSample = 0.39200000; TotalTime = 0.12580s; TotalTimePerSample = 0.50319ms; SamplesPerSecond = 1987 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 361- 370 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.65913971; EvalErr[0]PerSample = 0.35600000; TotalTime = 0.12534s; TotalTimePerSample = 0.50137ms; SamplesPerSecond = 1994 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 371- 380 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.63919874; EvalErr[0]PerSample = 0.36400000; TotalTime = 0.12514s; TotalTimePerSample = 0.50057ms; SamplesPerSecond = 1997 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 381- 390 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.61293878; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.12522s; TotalTimePerSample = 0.50088ms; SamplesPerSecond = 1996 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 391- 400 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.55255340; EvalErr[0]PerSample = 0.18800000; TotalTime = 0.12484s; TotalTimePerSample = 0.49937ms; SamplesPerSecond = 2002 +MPI Rank 1: Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.4735; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.008953 +MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.008000 effective momentum = 0.900000 MPI Rank 1: starting epoch 1 at record count 10000, and file position 0 MPI Rank 1: already there from last epoch MPI Rank 1: MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 4, NumGradientBits = 32). -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 1- 10 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.50775200; EvalErr[0]PerSample = 0.23999999; TotalTime = 0.12819s; TotalTimePerSample = 0.51274ms; SamplesPerSecond = 1950 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 11- 20 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.43389454; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12189s; TotalTimePerSample = 0.48755ms; SamplesPerSecond = 2051 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 21- 30 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.36675408; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12077s; TotalTimePerSample = 0.48309ms; SamplesPerSecond = 2070 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 31- 40 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.33769274; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12255s; TotalTimePerSample = 0.49019ms; SamplesPerSecond = 2040 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 41- 50 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.30321363; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12133s; TotalTimePerSample = 0.48531ms; SamplesPerSecond = 2060 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 51- 60 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.29576379; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12054s; TotalTimePerSample = 0.48217ms; SamplesPerSecond = 2073 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 61- 70 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.24924731; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12247s; TotalTimePerSample = 0.48987ms; SamplesPerSecond = 2041 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 71- 80 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.24632569; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12330s; TotalTimePerSample = 0.49319ms; SamplesPerSecond = 2027 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 81- 90 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20943311; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12290s; TotalTimePerSample = 0.49159ms; SamplesPerSecond = 2034 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19116065; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12191s; TotalTimePerSample = 0.48764ms; SamplesPerSecond = 2050 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17923315; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12243s; TotalTimePerSample = 0.48973ms; SamplesPerSecond = 2041 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17075513; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12160s; TotalTimePerSample = 0.48641ms; SamplesPerSecond = 2055 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14442432; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12216s; TotalTimePerSample = 0.48864ms; SamplesPerSecond = 2046 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17753857; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12282s; TotalTimePerSample = 0.49126ms; SamplesPerSecond = 2035 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15087914; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12184s; TotalTimePerSample = 0.48735ms; SamplesPerSecond = 2051 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19252978; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12266s; TotalTimePerSample = 0.49063ms; SamplesPerSecond = 2038 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17830664; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12268s; TotalTimePerSample = 0.49071ms; SamplesPerSecond = 2037 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15115429; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12258s; TotalTimePerSample = 0.49032ms; SamplesPerSecond = 2039 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19135889; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12208s; TotalTimePerSample = 0.48832ms; SamplesPerSecond = 2047 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.21491407; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12305s; TotalTimePerSample = 0.49221ms; SamplesPerSecond = 2031 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18682373; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12157s; TotalTimePerSample = 0.48628ms; SamplesPerSecond = 2056 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18483251; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12136s; TotalTimePerSample = 0.48545ms; SamplesPerSecond = 2059 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14684522; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12230s; TotalTimePerSample = 0.48920ms; SamplesPerSecond = 2044 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15322119; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12309s; TotalTimePerSample = 0.49234ms; SamplesPerSecond = 2031 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19882520; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12375s; TotalTimePerSample = 0.49499ms; SamplesPerSecond = 2020 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13683788; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12243s; TotalTimePerSample = 0.48973ms; SamplesPerSecond = 2041 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18621191; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12194s; TotalTimePerSample = 0.48776ms; SamplesPerSecond = 2050 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19408056; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12169s; TotalTimePerSample = 0.48674ms; SamplesPerSecond = 2054 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17298096; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12219s; TotalTimePerSample = 0.48877ms; SamplesPerSecond = 2045 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13265137; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12194s; TotalTimePerSample = 0.48774ms; SamplesPerSecond = 2050 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17627051; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12157s; TotalTimePerSample = 0.48627ms; SamplesPerSecond = 2056 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12734570; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12411s; TotalTimePerSample = 0.49643ms; SamplesPerSecond = 2014 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15108399; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12117s; TotalTimePerSample = 0.48467ms; SamplesPerSecond = 2063 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19729199; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12269s; TotalTimePerSample = 0.49077ms; SamplesPerSecond = 2037 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12857373; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12203s; TotalTimePerSample = 0.48814ms; SamplesPerSecond = 2048 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13867822; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12273s; TotalTimePerSample = 0.49094ms; SamplesPerSecond = 2036 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12786084; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12171s; TotalTimePerSample = 0.48684ms; SamplesPerSecond = 2054 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16643262; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12108s; TotalTimePerSample = 0.48434ms; SamplesPerSecond = 2064 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20440333; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12239s; TotalTimePerSample = 0.48954ms; SamplesPerSecond = 2042 -MPI Rank 1: Epoch[ 2 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14566259; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12218s; TotalTimePerSample = 0.48873ms; SamplesPerSecond = 2046 -MPI Rank 1: Finished Epoch[ 2 of 10]: [Training Set] TrainLossPerSample = 0.20373113; EvalErrPerSample = 0.082699999; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.929569 -MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.008000 momentum = 0.900001 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.50774607; EvalErr[0]PerSample = 0.24000000; TotalTime = 0.12601s; TotalTimePerSample = 0.50405ms; SamplesPerSecond = 1983 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.43388910; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12544s; TotalTimePerSample = 0.50174ms; SamplesPerSecond = 1993 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.36674852; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12539s; TotalTimePerSample = 0.50155ms; SamplesPerSecond = 1993 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.33768746; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12522s; TotalTimePerSample = 0.50086ms; SamplesPerSecond = 1996 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.30320932; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12500s; TotalTimePerSample = 0.49999ms; SamplesPerSecond = 2000 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.29576032; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12561s; TotalTimePerSample = 0.50246ms; SamplesPerSecond = 1990 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.24924483; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12491s; TotalTimePerSample = 0.49963ms; SamplesPerSecond = 2001 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.24632409; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12565s; TotalTimePerSample = 0.50259ms; SamplesPerSecond = 1989 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20943152; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12520s; TotalTimePerSample = 0.50078ms; SamplesPerSecond = 1996 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19115992; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12536s; TotalTimePerSample = 0.50143ms; SamplesPerSecond = 1994 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17923227; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12509s; TotalTimePerSample = 0.50036ms; SamplesPerSecond = 1998 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17075420; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12523s; TotalTimePerSample = 0.50092ms; SamplesPerSecond = 1996 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14442369; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12530s; TotalTimePerSample = 0.50118ms; SamplesPerSecond = 1995 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17753818; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12511s; TotalTimePerSample = 0.50044ms; SamplesPerSecond = 1998 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15087853; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12521s; TotalTimePerSample = 0.50086ms; SamplesPerSecond = 1996 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19253021; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12531s; TotalTimePerSample = 0.50123ms; SamplesPerSecond = 1995 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17830684; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12532s; TotalTimePerSample = 0.50130ms; SamplesPerSecond = 1994 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15115428; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12474s; TotalTimePerSample = 0.49895ms; SamplesPerSecond = 2004 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19135968; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12524s; TotalTimePerSample = 0.50095ms; SamplesPerSecond = 1996 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.21491485; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12497s; TotalTimePerSample = 0.49987ms; SamplesPerSecond = 2000 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18682346; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12528s; TotalTimePerSample = 0.50113ms; SamplesPerSecond = 1995 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18483205; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12501s; TotalTimePerSample = 0.50002ms; SamplesPerSecond = 1999 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14684503; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12495s; TotalTimePerSample = 0.49978ms; SamplesPerSecond = 2000 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15322116; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12527s; TotalTimePerSample = 0.50109ms; SamplesPerSecond = 1995 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19882571; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12518s; TotalTimePerSample = 0.50072ms; SamplesPerSecond = 1997 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13683832; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12507s; TotalTimePerSample = 0.50028ms; SamplesPerSecond = 1998 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18621189; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12489s; TotalTimePerSample = 0.49957ms; SamplesPerSecond = 2001 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19408050; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12602s; TotalTimePerSample = 0.50407ms; SamplesPerSecond = 1983 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17298137; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12516s; TotalTimePerSample = 0.50063ms; SamplesPerSecond = 1997 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13265130; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12561s; TotalTimePerSample = 0.50245ms; SamplesPerSecond = 1990 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17627178; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12509s; TotalTimePerSample = 0.50038ms; SamplesPerSecond = 1998 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12734628; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12580s; TotalTimePerSample = 0.50321ms; SamplesPerSecond = 1987 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15108452; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12484s; TotalTimePerSample = 0.49936ms; SamplesPerSecond = 2002 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19729185; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12592s; TotalTimePerSample = 0.50368ms; SamplesPerSecond = 1985 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12857333; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12505s; TotalTimePerSample = 0.50020ms; SamplesPerSecond = 1999 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13867804; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12550s; TotalTimePerSample = 0.50201ms; SamplesPerSecond = 1992 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12786051; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12550s; TotalTimePerSample = 0.50202ms; SamplesPerSecond = 1991 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16643303; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12504s; TotalTimePerSample = 0.50018ms; SamplesPerSecond = 1999 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20440408; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12576s; TotalTimePerSample = 0.50303ms; SamplesPerSecond = 1987 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14566237; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12516s; TotalTimePerSample = 0.50066ms; SamplesPerSecond = 1997 +MPI Rank 1: Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 0.20373026; EvalErrPerSample = 0.0827; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.013461 +MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.008000 effective momentum = 0.900000 MPI Rank 1: starting epoch 2 at record count 20000, and file position 0 MPI Rank 1: already there from last epoch MPI Rank 1: MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 4, NumGradientBits = 32). -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 1- 10 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12590086; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12914s; TotalTimePerSample = 0.51656ms; SamplesPerSecond = 1935 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 11- 20 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17780226; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12397s; TotalTimePerSample = 0.49587ms; SamplesPerSecond = 2016 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 21- 30 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14417633; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12282s; TotalTimePerSample = 0.49128ms; SamplesPerSecond = 2035 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 31- 40 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15796880; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12283s; TotalTimePerSample = 0.49133ms; SamplesPerSecond = 2035 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 41- 50 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17002991; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12300s; TotalTimePerSample = 0.49200ms; SamplesPerSecond = 2032 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 51- 60 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18262109; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12231s; TotalTimePerSample = 0.48924ms; SamplesPerSecond = 2044 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 61- 70 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14643688; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12105s; TotalTimePerSample = 0.48419ms; SamplesPerSecond = 2065 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 71- 80 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18030518; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12216s; TotalTimePerSample = 0.48862ms; SamplesPerSecond = 2046 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 81- 90 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15846142; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12217s; TotalTimePerSample = 0.48869ms; SamplesPerSecond = 2046 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14486536; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12135s; TotalTimePerSample = 0.48540ms; SamplesPerSecond = 2060 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13469091; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12151s; TotalTimePerSample = 0.48603ms; SamplesPerSecond = 2057 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12148s; TotalTimePerSample = 0.48593ms; SamplesPerSecond = 2057 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.11641297; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12268s; TotalTimePerSample = 0.49074ms; SamplesPerSecond = 2037 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16786633; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12218s; TotalTimePerSample = 0.48871ms; SamplesPerSecond = 2046 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12811548; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12211s; TotalTimePerSample = 0.48844ms; SamplesPerSecond = 2047 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17257836; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12252s; TotalTimePerSample = 0.49008ms; SamplesPerSecond = 2040 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17623682; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12011s; TotalTimePerSample = 0.48046ms; SamplesPerSecond = 2081 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14121118; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12138s; TotalTimePerSample = 0.48552ms; SamplesPerSecond = 2059 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19243409; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12070s; TotalTimePerSample = 0.48282ms; SamplesPerSecond = 2071 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20908155; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12151s; TotalTimePerSample = 0.48602ms; SamplesPerSecond = 2057 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18472095; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12151s; TotalTimePerSample = 0.48602ms; SamplesPerSecond = 2057 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18185547; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12187s; TotalTimePerSample = 0.48750ms; SamplesPerSecond = 2051 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14074194; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12175s; TotalTimePerSample = 0.48701ms; SamplesPerSecond = 2053 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14871632; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12169s; TotalTimePerSample = 0.48676ms; SamplesPerSecond = 2054 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20299682; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12217s; TotalTimePerSample = 0.48868ms; SamplesPerSecond = 2046 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12852076; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12056s; TotalTimePerSample = 0.48223ms; SamplesPerSecond = 2073 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18660498; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12199s; TotalTimePerSample = 0.48794ms; SamplesPerSecond = 2049 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19576025; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12241s; TotalTimePerSample = 0.48964ms; SamplesPerSecond = 2042 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16667627; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12267s; TotalTimePerSample = 0.49067ms; SamplesPerSecond = 2038 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12526172; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12060s; TotalTimePerSample = 0.48239ms; SamplesPerSecond = 2073 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17391992; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12147s; TotalTimePerSample = 0.48588ms; SamplesPerSecond = 2058 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12281641; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12143s; TotalTimePerSample = 0.48574ms; SamplesPerSecond = 2058 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14759424; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12211s; TotalTimePerSample = 0.48844ms; SamplesPerSecond = 2047 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19801368; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12211s; TotalTimePerSample = 0.48843ms; SamplesPerSecond = 2047 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12593359; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12245s; TotalTimePerSample = 0.48980ms; SamplesPerSecond = 2041 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13756640; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12191s; TotalTimePerSample = 0.48763ms; SamplesPerSecond = 2050 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12244s; TotalTimePerSample = 0.48977ms; SamplesPerSecond = 2041 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16654395; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12275s; TotalTimePerSample = 0.49099ms; SamplesPerSecond = 2036 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20658936; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12092s; TotalTimePerSample = 0.48370ms; SamplesPerSecond = 2067 -MPI Rank 1: Epoch[ 3 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14583300; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12209s; TotalTimePerSample = 0.48836ms; SamplesPerSecond = 2047 -MPI Rank 1: Finished Epoch[ 3 of 10]: [Training Set] TrainLossPerSample = 0.15948617; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.919364 -MPI Rank 1: Starting Epoch 4: learning rate per sample = 0.008000 momentum = 0.900001 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12590085; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12568s; TotalTimePerSample = 0.50274ms; SamplesPerSecond = 1989 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17780230; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12485s; TotalTimePerSample = 0.49939ms; SamplesPerSecond = 2002 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14417637; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12581s; TotalTimePerSample = 0.50323ms; SamplesPerSecond = 1987 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15796896; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12536s; TotalTimePerSample = 0.50145ms; SamplesPerSecond = 1994 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17003000; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12540s; TotalTimePerSample = 0.50160ms; SamplesPerSecond = 1993 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18262114; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12534s; TotalTimePerSample = 0.50136ms; SamplesPerSecond = 1994 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14643695; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12559s; TotalTimePerSample = 0.50237ms; SamplesPerSecond = 1990 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18030528; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12591s; TotalTimePerSample = 0.50362ms; SamplesPerSecond = 1985 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15846150; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12559s; TotalTimePerSample = 0.50235ms; SamplesPerSecond = 1990 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14486534; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12553s; TotalTimePerSample = 0.50213ms; SamplesPerSecond = 1991 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13469094; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12535s; TotalTimePerSample = 0.50142ms; SamplesPerSecond = 1994 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12529s; TotalTimePerSample = 0.50116ms; SamplesPerSecond = 1995 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.11641295; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12531s; TotalTimePerSample = 0.50123ms; SamplesPerSecond = 1995 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16786647; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12510s; TotalTimePerSample = 0.50041ms; SamplesPerSecond = 1998 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12811514; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12515s; TotalTimePerSample = 0.50060ms; SamplesPerSecond = 1997 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17257851; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12592s; TotalTimePerSample = 0.50366ms; SamplesPerSecond = 1985 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17623656; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12521s; TotalTimePerSample = 0.50083ms; SamplesPerSecond = 1996 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14121117; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12537s; TotalTimePerSample = 0.50146ms; SamplesPerSecond = 1994 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19243442; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12551s; TotalTimePerSample = 0.50205ms; SamplesPerSecond = 1991 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20908161; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12523s; TotalTimePerSample = 0.50090ms; SamplesPerSecond = 1996 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18472067; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12551s; TotalTimePerSample = 0.50206ms; SamplesPerSecond = 1991 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18185536; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12499s; TotalTimePerSample = 0.49998ms; SamplesPerSecond = 2000 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14074204; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12514s; TotalTimePerSample = 0.50055ms; SamplesPerSecond = 1997 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14871620; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12501s; TotalTimePerSample = 0.50005ms; SamplesPerSecond = 1999 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20299705; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12576s; TotalTimePerSample = 0.50303ms; SamplesPerSecond = 1987 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12852037; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12521s; TotalTimePerSample = 0.50085ms; SamplesPerSecond = 1996 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18660440; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12537s; TotalTimePerSample = 0.50149ms; SamplesPerSecond = 1994 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19575997; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12482s; TotalTimePerSample = 0.49928ms; SamplesPerSecond = 2002 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16667676; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12597s; TotalTimePerSample = 0.50387ms; SamplesPerSecond = 1984 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12526168; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12510s; TotalTimePerSample = 0.50040ms; SamplesPerSecond = 1998 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17392133; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12561s; TotalTimePerSample = 0.50245ms; SamplesPerSecond = 1990 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12281615; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12508s; TotalTimePerSample = 0.50032ms; SamplesPerSecond = 1998 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14759390; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12501s; TotalTimePerSample = 0.50003ms; SamplesPerSecond = 1999 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19801300; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12533s; TotalTimePerSample = 0.50131ms; SamplesPerSecond = 1994 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12593395; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12505s; TotalTimePerSample = 0.50018ms; SamplesPerSecond = 1999 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13756617; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12535s; TotalTimePerSample = 0.50138ms; SamplesPerSecond = 1994 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12529s; TotalTimePerSample = 0.50116ms; SamplesPerSecond = 1995 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16654368; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12546s; TotalTimePerSample = 0.50186ms; SamplesPerSecond = 1992 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20658950; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12535s; TotalTimePerSample = 0.50140ms; SamplesPerSecond = 1994 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14583322; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12562s; TotalTimePerSample = 0.50246ms; SamplesPerSecond = 1990 +MPI Rank 1: Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 0.15948618; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.016666 +MPI Rank 1: Starting Epoch 4: learning rate per sample = 0.008000 effective momentum = 0.900000 MPI Rank 1: starting epoch 3 at record count 30000, and file position 0 MPI Rank 1: already there from last epoch MPI Rank 1: MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 4, NumGradientBits = 32). -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 1- 10 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12371233; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12759s; TotalTimePerSample = 0.51034ms; SamplesPerSecond = 1959 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 11- 20 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18070515; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12299s; TotalTimePerSample = 0.49196ms; SamplesPerSecond = 2032 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 21- 30 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14239721; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12169s; TotalTimePerSample = 0.48675ms; SamplesPerSecond = 2054 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 31- 40 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15630139; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12204s; TotalTimePerSample = 0.48816ms; SamplesPerSecond = 2048 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 41- 50 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16935523; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12093s; TotalTimePerSample = 0.48370ms; SamplesPerSecond = 2067 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 51- 60 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18198816; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12085s; TotalTimePerSample = 0.48342ms; SamplesPerSecond = 2068 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 61- 70 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14475952; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12254s; TotalTimePerSample = 0.49016ms; SamplesPerSecond = 2040 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 71- 80 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18021594; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12209s; TotalTimePerSample = 0.48835ms; SamplesPerSecond = 2047 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 81- 90 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15849304; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12486s; TotalTimePerSample = 0.49944ms; SamplesPerSecond = 2002 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14474402; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12218s; TotalTimePerSample = 0.48872ms; SamplesPerSecond = 2046 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13362928; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12227s; TotalTimePerSample = 0.48908ms; SamplesPerSecond = 2044 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13708325; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12183s; TotalTimePerSample = 0.48730ms; SamplesPerSecond = 2052 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.11569763; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12245s; TotalTimePerSample = 0.48978ms; SamplesPerSecond = 2041 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16892321; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12149s; TotalTimePerSample = 0.48596ms; SamplesPerSecond = 2057 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12752125; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12130s; TotalTimePerSample = 0.48520ms; SamplesPerSecond = 2061 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17100880; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12211s; TotalTimePerSample = 0.48844ms; SamplesPerSecond = 2047 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17660449; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12219s; TotalTimePerSample = 0.48877ms; SamplesPerSecond = 2045 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14105836; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12170s; TotalTimePerSample = 0.48678ms; SamplesPerSecond = 2054 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19333544; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12125s; TotalTimePerSample = 0.48498ms; SamplesPerSecond = 2061 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20859498; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12070s; TotalTimePerSample = 0.48282ms; SamplesPerSecond = 2071 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18499707; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12381s; TotalTimePerSample = 0.49523ms; SamplesPerSecond = 2019 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18152441; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12165s; TotalTimePerSample = 0.48659ms; SamplesPerSecond = 2055 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14037134; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12260s; TotalTimePerSample = 0.49042ms; SamplesPerSecond = 2039 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14866894; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12392s; TotalTimePerSample = 0.49567ms; SamplesPerSecond = 2017 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20347705; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12232s; TotalTimePerSample = 0.48928ms; SamplesPerSecond = 2043 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12815039; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12246s; TotalTimePerSample = 0.48984ms; SamplesPerSecond = 2041 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18672803; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12290s; TotalTimePerSample = 0.49159ms; SamplesPerSecond = 2034 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19552930; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12108s; TotalTimePerSample = 0.48432ms; SamplesPerSecond = 2064 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16452637; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12196s; TotalTimePerSample = 0.48784ms; SamplesPerSecond = 2049 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12461865; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12028s; TotalTimePerSample = 0.48112ms; SamplesPerSecond = 2078 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17285107; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12185s; TotalTimePerSample = 0.48740ms; SamplesPerSecond = 2051 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12253613; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12355s; TotalTimePerSample = 0.49418ms; SamplesPerSecond = 2023 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14723291; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12238s; TotalTimePerSample = 0.48953ms; SamplesPerSecond = 2042 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19789551; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12335s; TotalTimePerSample = 0.49340ms; SamplesPerSecond = 2026 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12575878; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12337s; TotalTimePerSample = 0.49350ms; SamplesPerSecond = 2026 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13745947; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12227s; TotalTimePerSample = 0.48908ms; SamplesPerSecond = 2044 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12839746; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12155s; TotalTimePerSample = 0.48619ms; SamplesPerSecond = 2056 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16647315; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12210s; TotalTimePerSample = 0.48839ms; SamplesPerSecond = 2047 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20679444; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12144s; TotalTimePerSample = 0.48577ms; SamplesPerSecond = 2058 -MPI Rank 1: Epoch[ 4 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14585204; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12220s; TotalTimePerSample = 0.48881ms; SamplesPerSecond = 2045 -MPI Rank 1: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15914927; EvalErrPerSample = 0.076700002; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.927435 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12371233; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12584s; TotalTimePerSample = 0.50334ms; SamplesPerSecond = 1986 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18070514; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12537s; TotalTimePerSample = 0.50147ms; SamplesPerSecond = 1994 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14239731; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12565s; TotalTimePerSample = 0.50259ms; SamplesPerSecond = 1989 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15630155; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12544s; TotalTimePerSample = 0.50177ms; SamplesPerSecond = 1992 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16935525; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12539s; TotalTimePerSample = 0.50154ms; SamplesPerSecond = 1993 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18198833; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12549s; TotalTimePerSample = 0.50194ms; SamplesPerSecond = 1992 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14475946; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12513s; TotalTimePerSample = 0.50050ms; SamplesPerSecond = 1997 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18021602; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12488s; TotalTimePerSample = 0.49954ms; SamplesPerSecond = 2001 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15849308; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12525s; TotalTimePerSample = 0.50099ms; SamplesPerSecond = 1996 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14474426; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12565s; TotalTimePerSample = 0.50260ms; SamplesPerSecond = 1989 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13362926; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12456s; TotalTimePerSample = 0.49826ms; SamplesPerSecond = 2007 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13708300; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12520s; TotalTimePerSample = 0.50082ms; SamplesPerSecond = 1996 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.11569776; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12513s; TotalTimePerSample = 0.50052ms; SamplesPerSecond = 1997 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16892330; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12555s; TotalTimePerSample = 0.50222ms; SamplesPerSecond = 1991 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12752163; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12519s; TotalTimePerSample = 0.50076ms; SamplesPerSecond = 1996 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17100866; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12561s; TotalTimePerSample = 0.50244ms; SamplesPerSecond = 1990 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17660425; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12497s; TotalTimePerSample = 0.49989ms; SamplesPerSecond = 2000 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14105804; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12543s; TotalTimePerSample = 0.50174ms; SamplesPerSecond = 1993 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19333553; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12540s; TotalTimePerSample = 0.50159ms; SamplesPerSecond = 1993 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20859525; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12518s; TotalTimePerSample = 0.50071ms; SamplesPerSecond = 1997 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18499677; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12594s; TotalTimePerSample = 0.50376ms; SamplesPerSecond = 1985 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18152438; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12548s; TotalTimePerSample = 0.50193ms; SamplesPerSecond = 1992 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14037158; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12541s; TotalTimePerSample = 0.50164ms; SamplesPerSecond = 1993 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14866862; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12546s; TotalTimePerSample = 0.50184ms; SamplesPerSecond = 1992 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20347748; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12561s; TotalTimePerSample = 0.50244ms; SamplesPerSecond = 1990 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12815013; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12521s; TotalTimePerSample = 0.50085ms; SamplesPerSecond = 1996 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18672810; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12565s; TotalTimePerSample = 0.50258ms; SamplesPerSecond = 1989 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19552989; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12497s; TotalTimePerSample = 0.49988ms; SamplesPerSecond = 2000 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16452642; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12554s; TotalTimePerSample = 0.50214ms; SamplesPerSecond = 1991 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12461825; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12499s; TotalTimePerSample = 0.49995ms; SamplesPerSecond = 2000 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17285251; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12571s; TotalTimePerSample = 0.50283ms; SamplesPerSecond = 1988 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12253620; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12562s; TotalTimePerSample = 0.50250ms; SamplesPerSecond = 1990 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14723333; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12541s; TotalTimePerSample = 0.50164ms; SamplesPerSecond = 1993 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19789537; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12618s; TotalTimePerSample = 0.50470ms; SamplesPerSecond = 1981 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12575877; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12608s; TotalTimePerSample = 0.50433ms; SamplesPerSecond = 1982 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13745928; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12556s; TotalTimePerSample = 0.50225ms; SamplesPerSecond = 1991 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12839652; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12497s; TotalTimePerSample = 0.49987ms; SamplesPerSecond = 2000 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16647280; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12550s; TotalTimePerSample = 0.50201ms; SamplesPerSecond = 1991 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20679434; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12523s; TotalTimePerSample = 0.50091ms; SamplesPerSecond = 1996 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12566s; TotalTimePerSample = 0.50264ms; SamplesPerSecond = 1989 +MPI Rank 1: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.01855 +MPI Rank 1: CNTKCommandTrainEnd: SimpleMultiGPU MPI Rank 1: COMPLETED MPI Rank 1: ~MPIWrapper -MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank2 -MPI Rank 2: ------------------------------------------------------------------- -MPI Rank 2: Build info: -MPI Rank 2: -MPI Rank 2: Built time: Aug 25 2015 17:44:46 -MPI Rank 2: Last modified date: Mon Aug 24 16:38:42 2015 -MPI Rank 2: Built by amitaga on Amitaga-Win-DT3 -MPI Rank 2: Build Path: E:\NetScale\CNTK\git_repos\public_master\MachineLearning\CNTK\ -MPI Rank 2: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 -MPI Rank 2: ------------------------------------------------------------------- -MPI Rank 2: running on Amitaga-Win-DT3 at 2015/08/26 01:48:44 -MPI Rank 2: command line options: -MPI Rank 2: configFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 2: running on localhost at 2015/10/24 12:44:54 +MPI Rank 2: command line: +MPI Rank 2: /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../../SimpleMultiGPU.config RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 2: MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> MPI Rank 2: deviceId=$DeviceId$ @@ -1013,12 +1257,11 @@ MPI Rank 2: minibatchSize=25 MPI Rank 2: learningRatesPerMB=0.5:0.2*20:0.1 MPI Rank 2: momentumPerMB=0.9 MPI Rank 2: dropoutRate=0.0 -MPI Rank 2: maxEpochs=10 +MPI Rank 2: maxEpochs=4 MPI Rank 2: ParallelTrain=[ MPI Rank 2: parallelizationMethod=DataParallelSGD MPI Rank 2: DataParallelSGD=[ MPI Rank 2: gradientBits=1 -MPI Rank 2: parallelizationStartEpoch=1 MPI Rank 2: ] MPI Rank 2: ] MPI Rank 2: ] @@ -1040,12 +1283,13 @@ MPI Rank 2: labelMappingFile=$DataDir$/SimpleMapping.txt MPI Rank 2: ] MPI Rank 2: ] MPI Rank 2: ] -MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu -MPI Rank 2: DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data +MPI Rank 2: RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu +MPI Rank 2: DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data +MPI Rank 2: ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. MPI Rank 2: DeviceId=0 -MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 2: precision=float MPI Rank 2: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 2: stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 2: MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< MPI Rank 2: @@ -1056,7 +1300,7 @@ MPI Rank 2: precision=float MPI Rank 2: parallelTrain=true MPI Rank 2: SimpleMultiGPU=[ MPI Rank 2: action=train -MPI Rank 2: modelPath=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 2: modelPath=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn MPI Rank 2: deviceId=0 MPI Rank 2: traceLevel=1 MPI Rank 2: SimpleNetworkBuilder=[ @@ -1075,18 +1319,17 @@ MPI Rank 2: minibatchSize=25 MPI Rank 2: learningRatesPerMB=0.5:0.2*20:0.1 MPI Rank 2: momentumPerMB=0.9 MPI Rank 2: dropoutRate=0.0 -MPI Rank 2: maxEpochs=10 +MPI Rank 2: maxEpochs=4 MPI Rank 2: ParallelTrain=[ MPI Rank 2: parallelizationMethod=DataParallelSGD MPI Rank 2: DataParallelSGD=[ MPI Rank 2: gradientBits=1 -MPI Rank 2: parallelizationStartEpoch=1 MPI Rank 2: ] MPI Rank 2: ] MPI Rank 2: ] MPI Rank 2: reader=[ MPI Rank 2: readerType=UCIFastReader -MPI Rank 2: file=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 2: file=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt MPI Rank 2: miniBatchMode=Partial MPI Rank 2: randomize=None MPI Rank 2: verbosity=1 @@ -1098,29 +1341,31 @@ MPI Rank 2: labels=[ MPI Rank 2: start=2 MPI Rank 2: dim=1 MPI Rank 2: labelDim=2 -MPI Rank 2: labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleMapping.txt +MPI Rank 2: labelMappingFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleMapping.txt MPI Rank 2: ] MPI Rank 2: ] MPI Rank 2: ] -MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu -MPI Rank 2: DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data +MPI Rank 2: RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu +MPI Rank 2: DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data +MPI Rank 2: ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. MPI Rank 2: DeviceId=0 -MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 2: precision=float MPI Rank 2: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 2: stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 2: MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 2: MPI Rank 2: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> MPI Rank 2: configparameters: SimpleMultiGPU.config:command=SimpleMultiGPU -MPI Rank 2: configparameters: SimpleMultiGPU.config:DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data +MPI Rank 2: configparameters: SimpleMultiGPU.config:ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. +MPI Rank 2: configparameters: SimpleMultiGPU.config:DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data MPI Rank 2: configparameters: SimpleMultiGPU.config:deviceId=0 MPI Rank 2: configparameters: SimpleMultiGPU.config:parallelTrain=true MPI Rank 2: configparameters: SimpleMultiGPU.config:precision=float -MPI Rank 2: configparameters: SimpleMultiGPU.config:RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu +MPI Rank 2: configparameters: SimpleMultiGPU.config:RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu MPI Rank 2: configparameters: SimpleMultiGPU.config:SimpleMultiGPU=[ MPI Rank 2: action=train -MPI Rank 2: modelPath=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 2: modelPath=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn MPI Rank 2: deviceId=0 MPI Rank 2: traceLevel=1 MPI Rank 2: SimpleNetworkBuilder=[ @@ -1139,18 +1384,17 @@ MPI Rank 2: minibatchSize=25 MPI Rank 2: learningRatesPerMB=0.5:0.2*20:0.1 MPI Rank 2: momentumPerMB=0.9 MPI Rank 2: dropoutRate=0.0 -MPI Rank 2: maxEpochs=10 +MPI Rank 2: maxEpochs=4 MPI Rank 2: ParallelTrain=[ MPI Rank 2: parallelizationMethod=DataParallelSGD MPI Rank 2: DataParallelSGD=[ MPI Rank 2: gradientBits=1 -MPI Rank 2: parallelizationStartEpoch=1 MPI Rank 2: ] MPI Rank 2: ] MPI Rank 2: ] MPI Rank 2: reader=[ MPI Rank 2: readerType=UCIFastReader -MPI Rank 2: file=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 2: file=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt MPI Rank 2: miniBatchMode=Partial MPI Rank 2: randomize=None MPI Rank 2: verbosity=1 @@ -1162,45 +1406,100 @@ MPI Rank 2: labels=[ MPI Rank 2: start=2 MPI Rank 2: dim=1 MPI Rank 2: labelDim=2 -MPI Rank 2: labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleMapping.txt +MPI Rank 2: labelMappingFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleMapping.txt MPI Rank 2: ] MPI Rank 2: ] MPI Rank 2: ] [SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] MPI Rank 2: -MPI Rank 2: configparameters: SimpleMultiGPU.config:stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 2: configparameters: SimpleMultiGPU.config:stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 2: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 2: command: SimpleMultiGPU MPI Rank 2: precision = float +MPI Rank 2: CNTKModelPath: /tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 2: CNTKCommandTrainInfo: SimpleMultiGPU : 4 +MPI Rank 2: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4 +MPI Rank 2: CNTKCommandTrainBegin: SimpleMultiGPU MPI Rank 2: SimpleNetworkBuilder Using GPU 0 -MPI Rank 2: reading uci file E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 2: reading uci file /home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt +MPI Rank 2: SetUniformRandomValue (GPU): creating curand object with seed 1 MPI Rank 2: GetTrainCriterionNodes ... MPI Rank 2: GetEvalCriterionNodes ... MPI Rank 2: MPI Rank 2: -MPI Rank 2: Validating node CrossEntropyWithSoftmax +MPI Rank 2: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. MPI Rank 2: -MPI Rank 2: Validating --> labels = InputValue -MPI Rank 2: Validating --> W2 = LearnableParameter -MPI Rank 2: Validating --> W1 = LearnableParameter -MPI Rank 2: Validating --> W0 = LearnableParameter -MPI Rank 2: Validating --> features = InputValue -MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, 3]) -MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, 3]) -MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, 3]) -MPI Rank 2: Validating --> B0 = LearnableParameter -MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, 3], B0[50, 1]) -MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, 3]) -MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, 3]) -MPI Rank 2: Validating --> B1 = LearnableParameter -MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, 3], B1[50, 1]) -MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, 3]) -MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, 3]) -MPI Rank 2: Validating --> B2 = LearnableParameter -MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, 3], B2[2, 1]) -MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, 3], HLast[2, 3]) +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 2: +MPI Rank 2: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 2: +MPI Rank 2: Validating for node CrossEntropyWithSoftmax, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 2: +MPI Rank 2: 9 out of 20 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Precomputing --> 3 PreCompute nodes found. MPI Rank 2: -MPI Rank 2: Found 3 PreCompute nodes MPI Rank 2: NodeName: InvStdOfFeatures MPI Rank 2: NodeName: MeanOfFeatures MPI Rank 2: NodeName: Prior @@ -1211,250 +1510,320 @@ MPI Rank 2: starting epoch 0 at record count 0, and file position 0 MPI Rank 2: already there from last epoch MPI Rank 2: MPI Rank 2: -MPI Rank 2: Validating node InvStdOfFeatures +MPI Rank 2: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1. MPI Rank 2: -MPI Rank 2: Validating --> features = InputValue -MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, 25]) +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: +MPI Rank 2: Validating for node InvStdOfFeatures, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: +MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 2: MPI Rank 2: MPI Rank 2: -MPI Rank 2: Validating node MeanOfFeatures +MPI Rank 2: Validating for node MeanOfFeatures. 2 nodes to process in pass 1. MPI Rank 2: -MPI Rank 2: Validating --> features = InputValue -MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, 25]) +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: +MPI Rank 2: Validating for node MeanOfFeatures, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: +MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 2: MPI Rank 2: MPI Rank 2: -MPI Rank 2: Validating node Prior +MPI Rank 2: Validating for node Prior. 2 nodes to process in pass 1. MPI Rank 2: -MPI Rank 2: Validating --> labels = InputValue -MPI Rank 2: Validating --> Prior = Mean(labels[2, 25]) +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 2: +MPI Rank 2: Validating for node Prior. 1 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 2: +MPI Rank 2: Validating for node Prior, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 2: +MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once. +MPI Rank 2: +MPI Rank 2: Precomputing --> Completed. MPI Rank 2: MPI Rank 2: Set Max Temp Mem Size For Convolution Nodes to 0 samples. -MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.020000 momentum = 0.900001 +MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.020000 effective momentum = 0.900000 MPI Rank 2: starting epoch 0 at record count 0, and file position 0 MPI Rank 2: already there from last epoch MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 2: +MPI Rank 2: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 2: +MPI Rank 2: Validating for node EvalErrorPrediction, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 2: +MPI Rank 2: 9 out of 20 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 4, NumGradientBits = 32). -MPI Rank 2: -MPI Rank 2: -MPI Rank 2: Validating node EvalErrorPrediction -MPI Rank 2: -MPI Rank 2: Validating --> labels = InputValue -MPI Rank 2: Validating --> W2 = LearnableParameter -MPI Rank 2: Validating --> W1 = LearnableParameter -MPI Rank 2: Validating --> W0 = LearnableParameter -MPI Rank 2: Validating --> features = InputValue -MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, 6]) -MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, 6]) -MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, 6], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, 6]) -MPI Rank 2: Validating --> B0 = LearnableParameter -MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, 6], B0[50, 1]) -MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, 6]) -MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, 6]) -MPI Rank 2: Validating --> B1 = LearnableParameter -MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, 6], B1[50, 1]) -MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, 6]) -MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, 6]) -MPI Rank 2: Validating --> B2 = LearnableParameter -MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, 6], B2[2, 1]) -MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, 6], HLast[2, 6]) -MPI Rank 2: -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 1- 10 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70007980; EvalErr[0]PerSample = 0.52399999; TotalTime = 0.19882s; TotalTimePerSample = 0.79529ms; SamplesPerSecond = 1257 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 11- 20 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71514523; EvalErr[0]PerSample = 0.51999998; TotalTime = 0.15550s; TotalTimePerSample = 0.62202ms; SamplesPerSecond = 1607 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 21- 30 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72945595; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.14886s; TotalTimePerSample = 0.59545ms; SamplesPerSecond = 1679 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 31- 40 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70079005; EvalErr[0]PerSample = 0.52399999; TotalTime = 0.14476s; TotalTimePerSample = 0.57903ms; SamplesPerSecond = 1727 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 41- 50 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70605618; EvalErr[0]PerSample = 0.54000002; TotalTime = 0.14226s; TotalTimePerSample = 0.56903ms; SamplesPerSecond = 1757 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 51- 60 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71572435; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.13675s; TotalTimePerSample = 0.54700ms; SamplesPerSecond = 1828 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 61- 70 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72149903; EvalErr[0]PerSample = 0.47999999; TotalTime = 0.13630s; TotalTimePerSample = 0.54521ms; SamplesPerSecond = 1834 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 71- 80 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.79845655; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.13448s; TotalTimePerSample = 0.53790ms; SamplesPerSecond = 1859 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 81- 90 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69665188; EvalErr[0]PerSample = 0.46799999; TotalTime = 0.13043s; TotalTimePerSample = 0.52172ms; SamplesPerSecond = 1916 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70723337; EvalErr[0]PerSample = 0.49200001; TotalTime = 0.12787s; TotalTimePerSample = 0.51147ms; SamplesPerSecond = 1955 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71420360; EvalErr[0]PerSample = 0.55199999; TotalTime = 0.12629s; TotalTimePerSample = 0.50515ms; SamplesPerSecond = 1979 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69535255; EvalErr[0]PerSample = 0.43599999; TotalTime = 0.12559s; TotalTimePerSample = 0.50234ms; SamplesPerSecond = 1990 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70078611; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12260s; TotalTimePerSample = 0.49040ms; SamplesPerSecond = 2039 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71857810; EvalErr[0]PerSample = 0.54799998; TotalTime = 0.12291s; TotalTimePerSample = 0.49165ms; SamplesPerSecond = 2033 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72088283; EvalErr[0]PerSample = 0.48800001; TotalTime = 0.12232s; TotalTimePerSample = 0.48928ms; SamplesPerSecond = 2043 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71798825; EvalErr[0]PerSample = 0.55199999; TotalTime = 0.12123s; TotalTimePerSample = 0.48490ms; SamplesPerSecond = 2062 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.74162209; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12328s; TotalTimePerSample = 0.49314ms; SamplesPerSecond = 2027 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71835059; EvalErr[0]PerSample = 0.51599997; TotalTime = 0.12340s; TotalTimePerSample = 0.49358ms; SamplesPerSecond = 2025 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71529394; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12333s; TotalTimePerSample = 0.49333ms; SamplesPerSecond = 2027 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71727639; EvalErr[0]PerSample = 0.53200001; TotalTime = 0.12471s; TotalTimePerSample = 0.49885ms; SamplesPerSecond = 2004 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71745312; EvalErr[0]PerSample = 0.50400001; TotalTime = 0.12360s; TotalTimePerSample = 0.49439ms; SamplesPerSecond = 2022 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72088087; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12196s; TotalTimePerSample = 0.48784ms; SamplesPerSecond = 2049 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72006541; EvalErr[0]PerSample = 0.50800002; TotalTime = 0.12265s; TotalTimePerSample = 0.49062ms; SamplesPerSecond = 2038 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71275192; EvalErr[0]PerSample = 0.51200002; TotalTime = 0.12161s; TotalTimePerSample = 0.48644ms; SamplesPerSecond = 2055 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69645119; EvalErr[0]PerSample = 0.50400001; TotalTime = 0.12121s; TotalTimePerSample = 0.48486ms; SamplesPerSecond = 2062 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70129883; EvalErr[0]PerSample = 0.51200002; TotalTime = 0.12089s; TotalTimePerSample = 0.48356ms; SamplesPerSecond = 2067 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70768166; EvalErr[0]PerSample = 0.54400003; TotalTime = 0.12175s; TotalTimePerSample = 0.48699ms; SamplesPerSecond = 2053 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69744140; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.12268s; TotalTimePerSample = 0.49072ms; SamplesPerSecond = 2037 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69266015; EvalErr[0]PerSample = 0.44800001; TotalTime = 0.12179s; TotalTimePerSample = 0.48714ms; SamplesPerSecond = 2052 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69347072; EvalErr[0]PerSample = 0.49599999; TotalTime = 0.12233s; TotalTimePerSample = 0.48930ms; SamplesPerSecond = 2043 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69257420; EvalErr[0]PerSample = 0.54000002; TotalTime = 0.12287s; TotalTimePerSample = 0.49146ms; SamplesPerSecond = 2034 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.68625975; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12417s; TotalTimePerSample = 0.49666ms; SamplesPerSecond = 2013 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69064063; EvalErr[0]PerSample = 0.46799999; TotalTime = 0.12339s; TotalTimePerSample = 0.49356ms; SamplesPerSecond = 2026 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70192385; EvalErr[0]PerSample = 0.46000001; TotalTime = 0.12176s; TotalTimePerSample = 0.48705ms; SamplesPerSecond = 2053 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69058985; EvalErr[0]PerSample = 0.51999998; TotalTime = 0.12238s; TotalTimePerSample = 0.48951ms; SamplesPerSecond = 2042 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.67041212; EvalErr[0]PerSample = 0.39199999; TotalTime = 0.12185s; TotalTimePerSample = 0.48741ms; SamplesPerSecond = 2051 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.65914255; EvalErr[0]PerSample = 0.35600001; TotalTime = 0.12262s; TotalTimePerSample = 0.49048ms; SamplesPerSecond = 2038 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.63919920; EvalErr[0]PerSample = 0.36399999; TotalTime = 0.12265s; TotalTimePerSample = 0.49060ms; SamplesPerSecond = 2038 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.61294138; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.12142s; TotalTimePerSample = 0.48570ms; SamplesPerSecond = 2058 -MPI Rank 2: Epoch[ 1 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.55255663; EvalErr[0]PerSample = 0.18799999; TotalTime = 0.12034s; TotalTimePerSample = 0.48138ms; SamplesPerSecond = 2077 -MPI Rank 2: Finished Epoch[ 1 of 10]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.47350001; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.253381 -MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.008000 momentum = 0.900001 +MPI Rank 2: DecimateMinibatchSequences: WARNING: Number of parallel utterances 25 not a multiple of number of GPUs 4, GPU usage will be suboptimal. +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 1- 10 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70007977; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.12806s; TotalTimePerSample = 0.51223ms; SamplesPerSecond = 1952 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 11- 20 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71514542; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12488s; TotalTimePerSample = 0.49952ms; SamplesPerSecond = 2001 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 21- 30 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72945595; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12485s; TotalTimePerSample = 0.49942ms; SamplesPerSecond = 2002 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 31- 40 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70079058; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.12472s; TotalTimePerSample = 0.49886ms; SamplesPerSecond = 2004 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 41- 50 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70605616; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12438s; TotalTimePerSample = 0.49751ms; SamplesPerSecond = 2010 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 51- 60 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71572398; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12463s; TotalTimePerSample = 0.49853ms; SamplesPerSecond = 2005 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 61- 70 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72149851; EvalErr[0]PerSample = 0.48000000; TotalTime = 0.12459s; TotalTimePerSample = 0.49836ms; SamplesPerSecond = 2006 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 71- 80 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.79845605; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12428s; TotalTimePerSample = 0.49713ms; SamplesPerSecond = 2011 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 81- 90 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69665186; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12443s; TotalTimePerSample = 0.49771ms; SamplesPerSecond = 2009 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 91- 100 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70723326; EvalErr[0]PerSample = 0.49200000; TotalTime = 0.12508s; TotalTimePerSample = 0.50032ms; SamplesPerSecond = 1998 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 101- 110 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71420344; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.12487s; TotalTimePerSample = 0.49946ms; SamplesPerSecond = 2002 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 111- 120 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69535258; EvalErr[0]PerSample = 0.43600000; TotalTime = 0.12511s; TotalTimePerSample = 0.50045ms; SamplesPerSecond = 1998 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 121- 130 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70078531; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12505s; TotalTimePerSample = 0.50019ms; SamplesPerSecond = 1999 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 131- 140 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71857915; EvalErr[0]PerSample = 0.54800000; TotalTime = 0.12473s; TotalTimePerSample = 0.49891ms; SamplesPerSecond = 2004 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 141- 150 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72088358; EvalErr[0]PerSample = 0.48800000; TotalTime = 0.12486s; TotalTimePerSample = 0.49944ms; SamplesPerSecond = 2002 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 151- 160 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71798839; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.12506s; TotalTimePerSample = 0.50025ms; SamplesPerSecond = 1999 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 161- 170 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.74162164; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12442s; TotalTimePerSample = 0.49768ms; SamplesPerSecond = 2009 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 171- 180 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71835127; EvalErr[0]PerSample = 0.51600000; TotalTime = 0.12471s; TotalTimePerSample = 0.49883ms; SamplesPerSecond = 2004 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 181- 190 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71529462; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12444s; TotalTimePerSample = 0.49777ms; SamplesPerSecond = 2008 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 191- 200 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71727656; EvalErr[0]PerSample = 0.53200000; TotalTime = 0.12439s; TotalTimePerSample = 0.49756ms; SamplesPerSecond = 2009 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 201- 210 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71745517; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.12478s; TotalTimePerSample = 0.49913ms; SamplesPerSecond = 2003 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 211- 220 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72088397; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12447s; TotalTimePerSample = 0.49788ms; SamplesPerSecond = 2008 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 221- 230 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72006808; EvalErr[0]PerSample = 0.50800000; TotalTime = 0.12491s; TotalTimePerSample = 0.49964ms; SamplesPerSecond = 2001 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 231- 240 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71275468; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12509s; TotalTimePerSample = 0.50036ms; SamplesPerSecond = 1998 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 241- 250 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69644781; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.12463s; TotalTimePerSample = 0.49852ms; SamplesPerSecond = 2005 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 251- 260 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70129697; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12496s; TotalTimePerSample = 0.49985ms; SamplesPerSecond = 2000 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 261- 270 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70768095; EvalErr[0]PerSample = 0.54400000; TotalTime = 0.12507s; TotalTimePerSample = 0.50029ms; SamplesPerSecond = 1998 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 271- 280 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69744379; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.12537s; TotalTimePerSample = 0.50148ms; SamplesPerSecond = 1994 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 281- 290 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69266186; EvalErr[0]PerSample = 0.44800000; TotalTime = 0.12541s; TotalTimePerSample = 0.50163ms; SamplesPerSecond = 1993 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 291- 300 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69347266; EvalErr[0]PerSample = 0.49600000; TotalTime = 0.12475s; TotalTimePerSample = 0.49900ms; SamplesPerSecond = 2004 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 301- 310 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69257410; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12566s; TotalTimePerSample = 0.50262ms; SamplesPerSecond = 1989 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 311- 320 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.68625741; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12517s; TotalTimePerSample = 0.50068ms; SamplesPerSecond = 1997 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 321- 330 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69064011; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12519s; TotalTimePerSample = 0.50077ms; SamplesPerSecond = 1996 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 331- 340 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70192153; EvalErr[0]PerSample = 0.46000000; TotalTime = 0.12534s; TotalTimePerSample = 0.50135ms; SamplesPerSecond = 1994 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 341- 350 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69058912; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12528s; TotalTimePerSample = 0.50111ms; SamplesPerSecond = 1995 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 351- 360 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.67041489; EvalErr[0]PerSample = 0.39200000; TotalTime = 0.12603s; TotalTimePerSample = 0.50413ms; SamplesPerSecond = 1983 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 361- 370 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.65913971; EvalErr[0]PerSample = 0.35600000; TotalTime = 0.12535s; TotalTimePerSample = 0.50140ms; SamplesPerSecond = 1994 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 371- 380 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.63919874; EvalErr[0]PerSample = 0.36400000; TotalTime = 0.12492s; TotalTimePerSample = 0.49966ms; SamplesPerSecond = 2001 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 381- 390 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.61293878; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.12545s; TotalTimePerSample = 0.50180ms; SamplesPerSecond = 1992 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 391- 400 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.55255340; EvalErr[0]PerSample = 0.18800000; TotalTime = 0.12484s; TotalTimePerSample = 0.49934ms; SamplesPerSecond = 2002 +MPI Rank 2: Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.4735; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.008988 +MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.008000 effective momentum = 0.900000 MPI Rank 2: starting epoch 1 at record count 10000, and file position 0 MPI Rank 2: already there from last epoch MPI Rank 2: MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 4, NumGradientBits = 32). -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 1- 10 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.50775200; EvalErr[0]PerSample = 0.23999999; TotalTime = 0.12817s; TotalTimePerSample = 0.51266ms; SamplesPerSecond = 1950 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 11- 20 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.43389454; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12188s; TotalTimePerSample = 0.48752ms; SamplesPerSecond = 2051 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 21- 30 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.36675408; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12076s; TotalTimePerSample = 0.48305ms; SamplesPerSecond = 2070 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 31- 40 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.33769274; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12255s; TotalTimePerSample = 0.49018ms; SamplesPerSecond = 2040 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 41- 50 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.30321363; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12132s; TotalTimePerSample = 0.48528ms; SamplesPerSecond = 2060 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 51- 60 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.29576379; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12054s; TotalTimePerSample = 0.48217ms; SamplesPerSecond = 2073 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 61- 70 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.24924731; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12247s; TotalTimePerSample = 0.48987ms; SamplesPerSecond = 2041 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 71- 80 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.24632569; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12329s; TotalTimePerSample = 0.49317ms; SamplesPerSecond = 2027 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 81- 90 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20943311; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12288s; TotalTimePerSample = 0.49151ms; SamplesPerSecond = 2034 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19116065; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12191s; TotalTimePerSample = 0.48763ms; SamplesPerSecond = 2050 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17923315; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12243s; TotalTimePerSample = 0.48971ms; SamplesPerSecond = 2042 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17075513; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12159s; TotalTimePerSample = 0.48638ms; SamplesPerSecond = 2056 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14442432; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12214s; TotalTimePerSample = 0.48854ms; SamplesPerSecond = 2046 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17753857; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12283s; TotalTimePerSample = 0.49132ms; SamplesPerSecond = 2035 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15087914; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12183s; TotalTimePerSample = 0.48733ms; SamplesPerSecond = 2052 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19252978; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12265s; TotalTimePerSample = 0.49059ms; SamplesPerSecond = 2038 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17830664; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12266s; TotalTimePerSample = 0.49066ms; SamplesPerSecond = 2038 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15115429; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12254s; TotalTimePerSample = 0.49015ms; SamplesPerSecond = 2040 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19135889; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12209s; TotalTimePerSample = 0.48838ms; SamplesPerSecond = 2047 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.21491407; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12304s; TotalTimePerSample = 0.49215ms; SamplesPerSecond = 2031 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18682373; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12157s; TotalTimePerSample = 0.48626ms; SamplesPerSecond = 2056 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18483251; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12136s; TotalTimePerSample = 0.48544ms; SamplesPerSecond = 2059 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14684522; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12230s; TotalTimePerSample = 0.48920ms; SamplesPerSecond = 2044 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15322119; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12308s; TotalTimePerSample = 0.49232ms; SamplesPerSecond = 2031 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19882520; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12375s; TotalTimePerSample = 0.49501ms; SamplesPerSecond = 2020 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13683788; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12243s; TotalTimePerSample = 0.48974ms; SamplesPerSecond = 2041 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18621191; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12193s; TotalTimePerSample = 0.48772ms; SamplesPerSecond = 2050 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19408056; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12169s; TotalTimePerSample = 0.48674ms; SamplesPerSecond = 2054 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17298096; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12218s; TotalTimePerSample = 0.48873ms; SamplesPerSecond = 2046 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13265137; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12193s; TotalTimePerSample = 0.48772ms; SamplesPerSecond = 2050 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17627051; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12157s; TotalTimePerSample = 0.48628ms; SamplesPerSecond = 2056 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12734570; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12410s; TotalTimePerSample = 0.49638ms; SamplesPerSecond = 2014 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15108399; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12118s; TotalTimePerSample = 0.48473ms; SamplesPerSecond = 2063 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19729199; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12267s; TotalTimePerSample = 0.49067ms; SamplesPerSecond = 2038 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12857373; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12203s; TotalTimePerSample = 0.48812ms; SamplesPerSecond = 2048 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13867822; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12274s; TotalTimePerSample = 0.49095ms; SamplesPerSecond = 2036 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12786084; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12170s; TotalTimePerSample = 0.48682ms; SamplesPerSecond = 2054 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16643262; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12108s; TotalTimePerSample = 0.48432ms; SamplesPerSecond = 2064 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20440333; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12238s; TotalTimePerSample = 0.48951ms; SamplesPerSecond = 2042 -MPI Rank 2: Epoch[ 2 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14566259; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12230s; TotalTimePerSample = 0.48920ms; SamplesPerSecond = 2044 -MPI Rank 2: Finished Epoch[ 2 of 10]: [Training Set] TrainLossPerSample = 0.20373113; EvalErrPerSample = 0.082699999; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.929244 -MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.008000 momentum = 0.900001 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.50774607; EvalErr[0]PerSample = 0.24000000; TotalTime = 0.12606s; TotalTimePerSample = 0.50425ms; SamplesPerSecond = 1983 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.43388910; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12509s; TotalTimePerSample = 0.50036ms; SamplesPerSecond = 1998 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.36674852; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12573s; TotalTimePerSample = 0.50294ms; SamplesPerSecond = 1988 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.33768746; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12522s; TotalTimePerSample = 0.50089ms; SamplesPerSecond = 1996 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.30320932; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12500s; TotalTimePerSample = 0.50001ms; SamplesPerSecond = 1999 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.29576032; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12562s; TotalTimePerSample = 0.50246ms; SamplesPerSecond = 1990 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.24924483; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12481s; TotalTimePerSample = 0.49925ms; SamplesPerSecond = 2003 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.24632409; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12575s; TotalTimePerSample = 0.50299ms; SamplesPerSecond = 1988 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20943152; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12520s; TotalTimePerSample = 0.50078ms; SamplesPerSecond = 1996 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19115992; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12536s; TotalTimePerSample = 0.50144ms; SamplesPerSecond = 1994 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17923227; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12509s; TotalTimePerSample = 0.50036ms; SamplesPerSecond = 1998 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17075420; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12523s; TotalTimePerSample = 0.50093ms; SamplesPerSecond = 1996 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14442369; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12530s; TotalTimePerSample = 0.50120ms; SamplesPerSecond = 1995 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17753818; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12512s; TotalTimePerSample = 0.50048ms; SamplesPerSecond = 1998 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15087853; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12521s; TotalTimePerSample = 0.50084ms; SamplesPerSecond = 1996 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19253021; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12531s; TotalTimePerSample = 0.50125ms; SamplesPerSecond = 1995 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17830684; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12510s; TotalTimePerSample = 0.50039ms; SamplesPerSecond = 1998 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15115428; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12497s; TotalTimePerSample = 0.49987ms; SamplesPerSecond = 2000 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19135968; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12524s; TotalTimePerSample = 0.50096ms; SamplesPerSecond = 1996 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.21491485; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12497s; TotalTimePerSample = 0.49989ms; SamplesPerSecond = 2000 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18682346; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12528s; TotalTimePerSample = 0.50113ms; SamplesPerSecond = 1995 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18483205; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12500s; TotalTimePerSample = 0.49998ms; SamplesPerSecond = 2000 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14684503; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12496s; TotalTimePerSample = 0.49986ms; SamplesPerSecond = 2000 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15322116; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12528s; TotalTimePerSample = 0.50110ms; SamplesPerSecond = 1995 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19882571; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12508s; TotalTimePerSample = 0.50033ms; SamplesPerSecond = 1998 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13683832; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12516s; TotalTimePerSample = 0.50066ms; SamplesPerSecond = 1997 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18621189; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12489s; TotalTimePerSample = 0.49958ms; SamplesPerSecond = 2001 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19408050; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12602s; TotalTimePerSample = 0.50408ms; SamplesPerSecond = 1983 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17298137; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12516s; TotalTimePerSample = 0.50064ms; SamplesPerSecond = 1997 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13265130; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12561s; TotalTimePerSample = 0.50246ms; SamplesPerSecond = 1990 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17627178; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12509s; TotalTimePerSample = 0.50036ms; SamplesPerSecond = 1998 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12734628; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12564s; TotalTimePerSample = 0.50257ms; SamplesPerSecond = 1989 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15108452; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12521s; TotalTimePerSample = 0.50085ms; SamplesPerSecond = 1996 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19729185; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12571s; TotalTimePerSample = 0.50283ms; SamplesPerSecond = 1988 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12857333; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12505s; TotalTimePerSample = 0.50021ms; SamplesPerSecond = 1999 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13867804; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12550s; TotalTimePerSample = 0.50201ms; SamplesPerSecond = 1991 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12786051; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12551s; TotalTimePerSample = 0.50204ms; SamplesPerSecond = 1991 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16643303; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12505s; TotalTimePerSample = 0.50018ms; SamplesPerSecond = 1999 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20440408; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12575s; TotalTimePerSample = 0.50302ms; SamplesPerSecond = 1988 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14566237; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12517s; TotalTimePerSample = 0.50067ms; SamplesPerSecond = 1997 +MPI Rank 2: Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 0.20373026; EvalErrPerSample = 0.0827; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.013494 +MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.008000 effective momentum = 0.900000 MPI Rank 2: starting epoch 2 at record count 20000, and file position 0 MPI Rank 2: already there from last epoch MPI Rank 2: MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 4, NumGradientBits = 32). -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 1- 10 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12590086; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12913s; TotalTimePerSample = 0.51653ms; SamplesPerSecond = 1935 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 11- 20 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17780226; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12397s; TotalTimePerSample = 0.49587ms; SamplesPerSecond = 2016 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 21- 30 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14417633; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12281s; TotalTimePerSample = 0.49122ms; SamplesPerSecond = 2035 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 31- 40 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15796880; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12283s; TotalTimePerSample = 0.49132ms; SamplesPerSecond = 2035 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 41- 50 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17002991; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12299s; TotalTimePerSample = 0.49197ms; SamplesPerSecond = 2032 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 51- 60 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18262109; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12231s; TotalTimePerSample = 0.48926ms; SamplesPerSecond = 2043 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 61- 70 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14643688; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12104s; TotalTimePerSample = 0.48418ms; SamplesPerSecond = 2065 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 71- 80 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18030518; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12215s; TotalTimePerSample = 0.48860ms; SamplesPerSecond = 2046 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 81- 90 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15846142; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12214s; TotalTimePerSample = 0.48858ms; SamplesPerSecond = 2046 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14486536; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12133s; TotalTimePerSample = 0.48530ms; SamplesPerSecond = 2060 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13469091; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12151s; TotalTimePerSample = 0.48602ms; SamplesPerSecond = 2057 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12147s; TotalTimePerSample = 0.48589ms; SamplesPerSecond = 2058 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.11641297; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12268s; TotalTimePerSample = 0.49072ms; SamplesPerSecond = 2037 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16786633; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12218s; TotalTimePerSample = 0.48872ms; SamplesPerSecond = 2046 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12811548; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12211s; TotalTimePerSample = 0.48843ms; SamplesPerSecond = 2047 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17257836; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12252s; TotalTimePerSample = 0.49008ms; SamplesPerSecond = 2040 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17623682; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12011s; TotalTimePerSample = 0.48045ms; SamplesPerSecond = 2081 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14121118; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12143s; TotalTimePerSample = 0.48572ms; SamplesPerSecond = 2058 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19243409; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12070s; TotalTimePerSample = 0.48280ms; SamplesPerSecond = 2071 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20908155; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12150s; TotalTimePerSample = 0.48598ms; SamplesPerSecond = 2057 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18472095; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12149s; TotalTimePerSample = 0.48598ms; SamplesPerSecond = 2057 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18185547; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12188s; TotalTimePerSample = 0.48752ms; SamplesPerSecond = 2051 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14074194; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12174s; TotalTimePerSample = 0.48697ms; SamplesPerSecond = 2053 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14871632; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12189s; TotalTimePerSample = 0.48758ms; SamplesPerSecond = 2050 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20299682; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12199s; TotalTimePerSample = 0.48796ms; SamplesPerSecond = 2049 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12852076; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12055s; TotalTimePerSample = 0.48218ms; SamplesPerSecond = 2073 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18660498; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12198s; TotalTimePerSample = 0.48793ms; SamplesPerSecond = 2049 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19576025; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12240s; TotalTimePerSample = 0.48961ms; SamplesPerSecond = 2042 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16667627; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12266s; TotalTimePerSample = 0.49062ms; SamplesPerSecond = 2038 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12526172; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12059s; TotalTimePerSample = 0.48236ms; SamplesPerSecond = 2073 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17391992; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12146s; TotalTimePerSample = 0.48585ms; SamplesPerSecond = 2058 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12281641; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12142s; TotalTimePerSample = 0.48568ms; SamplesPerSecond = 2058 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14759424; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12211s; TotalTimePerSample = 0.48842ms; SamplesPerSecond = 2047 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19801368; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12210s; TotalTimePerSample = 0.48840ms; SamplesPerSecond = 2047 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12593359; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12244s; TotalTimePerSample = 0.48974ms; SamplesPerSecond = 2041 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13756640; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12191s; TotalTimePerSample = 0.48764ms; SamplesPerSecond = 2050 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12244s; TotalTimePerSample = 0.48974ms; SamplesPerSecond = 2041 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16654395; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12275s; TotalTimePerSample = 0.49099ms; SamplesPerSecond = 2036 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20658936; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12092s; TotalTimePerSample = 0.48367ms; SamplesPerSecond = 2067 -MPI Rank 2: Epoch[ 3 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14583300; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12208s; TotalTimePerSample = 0.48833ms; SamplesPerSecond = 2047 -MPI Rank 2: Finished Epoch[ 3 of 10]: [Training Set] TrainLossPerSample = 0.15948617; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.919216 -MPI Rank 2: Starting Epoch 4: learning rate per sample = 0.008000 momentum = 0.900001 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12590085; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12574s; TotalTimePerSample = 0.50295ms; SamplesPerSecond = 1988 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17780230; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12505s; TotalTimePerSample = 0.50020ms; SamplesPerSecond = 1999 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14417637; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12561s; TotalTimePerSample = 0.50243ms; SamplesPerSecond = 1990 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15796896; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12537s; TotalTimePerSample = 0.50146ms; SamplesPerSecond = 1994 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17003000; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12540s; TotalTimePerSample = 0.50161ms; SamplesPerSecond = 1993 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18262114; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12511s; TotalTimePerSample = 0.50045ms; SamplesPerSecond = 1998 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14643695; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12582s; TotalTimePerSample = 0.50330ms; SamplesPerSecond = 1986 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18030528; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12591s; TotalTimePerSample = 0.50363ms; SamplesPerSecond = 1985 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15846150; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12559s; TotalTimePerSample = 0.50235ms; SamplesPerSecond = 1990 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14486534; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12554s; TotalTimePerSample = 0.50214ms; SamplesPerSecond = 1991 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13469094; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12556s; TotalTimePerSample = 0.50224ms; SamplesPerSecond = 1991 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12509s; TotalTimePerSample = 0.50035ms; SamplesPerSecond = 1998 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.11641295; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12531s; TotalTimePerSample = 0.50124ms; SamplesPerSecond = 1995 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16786647; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12510s; TotalTimePerSample = 0.50040ms; SamplesPerSecond = 1998 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12811514; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12516s; TotalTimePerSample = 0.50063ms; SamplesPerSecond = 1997 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17257851; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12592s; TotalTimePerSample = 0.50368ms; SamplesPerSecond = 1985 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17623656; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12542s; TotalTimePerSample = 0.50166ms; SamplesPerSecond = 1993 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14121117; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12516s; TotalTimePerSample = 0.50064ms; SamplesPerSecond = 1997 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19243442; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12552s; TotalTimePerSample = 0.50206ms; SamplesPerSecond = 1991 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20908161; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12500s; TotalTimePerSample = 0.49999ms; SamplesPerSecond = 2000 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18472067; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12575s; TotalTimePerSample = 0.50299ms; SamplesPerSecond = 1988 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18185536; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12514s; TotalTimePerSample = 0.50057ms; SamplesPerSecond = 1997 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14074204; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12507s; TotalTimePerSample = 0.50029ms; SamplesPerSecond = 1998 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14871620; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12522s; TotalTimePerSample = 0.50087ms; SamplesPerSecond = 1996 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20299705; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12556s; TotalTimePerSample = 0.50223ms; SamplesPerSecond = 1991 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12852037; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12522s; TotalTimePerSample = 0.50087ms; SamplesPerSecond = 1996 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18660440; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12514s; TotalTimePerSample = 0.50057ms; SamplesPerSecond = 1997 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19575997; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12505s; TotalTimePerSample = 0.50022ms; SamplesPerSecond = 1999 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16667676; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12596s; TotalTimePerSample = 0.50386ms; SamplesPerSecond = 1984 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12526168; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12511s; TotalTimePerSample = 0.50044ms; SamplesPerSecond = 1998 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17392133; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12561s; TotalTimePerSample = 0.50246ms; SamplesPerSecond = 1990 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12281615; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12485s; TotalTimePerSample = 0.49940ms; SamplesPerSecond = 2002 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14759390; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12524s; TotalTimePerSample = 0.50097ms; SamplesPerSecond = 1996 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19801300; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12533s; TotalTimePerSample = 0.50132ms; SamplesPerSecond = 1994 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12593395; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12505s; TotalTimePerSample = 0.50019ms; SamplesPerSecond = 1999 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13756617; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12535s; TotalTimePerSample = 0.50138ms; SamplesPerSecond = 1994 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12529s; TotalTimePerSample = 0.50118ms; SamplesPerSecond = 1995 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16654368; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12547s; TotalTimePerSample = 0.50188ms; SamplesPerSecond = 1992 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20658950; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12535s; TotalTimePerSample = 0.50142ms; SamplesPerSecond = 1994 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14583322; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12561s; TotalTimePerSample = 0.50244ms; SamplesPerSecond = 1990 +MPI Rank 2: Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 0.15948618; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.016697 +MPI Rank 2: Starting Epoch 4: learning rate per sample = 0.008000 effective momentum = 0.900000 MPI Rank 2: starting epoch 3 at record count 30000, and file position 0 MPI Rank 2: already there from last epoch MPI Rank 2: MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 4, NumGradientBits = 32). -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 1- 10 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12371233; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12757s; TotalTimePerSample = 0.51028ms; SamplesPerSecond = 1959 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 11- 20 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18070515; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12299s; TotalTimePerSample = 0.49195ms; SamplesPerSecond = 2032 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 21- 30 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14239721; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12165s; TotalTimePerSample = 0.48662ms; SamplesPerSecond = 2054 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 31- 40 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15630139; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12202s; TotalTimePerSample = 0.48810ms; SamplesPerSecond = 2048 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 41- 50 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16935523; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12091s; TotalTimePerSample = 0.48366ms; SamplesPerSecond = 2067 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 51- 60 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18198816; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12087s; TotalTimePerSample = 0.48349ms; SamplesPerSecond = 2068 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 61- 70 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14475952; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12254s; TotalTimePerSample = 0.49015ms; SamplesPerSecond = 2040 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 71- 80 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18021594; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12208s; TotalTimePerSample = 0.48833ms; SamplesPerSecond = 2047 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 81- 90 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15849304; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12485s; TotalTimePerSample = 0.49942ms; SamplesPerSecond = 2002 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14474402; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12218s; TotalTimePerSample = 0.48871ms; SamplesPerSecond = 2046 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13362928; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12226s; TotalTimePerSample = 0.48905ms; SamplesPerSecond = 2044 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13708325; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12181s; TotalTimePerSample = 0.48724ms; SamplesPerSecond = 2052 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.11569763; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12244s; TotalTimePerSample = 0.48975ms; SamplesPerSecond = 2041 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16892321; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12148s; TotalTimePerSample = 0.48594ms; SamplesPerSecond = 2057 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12752125; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12131s; TotalTimePerSample = 0.48522ms; SamplesPerSecond = 2060 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17100880; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12211s; TotalTimePerSample = 0.48844ms; SamplesPerSecond = 2047 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17660449; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12219s; TotalTimePerSample = 0.48876ms; SamplesPerSecond = 2045 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14105836; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12168s; TotalTimePerSample = 0.48672ms; SamplesPerSecond = 2054 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19333544; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12124s; TotalTimePerSample = 0.48494ms; SamplesPerSecond = 2062 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20859498; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12071s; TotalTimePerSample = 0.48283ms; SamplesPerSecond = 2071 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18499707; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12379s; TotalTimePerSample = 0.49516ms; SamplesPerSecond = 2019 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18152441; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12164s; TotalTimePerSample = 0.48655ms; SamplesPerSecond = 2055 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14037134; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12260s; TotalTimePerSample = 0.49039ms; SamplesPerSecond = 2039 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14866894; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12391s; TotalTimePerSample = 0.49563ms; SamplesPerSecond = 2017 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20347705; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12230s; TotalTimePerSample = 0.48922ms; SamplesPerSecond = 2044 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12815039; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12247s; TotalTimePerSample = 0.48989ms; SamplesPerSecond = 2041 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18672803; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12289s; TotalTimePerSample = 0.49155ms; SamplesPerSecond = 2034 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19552930; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12108s; TotalTimePerSample = 0.48431ms; SamplesPerSecond = 2064 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16452637; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12196s; TotalTimePerSample = 0.48783ms; SamplesPerSecond = 2049 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12461865; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12027s; TotalTimePerSample = 0.48108ms; SamplesPerSecond = 2078 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17285107; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12184s; TotalTimePerSample = 0.48738ms; SamplesPerSecond = 2051 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12253613; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12353s; TotalTimePerSample = 0.49410ms; SamplesPerSecond = 2023 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14723291; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12238s; TotalTimePerSample = 0.48952ms; SamplesPerSecond = 2042 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19789551; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12335s; TotalTimePerSample = 0.49339ms; SamplesPerSecond = 2026 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12575878; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12336s; TotalTimePerSample = 0.49344ms; SamplesPerSecond = 2026 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13745947; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12227s; TotalTimePerSample = 0.48906ms; SamplesPerSecond = 2044 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12839746; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12153s; TotalTimePerSample = 0.48612ms; SamplesPerSecond = 2057 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16647315; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12210s; TotalTimePerSample = 0.48838ms; SamplesPerSecond = 2047 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20679444; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12144s; TotalTimePerSample = 0.48576ms; SamplesPerSecond = 2058 -MPI Rank 2: Epoch[ 4 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14585204; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12219s; TotalTimePerSample = 0.48878ms; SamplesPerSecond = 2045 -MPI Rank 2: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15914927; EvalErrPerSample = 0.076700002; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.9272 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12371233; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12588s; TotalTimePerSample = 0.50353ms; SamplesPerSecond = 1985 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18070514; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12537s; TotalTimePerSample = 0.50147ms; SamplesPerSecond = 1994 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14239731; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12565s; TotalTimePerSample = 0.50262ms; SamplesPerSecond = 1989 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15630155; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12544s; TotalTimePerSample = 0.50175ms; SamplesPerSecond = 1993 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16935525; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12540s; TotalTimePerSample = 0.50158ms; SamplesPerSecond = 1993 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18198833; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12549s; TotalTimePerSample = 0.50196ms; SamplesPerSecond = 1992 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14475946; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12533s; TotalTimePerSample = 0.50132ms; SamplesPerSecond = 1994 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18021602; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12467s; TotalTimePerSample = 0.49869ms; SamplesPerSecond = 2005 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15849308; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12526s; TotalTimePerSample = 0.50102ms; SamplesPerSecond = 1995 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14474426; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12565s; TotalTimePerSample = 0.50260ms; SamplesPerSecond = 1989 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13362926; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12457s; TotalTimePerSample = 0.49827ms; SamplesPerSecond = 2006 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13708300; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12521s; TotalTimePerSample = 0.50084ms; SamplesPerSecond = 1996 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.11569776; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12513s; TotalTimePerSample = 0.50054ms; SamplesPerSecond = 1997 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16892330; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12556s; TotalTimePerSample = 0.50224ms; SamplesPerSecond = 1991 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12752163; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12519s; TotalTimePerSample = 0.50077ms; SamplesPerSecond = 1996 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17100866; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12561s; TotalTimePerSample = 0.50244ms; SamplesPerSecond = 1990 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17660425; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12498s; TotalTimePerSample = 0.49991ms; SamplesPerSecond = 2000 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14105804; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12544s; TotalTimePerSample = 0.50175ms; SamplesPerSecond = 1993 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19333553; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12517s; TotalTimePerSample = 0.50068ms; SamplesPerSecond = 1997 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20859525; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12541s; TotalTimePerSample = 0.50164ms; SamplesPerSecond = 1993 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18499677; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12594s; TotalTimePerSample = 0.50377ms; SamplesPerSecond = 1985 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18152438; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12537s; TotalTimePerSample = 0.50148ms; SamplesPerSecond = 1994 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14037158; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12552s; TotalTimePerSample = 0.50210ms; SamplesPerSecond = 1991 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14866862; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12546s; TotalTimePerSample = 0.50185ms; SamplesPerSecond = 1992 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20347748; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12561s; TotalTimePerSample = 0.50245ms; SamplesPerSecond = 1990 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12815013; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12522s; TotalTimePerSample = 0.50086ms; SamplesPerSecond = 1996 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18672810; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12548s; TotalTimePerSample = 0.50193ms; SamplesPerSecond = 1992 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19552989; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12513s; TotalTimePerSample = 0.50053ms; SamplesPerSecond = 1997 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16452642; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12554s; TotalTimePerSample = 0.50216ms; SamplesPerSecond = 1991 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12461825; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12499s; TotalTimePerSample = 0.49995ms; SamplesPerSecond = 2000 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17285251; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12571s; TotalTimePerSample = 0.50285ms; SamplesPerSecond = 1988 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12253620; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12563s; TotalTimePerSample = 0.50251ms; SamplesPerSecond = 1990 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14723333; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12541s; TotalTimePerSample = 0.50165ms; SamplesPerSecond = 1993 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19789537; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12616s; TotalTimePerSample = 0.50464ms; SamplesPerSecond = 1981 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12575877; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12608s; TotalTimePerSample = 0.50434ms; SamplesPerSecond = 1982 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13745928; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12557s; TotalTimePerSample = 0.50227ms; SamplesPerSecond = 1990 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12839652; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12497s; TotalTimePerSample = 0.49987ms; SamplesPerSecond = 2000 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16647280; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12551s; TotalTimePerSample = 0.50202ms; SamplesPerSecond = 1991 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20679434; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12523s; TotalTimePerSample = 0.50091ms; SamplesPerSecond = 1996 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12566s; TotalTimePerSample = 0.50262ms; SamplesPerSecond = 1989 +MPI Rank 2: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.018583 +MPI Rank 2: CNTKCommandTrainEnd: SimpleMultiGPU MPI Rank 2: COMPLETED MPI Rank 2: ~MPIWrapper -MPI Rank 3: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank3 -MPI Rank 3: ------------------------------------------------------------------- -MPI Rank 3: Build info: -MPI Rank 3: -MPI Rank 3: Built time: Aug 25 2015 17:44:46 -MPI Rank 3: Last modified date: Mon Aug 24 16:38:42 2015 -MPI Rank 3: Built by amitaga on Amitaga-Win-DT3 -MPI Rank 3: Build Path: E:\NetScale\CNTK\git_repos\public_master\MachineLearning\CNTK\ -MPI Rank 3: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 -MPI Rank 3: ------------------------------------------------------------------- -MPI Rank 3: running on Amitaga-Win-DT3 at 2015/08/26 01:48:44 -MPI Rank 3: command line options: -MPI Rank 3: configFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 3: running on localhost at 2015/10/24 12:44:55 +MPI Rank 3: command line: +MPI Rank 3: /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../../SimpleMultiGPU.config RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 3: MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> MPI Rank 3: deviceId=$DeviceId$ @@ -1482,12 +1851,11 @@ MPI Rank 3: minibatchSize=25 MPI Rank 3: learningRatesPerMB=0.5:0.2*20:0.1 MPI Rank 3: momentumPerMB=0.9 MPI Rank 3: dropoutRate=0.0 -MPI Rank 3: maxEpochs=10 +MPI Rank 3: maxEpochs=4 MPI Rank 3: ParallelTrain=[ MPI Rank 3: parallelizationMethod=DataParallelSGD MPI Rank 3: DataParallelSGD=[ MPI Rank 3: gradientBits=1 -MPI Rank 3: parallelizationStartEpoch=1 MPI Rank 3: ] MPI Rank 3: ] MPI Rank 3: ] @@ -1509,12 +1877,13 @@ MPI Rank 3: labelMappingFile=$DataDir$/SimpleMapping.txt MPI Rank 3: ] MPI Rank 3: ] MPI Rank 3: ] -MPI Rank 3: RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu -MPI Rank 3: DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data +MPI Rank 3: RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu +MPI Rank 3: DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data +MPI Rank 3: ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. MPI Rank 3: DeviceId=0 -MPI Rank 3: stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 3: precision=float MPI Rank 3: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 3: stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 3: MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< MPI Rank 3: @@ -1525,7 +1894,7 @@ MPI Rank 3: precision=float MPI Rank 3: parallelTrain=true MPI Rank 3: SimpleMultiGPU=[ MPI Rank 3: action=train -MPI Rank 3: modelPath=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 3: modelPath=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn MPI Rank 3: deviceId=0 MPI Rank 3: traceLevel=1 MPI Rank 3: SimpleNetworkBuilder=[ @@ -1544,18 +1913,17 @@ MPI Rank 3: minibatchSize=25 MPI Rank 3: learningRatesPerMB=0.5:0.2*20:0.1 MPI Rank 3: momentumPerMB=0.9 MPI Rank 3: dropoutRate=0.0 -MPI Rank 3: maxEpochs=10 +MPI Rank 3: maxEpochs=4 MPI Rank 3: ParallelTrain=[ MPI Rank 3: parallelizationMethod=DataParallelSGD MPI Rank 3: DataParallelSGD=[ MPI Rank 3: gradientBits=1 -MPI Rank 3: parallelizationStartEpoch=1 MPI Rank 3: ] MPI Rank 3: ] MPI Rank 3: ] MPI Rank 3: reader=[ MPI Rank 3: readerType=UCIFastReader -MPI Rank 3: file=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 3: file=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt MPI Rank 3: miniBatchMode=Partial MPI Rank 3: randomize=None MPI Rank 3: verbosity=1 @@ -1567,29 +1935,31 @@ MPI Rank 3: labels=[ MPI Rank 3: start=2 MPI Rank 3: dim=1 MPI Rank 3: labelDim=2 -MPI Rank 3: labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleMapping.txt +MPI Rank 3: labelMappingFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleMapping.txt MPI Rank 3: ] MPI Rank 3: ] MPI Rank 3: ] -MPI Rank 3: RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu -MPI Rank 3: DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data +MPI Rank 3: RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu +MPI Rank 3: DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data +MPI Rank 3: ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. MPI Rank 3: DeviceId=0 -MPI Rank 3: stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 3: precision=float MPI Rank 3: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 3: stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 3: MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 3: MPI Rank 3: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> MPI Rank 3: configparameters: SimpleMultiGPU.config:command=SimpleMultiGPU -MPI Rank 3: configparameters: SimpleMultiGPU.config:DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data +MPI Rank 3: configparameters: SimpleMultiGPU.config:ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. +MPI Rank 3: configparameters: SimpleMultiGPU.config:DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data MPI Rank 3: configparameters: SimpleMultiGPU.config:deviceId=0 MPI Rank 3: configparameters: SimpleMultiGPU.config:parallelTrain=true MPI Rank 3: configparameters: SimpleMultiGPU.config:precision=float -MPI Rank 3: configparameters: SimpleMultiGPU.config:RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu +MPI Rank 3: configparameters: SimpleMultiGPU.config:RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu MPI Rank 3: configparameters: SimpleMultiGPU.config:SimpleMultiGPU=[ MPI Rank 3: action=train -MPI Rank 3: modelPath=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 3: modelPath=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn MPI Rank 3: deviceId=0 MPI Rank 3: traceLevel=1 MPI Rank 3: SimpleNetworkBuilder=[ @@ -1608,18 +1978,17 @@ MPI Rank 3: minibatchSize=25 MPI Rank 3: learningRatesPerMB=0.5:0.2*20:0.1 MPI Rank 3: momentumPerMB=0.9 MPI Rank 3: dropoutRate=0.0 -MPI Rank 3: maxEpochs=10 +MPI Rank 3: maxEpochs=4 MPI Rank 3: ParallelTrain=[ MPI Rank 3: parallelizationMethod=DataParallelSGD MPI Rank 3: DataParallelSGD=[ MPI Rank 3: gradientBits=1 -MPI Rank 3: parallelizationStartEpoch=1 MPI Rank 3: ] MPI Rank 3: ] MPI Rank 3: ] MPI Rank 3: reader=[ MPI Rank 3: readerType=UCIFastReader -MPI Rank 3: file=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 3: file=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt MPI Rank 3: miniBatchMode=Partial MPI Rank 3: randomize=None MPI Rank 3: verbosity=1 @@ -1631,45 +2000,100 @@ MPI Rank 3: labels=[ MPI Rank 3: start=2 MPI Rank 3: dim=1 MPI Rank 3: labelDim=2 -MPI Rank 3: labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleMapping.txt +MPI Rank 3: labelMappingFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleMapping.txt MPI Rank 3: ] MPI Rank 3: ] MPI Rank 3: ] [SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] MPI Rank 3: -MPI Rank 3: configparameters: SimpleMultiGPU.config:stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 3: configparameters: SimpleMultiGPU.config:stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr MPI Rank 3: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 3: command: SimpleMultiGPU MPI Rank 3: precision = float +MPI Rank 3: CNTKModelPath: /tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 3: CNTKCommandTrainInfo: SimpleMultiGPU : 4 +MPI Rank 3: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4 +MPI Rank 3: CNTKCommandTrainBegin: SimpleMultiGPU MPI Rank 3: SimpleNetworkBuilder Using GPU 0 -MPI Rank 3: reading uci file E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 3: reading uci file /home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt +MPI Rank 3: SetUniformRandomValue (GPU): creating curand object with seed 1 MPI Rank 3: GetTrainCriterionNodes ... MPI Rank 3: GetEvalCriterionNodes ... MPI Rank 3: MPI Rank 3: -MPI Rank 3: Validating node CrossEntropyWithSoftmax +MPI Rank 3: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. MPI Rank 3: -MPI Rank 3: Validating --> labels = InputValue -MPI Rank 3: Validating --> W2 = LearnableParameter -MPI Rank 3: Validating --> W1 = LearnableParameter -MPI Rank 3: Validating --> W0 = LearnableParameter -MPI Rank 3: Validating --> features = InputValue -MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, 3]) -MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, 3]) -MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, 3]) -MPI Rank 3: Validating --> B0 = LearnableParameter -MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, 3], B0[50, 1]) -MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, 3]) -MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, 3]) -MPI Rank 3: Validating --> B1 = LearnableParameter -MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, 3], B1[50, 1]) -MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, 3]) -MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, 3]) -MPI Rank 3: Validating --> B2 = LearnableParameter -MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, 3], B2[2, 1]) -MPI Rank 3: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, 3], HLast[2, 3]) +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 3: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 3: +MPI Rank 3: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 3: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 3: +MPI Rank 3: Validating for node CrossEntropyWithSoftmax, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 3: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 3: +MPI Rank 3: 9 out of 20 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Precomputing --> 3 PreCompute nodes found. MPI Rank 3: -MPI Rank 3: Found 3 PreCompute nodes MPI Rank 3: NodeName: InvStdOfFeatures MPI Rank 3: NodeName: MeanOfFeatures MPI Rank 3: NodeName: Prior @@ -1680,234 +2104,314 @@ MPI Rank 3: starting epoch 0 at record count 0, and file position 0 MPI Rank 3: already there from last epoch MPI Rank 3: MPI Rank 3: -MPI Rank 3: Validating node InvStdOfFeatures +MPI Rank 3: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1. MPI Rank 3: -MPI Rank 3: Validating --> features = InputValue -MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, 25]) +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: +MPI Rank 3: Validating for node InvStdOfFeatures, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: +MPI Rank 3: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 3: MPI Rank 3: MPI Rank 3: -MPI Rank 3: Validating node MeanOfFeatures +MPI Rank 3: Validating for node MeanOfFeatures. 2 nodes to process in pass 1. MPI Rank 3: -MPI Rank 3: Validating --> features = InputValue -MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, 25]) +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: +MPI Rank 3: Validating for node MeanOfFeatures, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: +MPI Rank 3: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 3: MPI Rank 3: MPI Rank 3: -MPI Rank 3: Validating node Prior +MPI Rank 3: Validating for node Prior. 2 nodes to process in pass 1. MPI Rank 3: -MPI Rank 3: Validating --> labels = InputValue -MPI Rank 3: Validating --> Prior = Mean(labels[2, 25]) +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 3: +MPI Rank 3: Validating for node Prior. 1 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 3: +MPI Rank 3: Validating for node Prior, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 3: +MPI Rank 3: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once. +MPI Rank 3: +MPI Rank 3: Precomputing --> Completed. MPI Rank 3: MPI Rank 3: Set Max Temp Mem Size For Convolution Nodes to 0 samples. -MPI Rank 3: Starting Epoch 1: learning rate per sample = 0.020000 momentum = 0.900001 +MPI Rank 3: Starting Epoch 1: learning rate per sample = 0.020000 effective momentum = 0.900000 MPI Rank 3: starting epoch 0 at record count 0, and file position 0 MPI Rank 3: already there from last epoch MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 3: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 3: +MPI Rank 3: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 3: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 3: +MPI Rank 3: Validating for node EvalErrorPrediction, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 3: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 3: +MPI Rank 3: 9 out of 20 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: MPI Rank 3: Starting minibatch loop, DataParallelSGD training (MyRank = 3, NumNodes = 4, NumGradientBits = 32). -MPI Rank 3: -MPI Rank 3: -MPI Rank 3: Validating node EvalErrorPrediction -MPI Rank 3: -MPI Rank 3: Validating --> labels = InputValue -MPI Rank 3: Validating --> W2 = LearnableParameter -MPI Rank 3: Validating --> W1 = LearnableParameter -MPI Rank 3: Validating --> W0 = LearnableParameter -MPI Rank 3: Validating --> features = InputValue -MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, 7]) -MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, 7]) -MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, 7], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, 7]) -MPI Rank 3: Validating --> B0 = LearnableParameter -MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, 7], B0[50, 1]) -MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, 7]) -MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, 7]) -MPI Rank 3: Validating --> B1 = LearnableParameter -MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, 7], B1[50, 1]) -MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, 7]) -MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, 7]) -MPI Rank 3: Validating --> B2 = LearnableParameter -MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, 7], B2[2, 1]) -MPI Rank 3: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, 7], HLast[2, 7]) -MPI Rank 3: -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 1- 10 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70007980; EvalErr[0]PerSample = 0.52399999; TotalTime = 0.20117s; TotalTimePerSample = 0.80470ms; SamplesPerSecond = 1242 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 11- 20 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71514523; EvalErr[0]PerSample = 0.51999998; TotalTime = 0.15541s; TotalTimePerSample = 0.62162ms; SamplesPerSecond = 1608 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 21- 30 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72945595; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.14893s; TotalTimePerSample = 0.59571ms; SamplesPerSecond = 1678 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 31- 40 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70079005; EvalErr[0]PerSample = 0.52399999; TotalTime = 0.14465s; TotalTimePerSample = 0.57860ms; SamplesPerSecond = 1728 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 41- 50 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70605618; EvalErr[0]PerSample = 0.54000002; TotalTime = 0.14226s; TotalTimePerSample = 0.56906ms; SamplesPerSecond = 1757 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 51- 60 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71572435; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.13673s; TotalTimePerSample = 0.54692ms; SamplesPerSecond = 1828 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 61- 70 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72149903; EvalErr[0]PerSample = 0.47999999; TotalTime = 0.13632s; TotalTimePerSample = 0.54528ms; SamplesPerSecond = 1833 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 71- 80 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.79845655; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.13450s; TotalTimePerSample = 0.53800ms; SamplesPerSecond = 1858 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 81- 90 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69665188; EvalErr[0]PerSample = 0.46799999; TotalTime = 0.13040s; TotalTimePerSample = 0.52161ms; SamplesPerSecond = 1917 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70723337; EvalErr[0]PerSample = 0.49200001; TotalTime = 0.12784s; TotalTimePerSample = 0.51137ms; SamplesPerSecond = 1955 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71420360; EvalErr[0]PerSample = 0.55199999; TotalTime = 0.12616s; TotalTimePerSample = 0.50466ms; SamplesPerSecond = 1981 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69535255; EvalErr[0]PerSample = 0.43599999; TotalTime = 0.12550s; TotalTimePerSample = 0.50198ms; SamplesPerSecond = 1992 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70078611; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12245s; TotalTimePerSample = 0.48982ms; SamplesPerSecond = 2041 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71857810; EvalErr[0]PerSample = 0.54799998; TotalTime = 0.12284s; TotalTimePerSample = 0.49136ms; SamplesPerSecond = 2035 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72088283; EvalErr[0]PerSample = 0.48800001; TotalTime = 0.12223s; TotalTimePerSample = 0.48893ms; SamplesPerSecond = 2045 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71798825; EvalErr[0]PerSample = 0.55199999; TotalTime = 0.12113s; TotalTimePerSample = 0.48453ms; SamplesPerSecond = 2063 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.74162209; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12325s; TotalTimePerSample = 0.49300ms; SamplesPerSecond = 2028 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71835059; EvalErr[0]PerSample = 0.51599997; TotalTime = 0.12335s; TotalTimePerSample = 0.49339ms; SamplesPerSecond = 2026 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71529394; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12333s; TotalTimePerSample = 0.49334ms; SamplesPerSecond = 2027 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71727639; EvalErr[0]PerSample = 0.53200001; TotalTime = 0.12471s; TotalTimePerSample = 0.49884ms; SamplesPerSecond = 2004 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71745312; EvalErr[0]PerSample = 0.50400001; TotalTime = 0.12359s; TotalTimePerSample = 0.49437ms; SamplesPerSecond = 2022 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72088087; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12196s; TotalTimePerSample = 0.48784ms; SamplesPerSecond = 2049 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.72006541; EvalErr[0]PerSample = 0.50800002; TotalTime = 0.12266s; TotalTimePerSample = 0.49064ms; SamplesPerSecond = 2038 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.71275192; EvalErr[0]PerSample = 0.51200002; TotalTime = 0.12160s; TotalTimePerSample = 0.48640ms; SamplesPerSecond = 2055 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69645119; EvalErr[0]PerSample = 0.50400001; TotalTime = 0.12120s; TotalTimePerSample = 0.48479ms; SamplesPerSecond = 2062 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70129883; EvalErr[0]PerSample = 0.51200002; TotalTime = 0.12089s; TotalTimePerSample = 0.48356ms; SamplesPerSecond = 2067 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70768166; EvalErr[0]PerSample = 0.54400003; TotalTime = 0.12162s; TotalTimePerSample = 0.48648ms; SamplesPerSecond = 2055 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69744140; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.12267s; TotalTimePerSample = 0.49068ms; SamplesPerSecond = 2037 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69266015; EvalErr[0]PerSample = 0.44800001; TotalTime = 0.12178s; TotalTimePerSample = 0.48714ms; SamplesPerSecond = 2052 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69347072; EvalErr[0]PerSample = 0.49599999; TotalTime = 0.12231s; TotalTimePerSample = 0.48926ms; SamplesPerSecond = 2043 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69257420; EvalErr[0]PerSample = 0.54000002; TotalTime = 0.12288s; TotalTimePerSample = 0.49150ms; SamplesPerSecond = 2034 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.68625975; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12415s; TotalTimePerSample = 0.49658ms; SamplesPerSecond = 2013 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69064063; EvalErr[0]PerSample = 0.46799999; TotalTime = 0.12340s; TotalTimePerSample = 0.49359ms; SamplesPerSecond = 2025 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.70192385; EvalErr[0]PerSample = 0.46000001; TotalTime = 0.12176s; TotalTimePerSample = 0.48703ms; SamplesPerSecond = 2053 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.69058985; EvalErr[0]PerSample = 0.51999998; TotalTime = 0.12237s; TotalTimePerSample = 0.48949ms; SamplesPerSecond = 2042 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.67041212; EvalErr[0]PerSample = 0.39199999; TotalTime = 0.12186s; TotalTimePerSample = 0.48742ms; SamplesPerSecond = 2051 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.65914255; EvalErr[0]PerSample = 0.35600001; TotalTime = 0.12261s; TotalTimePerSample = 0.49044ms; SamplesPerSecond = 2038 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.63919920; EvalErr[0]PerSample = 0.36399999; TotalTime = 0.12265s; TotalTimePerSample = 0.49060ms; SamplesPerSecond = 2038 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.61294138; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.12142s; TotalTimePerSample = 0.48568ms; SamplesPerSecond = 2058 -MPI Rank 3: Epoch[ 1 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.55255663; EvalErr[0]PerSample = 0.18799999; TotalTime = 0.12034s; TotalTimePerSample = 0.48134ms; SamplesPerSecond = 2077 -MPI Rank 3: Finished Epoch[ 1 of 10]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.47350001; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.253729 -MPI Rank 3: Starting Epoch 2: learning rate per sample = 0.008000 momentum = 0.900001 +MPI Rank 3: DecimateMinibatchSequences: WARNING: Number of parallel utterances 25 not a multiple of number of GPUs 4, GPU usage will be suboptimal. +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 1- 10 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70007977; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.12756s; TotalTimePerSample = 0.51025ms; SamplesPerSecond = 1959 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 11- 20 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71514542; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12451s; TotalTimePerSample = 0.49804ms; SamplesPerSecond = 2007 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 21- 30 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72945595; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12484s; TotalTimePerSample = 0.49937ms; SamplesPerSecond = 2002 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 31- 40 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70079058; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.12472s; TotalTimePerSample = 0.49888ms; SamplesPerSecond = 2004 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 41- 50 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70605616; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12464s; TotalTimePerSample = 0.49857ms; SamplesPerSecond = 2005 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 51- 60 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71572398; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12454s; TotalTimePerSample = 0.49814ms; SamplesPerSecond = 2007 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 61- 70 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72149851; EvalErr[0]PerSample = 0.48000000; TotalTime = 0.12437s; TotalTimePerSample = 0.49750ms; SamplesPerSecond = 2010 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 71- 80 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.79845605; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12464s; TotalTimePerSample = 0.49854ms; SamplesPerSecond = 2005 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 81- 90 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69665186; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12442s; TotalTimePerSample = 0.49766ms; SamplesPerSecond = 2009 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 91- 100 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70723326; EvalErr[0]PerSample = 0.49200000; TotalTime = 0.12491s; TotalTimePerSample = 0.49965ms; SamplesPerSecond = 2001 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 101- 110 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71420344; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.12493s; TotalTimePerSample = 0.49973ms; SamplesPerSecond = 2001 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 111- 120 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69535258; EvalErr[0]PerSample = 0.43600000; TotalTime = 0.12517s; TotalTimePerSample = 0.50070ms; SamplesPerSecond = 1997 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 121- 130 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70078531; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12504s; TotalTimePerSample = 0.50018ms; SamplesPerSecond = 1999 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 131- 140 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71857915; EvalErr[0]PerSample = 0.54800000; TotalTime = 0.12456s; TotalTimePerSample = 0.49823ms; SamplesPerSecond = 2007 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 141- 150 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72088358; EvalErr[0]PerSample = 0.48800000; TotalTime = 0.12477s; TotalTimePerSample = 0.49910ms; SamplesPerSecond = 2003 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 151- 160 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71798839; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.12521s; TotalTimePerSample = 0.50086ms; SamplesPerSecond = 1996 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 161- 170 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.74162164; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12412s; TotalTimePerSample = 0.49647ms; SamplesPerSecond = 2014 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 171- 180 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71835127; EvalErr[0]PerSample = 0.51600000; TotalTime = 0.12483s; TotalTimePerSample = 0.49930ms; SamplesPerSecond = 2002 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 181- 190 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71529462; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12466s; TotalTimePerSample = 0.49865ms; SamplesPerSecond = 2005 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 191- 200 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71727656; EvalErr[0]PerSample = 0.53200000; TotalTime = 0.12438s; TotalTimePerSample = 0.49752ms; SamplesPerSecond = 2009 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 201- 210 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71745517; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.12477s; TotalTimePerSample = 0.49908ms; SamplesPerSecond = 2003 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 211- 220 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72088397; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12416s; TotalTimePerSample = 0.49663ms; SamplesPerSecond = 2013 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 221- 230 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72006808; EvalErr[0]PerSample = 0.50800000; TotalTime = 0.12445s; TotalTimePerSample = 0.49778ms; SamplesPerSecond = 2008 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 231- 240 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71275468; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12513s; TotalTimePerSample = 0.50054ms; SamplesPerSecond = 1997 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 241- 250 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69644781; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.12463s; TotalTimePerSample = 0.49852ms; SamplesPerSecond = 2005 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 251- 260 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70129697; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12494s; TotalTimePerSample = 0.49976ms; SamplesPerSecond = 2000 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 261- 270 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70768095; EvalErr[0]PerSample = 0.54400000; TotalTime = 0.12514s; TotalTimePerSample = 0.50055ms; SamplesPerSecond = 1997 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 271- 280 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69744379; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.12533s; TotalTimePerSample = 0.50132ms; SamplesPerSecond = 1994 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 281- 290 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69266186; EvalErr[0]PerSample = 0.44800000; TotalTime = 0.12545s; TotalTimePerSample = 0.50178ms; SamplesPerSecond = 1992 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 291- 300 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69347266; EvalErr[0]PerSample = 0.49600000; TotalTime = 0.12500s; TotalTimePerSample = 0.50000ms; SamplesPerSecond = 1999 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 301- 310 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69257410; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12540s; TotalTimePerSample = 0.50159ms; SamplesPerSecond = 1993 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 311- 320 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.68625741; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12518s; TotalTimePerSample = 0.50071ms; SamplesPerSecond = 1997 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 321- 330 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69064011; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12548s; TotalTimePerSample = 0.50190ms; SamplesPerSecond = 1992 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 331- 340 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70192153; EvalErr[0]PerSample = 0.46000000; TotalTime = 0.12511s; TotalTimePerSample = 0.50045ms; SamplesPerSecond = 1998 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 341- 350 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69058912; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12552s; TotalTimePerSample = 0.50208ms; SamplesPerSecond = 1991 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 351- 360 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.67041489; EvalErr[0]PerSample = 0.39200000; TotalTime = 0.12577s; TotalTimePerSample = 0.50310ms; SamplesPerSecond = 1987 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 361- 370 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.65913971; EvalErr[0]PerSample = 0.35600000; TotalTime = 0.12536s; TotalTimePerSample = 0.50142ms; SamplesPerSecond = 1994 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 371- 380 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.63919874; EvalErr[0]PerSample = 0.36400000; TotalTime = 0.12523s; TotalTimePerSample = 0.50094ms; SamplesPerSecond = 1996 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 381- 390 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.61293878; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.12515s; TotalTimePerSample = 0.50062ms; SamplesPerSecond = 1997 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 391- 400 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.55255340; EvalErr[0]PerSample = 0.18800000; TotalTime = 0.12492s; TotalTimePerSample = 0.49970ms; SamplesPerSecond = 2001 +MPI Rank 3: Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.4735; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.008585 +MPI Rank 3: Starting Epoch 2: learning rate per sample = 0.008000 effective momentum = 0.900000 MPI Rank 3: starting epoch 1 at record count 10000, and file position 0 MPI Rank 3: already there from last epoch MPI Rank 3: MPI Rank 3: Starting minibatch loop, DataParallelSGD training (MyRank = 3, NumNodes = 4, NumGradientBits = 32). -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 1- 10 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.50775200; EvalErr[0]PerSample = 0.23999999; TotalTime = 0.12816s; TotalTimePerSample = 0.51265ms; SamplesPerSecond = 1950 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 11- 20 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.43389454; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12188s; TotalTimePerSample = 0.48751ms; SamplesPerSecond = 2051 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 21- 30 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.36675408; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12077s; TotalTimePerSample = 0.48307ms; SamplesPerSecond = 2070 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 31- 40 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.33769274; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12254s; TotalTimePerSample = 0.49018ms; SamplesPerSecond = 2040 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 41- 50 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.30321363; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12133s; TotalTimePerSample = 0.48531ms; SamplesPerSecond = 2060 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 51- 60 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.29576379; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12053s; TotalTimePerSample = 0.48212ms; SamplesPerSecond = 2074 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 61- 70 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.24924731; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12247s; TotalTimePerSample = 0.48987ms; SamplesPerSecond = 2041 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 71- 80 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.24632569; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12328s; TotalTimePerSample = 0.49313ms; SamplesPerSecond = 2027 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 81- 90 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20943311; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12287s; TotalTimePerSample = 0.49148ms; SamplesPerSecond = 2034 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19116065; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12193s; TotalTimePerSample = 0.48770ms; SamplesPerSecond = 2050 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17923315; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12243s; TotalTimePerSample = 0.48974ms; SamplesPerSecond = 2041 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17075513; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12160s; TotalTimePerSample = 0.48638ms; SamplesPerSecond = 2055 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14442432; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12214s; TotalTimePerSample = 0.48854ms; SamplesPerSecond = 2046 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17753857; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12281s; TotalTimePerSample = 0.49123ms; SamplesPerSecond = 2035 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15087914; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12183s; TotalTimePerSample = 0.48733ms; SamplesPerSecond = 2052 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19252978; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12266s; TotalTimePerSample = 0.49063ms; SamplesPerSecond = 2038 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17830664; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12266s; TotalTimePerSample = 0.49063ms; SamplesPerSecond = 2038 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15115429; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12254s; TotalTimePerSample = 0.49016ms; SamplesPerSecond = 2040 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19135889; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12208s; TotalTimePerSample = 0.48830ms; SamplesPerSecond = 2047 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.21491407; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12304s; TotalTimePerSample = 0.49215ms; SamplesPerSecond = 2031 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18682373; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12156s; TotalTimePerSample = 0.48625ms; SamplesPerSecond = 2056 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18483251; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12136s; TotalTimePerSample = 0.48543ms; SamplesPerSecond = 2060 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14684522; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12230s; TotalTimePerSample = 0.48920ms; SamplesPerSecond = 2044 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15322119; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12307s; TotalTimePerSample = 0.49228ms; SamplesPerSecond = 2031 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19882520; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12373s; TotalTimePerSample = 0.49490ms; SamplesPerSecond = 2020 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13683788; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12241s; TotalTimePerSample = 0.48964ms; SamplesPerSecond = 2042 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18621191; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12195s; TotalTimePerSample = 0.48782ms; SamplesPerSecond = 2049 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19408056; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12169s; TotalTimePerSample = 0.48674ms; SamplesPerSecond = 2054 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17298096; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12217s; TotalTimePerSample = 0.48868ms; SamplesPerSecond = 2046 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13265137; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12193s; TotalTimePerSample = 0.48771ms; SamplesPerSecond = 2050 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17627051; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12156s; TotalTimePerSample = 0.48626ms; SamplesPerSecond = 2056 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12734570; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12410s; TotalTimePerSample = 0.49638ms; SamplesPerSecond = 2014 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15108399; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12115s; TotalTimePerSample = 0.48460ms; SamplesPerSecond = 2063 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19729199; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12268s; TotalTimePerSample = 0.49072ms; SamplesPerSecond = 2037 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12857373; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12203s; TotalTimePerSample = 0.48812ms; SamplesPerSecond = 2048 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13867822; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12273s; TotalTimePerSample = 0.49092ms; SamplesPerSecond = 2037 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12786084; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12172s; TotalTimePerSample = 0.48688ms; SamplesPerSecond = 2053 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16643262; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12109s; TotalTimePerSample = 0.48436ms; SamplesPerSecond = 2064 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20440333; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12237s; TotalTimePerSample = 0.48948ms; SamplesPerSecond = 2042 -MPI Rank 3: Epoch[ 2 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14566259; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12233s; TotalTimePerSample = 0.48931ms; SamplesPerSecond = 2043 -MPI Rank 3: Finished Epoch[ 2 of 10]: [Training Set] TrainLossPerSample = 0.20373113; EvalErrPerSample = 0.082699999; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.929802 -MPI Rank 3: Starting Epoch 3: learning rate per sample = 0.008000 momentum = 0.900001 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.50774607; EvalErr[0]PerSample = 0.24000000; TotalTime = 0.12567s; TotalTimePerSample = 0.50268ms; SamplesPerSecond = 1989 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.43388910; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12532s; TotalTimePerSample = 0.50129ms; SamplesPerSecond = 1994 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.36674852; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12549s; TotalTimePerSample = 0.50195ms; SamplesPerSecond = 1992 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.33768746; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12523s; TotalTimePerSample = 0.50094ms; SamplesPerSecond = 1996 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.30320932; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12505s; TotalTimePerSample = 0.50018ms; SamplesPerSecond = 1999 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.29576032; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12563s; TotalTimePerSample = 0.50252ms; SamplesPerSecond = 1989 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.24924483; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12490s; TotalTimePerSample = 0.49960ms; SamplesPerSecond = 2001 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.24632409; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12564s; TotalTimePerSample = 0.50257ms; SamplesPerSecond = 1989 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20943152; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12525s; TotalTimePerSample = 0.50102ms; SamplesPerSecond = 1995 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19115992; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12537s; TotalTimePerSample = 0.50149ms; SamplesPerSecond = 1994 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17923227; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12505s; TotalTimePerSample = 0.50018ms; SamplesPerSecond = 1999 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17075420; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12524s; TotalTimePerSample = 0.50098ms; SamplesPerSecond = 1996 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14442369; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12534s; TotalTimePerSample = 0.50138ms; SamplesPerSecond = 1994 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17753818; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12514s; TotalTimePerSample = 0.50056ms; SamplesPerSecond = 1997 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15087853; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12521s; TotalTimePerSample = 0.50083ms; SamplesPerSecond = 1996 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19253021; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12534s; TotalTimePerSample = 0.50136ms; SamplesPerSecond = 1994 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17830684; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12533s; TotalTimePerSample = 0.50130ms; SamplesPerSecond = 1994 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15115428; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12476s; TotalTimePerSample = 0.49904ms; SamplesPerSecond = 2003 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19135968; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12523s; TotalTimePerSample = 0.50093ms; SamplesPerSecond = 1996 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.21491485; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12494s; TotalTimePerSample = 0.49976ms; SamplesPerSecond = 2000 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18682346; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12558s; TotalTimePerSample = 0.50232ms; SamplesPerSecond = 1990 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18483205; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12477s; TotalTimePerSample = 0.49909ms; SamplesPerSecond = 2003 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14684503; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12463s; TotalTimePerSample = 0.49854ms; SamplesPerSecond = 2005 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15322116; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12531s; TotalTimePerSample = 0.50124ms; SamplesPerSecond = 1995 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19882571; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12527s; TotalTimePerSample = 0.50109ms; SamplesPerSecond = 1995 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13683832; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12494s; TotalTimePerSample = 0.49975ms; SamplesPerSecond = 2000 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18621189; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12491s; TotalTimePerSample = 0.49962ms; SamplesPerSecond = 2001 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19408050; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12603s; TotalTimePerSample = 0.50412ms; SamplesPerSecond = 1983 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17298137; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12517s; TotalTimePerSample = 0.50067ms; SamplesPerSecond = 1997 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13265130; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12562s; TotalTimePerSample = 0.50249ms; SamplesPerSecond = 1990 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17627178; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12516s; TotalTimePerSample = 0.50065ms; SamplesPerSecond = 1997 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12734628; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12563s; TotalTimePerSample = 0.50253ms; SamplesPerSecond = 1989 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15108452; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12524s; TotalTimePerSample = 0.50094ms; SamplesPerSecond = 1996 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19729185; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12567s; TotalTimePerSample = 0.50269ms; SamplesPerSecond = 1989 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12857333; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12512s; TotalTimePerSample = 0.50048ms; SamplesPerSecond = 1998 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13867804; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12546s; TotalTimePerSample = 0.50183ms; SamplesPerSecond = 1992 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12786051; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12557s; TotalTimePerSample = 0.50230ms; SamplesPerSecond = 1990 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16643303; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12504s; TotalTimePerSample = 0.50015ms; SamplesPerSecond = 1999 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20440408; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12589s; TotalTimePerSample = 0.50354ms; SamplesPerSecond = 1985 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14566237; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12509s; TotalTimePerSample = 0.50036ms; SamplesPerSecond = 1998 +MPI Rank 3: Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 0.20373026; EvalErrPerSample = 0.0827; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.013093 +MPI Rank 3: Starting Epoch 3: learning rate per sample = 0.008000 effective momentum = 0.900000 MPI Rank 3: starting epoch 2 at record count 20000, and file position 0 MPI Rank 3: already there from last epoch MPI Rank 3: MPI Rank 3: Starting minibatch loop, DataParallelSGD training (MyRank = 3, NumNodes = 4, NumGradientBits = 32). -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 1- 10 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12590086; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12908s; TotalTimePerSample = 0.51633ms; SamplesPerSecond = 1936 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 11- 20 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17780226; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12396s; TotalTimePerSample = 0.49584ms; SamplesPerSecond = 2016 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 21- 30 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14417633; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12283s; TotalTimePerSample = 0.49131ms; SamplesPerSecond = 2035 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 31- 40 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15796880; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12282s; TotalTimePerSample = 0.49129ms; SamplesPerSecond = 2035 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 41- 50 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17002991; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12299s; TotalTimePerSample = 0.49194ms; SamplesPerSecond = 2032 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 51- 60 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18262109; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12230s; TotalTimePerSample = 0.48920ms; SamplesPerSecond = 2044 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 61- 70 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14643688; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12102s; TotalTimePerSample = 0.48407ms; SamplesPerSecond = 2065 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 71- 80 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18030518; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12217s; TotalTimePerSample = 0.48868ms; SamplesPerSecond = 2046 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 81- 90 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15846142; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12213s; TotalTimePerSample = 0.48853ms; SamplesPerSecond = 2046 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14486536; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12131s; TotalTimePerSample = 0.48526ms; SamplesPerSecond = 2060 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13469091; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12150s; TotalTimePerSample = 0.48601ms; SamplesPerSecond = 2057 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12147s; TotalTimePerSample = 0.48588ms; SamplesPerSecond = 2058 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.11641297; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12266s; TotalTimePerSample = 0.49064ms; SamplesPerSecond = 2038 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16786633; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12217s; TotalTimePerSample = 0.48867ms; SamplesPerSecond = 2046 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12811548; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12209s; TotalTimePerSample = 0.48836ms; SamplesPerSecond = 2047 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17257836; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12251s; TotalTimePerSample = 0.49005ms; SamplesPerSecond = 2040 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17623682; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12008s; TotalTimePerSample = 0.48034ms; SamplesPerSecond = 2081 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14121118; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12156s; TotalTimePerSample = 0.48624ms; SamplesPerSecond = 2056 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19243409; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12071s; TotalTimePerSample = 0.48282ms; SamplesPerSecond = 2071 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20908155; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12149s; TotalTimePerSample = 0.48598ms; SamplesPerSecond = 2057 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18472095; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12149s; TotalTimePerSample = 0.48597ms; SamplesPerSecond = 2057 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18185547; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12187s; TotalTimePerSample = 0.48748ms; SamplesPerSecond = 2051 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14074194; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12174s; TotalTimePerSample = 0.48697ms; SamplesPerSecond = 2053 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14871632; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12167s; TotalTimePerSample = 0.48668ms; SamplesPerSecond = 2054 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20299682; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12216s; TotalTimePerSample = 0.48864ms; SamplesPerSecond = 2046 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12852076; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12055s; TotalTimePerSample = 0.48219ms; SamplesPerSecond = 2073 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18660498; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12199s; TotalTimePerSample = 0.48796ms; SamplesPerSecond = 2049 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19576025; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12239s; TotalTimePerSample = 0.48957ms; SamplesPerSecond = 2042 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16667627; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12266s; TotalTimePerSample = 0.49065ms; SamplesPerSecond = 2038 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12526172; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12058s; TotalTimePerSample = 0.48234ms; SamplesPerSecond = 2073 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17391992; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12146s; TotalTimePerSample = 0.48584ms; SamplesPerSecond = 2058 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12281641; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12141s; TotalTimePerSample = 0.48564ms; SamplesPerSecond = 2059 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14759424; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12210s; TotalTimePerSample = 0.48838ms; SamplesPerSecond = 2047 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19801368; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12210s; TotalTimePerSample = 0.48840ms; SamplesPerSecond = 2047 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12593359; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12242s; TotalTimePerSample = 0.48967ms; SamplesPerSecond = 2042 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13756640; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12193s; TotalTimePerSample = 0.48772ms; SamplesPerSecond = 2050 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12244s; TotalTimePerSample = 0.48974ms; SamplesPerSecond = 2041 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16654395; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12273s; TotalTimePerSample = 0.49091ms; SamplesPerSecond = 2037 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20658936; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12091s; TotalTimePerSample = 0.48362ms; SamplesPerSecond = 2067 -MPI Rank 3: Epoch[ 3 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14583300; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12209s; TotalTimePerSample = 0.48836ms; SamplesPerSecond = 2047 -MPI Rank 3: Finished Epoch[ 3 of 10]: [Training Set] TrainLossPerSample = 0.15948617; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.918964 -MPI Rank 3: Starting Epoch 4: learning rate per sample = 0.008000 momentum = 0.900001 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12590085; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12535s; TotalTimePerSample = 0.50138ms; SamplesPerSecond = 1994 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17780230; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12508s; TotalTimePerSample = 0.50032ms; SamplesPerSecond = 1998 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14417637; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12562s; TotalTimePerSample = 0.50248ms; SamplesPerSecond = 1990 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15796896; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12536s; TotalTimePerSample = 0.50144ms; SamplesPerSecond = 1994 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17003000; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12537s; TotalTimePerSample = 0.50150ms; SamplesPerSecond = 1994 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18262114; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12539s; TotalTimePerSample = 0.50157ms; SamplesPerSecond = 1993 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14643695; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12560s; TotalTimePerSample = 0.50240ms; SamplesPerSecond = 1990 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18030528; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12594s; TotalTimePerSample = 0.50375ms; SamplesPerSecond = 1985 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15846150; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12554s; TotalTimePerSample = 0.50217ms; SamplesPerSecond = 1991 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14486534; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12555s; TotalTimePerSample = 0.50218ms; SamplesPerSecond = 1991 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13469094; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12560s; TotalTimePerSample = 0.50238ms; SamplesPerSecond = 1990 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12511s; TotalTimePerSample = 0.50044ms; SamplesPerSecond = 1998 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.11641295; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12529s; TotalTimePerSample = 0.50114ms; SamplesPerSecond = 1995 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16786647; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12511s; TotalTimePerSample = 0.50044ms; SamplesPerSecond = 1998 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12811514; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12520s; TotalTimePerSample = 0.50081ms; SamplesPerSecond = 1996 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17257851; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12589s; TotalTimePerSample = 0.50357ms; SamplesPerSecond = 1985 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17623656; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12548s; TotalTimePerSample = 0.50190ms; SamplesPerSecond = 1992 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14121117; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12516s; TotalTimePerSample = 0.50063ms; SamplesPerSecond = 1997 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19243442; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12554s; TotalTimePerSample = 0.50217ms; SamplesPerSecond = 1991 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20908161; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12522s; TotalTimePerSample = 0.50090ms; SamplesPerSecond = 1996 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18472067; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12552s; TotalTimePerSample = 0.50209ms; SamplesPerSecond = 1991 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18185536; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12523s; TotalTimePerSample = 0.50091ms; SamplesPerSecond = 1996 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14074204; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12472s; TotalTimePerSample = 0.49888ms; SamplesPerSecond = 2004 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14871620; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12523s; TotalTimePerSample = 0.50091ms; SamplesPerSecond = 1996 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20299705; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12557s; TotalTimePerSample = 0.50228ms; SamplesPerSecond = 1990 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12852037; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12523s; TotalTimePerSample = 0.50090ms; SamplesPerSecond = 1996 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18660440; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12537s; TotalTimePerSample = 0.50148ms; SamplesPerSecond = 1994 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19575997; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12483s; TotalTimePerSample = 0.49931ms; SamplesPerSecond = 2002 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16667676; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12610s; TotalTimePerSample = 0.50441ms; SamplesPerSecond = 1982 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12526168; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12498s; TotalTimePerSample = 0.49993ms; SamplesPerSecond = 2000 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17392133; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12559s; TotalTimePerSample = 0.50237ms; SamplesPerSecond = 1990 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12281615; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12513s; TotalTimePerSample = 0.50054ms; SamplesPerSecond = 1997 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14759390; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12501s; TotalTimePerSample = 0.50004ms; SamplesPerSecond = 1999 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19801300; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12531s; TotalTimePerSample = 0.50124ms; SamplesPerSecond = 1995 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12593395; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12506s; TotalTimePerSample = 0.50023ms; SamplesPerSecond = 1999 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13756617; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12535s; TotalTimePerSample = 0.50141ms; SamplesPerSecond = 1994 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12534s; TotalTimePerSample = 0.50137ms; SamplesPerSecond = 1994 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16654368; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12550s; TotalTimePerSample = 0.50199ms; SamplesPerSecond = 1992 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20658950; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12534s; TotalTimePerSample = 0.50137ms; SamplesPerSecond = 1994 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14583322; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12566s; TotalTimePerSample = 0.50263ms; SamplesPerSecond = 1989 +MPI Rank 3: Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 0.15948618; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.016289 +MPI Rank 3: Starting Epoch 4: learning rate per sample = 0.008000 effective momentum = 0.900000 MPI Rank 3: starting epoch 3 at record count 30000, and file position 0 MPI Rank 3: already there from last epoch MPI Rank 3: MPI Rank 3: Starting minibatch loop, DataParallelSGD training (MyRank = 3, NumNodes = 4, NumGradientBits = 32). -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 1- 10 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12371233; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12751s; TotalTimePerSample = 0.51002ms; SamplesPerSecond = 1960 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 11- 20 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18070515; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12299s; TotalTimePerSample = 0.49196ms; SamplesPerSecond = 2032 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 21- 30 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14239721; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12164s; TotalTimePerSample = 0.48655ms; SamplesPerSecond = 2055 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 31- 40 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15630139; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12202s; TotalTimePerSample = 0.48808ms; SamplesPerSecond = 2048 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 41- 50 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16935523; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12092s; TotalTimePerSample = 0.48368ms; SamplesPerSecond = 2067 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 51- 60 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18198816; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12084s; TotalTimePerSample = 0.48334ms; SamplesPerSecond = 2068 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 61- 70 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14475952; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12254s; TotalTimePerSample = 0.49015ms; SamplesPerSecond = 2040 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 71- 80 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18021594; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12209s; TotalTimePerSample = 0.48837ms; SamplesPerSecond = 2047 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 81- 90 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.15849304; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12483s; TotalTimePerSample = 0.49934ms; SamplesPerSecond = 2002 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14474402; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12218s; TotalTimePerSample = 0.48872ms; SamplesPerSecond = 2046 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13362928; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12226s; TotalTimePerSample = 0.48902ms; SamplesPerSecond = 2044 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13708325; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12182s; TotalTimePerSample = 0.48728ms; SamplesPerSecond = 2052 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.11569763; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12244s; TotalTimePerSample = 0.48977ms; SamplesPerSecond = 2041 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16892321; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12146s; TotalTimePerSample = 0.48586ms; SamplesPerSecond = 2058 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12752125; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12127s; TotalTimePerSample = 0.48510ms; SamplesPerSecond = 2061 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17100880; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12211s; TotalTimePerSample = 0.48843ms; SamplesPerSecond = 2047 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17660449; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12218s; TotalTimePerSample = 0.48873ms; SamplesPerSecond = 2046 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14105836; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12168s; TotalTimePerSample = 0.48673ms; SamplesPerSecond = 2054 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19333544; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12124s; TotalTimePerSample = 0.48496ms; SamplesPerSecond = 2062 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20859498; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12069s; TotalTimePerSample = 0.48276ms; SamplesPerSecond = 2071 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18499707; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12378s; TotalTimePerSample = 0.49513ms; SamplesPerSecond = 2019 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18152441; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12165s; TotalTimePerSample = 0.48661ms; SamplesPerSecond = 2055 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14037134; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12258s; TotalTimePerSample = 0.49032ms; SamplesPerSecond = 2039 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14866894; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12390s; TotalTimePerSample = 0.49561ms; SamplesPerSecond = 2017 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20347705; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12229s; TotalTimePerSample = 0.48917ms; SamplesPerSecond = 2044 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12815039; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12245s; TotalTimePerSample = 0.48980ms; SamplesPerSecond = 2041 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.18672803; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12288s; TotalTimePerSample = 0.49152ms; SamplesPerSecond = 2034 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19552930; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12108s; TotalTimePerSample = 0.48432ms; SamplesPerSecond = 2064 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16452637; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12195s; TotalTimePerSample = 0.48780ms; SamplesPerSecond = 2050 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12461865; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12028s; TotalTimePerSample = 0.48110ms; SamplesPerSecond = 2078 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.17285107; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12184s; TotalTimePerSample = 0.48736ms; SamplesPerSecond = 2051 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12253613; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12352s; TotalTimePerSample = 0.49408ms; SamplesPerSecond = 2023 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14723291; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12238s; TotalTimePerSample = 0.48952ms; SamplesPerSecond = 2042 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.19789551; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12333s; TotalTimePerSample = 0.49332ms; SamplesPerSecond = 2027 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12575878; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12335s; TotalTimePerSample = 0.49340ms; SamplesPerSecond = 2026 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.13745947; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12227s; TotalTimePerSample = 0.48906ms; SamplesPerSecond = 2044 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.12839746; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12152s; TotalTimePerSample = 0.48608ms; SamplesPerSecond = 2057 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.16647315; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12209s; TotalTimePerSample = 0.48836ms; SamplesPerSecond = 2047 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.20679444; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12143s; TotalTimePerSample = 0.48573ms; SamplesPerSecond = 2058 -MPI Rank 3: Epoch[ 4 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample = 0.14585204; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12219s; TotalTimePerSample = 0.48876ms; SamplesPerSecond = 2046 -MPI Rank 3: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15914927; EvalErrPerSample = 0.076700002; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.927711 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12371233; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12546s; TotalTimePerSample = 0.50182ms; SamplesPerSecond = 1992 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18070514; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12545s; TotalTimePerSample = 0.50180ms; SamplesPerSecond = 1992 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14239731; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12565s; TotalTimePerSample = 0.50258ms; SamplesPerSecond = 1989 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15630155; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12556s; TotalTimePerSample = 0.50224ms; SamplesPerSecond = 1991 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16935525; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12543s; TotalTimePerSample = 0.50174ms; SamplesPerSecond = 1993 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18198833; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12530s; TotalTimePerSample = 0.50121ms; SamplesPerSecond = 1995 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14475946; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12540s; TotalTimePerSample = 0.50159ms; SamplesPerSecond = 1993 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18021602; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12464s; TotalTimePerSample = 0.49856ms; SamplesPerSecond = 2005 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15849308; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12530s; TotalTimePerSample = 0.50120ms; SamplesPerSecond = 1995 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14474426; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12568s; TotalTimePerSample = 0.50272ms; SamplesPerSecond = 1989 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13362926; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12452s; TotalTimePerSample = 0.49808ms; SamplesPerSecond = 2007 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13708300; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12525s; TotalTimePerSample = 0.50101ms; SamplesPerSecond = 1995 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.11569776; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12514s; TotalTimePerSample = 0.50058ms; SamplesPerSecond = 1997 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16892330; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12559s; TotalTimePerSample = 0.50234ms; SamplesPerSecond = 1990 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12752163; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12515s; TotalTimePerSample = 0.50059ms; SamplesPerSecond = 1997 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17100866; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12566s; TotalTimePerSample = 0.50264ms; SamplesPerSecond = 1989 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17660425; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12500s; TotalTimePerSample = 0.50002ms; SamplesPerSecond = 1999 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14105804; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12543s; TotalTimePerSample = 0.50172ms; SamplesPerSecond = 1993 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19333553; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12542s; TotalTimePerSample = 0.50168ms; SamplesPerSecond = 1993 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20859525; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12515s; TotalTimePerSample = 0.50059ms; SamplesPerSecond = 1997 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18499677; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12599s; TotalTimePerSample = 0.50396ms; SamplesPerSecond = 1984 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18152438; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12547s; TotalTimePerSample = 0.50188ms; SamplesPerSecond = 1992 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14037158; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12508s; TotalTimePerSample = 0.50033ms; SamplesPerSecond = 1998 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14866862; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12547s; TotalTimePerSample = 0.50189ms; SamplesPerSecond = 1992 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20347748; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12559s; TotalTimePerSample = 0.50236ms; SamplesPerSecond = 1990 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12815013; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12526s; TotalTimePerSample = 0.50104ms; SamplesPerSecond = 1995 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18672810; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12549s; TotalTimePerSample = 0.50197ms; SamplesPerSecond = 1992 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19552989; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12511s; TotalTimePerSample = 0.50045ms; SamplesPerSecond = 1998 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16452642; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12558s; TotalTimePerSample = 0.50232ms; SamplesPerSecond = 1990 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12461825; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12496s; TotalTimePerSample = 0.49984ms; SamplesPerSecond = 2000 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17285251; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12575s; TotalTimePerSample = 0.50300ms; SamplesPerSecond = 1988 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12253620; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12561s; TotalTimePerSample = 0.50243ms; SamplesPerSecond = 1990 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14723333; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12548s; TotalTimePerSample = 0.50190ms; SamplesPerSecond = 1992 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19789537; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12617s; TotalTimePerSample = 0.50467ms; SamplesPerSecond = 1981 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12575877; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12609s; TotalTimePerSample = 0.50438ms; SamplesPerSecond = 1982 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13745928; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12558s; TotalTimePerSample = 0.50231ms; SamplesPerSecond = 1990 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12839652; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12498s; TotalTimePerSample = 0.49992ms; SamplesPerSecond = 2000 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16647280; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12549s; TotalTimePerSample = 0.50196ms; SamplesPerSecond = 1992 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20679434; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12527s; TotalTimePerSample = 0.50107ms; SamplesPerSecond = 1995 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12570s; TotalTimePerSample = 0.50282ms; SamplesPerSecond = 1988 +MPI Rank 3: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.018182 +MPI Rank 3: CNTKCommandTrainEnd: SimpleMultiGPU MPI Rank 3: COMPLETED MPI Rank 3: ~MPIWrapper diff --git a/Tests/ParallelTraining/NoQuantization/SinglePrecision/baseline.windows.gpu.txt b/Tests/ParallelTraining/NoQuantization/SinglePrecision/baseline.windows.gpu.txt new file mode 100644 index 000000000..04e6c6109 --- /dev/null +++ b/Tests/ParallelTraining/NoQuantization/SinglePrecision/baseline.windows.gpu.txt @@ -0,0 +1,2449 @@ +=== Running C:\Program Files\Microsoft MPI\Bin\/mpiexec.exe -n 4 E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining/SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPIWrapper: initializing MPI +MPIWrapper: initializing MPI +MPIWrapper: initializing MPI +MPIWrapper: initializing MPI +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (1) are in (participating) +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (3) are in (participating) +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (0) are in (participating) +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (2) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 0 in a gearbox of 4 +mpihelper: we are cog 2 in a gearbox of 4 +mpihelper: we are cog 3 in a gearbox of 4 +mpihelper: we are cog 1 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +MPI Rank 0: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank0 +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: Build info: +MPI Rank 0: +MPI Rank 0: Built time: Oct 24 2015 13:33:25 +MPI Rank 0: Last modified date: Thu Oct 22 16:00:27 2015 +MPI Rank 0: Built by amitaga on Amitaga-Win-DT3 +MPI Rank 0: Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\ +MPI Rank 0: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: running on Amitaga-Win-DT3 at 2015/10/24 21:49:38 +MPI Rank 0: command line: +MPI Rank 0: E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining/SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: deviceId=$DeviceId$ +MPI Rank 0: command=SimpleMultiGPU +MPI Rank 0: precision=float +MPI Rank 0: parallelTrain=true +MPI Rank 0: SimpleMultiGPU=[ +MPI Rank 0: action=train +MPI Rank 0: modelPath=$RunDir$/models/Simple.dnn +MPI Rank 0: deviceId=$DeviceId$ +MPI Rank 0: traceLevel=1 +MPI Rank 0: SimpleNetworkBuilder=[ +MPI Rank 0: layerSizes=2:50*2:2 +MPI Rank 0: trainingCriterion=CrossEntropyWithSoftmax +MPI Rank 0: evalCriterion=ErrorPrediction +MPI Rank 0: layerTypes=Sigmoid +MPI Rank 0: initValueScale=1.0 +MPI Rank 0: applyMeanVarNorm=true +MPI Rank 0: uniformInit=true +MPI Rank 0: needPrior=true +MPI Rank 0: ] +MPI Rank 0: SGD=[ +MPI Rank 0: epochSize=0 +MPI Rank 0: minibatchSize=25 +MPI Rank 0: learningRatesPerMB=0.5:0.2*20:0.1 +MPI Rank 0: momentumPerMB=0.9 +MPI Rank 0: dropoutRate=0.0 +MPI Rank 0: maxEpochs=4 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationMethod=DataParallelSGD +MPI Rank 0: DataParallelSGD=[ +MPI Rank 0: gradientBits=1 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: reader=[ +MPI Rank 0: readerType=UCIFastReader +MPI Rank 0: file=$DataDir$/SimpleDataTrain.txt +MPI Rank 0: miniBatchMode=Partial +MPI Rank 0: randomize=None +MPI Rank 0: verbosity=1 +MPI Rank 0: features=[ +MPI Rank 0: dim=2 +MPI Rank 0: start=0 +MPI Rank 0: ] +MPI Rank 0: labels=[ +MPI Rank 0: start=2 +MPI Rank 0: dim=1 +MPI Rank 0: labelDim=2 +MPI Rank 0: labelMappingFile=$DataDir$/SimpleMapping.txt +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu +MPI Rank 0: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data +MPI Rank 0: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining +MPI Rank 0: DeviceId=0 +MPI Rank 0: precision=float +MPI Rank 0: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: deviceId=0 +MPI Rank 0: command=SimpleMultiGPU +MPI Rank 0: precision=float +MPI Rank 0: parallelTrain=true +MPI Rank 0: SimpleMultiGPU=[ +MPI Rank 0: action=train +MPI Rank 0: modelPath=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 0: deviceId=0 +MPI Rank 0: traceLevel=1 +MPI Rank 0: SimpleNetworkBuilder=[ +MPI Rank 0: layerSizes=2:50*2:2 +MPI Rank 0: trainingCriterion=CrossEntropyWithSoftmax +MPI Rank 0: evalCriterion=ErrorPrediction +MPI Rank 0: layerTypes=Sigmoid +MPI Rank 0: initValueScale=1.0 +MPI Rank 0: applyMeanVarNorm=true +MPI Rank 0: uniformInit=true +MPI Rank 0: needPrior=true +MPI Rank 0: ] +MPI Rank 0: SGD=[ +MPI Rank 0: epochSize=0 +MPI Rank 0: minibatchSize=25 +MPI Rank 0: learningRatesPerMB=0.5:0.2*20:0.1 +MPI Rank 0: momentumPerMB=0.9 +MPI Rank 0: dropoutRate=0.0 +MPI Rank 0: maxEpochs=4 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationMethod=DataParallelSGD +MPI Rank 0: DataParallelSGD=[ +MPI Rank 0: gradientBits=1 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: reader=[ +MPI Rank 0: readerType=UCIFastReader +MPI Rank 0: file=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 0: miniBatchMode=Partial +MPI Rank 0: randomize=None +MPI Rank 0: verbosity=1 +MPI Rank 0: features=[ +MPI Rank 0: dim=2 +MPI Rank 0: start=0 +MPI Rank 0: ] +MPI Rank 0: labels=[ +MPI Rank 0: start=2 +MPI Rank 0: dim=1 +MPI Rank 0: labelDim=2 +MPI Rank 0: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleMapping.txt +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu +MPI Rank 0: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data +MPI Rank 0: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining +MPI Rank 0: DeviceId=0 +MPI Rank 0: precision=float +MPI Rank 0: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: configparameters: SimpleMultiGPU.config:command=SimpleMultiGPU +MPI Rank 0: configparameters: SimpleMultiGPU.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining +MPI Rank 0: configparameters: SimpleMultiGPU.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data +MPI Rank 0: configparameters: SimpleMultiGPU.config:deviceId=0 +MPI Rank 0: configparameters: SimpleMultiGPU.config:parallelTrain=true +MPI Rank 0: configparameters: SimpleMultiGPU.config:precision=float +MPI Rank 0: configparameters: SimpleMultiGPU.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu +MPI Rank 0: configparameters: SimpleMultiGPU.config:SimpleMultiGPU=[ +MPI Rank 0: action=train +MPI Rank 0: modelPath=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 0: deviceId=0 +MPI Rank 0: traceLevel=1 +MPI Rank 0: SimpleNetworkBuilder=[ +MPI Rank 0: layerSizes=2:50*2:2 +MPI Rank 0: trainingCriterion=CrossEntropyWithSoftmax +MPI Rank 0: evalCriterion=ErrorPrediction +MPI Rank 0: layerTypes=Sigmoid +MPI Rank 0: initValueScale=1.0 +MPI Rank 0: applyMeanVarNorm=true +MPI Rank 0: uniformInit=true +MPI Rank 0: needPrior=true +MPI Rank 0: ] +MPI Rank 0: SGD=[ +MPI Rank 0: epochSize=0 +MPI Rank 0: minibatchSize=25 +MPI Rank 0: learningRatesPerMB=0.5:0.2*20:0.1 +MPI Rank 0: momentumPerMB=0.9 +MPI Rank 0: dropoutRate=0.0 +MPI Rank 0: maxEpochs=4 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationMethod=DataParallelSGD +MPI Rank 0: DataParallelSGD=[ +MPI Rank 0: gradientBits=1 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: reader=[ +MPI Rank 0: readerType=UCIFastReader +MPI Rank 0: file=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 0: miniBatchMode=Partial +MPI Rank 0: randomize=None +MPI Rank 0: verbosity=1 +MPI Rank 0: features=[ +MPI Rank 0: dim=2 +MPI Rank 0: start=0 +MPI Rank 0: ] +MPI Rank 0: labels=[ +MPI Rank 0: start=2 +MPI Rank 0: dim=1 +MPI Rank 0: labelDim=2 +MPI Rank 0: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleMapping.txt +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] [SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 0: +MPI Rank 0: configparameters: SimpleMultiGPU.config:stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: command: SimpleMultiGPU +MPI Rank 0: precision = float +MPI Rank 0: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 0: CNTKCommandTrainInfo: SimpleMultiGPU : 4 +MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4 +MPI Rank 0: CNTKCommandTrainBegin: SimpleMultiGPU +MPI Rank 0: SimpleNetworkBuilder Using GPU 0 +MPI Rank 0: reading uci file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 0: SetUniformRandomValue (GPU): creating curand object with seed 1 +MPI Rank 0: GetTrainCriterionNodes ... +MPI Rank 0: GetEvalCriterionNodes ... +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 0: +MPI Rank 0: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 0: +MPI Rank 0: Validating for node CrossEntropyWithSoftmax, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 0: +MPI Rank 0: 9 out of 20 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Precomputing --> 3 PreCompute nodes found. +MPI Rank 0: +MPI Rank 0: NodeName: InvStdOfFeatures +MPI Rank 0: NodeName: MeanOfFeatures +MPI Rank 0: NodeName: Prior +MPI Rank 0: starting at epoch 0 counting lines to determine record count +MPI Rank 0: +MPI Rank 0: 10000 records found +MPI Rank 0: starting epoch 0 at record count 0, and file position 0 +MPI Rank 0: already there from last epoch +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: +MPI Rank 0: Validating for node InvStdOfFeatures, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: +MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node MeanOfFeatures. 2 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: +MPI Rank 0: Validating for node MeanOfFeatures, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 0: +MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node Prior. 2 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 0: +MPI Rank 0: Validating for node Prior. 1 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 0: +MPI Rank 0: Validating for node Prior, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 0: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 0: +MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Precomputing --> Completed. +MPI Rank 0: +MPI Rank 0: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.020000 effective momentum = 0.900000 +MPI Rank 0: starting epoch 0 at record count 0, and file position 0 +MPI Rank 0: already there from last epoch +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 0: +MPI Rank 0: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 0: +MPI Rank 0: Validating for node EvalErrorPrediction, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 0: +MPI Rank 0: 9 out of 20 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 4, NumGradientBits = 32). +MPI Rank 0: DecimateMinibatchSequences: WARNING: Number of parallel utterances 25 not a multiple of number of GPUs 4, GPU usage will be suboptimal. +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 1- 10 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70007977; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.21274s; TotalTimePerSample = 0.85096ms; SamplesPerSecond = 1175 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 11- 20 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71514542; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.17175s; TotalTimePerSample = 0.68700ms; SamplesPerSecond = 1455 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 21- 30 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72945595; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.16870s; TotalTimePerSample = 0.67482ms; SamplesPerSecond = 1481 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 31- 40 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70079058; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.16118s; TotalTimePerSample = 0.64471ms; SamplesPerSecond = 1551 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 41- 50 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70605615; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.16159s; TotalTimePerSample = 0.64636ms; SamplesPerSecond = 1547 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 51- 60 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71572398; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.15234s; TotalTimePerSample = 0.60934ms; SamplesPerSecond = 1641 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 61- 70 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72149851; EvalErr[0]PerSample = 0.48000000; TotalTime = 0.14166s; TotalTimePerSample = 0.56666ms; SamplesPerSecond = 1764 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 71- 80 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.79845604; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.14173s; TotalTimePerSample = 0.56692ms; SamplesPerSecond = 1763 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 81- 90 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69665186; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.13864s; TotalTimePerSample = 0.55454ms; SamplesPerSecond = 1803 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 91- 100 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70723326; EvalErr[0]PerSample = 0.49200000; TotalTime = 0.12738s; TotalTimePerSample = 0.50953ms; SamplesPerSecond = 1962 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 101- 110 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71420345; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.13068s; TotalTimePerSample = 0.52273ms; SamplesPerSecond = 1913 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 111- 120 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69535258; EvalErr[0]PerSample = 0.43600000; TotalTime = 0.12783s; TotalTimePerSample = 0.51133ms; SamplesPerSecond = 1955 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 121- 130 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70078531; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12405s; TotalTimePerSample = 0.49618ms; SamplesPerSecond = 2015 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 131- 140 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71857916; EvalErr[0]PerSample = 0.54800000; TotalTime = 0.11956s; TotalTimePerSample = 0.47822ms; SamplesPerSecond = 2091 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 141- 150 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72088358; EvalErr[0]PerSample = 0.48800000; TotalTime = 0.11897s; TotalTimePerSample = 0.47589ms; SamplesPerSecond = 2101 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 151- 160 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71798840; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.11620s; TotalTimePerSample = 0.46478ms; SamplesPerSecond = 2151 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 161- 170 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.74162164; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.11800s; TotalTimePerSample = 0.47198ms; SamplesPerSecond = 2118 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 171- 180 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71835128; EvalErr[0]PerSample = 0.51600000; TotalTime = 0.11839s; TotalTimePerSample = 0.47358ms; SamplesPerSecond = 2111 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 181- 190 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71529463; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12527s; TotalTimePerSample = 0.50107ms; SamplesPerSecond = 1995 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 191- 200 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71727657; EvalErr[0]PerSample = 0.53200000; TotalTime = 0.11979s; TotalTimePerSample = 0.47917ms; SamplesPerSecond = 2086 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 201- 210 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71745517; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.11901s; TotalTimePerSample = 0.47605ms; SamplesPerSecond = 2100 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 211- 220 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72088398; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.11482s; TotalTimePerSample = 0.45926ms; SamplesPerSecond = 2177 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 221- 230 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72006809; EvalErr[0]PerSample = 0.50800000; TotalTime = 0.12406s; TotalTimePerSample = 0.49625ms; SamplesPerSecond = 2015 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 231- 240 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71275468; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12120s; TotalTimePerSample = 0.48478ms; SamplesPerSecond = 2062 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 241- 250 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69644781; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.11779s; TotalTimePerSample = 0.47117ms; SamplesPerSecond = 2122 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 251- 260 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70129698; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12147s; TotalTimePerSample = 0.48589ms; SamplesPerSecond = 2058 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 261- 270 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70768095; EvalErr[0]PerSample = 0.54400000; TotalTime = 0.12140s; TotalTimePerSample = 0.48561ms; SamplesPerSecond = 2059 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 271- 280 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69744379; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.11737s; TotalTimePerSample = 0.46946ms; SamplesPerSecond = 2130 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 281- 290 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69266187; EvalErr[0]PerSample = 0.44800000; TotalTime = 0.11798s; TotalTimePerSample = 0.47194ms; SamplesPerSecond = 2118 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 291- 300 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69347266; EvalErr[0]PerSample = 0.49600000; TotalTime = 0.12121s; TotalTimePerSample = 0.48485ms; SamplesPerSecond = 2062 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 301- 310 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69257409; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12546s; TotalTimePerSample = 0.50186ms; SamplesPerSecond = 1992 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 311- 320 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.68625741; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12595s; TotalTimePerSample = 0.50380ms; SamplesPerSecond = 1984 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 321- 330 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69064011; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12351s; TotalTimePerSample = 0.49405ms; SamplesPerSecond = 2024 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 331- 340 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70192154; EvalErr[0]PerSample = 0.46000000; TotalTime = 0.12384s; TotalTimePerSample = 0.49536ms; SamplesPerSecond = 2018 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 341- 350 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69058912; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12360s; TotalTimePerSample = 0.49439ms; SamplesPerSecond = 2022 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 351- 360 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.67041492; EvalErr[0]PerSample = 0.39200000; TotalTime = 0.12239s; TotalTimePerSample = 0.48957ms; SamplesPerSecond = 2042 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 361- 370 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.65913973; EvalErr[0]PerSample = 0.35600000; TotalTime = 0.12837s; TotalTimePerSample = 0.51346ms; SamplesPerSecond = 1947 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 371- 380 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.63919877; EvalErr[0]PerSample = 0.36400000; TotalTime = 0.14480s; TotalTimePerSample = 0.57919ms; SamplesPerSecond = 1726 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 381- 390 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.61293883; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.14498s; TotalTimePerSample = 0.57991ms; SamplesPerSecond = 1724 +MPI Rank 0: Epoch[ 1 of 4]-Minibatch[ 391- 400 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.55255352; EvalErr[0]PerSample = 0.18800000; TotalTime = 0.14514s; TotalTimePerSample = 0.58057ms; SamplesPerSecond = 1722 +MPI Rank 0: Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.4735; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.368447 +MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.008000 effective momentum = 0.900000 +MPI Rank 0: starting epoch 1 at record count 10000, and file position 0 +MPI Rank 0: already there from last epoch +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 4, NumGradientBits = 32). +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.50774625; EvalErr[0]PerSample = 0.24000000; TotalTime = 0.13805s; TotalTimePerSample = 0.55221ms; SamplesPerSecond = 1810 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.43388927; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.13897s; TotalTimePerSample = 0.55587ms; SamplesPerSecond = 1798 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.36674870; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.14035s; TotalTimePerSample = 0.56139ms; SamplesPerSecond = 1781 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.33768765; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.13735s; TotalTimePerSample = 0.54939ms; SamplesPerSecond = 1820 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.30320946; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12889s; TotalTimePerSample = 0.51557ms; SamplesPerSecond = 1939 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.29576043; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.14907s; TotalTimePerSample = 0.59630ms; SamplesPerSecond = 1677 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.24924491; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14579s; TotalTimePerSample = 0.58317ms; SamplesPerSecond = 1714 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.24632415; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.14396s; TotalTimePerSample = 0.57584ms; SamplesPerSecond = 1736 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20943158; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.13623s; TotalTimePerSample = 0.54492ms; SamplesPerSecond = 1835 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19115996; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.13589s; TotalTimePerSample = 0.54355ms; SamplesPerSecond = 1839 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17923231; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.13511s; TotalTimePerSample = 0.54042ms; SamplesPerSecond = 1850 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17075422; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.13630s; TotalTimePerSample = 0.54521ms; SamplesPerSecond = 1834 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14442371; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.13115s; TotalTimePerSample = 0.52461ms; SamplesPerSecond = 1906 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17753819; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.13713s; TotalTimePerSample = 0.54852ms; SamplesPerSecond = 1823 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15087855; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.13776s; TotalTimePerSample = 0.55104ms; SamplesPerSecond = 1814 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19253023; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.13519s; TotalTimePerSample = 0.54077ms; SamplesPerSecond = 1849 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17830684; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.14500s; TotalTimePerSample = 0.58000ms; SamplesPerSecond = 1724 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15115428; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.14318s; TotalTimePerSample = 0.57271ms; SamplesPerSecond = 1746 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19135968; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.14350s; TotalTimePerSample = 0.57402ms; SamplesPerSecond = 1742 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.21491485; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.14562s; TotalTimePerSample = 0.58247ms; SamplesPerSecond = 1716 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18682346; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.13541s; TotalTimePerSample = 0.54166ms; SamplesPerSecond = 1846 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18483205; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14328s; TotalTimePerSample = 0.57310ms; SamplesPerSecond = 1744 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14684504; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.14565s; TotalTimePerSample = 0.58259ms; SamplesPerSecond = 1716 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15322115; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14666s; TotalTimePerSample = 0.58663ms; SamplesPerSecond = 1704 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19882571; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.14043s; TotalTimePerSample = 0.56172ms; SamplesPerSecond = 1780 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13683833; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.14034s; TotalTimePerSample = 0.56138ms; SamplesPerSecond = 1781 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18621188; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.13422s; TotalTimePerSample = 0.53688ms; SamplesPerSecond = 1862 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19408048; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.13252s; TotalTimePerSample = 0.53009ms; SamplesPerSecond = 1886 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17298137; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.14529s; TotalTimePerSample = 0.58114ms; SamplesPerSecond = 1720 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13265130; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.13280s; TotalTimePerSample = 0.53118ms; SamplesPerSecond = 1882 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17627178; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.13158s; TotalTimePerSample = 0.52633ms; SamplesPerSecond = 1899 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12734628; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12716s; TotalTimePerSample = 0.50865ms; SamplesPerSecond = 1965 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15108451; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12641s; TotalTimePerSample = 0.50564ms; SamplesPerSecond = 1977 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19729184; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12382s; TotalTimePerSample = 0.49529ms; SamplesPerSecond = 2019 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12857332; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12221s; TotalTimePerSample = 0.48882ms; SamplesPerSecond = 2045 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13867804; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12107s; TotalTimePerSample = 0.48428ms; SamplesPerSecond = 2064 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12786050; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12283s; TotalTimePerSample = 0.49134ms; SamplesPerSecond = 2035 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16643303; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12578s; TotalTimePerSample = 0.50312ms; SamplesPerSecond = 1987 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20440409; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12240s; TotalTimePerSample = 0.48960ms; SamplesPerSecond = 2042 +MPI Rank 0: Epoch[ 2 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14566238; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12173s; TotalTimePerSample = 0.48692ms; SamplesPerSecond = 2053 +MPI Rank 0: Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 0.20373029; EvalErrPerSample = 0.0827; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.454988 +MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.008000 effective momentum = 0.900000 +MPI Rank 0: starting epoch 2 at record count 20000, and file position 0 +MPI Rank 0: already there from last epoch +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 4, NumGradientBits = 32). +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12590085; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.11986s; TotalTimePerSample = 0.47944ms; SamplesPerSecond = 2085 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17780229; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12511s; TotalTimePerSample = 0.50043ms; SamplesPerSecond = 1998 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14417637; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12349s; TotalTimePerSample = 0.49395ms; SamplesPerSecond = 2024 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15796895; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12131s; TotalTimePerSample = 0.48522ms; SamplesPerSecond = 2060 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17002999; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12305s; TotalTimePerSample = 0.49220ms; SamplesPerSecond = 2031 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18262114; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11573s; TotalTimePerSample = 0.46292ms; SamplesPerSecond = 2160 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14643694; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.11568s; TotalTimePerSample = 0.46271ms; SamplesPerSecond = 2161 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18030528; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12346s; TotalTimePerSample = 0.49385ms; SamplesPerSecond = 2024 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15846150; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12240s; TotalTimePerSample = 0.48958ms; SamplesPerSecond = 2042 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14486534; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12005s; TotalTimePerSample = 0.48020ms; SamplesPerSecond = 2082 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13469093; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.11867s; TotalTimePerSample = 0.47468ms; SamplesPerSecond = 2106 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12647s; TotalTimePerSample = 0.50587ms; SamplesPerSecond = 1976 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.11641295; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12012s; TotalTimePerSample = 0.48047ms; SamplesPerSecond = 2081 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16786647; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.11872s; TotalTimePerSample = 0.47488ms; SamplesPerSecond = 2105 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12811514; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12433s; TotalTimePerSample = 0.49730ms; SamplesPerSecond = 2010 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17257851; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.11777s; TotalTimePerSample = 0.47109ms; SamplesPerSecond = 2122 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17623656; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12099s; TotalTimePerSample = 0.48397ms; SamplesPerSecond = 2066 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14121117; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12178s; TotalTimePerSample = 0.48710ms; SamplesPerSecond = 2052 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19243443; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12140s; TotalTimePerSample = 0.48558ms; SamplesPerSecond = 2059 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20908161; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12905s; TotalTimePerSample = 0.51618ms; SamplesPerSecond = 1937 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18472067; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12856s; TotalTimePerSample = 0.51424ms; SamplesPerSecond = 1944 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18185535; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12184s; TotalTimePerSample = 0.48735ms; SamplesPerSecond = 2051 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14074205; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.13076s; TotalTimePerSample = 0.52305ms; SamplesPerSecond = 1911 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14871620; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11964s; TotalTimePerSample = 0.47857ms; SamplesPerSecond = 2089 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20299704; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12301s; TotalTimePerSample = 0.49202ms; SamplesPerSecond = 2032 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12852038; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12072s; TotalTimePerSample = 0.48289ms; SamplesPerSecond = 2070 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18660439; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12280s; TotalTimePerSample = 0.49120ms; SamplesPerSecond = 2035 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19575997; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12056s; TotalTimePerSample = 0.48223ms; SamplesPerSecond = 2073 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16667676; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12549s; TotalTimePerSample = 0.50197ms; SamplesPerSecond = 1992 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12526169; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12500s; TotalTimePerSample = 0.50000ms; SamplesPerSecond = 2000 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17392131; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12278s; TotalTimePerSample = 0.49111ms; SamplesPerSecond = 2036 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12281615; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12280s; TotalTimePerSample = 0.49121ms; SamplesPerSecond = 2035 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14759390; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.11641s; TotalTimePerSample = 0.46564ms; SamplesPerSecond = 2147 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19801300; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12504s; TotalTimePerSample = 0.50017ms; SamplesPerSecond = 1999 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12593395; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12272s; TotalTimePerSample = 0.49090ms; SamplesPerSecond = 2037 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13756617; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12083s; TotalTimePerSample = 0.48331ms; SamplesPerSecond = 2069 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12856s; TotalTimePerSample = 0.51424ms; SamplesPerSecond = 1944 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16654369; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.13337s; TotalTimePerSample = 0.53348ms; SamplesPerSecond = 1874 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20658951; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12150s; TotalTimePerSample = 0.48602ms; SamplesPerSecond = 2057 +MPI Rank 0: Epoch[ 3 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14583322; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.11497s; TotalTimePerSample = 0.45986ms; SamplesPerSecond = 2174 +MPI Rank 0: Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 0.15948618; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.926197 +MPI Rank 0: Starting Epoch 4: learning rate per sample = 0.008000 effective momentum = 0.900000 +MPI Rank 0: starting epoch 3 at record count 30000, and file position 0 +MPI Rank 0: already there from last epoch +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 4, NumGradientBits = 32). +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12371232; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12039s; TotalTimePerSample = 0.48155ms; SamplesPerSecond = 2076 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18070515; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12069s; TotalTimePerSample = 0.48277ms; SamplesPerSecond = 2071 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14239730; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12713s; TotalTimePerSample = 0.50853ms; SamplesPerSecond = 1966 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15630155; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.11918s; TotalTimePerSample = 0.47671ms; SamplesPerSecond = 2097 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16935525; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.11887s; TotalTimePerSample = 0.47550ms; SamplesPerSecond = 2103 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18198833; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12215s; TotalTimePerSample = 0.48859ms; SamplesPerSecond = 2046 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14475946; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12015s; TotalTimePerSample = 0.48061ms; SamplesPerSecond = 2080 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18021601; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.11829s; TotalTimePerSample = 0.47316ms; SamplesPerSecond = 2113 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15849308; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11851s; TotalTimePerSample = 0.47403ms; SamplesPerSecond = 2109 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14474425; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12129s; TotalTimePerSample = 0.48518ms; SamplesPerSecond = 2061 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13362926; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12019s; TotalTimePerSample = 0.48075ms; SamplesPerSecond = 2080 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13708299; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12225s; TotalTimePerSample = 0.48900ms; SamplesPerSecond = 2045 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.11569777; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12067s; TotalTimePerSample = 0.48266ms; SamplesPerSecond = 2071 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16892331; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.11980s; TotalTimePerSample = 0.47918ms; SamplesPerSecond = 2086 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12752162; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12049s; TotalTimePerSample = 0.48195ms; SamplesPerSecond = 2074 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17100866; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12280s; TotalTimePerSample = 0.49119ms; SamplesPerSecond = 2035 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17660425; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.11920s; TotalTimePerSample = 0.47680ms; SamplesPerSecond = 2097 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14105803; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12263s; TotalTimePerSample = 0.49050ms; SamplesPerSecond = 2038 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19333552; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12588s; TotalTimePerSample = 0.50350ms; SamplesPerSecond = 1986 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20859524; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.14396s; TotalTimePerSample = 0.57582ms; SamplesPerSecond = 1736 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18499677; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12856s; TotalTimePerSample = 0.51423ms; SamplesPerSecond = 1944 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18152438; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12753s; TotalTimePerSample = 0.51010ms; SamplesPerSecond = 1960 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14037157; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.11991s; TotalTimePerSample = 0.47965ms; SamplesPerSecond = 2084 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14866862; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12057s; TotalTimePerSample = 0.48229ms; SamplesPerSecond = 2073 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20347747; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.11798s; TotalTimePerSample = 0.47193ms; SamplesPerSecond = 2118 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12815012; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12094s; TotalTimePerSample = 0.48378ms; SamplesPerSecond = 2067 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18672810; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12091s; TotalTimePerSample = 0.48365ms; SamplesPerSecond = 2067 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19552990; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12224s; TotalTimePerSample = 0.48896ms; SamplesPerSecond = 2045 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16452642; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12373s; TotalTimePerSample = 0.49491ms; SamplesPerSecond = 2020 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12461825; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12109s; TotalTimePerSample = 0.48435ms; SamplesPerSecond = 2064 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17285251; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12231s; TotalTimePerSample = 0.48923ms; SamplesPerSecond = 2044 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12253620; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12255s; TotalTimePerSample = 0.49020ms; SamplesPerSecond = 2039 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14723334; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12410s; TotalTimePerSample = 0.49640ms; SamplesPerSecond = 2014 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19789537; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12264s; TotalTimePerSample = 0.49054ms; SamplesPerSecond = 2038 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12575877; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12440s; TotalTimePerSample = 0.49760ms; SamplesPerSecond = 2009 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13745928; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12623s; TotalTimePerSample = 0.50490ms; SamplesPerSecond = 1980 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12839652; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12027s; TotalTimePerSample = 0.48109ms; SamplesPerSecond = 2078 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16647280; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12139s; TotalTimePerSample = 0.48556ms; SamplesPerSecond = 2059 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20679434; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12487s; TotalTimePerSample = 0.49948ms; SamplesPerSecond = 2002 +MPI Rank 0: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12520s; TotalTimePerSample = 0.50081ms; SamplesPerSecond = 1996 +MPI Rank 0: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.931563 +MPI Rank 0: CNTKCommandTrainEnd: SimpleMultiGPU +MPI Rank 0: COMPLETED +MPI Rank 0: ~MPIWrapper +MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank1 +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: Build info: +MPI Rank 1: +MPI Rank 1: Built time: Oct 24 2015 13:33:25 +MPI Rank 1: Last modified date: Thu Oct 22 16:00:27 2015 +MPI Rank 1: Built by amitaga on Amitaga-Win-DT3 +MPI Rank 1: Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\ +MPI Rank 1: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: running on Amitaga-Win-DT3 at 2015/10/24 21:49:38 +MPI Rank 1: command line: +MPI Rank 1: E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining/SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: deviceId=$DeviceId$ +MPI Rank 1: command=SimpleMultiGPU +MPI Rank 1: precision=float +MPI Rank 1: parallelTrain=true +MPI Rank 1: SimpleMultiGPU=[ +MPI Rank 1: action=train +MPI Rank 1: modelPath=$RunDir$/models/Simple.dnn +MPI Rank 1: deviceId=$DeviceId$ +MPI Rank 1: traceLevel=1 +MPI Rank 1: SimpleNetworkBuilder=[ +MPI Rank 1: layerSizes=2:50*2:2 +MPI Rank 1: trainingCriterion=CrossEntropyWithSoftmax +MPI Rank 1: evalCriterion=ErrorPrediction +MPI Rank 1: layerTypes=Sigmoid +MPI Rank 1: initValueScale=1.0 +MPI Rank 1: applyMeanVarNorm=true +MPI Rank 1: uniformInit=true +MPI Rank 1: needPrior=true +MPI Rank 1: ] +MPI Rank 1: SGD=[ +MPI Rank 1: epochSize=0 +MPI Rank 1: minibatchSize=25 +MPI Rank 1: learningRatesPerMB=0.5:0.2*20:0.1 +MPI Rank 1: momentumPerMB=0.9 +MPI Rank 1: dropoutRate=0.0 +MPI Rank 1: maxEpochs=4 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationMethod=DataParallelSGD +MPI Rank 1: DataParallelSGD=[ +MPI Rank 1: gradientBits=1 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: reader=[ +MPI Rank 1: readerType=UCIFastReader +MPI Rank 1: file=$DataDir$/SimpleDataTrain.txt +MPI Rank 1: miniBatchMode=Partial +MPI Rank 1: randomize=None +MPI Rank 1: verbosity=1 +MPI Rank 1: features=[ +MPI Rank 1: dim=2 +MPI Rank 1: start=0 +MPI Rank 1: ] +MPI Rank 1: labels=[ +MPI Rank 1: start=2 +MPI Rank 1: dim=1 +MPI Rank 1: labelDim=2 +MPI Rank 1: labelMappingFile=$DataDir$/SimpleMapping.txt +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu +MPI Rank 1: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data +MPI Rank 1: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining +MPI Rank 1: DeviceId=0 +MPI Rank 1: precision=float +MPI Rank 1: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: deviceId=0 +MPI Rank 1: command=SimpleMultiGPU +MPI Rank 1: precision=float +MPI Rank 1: parallelTrain=true +MPI Rank 1: SimpleMultiGPU=[ +MPI Rank 1: action=train +MPI Rank 1: modelPath=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 1: deviceId=0 +MPI Rank 1: traceLevel=1 +MPI Rank 1: SimpleNetworkBuilder=[ +MPI Rank 1: layerSizes=2:50*2:2 +MPI Rank 1: trainingCriterion=CrossEntropyWithSoftmax +MPI Rank 1: evalCriterion=ErrorPrediction +MPI Rank 1: layerTypes=Sigmoid +MPI Rank 1: initValueScale=1.0 +MPI Rank 1: applyMeanVarNorm=true +MPI Rank 1: uniformInit=true +MPI Rank 1: needPrior=true +MPI Rank 1: ] +MPI Rank 1: SGD=[ +MPI Rank 1: epochSize=0 +MPI Rank 1: minibatchSize=25 +MPI Rank 1: learningRatesPerMB=0.5:0.2*20:0.1 +MPI Rank 1: momentumPerMB=0.9 +MPI Rank 1: dropoutRate=0.0 +MPI Rank 1: maxEpochs=4 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationMethod=DataParallelSGD +MPI Rank 1: DataParallelSGD=[ +MPI Rank 1: gradientBits=1 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: reader=[ +MPI Rank 1: readerType=UCIFastReader +MPI Rank 1: file=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 1: miniBatchMode=Partial +MPI Rank 1: randomize=None +MPI Rank 1: verbosity=1 +MPI Rank 1: features=[ +MPI Rank 1: dim=2 +MPI Rank 1: start=0 +MPI Rank 1: ] +MPI Rank 1: labels=[ +MPI Rank 1: start=2 +MPI Rank 1: dim=1 +MPI Rank 1: labelDim=2 +MPI Rank 1: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleMapping.txt +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu +MPI Rank 1: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data +MPI Rank 1: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining +MPI Rank 1: DeviceId=0 +MPI Rank 1: precision=float +MPI Rank 1: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: configparameters: SimpleMultiGPU.config:command=SimpleMultiGPU +MPI Rank 1: configparameters: SimpleMultiGPU.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining +MPI Rank 1: configparameters: SimpleMultiGPU.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data +MPI Rank 1: configparameters: SimpleMultiGPU.config:deviceId=0 +MPI Rank 1: configparameters: SimpleMultiGPU.config:parallelTrain=true +MPI Rank 1: configparameters: SimpleMultiGPU.config:precision=float +MPI Rank 1: configparameters: SimpleMultiGPU.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu +MPI Rank 1: configparameters: SimpleMultiGPU.config:SimpleMultiGPU=[ +MPI Rank 1: action=train +MPI Rank 1: modelPath=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 1: deviceId=0 +MPI Rank 1: traceLevel=1 +MPI Rank 1: SimpleNetworkBuilder=[ +MPI Rank 1: layerSizes=2:50*2:2 +MPI Rank 1: trainingCriterion=CrossEntropyWithSoftmax +MPI Rank 1: evalCriterion=ErrorPrediction +MPI Rank 1: layerTypes=Sigmoid +MPI Rank 1: initValueScale=1.0 +MPI Rank 1: applyMeanVarNorm=true +MPI Rank 1: uniformInit=true +MPI Rank 1: needPrior=true +MPI Rank 1: ] +MPI Rank 1: SGD=[ +MPI Rank 1: epochSize=0 +MPI Rank 1: minibatchSize=25 +MPI Rank 1: learningRatesPerMB=0.5:0.2*20:0.1 +MPI Rank 1: momentumPerMB=0.9 +MPI Rank 1: dropoutRate=0.0 +MPI Rank 1: maxEpochs=4 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationMethod=DataParallelSGD +MPI Rank 1: DataParallelSGD=[ +MPI Rank 1: gradientBits=1 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: reader=[ +MPI Rank 1: readerType=UCIFastReader +MPI Rank 1: file=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 1: miniBatchMode=Partial +MPI Rank 1: randomize=None +MPI Rank 1: verbosity=1 +MPI Rank 1: features=[ +MPI Rank 1: dim=2 +MPI Rank 1: start=0 +MPI Rank 1: ] +MPI Rank 1: labels=[ +MPI Rank 1: start=2 +MPI Rank 1: dim=1 +MPI Rank 1: labelDim=2 +MPI Rank 1: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleMapping.txt +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] [SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 1: +MPI Rank 1: configparameters: SimpleMultiGPU.config:stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: command: SimpleMultiGPU +MPI Rank 1: precision = float +MPI Rank 1: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 1: CNTKCommandTrainInfo: SimpleMultiGPU : 4 +MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4 +MPI Rank 1: CNTKCommandTrainBegin: SimpleMultiGPU +MPI Rank 1: SimpleNetworkBuilder Using GPU 0 +MPI Rank 1: reading uci file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 1: SetUniformRandomValue (GPU): creating curand object with seed 1 +MPI Rank 1: GetTrainCriterionNodes ... +MPI Rank 1: GetEvalCriterionNodes ... +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 1: +MPI Rank 1: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 1: +MPI Rank 1: Validating for node CrossEntropyWithSoftmax, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 1: +MPI Rank 1: 9 out of 20 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Precomputing --> 3 PreCompute nodes found. +MPI Rank 1: +MPI Rank 1: NodeName: InvStdOfFeatures +MPI Rank 1: NodeName: MeanOfFeatures +MPI Rank 1: NodeName: Prior +MPI Rank 1: starting at epoch 0 counting lines to determine record count +MPI Rank 1: +MPI Rank 1: 10000 records found +MPI Rank 1: starting epoch 0 at record count 0, and file position 0 +MPI Rank 1: already there from last epoch +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: +MPI Rank 1: Validating for node InvStdOfFeatures, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: +MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node MeanOfFeatures. 2 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: +MPI Rank 1: Validating for node MeanOfFeatures, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 1: +MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node Prior. 2 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 1: +MPI Rank 1: Validating for node Prior. 1 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 1: +MPI Rank 1: Validating for node Prior, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 1: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 1: +MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Precomputing --> Completed. +MPI Rank 1: +MPI Rank 1: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.020000 effective momentum = 0.900000 +MPI Rank 1: starting epoch 0 at record count 0, and file position 0 +MPI Rank 1: already there from last epoch +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 1: +MPI Rank 1: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 1: +MPI Rank 1: Validating for node EvalErrorPrediction, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 1: +MPI Rank 1: 9 out of 20 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 4, NumGradientBits = 32). +MPI Rank 1: DecimateMinibatchSequences: WARNING: Number of parallel utterances 25 not a multiple of number of GPUs 4, GPU usage will be suboptimal. +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 1- 10 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70007977; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.21240s; TotalTimePerSample = 0.84959ms; SamplesPerSecond = 1177 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 11- 20 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71514542; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.17214s; TotalTimePerSample = 0.68857ms; SamplesPerSecond = 1452 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 21- 30 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72945595; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.16860s; TotalTimePerSample = 0.67442ms; SamplesPerSecond = 1482 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 31- 40 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70079058; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.16047s; TotalTimePerSample = 0.64187ms; SamplesPerSecond = 1557 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 41- 50 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70605615; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.16232s; TotalTimePerSample = 0.64926ms; SamplesPerSecond = 1540 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 51- 60 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71572398; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.15152s; TotalTimePerSample = 0.60609ms; SamplesPerSecond = 1649 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 61- 70 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72149851; EvalErr[0]PerSample = 0.48000000; TotalTime = 0.14175s; TotalTimePerSample = 0.56700ms; SamplesPerSecond = 1763 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 71- 80 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.79845604; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.14218s; TotalTimePerSample = 0.56872ms; SamplesPerSecond = 1758 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 81- 90 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69665186; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.13820s; TotalTimePerSample = 0.55280ms; SamplesPerSecond = 1808 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 91- 100 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70723326; EvalErr[0]PerSample = 0.49200000; TotalTime = 0.12737s; TotalTimePerSample = 0.50948ms; SamplesPerSecond = 1962 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 101- 110 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71420345; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.13145s; TotalTimePerSample = 0.52581ms; SamplesPerSecond = 1901 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 111- 120 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69535258; EvalErr[0]PerSample = 0.43600000; TotalTime = 0.12711s; TotalTimePerSample = 0.50843ms; SamplesPerSecond = 1966 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 121- 130 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70078531; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12442s; TotalTimePerSample = 0.49766ms; SamplesPerSecond = 2009 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 131- 140 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71857916; EvalErr[0]PerSample = 0.54800000; TotalTime = 0.11861s; TotalTimePerSample = 0.47442ms; SamplesPerSecond = 2107 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 141- 150 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72088358; EvalErr[0]PerSample = 0.48800000; TotalTime = 0.11955s; TotalTimePerSample = 0.47821ms; SamplesPerSecond = 2091 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 151- 160 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71798840; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.11551s; TotalTimePerSample = 0.46204ms; SamplesPerSecond = 2164 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 161- 170 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.74162164; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.11801s; TotalTimePerSample = 0.47202ms; SamplesPerSecond = 2118 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 171- 180 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71835128; EvalErr[0]PerSample = 0.51600000; TotalTime = 0.11850s; TotalTimePerSample = 0.47402ms; SamplesPerSecond = 2109 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 181- 190 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71529463; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12523s; TotalTimePerSample = 0.50093ms; SamplesPerSecond = 1996 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 191- 200 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71727657; EvalErr[0]PerSample = 0.53200000; TotalTime = 0.12007s; TotalTimePerSample = 0.48030ms; SamplesPerSecond = 2082 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 201- 210 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71745517; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.11898s; TotalTimePerSample = 0.47592ms; SamplesPerSecond = 2101 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 211- 220 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72088398; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.11488s; TotalTimePerSample = 0.45952ms; SamplesPerSecond = 2176 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 221- 230 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72006809; EvalErr[0]PerSample = 0.50800000; TotalTime = 0.12370s; TotalTimePerSample = 0.49478ms; SamplesPerSecond = 2021 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 231- 240 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71275468; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12142s; TotalTimePerSample = 0.48566ms; SamplesPerSecond = 2059 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 241- 250 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69644781; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.11773s; TotalTimePerSample = 0.47092ms; SamplesPerSecond = 2123 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 251- 260 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70129698; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12147s; TotalTimePerSample = 0.48588ms; SamplesPerSecond = 2058 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 261- 270 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70768095; EvalErr[0]PerSample = 0.54400000; TotalTime = 0.12122s; TotalTimePerSample = 0.48487ms; SamplesPerSecond = 2062 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 271- 280 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69744379; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.11718s; TotalTimePerSample = 0.46871ms; SamplesPerSecond = 2133 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 281- 290 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69266187; EvalErr[0]PerSample = 0.44800000; TotalTime = 0.11804s; TotalTimePerSample = 0.47215ms; SamplesPerSecond = 2117 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 291- 300 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69347266; EvalErr[0]PerSample = 0.49600000; TotalTime = 0.12116s; TotalTimePerSample = 0.48466ms; SamplesPerSecond = 2063 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 301- 310 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69257409; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12555s; TotalTimePerSample = 0.50218ms; SamplesPerSecond = 1991 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 311- 320 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.68625741; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12602s; TotalTimePerSample = 0.50407ms; SamplesPerSecond = 1983 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 321- 330 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69064011; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12332s; TotalTimePerSample = 0.49329ms; SamplesPerSecond = 2027 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 331- 340 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70192154; EvalErr[0]PerSample = 0.46000000; TotalTime = 0.12449s; TotalTimePerSample = 0.49795ms; SamplesPerSecond = 2008 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 341- 350 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69058912; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12387s; TotalTimePerSample = 0.49546ms; SamplesPerSecond = 2018 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 351- 360 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.67041492; EvalErr[0]PerSample = 0.39200000; TotalTime = 0.12148s; TotalTimePerSample = 0.48592ms; SamplesPerSecond = 2057 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 361- 370 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.65913973; EvalErr[0]PerSample = 0.35600000; TotalTime = 0.12846s; TotalTimePerSample = 0.51382ms; SamplesPerSecond = 1946 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 371- 380 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.63919877; EvalErr[0]PerSample = 0.36400000; TotalTime = 0.14472s; TotalTimePerSample = 0.57890ms; SamplesPerSecond = 1727 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 381- 390 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.61293883; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.14464s; TotalTimePerSample = 0.57857ms; SamplesPerSecond = 1728 +MPI Rank 1: Epoch[ 1 of 4]-Minibatch[ 391- 400 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.55255352; EvalErr[0]PerSample = 0.18800000; TotalTime = 0.14544s; TotalTimePerSample = 0.58177ms; SamplesPerSecond = 1718 +MPI Rank 1: Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.4735; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.368273 +MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.008000 effective momentum = 0.900000 +MPI Rank 1: starting epoch 1 at record count 10000, and file position 0 +MPI Rank 1: already there from last epoch +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 4, NumGradientBits = 32). +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.50774625; EvalErr[0]PerSample = 0.24000000; TotalTime = 0.13816s; TotalTimePerSample = 0.55263ms; SamplesPerSecond = 1809 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.43388927; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.13876s; TotalTimePerSample = 0.55503ms; SamplesPerSecond = 1801 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.36674870; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.14014s; TotalTimePerSample = 0.56057ms; SamplesPerSecond = 1783 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.33768765; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.13764s; TotalTimePerSample = 0.55054ms; SamplesPerSecond = 1816 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.30320946; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12862s; TotalTimePerSample = 0.51449ms; SamplesPerSecond = 1943 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.29576043; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.14846s; TotalTimePerSample = 0.59382ms; SamplesPerSecond = 1684 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.24924491; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14674s; TotalTimePerSample = 0.58698ms; SamplesPerSecond = 1703 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.24632415; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.14428s; TotalTimePerSample = 0.57712ms; SamplesPerSecond = 1732 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20943158; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.13610s; TotalTimePerSample = 0.54438ms; SamplesPerSecond = 1836 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19115996; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.13467s; TotalTimePerSample = 0.53866ms; SamplesPerSecond = 1856 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17923231; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.13753s; TotalTimePerSample = 0.55014ms; SamplesPerSecond = 1817 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17075422; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.13578s; TotalTimePerSample = 0.54313ms; SamplesPerSecond = 1841 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14442371; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.13113s; TotalTimePerSample = 0.52451ms; SamplesPerSecond = 1906 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17753819; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.13552s; TotalTimePerSample = 0.54206ms; SamplesPerSecond = 1844 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15087855; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.13849s; TotalTimePerSample = 0.55396ms; SamplesPerSecond = 1805 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19253023; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.13517s; TotalTimePerSample = 0.54068ms; SamplesPerSecond = 1849 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17830684; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.14493s; TotalTimePerSample = 0.57970ms; SamplesPerSecond = 1725 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15115428; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.14346s; TotalTimePerSample = 0.57384ms; SamplesPerSecond = 1742 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19135968; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.14339s; TotalTimePerSample = 0.57354ms; SamplesPerSecond = 1743 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.21491485; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.14541s; TotalTimePerSample = 0.58166ms; SamplesPerSecond = 1719 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18682346; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.13568s; TotalTimePerSample = 0.54271ms; SamplesPerSecond = 1842 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18483205; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14328s; TotalTimePerSample = 0.57313ms; SamplesPerSecond = 1744 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14684504; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.14565s; TotalTimePerSample = 0.58262ms; SamplesPerSecond = 1716 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15322115; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14660s; TotalTimePerSample = 0.58641ms; SamplesPerSecond = 1705 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19882571; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.14054s; TotalTimePerSample = 0.56215ms; SamplesPerSecond = 1778 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13683833; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.14033s; TotalTimePerSample = 0.56131ms; SamplesPerSecond = 1781 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18621188; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.13395s; TotalTimePerSample = 0.53582ms; SamplesPerSecond = 1866 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19408048; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.13231s; TotalTimePerSample = 0.52925ms; SamplesPerSecond = 1889 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17298137; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.14681s; TotalTimePerSample = 0.58725ms; SamplesPerSecond = 1702 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13265130; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.13163s; TotalTimePerSample = 0.52651ms; SamplesPerSecond = 1899 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17627178; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.13127s; TotalTimePerSample = 0.52509ms; SamplesPerSecond = 1904 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12734628; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12747s; TotalTimePerSample = 0.50988ms; SamplesPerSecond = 1961 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15108451; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12633s; TotalTimePerSample = 0.50532ms; SamplesPerSecond = 1978 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19729184; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12381s; TotalTimePerSample = 0.49523ms; SamplesPerSecond = 2019 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12857332; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12245s; TotalTimePerSample = 0.48981ms; SamplesPerSecond = 2041 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13867804; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12077s; TotalTimePerSample = 0.48310ms; SamplesPerSecond = 2069 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12786050; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12252s; TotalTimePerSample = 0.49008ms; SamplesPerSecond = 2040 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16643303; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12600s; TotalTimePerSample = 0.50401ms; SamplesPerSecond = 1984 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20440409; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12218s; TotalTimePerSample = 0.48874ms; SamplesPerSecond = 2046 +MPI Rank 1: Epoch[ 2 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14566238; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12172s; TotalTimePerSample = 0.48689ms; SamplesPerSecond = 2053 +MPI Rank 1: Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 0.20373029; EvalErrPerSample = 0.0827; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.454826 +MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.008000 effective momentum = 0.900000 +MPI Rank 1: starting epoch 2 at record count 20000, and file position 0 +MPI Rank 1: already there from last epoch +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 4, NumGradientBits = 32). +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12590085; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12007s; TotalTimePerSample = 0.48028ms; SamplesPerSecond = 2082 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17780229; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12506s; TotalTimePerSample = 0.50025ms; SamplesPerSecond = 1999 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14417637; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12289s; TotalTimePerSample = 0.49155ms; SamplesPerSecond = 2034 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15796895; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12196s; TotalTimePerSample = 0.48784ms; SamplesPerSecond = 2049 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17002999; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12311s; TotalTimePerSample = 0.49244ms; SamplesPerSecond = 2030 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18262114; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11534s; TotalTimePerSample = 0.46136ms; SamplesPerSecond = 2167 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14643694; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.11576s; TotalTimePerSample = 0.46302ms; SamplesPerSecond = 2159 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18030528; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12379s; TotalTimePerSample = 0.49516ms; SamplesPerSecond = 2019 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15846150; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12204s; TotalTimePerSample = 0.48814ms; SamplesPerSecond = 2048 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14486534; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12041s; TotalTimePerSample = 0.48164ms; SamplesPerSecond = 2076 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13469093; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.11856s; TotalTimePerSample = 0.47423ms; SamplesPerSecond = 2108 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12682s; TotalTimePerSample = 0.50730ms; SamplesPerSecond = 1971 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.11641295; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.11948s; TotalTimePerSample = 0.47792ms; SamplesPerSecond = 2092 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16786647; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.11879s; TotalTimePerSample = 0.47514ms; SamplesPerSecond = 2104 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12811514; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12407s; TotalTimePerSample = 0.49630ms; SamplesPerSecond = 2014 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17257851; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.11792s; TotalTimePerSample = 0.47167ms; SamplesPerSecond = 2120 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17623656; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12138s; TotalTimePerSample = 0.48552ms; SamplesPerSecond = 2059 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14121117; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12145s; TotalTimePerSample = 0.48581ms; SamplesPerSecond = 2058 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19243443; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12131s; TotalTimePerSample = 0.48526ms; SamplesPerSecond = 2060 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20908161; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12898s; TotalTimePerSample = 0.51590ms; SamplesPerSecond = 1938 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18472067; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12863s; TotalTimePerSample = 0.51450ms; SamplesPerSecond = 1943 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18185535; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12226s; TotalTimePerSample = 0.48902ms; SamplesPerSecond = 2044 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14074205; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.13032s; TotalTimePerSample = 0.52128ms; SamplesPerSecond = 1918 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14871620; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12011s; TotalTimePerSample = 0.48045ms; SamplesPerSecond = 2081 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20299704; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12255s; TotalTimePerSample = 0.49020ms; SamplesPerSecond = 2040 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12852038; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12023s; TotalTimePerSample = 0.48093ms; SamplesPerSecond = 2079 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18660439; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12387s; TotalTimePerSample = 0.49548ms; SamplesPerSecond = 2018 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19575997; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12145s; TotalTimePerSample = 0.48581ms; SamplesPerSecond = 2058 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16667676; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12442s; TotalTimePerSample = 0.49768ms; SamplesPerSecond = 2009 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12526169; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12472s; TotalTimePerSample = 0.49888ms; SamplesPerSecond = 2004 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17392131; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12284s; TotalTimePerSample = 0.49135ms; SamplesPerSecond = 2035 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12281615; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12305s; TotalTimePerSample = 0.49218ms; SamplesPerSecond = 2031 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14759390; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.11614s; TotalTimePerSample = 0.46454ms; SamplesPerSecond = 2152 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19801300; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12518s; TotalTimePerSample = 0.50072ms; SamplesPerSecond = 1997 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12593395; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12264s; TotalTimePerSample = 0.49056ms; SamplesPerSecond = 2038 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13756617; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12058s; TotalTimePerSample = 0.48234ms; SamplesPerSecond = 2073 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12910s; TotalTimePerSample = 0.51639ms; SamplesPerSecond = 1936 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16654369; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.13325s; TotalTimePerSample = 0.53300ms; SamplesPerSecond = 1876 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20658951; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12111s; TotalTimePerSample = 0.48445ms; SamplesPerSecond = 2064 +MPI Rank 1: Epoch[ 3 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14583322; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.11517s; TotalTimePerSample = 0.46067ms; SamplesPerSecond = 2170 +MPI Rank 1: Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 0.15948618; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.926445 +MPI Rank 1: Starting Epoch 4: learning rate per sample = 0.008000 effective momentum = 0.900000 +MPI Rank 1: starting epoch 3 at record count 30000, and file position 0 +MPI Rank 1: already there from last epoch +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 4, NumGradientBits = 32). +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12371232; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12017s; TotalTimePerSample = 0.48068ms; SamplesPerSecond = 2080 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18070515; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12035s; TotalTimePerSample = 0.48139ms; SamplesPerSecond = 2077 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14239730; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12719s; TotalTimePerSample = 0.50874ms; SamplesPerSecond = 1965 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15630155; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.11979s; TotalTimePerSample = 0.47914ms; SamplesPerSecond = 2087 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16935525; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.11890s; TotalTimePerSample = 0.47562ms; SamplesPerSecond = 2102 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18198833; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12220s; TotalTimePerSample = 0.48880ms; SamplesPerSecond = 2045 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14475946; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12005s; TotalTimePerSample = 0.48020ms; SamplesPerSecond = 2082 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18021601; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.11799s; TotalTimePerSample = 0.47197ms; SamplesPerSecond = 2118 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15849308; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11827s; TotalTimePerSample = 0.47310ms; SamplesPerSecond = 2113 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14474425; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12205s; TotalTimePerSample = 0.48818ms; SamplesPerSecond = 2048 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13362926; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.11950s; TotalTimePerSample = 0.47799ms; SamplesPerSecond = 2092 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13708299; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12274s; TotalTimePerSample = 0.49095ms; SamplesPerSecond = 2036 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.11569777; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12041s; TotalTimePerSample = 0.48166ms; SamplesPerSecond = 2076 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16892331; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12018s; TotalTimePerSample = 0.48073ms; SamplesPerSecond = 2080 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12752162; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.11985s; TotalTimePerSample = 0.47938ms; SamplesPerSecond = 2086 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17100866; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12315s; TotalTimePerSample = 0.49262ms; SamplesPerSecond = 2029 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17660425; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.11828s; TotalTimePerSample = 0.47312ms; SamplesPerSecond = 2113 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14105803; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12278s; TotalTimePerSample = 0.49111ms; SamplesPerSecond = 2036 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19333552; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12607s; TotalTimePerSample = 0.50429ms; SamplesPerSecond = 1982 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20859524; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.14369s; TotalTimePerSample = 0.57476ms; SamplesPerSecond = 1739 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18499677; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12830s; TotalTimePerSample = 0.51322ms; SamplesPerSecond = 1948 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18152438; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12787s; TotalTimePerSample = 0.51148ms; SamplesPerSecond = 1955 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14037157; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.11950s; TotalTimePerSample = 0.47798ms; SamplesPerSecond = 2092 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14866862; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12108s; TotalTimePerSample = 0.48433ms; SamplesPerSecond = 2064 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20347747; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.11742s; TotalTimePerSample = 0.46968ms; SamplesPerSecond = 2129 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12815012; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12145s; TotalTimePerSample = 0.48580ms; SamplesPerSecond = 2058 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18672810; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12065s; TotalTimePerSample = 0.48258ms; SamplesPerSecond = 2072 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19552990; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12248s; TotalTimePerSample = 0.48990ms; SamplesPerSecond = 2041 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16452642; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12369s; TotalTimePerSample = 0.49474ms; SamplesPerSecond = 2021 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12461825; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12129s; TotalTimePerSample = 0.48516ms; SamplesPerSecond = 2061 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17285251; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12227s; TotalTimePerSample = 0.48908ms; SamplesPerSecond = 2044 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12253620; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12242s; TotalTimePerSample = 0.48967ms; SamplesPerSecond = 2042 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14723334; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12409s; TotalTimePerSample = 0.49637ms; SamplesPerSecond = 2014 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19789537; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12229s; TotalTimePerSample = 0.48915ms; SamplesPerSecond = 2044 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12575877; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12467s; TotalTimePerSample = 0.49868ms; SamplesPerSecond = 2005 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13745928; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12625s; TotalTimePerSample = 0.50498ms; SamplesPerSecond = 1980 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12839652; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12036s; TotalTimePerSample = 0.48143ms; SamplesPerSecond = 2077 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16647280; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12133s; TotalTimePerSample = 0.48530ms; SamplesPerSecond = 2060 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20679434; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12414s; TotalTimePerSample = 0.49655ms; SamplesPerSecond = 2013 +MPI Rank 1: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12625s; TotalTimePerSample = 0.50498ms; SamplesPerSecond = 1980 +MPI Rank 1: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.931591 +MPI Rank 1: CNTKCommandTrainEnd: SimpleMultiGPU +MPI Rank 1: COMPLETED +MPI Rank 1: ~MPIWrapper +MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank2 +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: Build info: +MPI Rank 2: +MPI Rank 2: Built time: Oct 24 2015 13:33:25 +MPI Rank 2: Last modified date: Thu Oct 22 16:00:27 2015 +MPI Rank 2: Built by amitaga on Amitaga-Win-DT3 +MPI Rank 2: Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\ +MPI Rank 2: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: running on Amitaga-Win-DT3 at 2015/10/24 21:49:39 +MPI Rank 2: command line: +MPI Rank 2: E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining/SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: deviceId=$DeviceId$ +MPI Rank 2: command=SimpleMultiGPU +MPI Rank 2: precision=float +MPI Rank 2: parallelTrain=true +MPI Rank 2: SimpleMultiGPU=[ +MPI Rank 2: action=train +MPI Rank 2: modelPath=$RunDir$/models/Simple.dnn +MPI Rank 2: deviceId=$DeviceId$ +MPI Rank 2: traceLevel=1 +MPI Rank 2: SimpleNetworkBuilder=[ +MPI Rank 2: layerSizes=2:50*2:2 +MPI Rank 2: trainingCriterion=CrossEntropyWithSoftmax +MPI Rank 2: evalCriterion=ErrorPrediction +MPI Rank 2: layerTypes=Sigmoid +MPI Rank 2: initValueScale=1.0 +MPI Rank 2: applyMeanVarNorm=true +MPI Rank 2: uniformInit=true +MPI Rank 2: needPrior=true +MPI Rank 2: ] +MPI Rank 2: SGD=[ +MPI Rank 2: epochSize=0 +MPI Rank 2: minibatchSize=25 +MPI Rank 2: learningRatesPerMB=0.5:0.2*20:0.1 +MPI Rank 2: momentumPerMB=0.9 +MPI Rank 2: dropoutRate=0.0 +MPI Rank 2: maxEpochs=4 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationMethod=DataParallelSGD +MPI Rank 2: DataParallelSGD=[ +MPI Rank 2: gradientBits=1 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: reader=[ +MPI Rank 2: readerType=UCIFastReader +MPI Rank 2: file=$DataDir$/SimpleDataTrain.txt +MPI Rank 2: miniBatchMode=Partial +MPI Rank 2: randomize=None +MPI Rank 2: verbosity=1 +MPI Rank 2: features=[ +MPI Rank 2: dim=2 +MPI Rank 2: start=0 +MPI Rank 2: ] +MPI Rank 2: labels=[ +MPI Rank 2: start=2 +MPI Rank 2: dim=1 +MPI Rank 2: labelDim=2 +MPI Rank 2: labelMappingFile=$DataDir$/SimpleMapping.txt +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu +MPI Rank 2: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data +MPI Rank 2: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining +MPI Rank 2: DeviceId=0 +MPI Rank 2: precision=float +MPI Rank 2: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: deviceId=0 +MPI Rank 2: command=SimpleMultiGPU +MPI Rank 2: precision=float +MPI Rank 2: parallelTrain=true +MPI Rank 2: SimpleMultiGPU=[ +MPI Rank 2: action=train +MPI Rank 2: modelPath=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 2: deviceId=0 +MPI Rank 2: traceLevel=1 +MPI Rank 2: SimpleNetworkBuilder=[ +MPI Rank 2: layerSizes=2:50*2:2 +MPI Rank 2: trainingCriterion=CrossEntropyWithSoftmax +MPI Rank 2: evalCriterion=ErrorPrediction +MPI Rank 2: layerTypes=Sigmoid +MPI Rank 2: initValueScale=1.0 +MPI Rank 2: applyMeanVarNorm=true +MPI Rank 2: uniformInit=true +MPI Rank 2: needPrior=true +MPI Rank 2: ] +MPI Rank 2: SGD=[ +MPI Rank 2: epochSize=0 +MPI Rank 2: minibatchSize=25 +MPI Rank 2: learningRatesPerMB=0.5:0.2*20:0.1 +MPI Rank 2: momentumPerMB=0.9 +MPI Rank 2: dropoutRate=0.0 +MPI Rank 2: maxEpochs=4 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationMethod=DataParallelSGD +MPI Rank 2: DataParallelSGD=[ +MPI Rank 2: gradientBits=1 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: reader=[ +MPI Rank 2: readerType=UCIFastReader +MPI Rank 2: file=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 2: miniBatchMode=Partial +MPI Rank 2: randomize=None +MPI Rank 2: verbosity=1 +MPI Rank 2: features=[ +MPI Rank 2: dim=2 +MPI Rank 2: start=0 +MPI Rank 2: ] +MPI Rank 2: labels=[ +MPI Rank 2: start=2 +MPI Rank 2: dim=1 +MPI Rank 2: labelDim=2 +MPI Rank 2: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleMapping.txt +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu +MPI Rank 2: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data +MPI Rank 2: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining +MPI Rank 2: DeviceId=0 +MPI Rank 2: precision=float +MPI Rank 2: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: configparameters: SimpleMultiGPU.config:command=SimpleMultiGPU +MPI Rank 2: configparameters: SimpleMultiGPU.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining +MPI Rank 2: configparameters: SimpleMultiGPU.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data +MPI Rank 2: configparameters: SimpleMultiGPU.config:deviceId=0 +MPI Rank 2: configparameters: SimpleMultiGPU.config:parallelTrain=true +MPI Rank 2: configparameters: SimpleMultiGPU.config:precision=float +MPI Rank 2: configparameters: SimpleMultiGPU.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu +MPI Rank 2: configparameters: SimpleMultiGPU.config:SimpleMultiGPU=[ +MPI Rank 2: action=train +MPI Rank 2: modelPath=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 2: deviceId=0 +MPI Rank 2: traceLevel=1 +MPI Rank 2: SimpleNetworkBuilder=[ +MPI Rank 2: layerSizes=2:50*2:2 +MPI Rank 2: trainingCriterion=CrossEntropyWithSoftmax +MPI Rank 2: evalCriterion=ErrorPrediction +MPI Rank 2: layerTypes=Sigmoid +MPI Rank 2: initValueScale=1.0 +MPI Rank 2: applyMeanVarNorm=true +MPI Rank 2: uniformInit=true +MPI Rank 2: needPrior=true +MPI Rank 2: ] +MPI Rank 2: SGD=[ +MPI Rank 2: epochSize=0 +MPI Rank 2: minibatchSize=25 +MPI Rank 2: learningRatesPerMB=0.5:0.2*20:0.1 +MPI Rank 2: momentumPerMB=0.9 +MPI Rank 2: dropoutRate=0.0 +MPI Rank 2: maxEpochs=4 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationMethod=DataParallelSGD +MPI Rank 2: DataParallelSGD=[ +MPI Rank 2: gradientBits=1 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: reader=[ +MPI Rank 2: readerType=UCIFastReader +MPI Rank 2: file=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 2: miniBatchMode=Partial +MPI Rank 2: randomize=None +MPI Rank 2: verbosity=1 +MPI Rank 2: features=[ +MPI Rank 2: dim=2 +MPI Rank 2: start=0 +MPI Rank 2: ] +MPI Rank 2: labels=[ +MPI Rank 2: start=2 +MPI Rank 2: dim=1 +MPI Rank 2: labelDim=2 +MPI Rank 2: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleMapping.txt +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: ] [SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 2: +MPI Rank 2: configparameters: SimpleMultiGPU.config:stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: command: SimpleMultiGPU +MPI Rank 2: precision = float +MPI Rank 2: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 2: CNTKCommandTrainInfo: SimpleMultiGPU : 4 +MPI Rank 2: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4 +MPI Rank 2: CNTKCommandTrainBegin: SimpleMultiGPU +MPI Rank 2: SimpleNetworkBuilder Using GPU 0 +MPI Rank 2: reading uci file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 2: SetUniformRandomValue (GPU): creating curand object with seed 1 +MPI Rank 2: GetTrainCriterionNodes ... +MPI Rank 2: GetEvalCriterionNodes ... +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 2: +MPI Rank 2: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 2: +MPI Rank 2: Validating for node CrossEntropyWithSoftmax, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 2: +MPI Rank 2: 9 out of 20 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Precomputing --> 3 PreCompute nodes found. +MPI Rank 2: +MPI Rank 2: NodeName: InvStdOfFeatures +MPI Rank 2: NodeName: MeanOfFeatures +MPI Rank 2: NodeName: Prior +MPI Rank 2: starting at epoch 0 counting lines to determine record count +MPI Rank 2: +MPI Rank 2: 10000 records found +MPI Rank 2: starting epoch 0 at record count 0, and file position 0 +MPI Rank 2: already there from last epoch +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: +MPI Rank 2: Validating for node InvStdOfFeatures, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: +MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node MeanOfFeatures. 2 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: +MPI Rank 2: Validating for node MeanOfFeatures, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 2: +MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node Prior. 2 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 2: +MPI Rank 2: Validating for node Prior. 1 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 2: +MPI Rank 2: Validating for node Prior, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 2: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 2: +MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Precomputing --> Completed. +MPI Rank 2: +MPI Rank 2: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.020000 effective momentum = 0.900000 +MPI Rank 2: starting epoch 0 at record count 0, and file position 0 +MPI Rank 2: already there from last epoch +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 2: +MPI Rank 2: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 2: +MPI Rank 2: Validating for node EvalErrorPrediction, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 2: +MPI Rank 2: 9 out of 20 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 4, NumGradientBits = 32). +MPI Rank 2: DecimateMinibatchSequences: WARNING: Number of parallel utterances 25 not a multiple of number of GPUs 4, GPU usage will be suboptimal. +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 1- 10 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70007977; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.21351s; TotalTimePerSample = 0.85404ms; SamplesPerSecond = 1170 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 11- 20 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71514542; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.17145s; TotalTimePerSample = 0.68580ms; SamplesPerSecond = 1458 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 21- 30 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72945595; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.16870s; TotalTimePerSample = 0.67481ms; SamplesPerSecond = 1481 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 31- 40 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70079058; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.16017s; TotalTimePerSample = 0.64067ms; SamplesPerSecond = 1560 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 41- 50 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70605615; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.16272s; TotalTimePerSample = 0.65088ms; SamplesPerSecond = 1536 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 51- 60 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71572398; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.15165s; TotalTimePerSample = 0.60661ms; SamplesPerSecond = 1648 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 61- 70 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72149851; EvalErr[0]PerSample = 0.48000000; TotalTime = 0.14150s; TotalTimePerSample = 0.56598ms; SamplesPerSecond = 1766 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 71- 80 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.79845604; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.14222s; TotalTimePerSample = 0.56890ms; SamplesPerSecond = 1757 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 81- 90 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69665186; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.13869s; TotalTimePerSample = 0.55474ms; SamplesPerSecond = 1802 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 91- 100 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70723326; EvalErr[0]PerSample = 0.49200000; TotalTime = 0.12734s; TotalTimePerSample = 0.50937ms; SamplesPerSecond = 1963 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 101- 110 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71420345; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.13083s; TotalTimePerSample = 0.52330ms; SamplesPerSecond = 1910 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 111- 120 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69535258; EvalErr[0]PerSample = 0.43600000; TotalTime = 0.12776s; TotalTimePerSample = 0.51105ms; SamplesPerSecond = 1956 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 121- 130 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70078531; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12404s; TotalTimePerSample = 0.49614ms; SamplesPerSecond = 2015 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 131- 140 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71857916; EvalErr[0]PerSample = 0.54800000; TotalTime = 0.11920s; TotalTimePerSample = 0.47680ms; SamplesPerSecond = 2097 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 141- 150 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72088358; EvalErr[0]PerSample = 0.48800000; TotalTime = 0.11919s; TotalTimePerSample = 0.47676ms; SamplesPerSecond = 2097 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 151- 160 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71798840; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.11688s; TotalTimePerSample = 0.46752ms; SamplesPerSecond = 2138 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 161- 170 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.74162164; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.11731s; TotalTimePerSample = 0.46925ms; SamplesPerSecond = 2131 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 171- 180 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71835128; EvalErr[0]PerSample = 0.51600000; TotalTime = 0.11878s; TotalTimePerSample = 0.47512ms; SamplesPerSecond = 2104 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 181- 190 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71529463; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12494s; TotalTimePerSample = 0.49977ms; SamplesPerSecond = 2000 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 191- 200 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71727657; EvalErr[0]PerSample = 0.53200000; TotalTime = 0.11999s; TotalTimePerSample = 0.47998ms; SamplesPerSecond = 2083 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 201- 210 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71745517; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.11901s; TotalTimePerSample = 0.47604ms; SamplesPerSecond = 2100 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 211- 220 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72088398; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.11499s; TotalTimePerSample = 0.45995ms; SamplesPerSecond = 2174 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 221- 230 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72006809; EvalErr[0]PerSample = 0.50800000; TotalTime = 0.12383s; TotalTimePerSample = 0.49534ms; SamplesPerSecond = 2018 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 231- 240 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71275468; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12128s; TotalTimePerSample = 0.48511ms; SamplesPerSecond = 2061 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 241- 250 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69644781; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.11778s; TotalTimePerSample = 0.47112ms; SamplesPerSecond = 2122 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 251- 260 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70129698; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12175s; TotalTimePerSample = 0.48699ms; SamplesPerSecond = 2053 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 261- 270 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70768095; EvalErr[0]PerSample = 0.54400000; TotalTime = 0.12129s; TotalTimePerSample = 0.48515ms; SamplesPerSecond = 2061 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 271- 280 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69744379; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.11708s; TotalTimePerSample = 0.46833ms; SamplesPerSecond = 2135 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 281- 290 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69266187; EvalErr[0]PerSample = 0.44800000; TotalTime = 0.11805s; TotalTimePerSample = 0.47221ms; SamplesPerSecond = 2117 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 291- 300 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69347266; EvalErr[0]PerSample = 0.49600000; TotalTime = 0.12085s; TotalTimePerSample = 0.48341ms; SamplesPerSecond = 2068 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 301- 310 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69257409; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12586s; TotalTimePerSample = 0.50342ms; SamplesPerSecond = 1986 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 311- 320 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.68625741; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12600s; TotalTimePerSample = 0.50399ms; SamplesPerSecond = 1984 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 321- 330 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69064011; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12351s; TotalTimePerSample = 0.49405ms; SamplesPerSecond = 2024 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 331- 340 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70192154; EvalErr[0]PerSample = 0.46000000; TotalTime = 0.12385s; TotalTimePerSample = 0.49541ms; SamplesPerSecond = 2018 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 341- 350 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69058912; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12401s; TotalTimePerSample = 0.49606ms; SamplesPerSecond = 2015 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 351- 360 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.67041492; EvalErr[0]PerSample = 0.39200000; TotalTime = 0.12184s; TotalTimePerSample = 0.48736ms; SamplesPerSecond = 2051 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 361- 370 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.65913973; EvalErr[0]PerSample = 0.35600000; TotalTime = 0.12859s; TotalTimePerSample = 0.51435ms; SamplesPerSecond = 1944 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 371- 380 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.63919877; EvalErr[0]PerSample = 0.36400000; TotalTime = 0.14455s; TotalTimePerSample = 0.57820ms; SamplesPerSecond = 1729 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 381- 390 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.61293883; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.14490s; TotalTimePerSample = 0.57959ms; SamplesPerSecond = 1725 +MPI Rank 2: Epoch[ 1 of 4]-Minibatch[ 391- 400 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.55255352; EvalErr[0]PerSample = 0.18800000; TotalTime = 0.14556s; TotalTimePerSample = 0.58222ms; SamplesPerSecond = 1717 +MPI Rank 2: Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.4735; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.368498 +MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.008000 effective momentum = 0.900000 +MPI Rank 2: starting epoch 1 at record count 10000, and file position 0 +MPI Rank 2: already there from last epoch +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 4, NumGradientBits = 32). +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.50774625; EvalErr[0]PerSample = 0.24000000; TotalTime = 0.13794s; TotalTimePerSample = 0.55177ms; SamplesPerSecond = 1812 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.43388927; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.13922s; TotalTimePerSample = 0.55688ms; SamplesPerSecond = 1795 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.36674870; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.14013s; TotalTimePerSample = 0.56053ms; SamplesPerSecond = 1784 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.33768765; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.13721s; TotalTimePerSample = 0.54884ms; SamplesPerSecond = 1822 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.30320946; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12919s; TotalTimePerSample = 0.51676ms; SamplesPerSecond = 1935 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.29576043; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.14802s; TotalTimePerSample = 0.59206ms; SamplesPerSecond = 1689 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.24924491; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14685s; TotalTimePerSample = 0.58740ms; SamplesPerSecond = 1702 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.24632415; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.14440s; TotalTimePerSample = 0.57762ms; SamplesPerSecond = 1731 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20943158; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.13614s; TotalTimePerSample = 0.54457ms; SamplesPerSecond = 1836 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19115996; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.13503s; TotalTimePerSample = 0.54011ms; SamplesPerSecond = 1851 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17923231; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.13615s; TotalTimePerSample = 0.54460ms; SamplesPerSecond = 1836 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17075422; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.13632s; TotalTimePerSample = 0.54526ms; SamplesPerSecond = 1833 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14442371; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.13087s; TotalTimePerSample = 0.52350ms; SamplesPerSecond = 1910 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17753819; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.13640s; TotalTimePerSample = 0.54560ms; SamplesPerSecond = 1832 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15087855; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.13907s; TotalTimePerSample = 0.55627ms; SamplesPerSecond = 1797 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19253023; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.13523s; TotalTimePerSample = 0.54090ms; SamplesPerSecond = 1848 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17830684; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.14474s; TotalTimePerSample = 0.57897ms; SamplesPerSecond = 1727 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15115428; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.14352s; TotalTimePerSample = 0.57407ms; SamplesPerSecond = 1741 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19135968; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.14338s; TotalTimePerSample = 0.57352ms; SamplesPerSecond = 1743 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.21491485; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.14566s; TotalTimePerSample = 0.58264ms; SamplesPerSecond = 1716 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18682346; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.13549s; TotalTimePerSample = 0.54194ms; SamplesPerSecond = 1845 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18483205; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14315s; TotalTimePerSample = 0.57262ms; SamplesPerSecond = 1746 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14684504; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.14583s; TotalTimePerSample = 0.58333ms; SamplesPerSecond = 1714 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15322115; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14638s; TotalTimePerSample = 0.58552ms; SamplesPerSecond = 1707 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19882571; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.14054s; TotalTimePerSample = 0.56214ms; SamplesPerSecond = 1778 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13683833; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.14019s; TotalTimePerSample = 0.56076ms; SamplesPerSecond = 1783 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18621188; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.13443s; TotalTimePerSample = 0.53771ms; SamplesPerSecond = 1859 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19408048; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.13228s; TotalTimePerSample = 0.52910ms; SamplesPerSecond = 1890 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17298137; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.14559s; TotalTimePerSample = 0.58236ms; SamplesPerSecond = 1717 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13265130; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.13309s; TotalTimePerSample = 0.53238ms; SamplesPerSecond = 1878 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17627178; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.13118s; TotalTimePerSample = 0.52470ms; SamplesPerSecond = 1905 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12734628; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12731s; TotalTimePerSample = 0.50923ms; SamplesPerSecond = 1963 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15108451; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12619s; TotalTimePerSample = 0.50474ms; SamplesPerSecond = 1981 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19729184; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12375s; TotalTimePerSample = 0.49498ms; SamplesPerSecond = 2020 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12857332; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12239s; TotalTimePerSample = 0.48956ms; SamplesPerSecond = 2042 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13867804; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12138s; TotalTimePerSample = 0.48554ms; SamplesPerSecond = 2059 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12786050; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12245s; TotalTimePerSample = 0.48981ms; SamplesPerSecond = 2041 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16643303; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12604s; TotalTimePerSample = 0.50418ms; SamplesPerSecond = 1983 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20440409; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12258s; TotalTimePerSample = 0.49033ms; SamplesPerSecond = 2039 +MPI Rank 2: Epoch[ 2 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14566238; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12077s; TotalTimePerSample = 0.48308ms; SamplesPerSecond = 2070 +MPI Rank 2: Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 0.20373029; EvalErrPerSample = 0.0827; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.454115 +MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.008000 effective momentum = 0.900000 +MPI Rank 2: starting epoch 2 at record count 20000, and file position 0 +MPI Rank 2: already there from last epoch +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 4, NumGradientBits = 32). +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12590085; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12007s; TotalTimePerSample = 0.48030ms; SamplesPerSecond = 2082 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17780229; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12502s; TotalTimePerSample = 0.50009ms; SamplesPerSecond = 1999 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14417637; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12269s; TotalTimePerSample = 0.49075ms; SamplesPerSecond = 2037 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15796895; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12210s; TotalTimePerSample = 0.48842ms; SamplesPerSecond = 2047 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17002999; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12374s; TotalTimePerSample = 0.49496ms; SamplesPerSecond = 2020 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18262114; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11485s; TotalTimePerSample = 0.45938ms; SamplesPerSecond = 2176 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14643694; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.11641s; TotalTimePerSample = 0.46564ms; SamplesPerSecond = 2147 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18030528; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12303s; TotalTimePerSample = 0.49210ms; SamplesPerSecond = 2032 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15846150; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12244s; TotalTimePerSample = 0.48977ms; SamplesPerSecond = 2041 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14486534; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12013s; TotalTimePerSample = 0.48050ms; SamplesPerSecond = 2081 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13469093; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.11843s; TotalTimePerSample = 0.47373ms; SamplesPerSecond = 2110 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12696s; TotalTimePerSample = 0.50785ms; SamplesPerSecond = 1969 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.11641295; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.11963s; TotalTimePerSample = 0.47853ms; SamplesPerSecond = 2089 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16786647; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.11897s; TotalTimePerSample = 0.47587ms; SamplesPerSecond = 2101 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12811514; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12430s; TotalTimePerSample = 0.49719ms; SamplesPerSecond = 2011 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17257851; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.11756s; TotalTimePerSample = 0.47023ms; SamplesPerSecond = 2126 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17623656; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12107s; TotalTimePerSample = 0.48428ms; SamplesPerSecond = 2064 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14121117; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12174s; TotalTimePerSample = 0.48696ms; SamplesPerSecond = 2053 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19243443; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12139s; TotalTimePerSample = 0.48555ms; SamplesPerSecond = 2059 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20908161; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12892s; TotalTimePerSample = 0.51567ms; SamplesPerSecond = 1939 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18472067; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12850s; TotalTimePerSample = 0.51400ms; SamplesPerSecond = 1945 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18185535; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12240s; TotalTimePerSample = 0.48959ms; SamplesPerSecond = 2042 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14074205; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.13033s; TotalTimePerSample = 0.52130ms; SamplesPerSecond = 1918 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14871620; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11995s; TotalTimePerSample = 0.47979ms; SamplesPerSecond = 2084 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20299704; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12241s; TotalTimePerSample = 0.48962ms; SamplesPerSecond = 2042 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12852038; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12054s; TotalTimePerSample = 0.48216ms; SamplesPerSecond = 2074 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18660439; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12419s; TotalTimePerSample = 0.49676ms; SamplesPerSecond = 2013 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19575997; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12071s; TotalTimePerSample = 0.48283ms; SamplesPerSecond = 2071 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16667676; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12485s; TotalTimePerSample = 0.49938ms; SamplesPerSecond = 2002 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12526169; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12473s; TotalTimePerSample = 0.49894ms; SamplesPerSecond = 2004 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17392131; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12339s; TotalTimePerSample = 0.49358ms; SamplesPerSecond = 2026 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12281615; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12287s; TotalTimePerSample = 0.49147ms; SamplesPerSecond = 2034 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14759390; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.11621s; TotalTimePerSample = 0.46485ms; SamplesPerSecond = 2151 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19801300; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12491s; TotalTimePerSample = 0.49964ms; SamplesPerSecond = 2001 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12593395; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12309s; TotalTimePerSample = 0.49238ms; SamplesPerSecond = 2030 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13756617; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12019s; TotalTimePerSample = 0.48077ms; SamplesPerSecond = 2080 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12843s; TotalTimePerSample = 0.51372ms; SamplesPerSecond = 1946 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16654369; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.13408s; TotalTimePerSample = 0.53631ms; SamplesPerSecond = 1864 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20658951; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12132s; TotalTimePerSample = 0.48530ms; SamplesPerSecond = 2060 +MPI Rank 2: Epoch[ 3 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14583322; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.11488s; TotalTimePerSample = 0.45952ms; SamplesPerSecond = 2176 +MPI Rank 2: Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 0.15948618; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.92626 +MPI Rank 2: Starting Epoch 4: learning rate per sample = 0.008000 effective momentum = 0.900000 +MPI Rank 2: starting epoch 3 at record count 30000, and file position 0 +MPI Rank 2: already there from last epoch +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 4, NumGradientBits = 32). +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12371232; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12031s; TotalTimePerSample = 0.48123ms; SamplesPerSecond = 2078 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18070515; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12045s; TotalTimePerSample = 0.48179ms; SamplesPerSecond = 2075 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14239730; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12730s; TotalTimePerSample = 0.50922ms; SamplesPerSecond = 1963 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15630155; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.11938s; TotalTimePerSample = 0.47754ms; SamplesPerSecond = 2094 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16935525; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.11927s; TotalTimePerSample = 0.47708ms; SamplesPerSecond = 2096 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18198833; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12177s; TotalTimePerSample = 0.48708ms; SamplesPerSecond = 2053 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14475946; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12015s; TotalTimePerSample = 0.48060ms; SamplesPerSecond = 2080 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18021601; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.11793s; TotalTimePerSample = 0.47171ms; SamplesPerSecond = 2119 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15849308; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11865s; TotalTimePerSample = 0.47460ms; SamplesPerSecond = 2107 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14474425; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12154s; TotalTimePerSample = 0.48615ms; SamplesPerSecond = 2056 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13362926; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.11991s; TotalTimePerSample = 0.47965ms; SamplesPerSecond = 2084 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13708299; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12222s; TotalTimePerSample = 0.48887ms; SamplesPerSecond = 2045 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.11569777; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12080s; TotalTimePerSample = 0.48321ms; SamplesPerSecond = 2069 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16892331; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12001s; TotalTimePerSample = 0.48002ms; SamplesPerSecond = 2083 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12752162; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12061s; TotalTimePerSample = 0.48244ms; SamplesPerSecond = 2072 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17100866; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12240s; TotalTimePerSample = 0.48961ms; SamplesPerSecond = 2042 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17660425; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12004s; TotalTimePerSample = 0.48016ms; SamplesPerSecond = 2082 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14105803; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12205s; TotalTimePerSample = 0.48818ms; SamplesPerSecond = 2048 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19333552; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12591s; TotalTimePerSample = 0.50365ms; SamplesPerSecond = 1985 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20859524; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.14391s; TotalTimePerSample = 0.57564ms; SamplesPerSecond = 1737 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18499677; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12836s; TotalTimePerSample = 0.51342ms; SamplesPerSecond = 1947 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18152438; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12739s; TotalTimePerSample = 0.50957ms; SamplesPerSecond = 1962 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14037157; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.11992s; TotalTimePerSample = 0.47966ms; SamplesPerSecond = 2084 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14866862; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12109s; TotalTimePerSample = 0.48438ms; SamplesPerSecond = 2064 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20347747; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.11737s; TotalTimePerSample = 0.46947ms; SamplesPerSecond = 2130 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12815012; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12143s; TotalTimePerSample = 0.48571ms; SamplesPerSecond = 2058 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18672810; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12074s; TotalTimePerSample = 0.48296ms; SamplesPerSecond = 2070 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19552990; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12188s; TotalTimePerSample = 0.48751ms; SamplesPerSecond = 2051 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16452642; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12390s; TotalTimePerSample = 0.49558ms; SamplesPerSecond = 2017 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12461825; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12141s; TotalTimePerSample = 0.48563ms; SamplesPerSecond = 2059 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17285251; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12232s; TotalTimePerSample = 0.48929ms; SamplesPerSecond = 2043 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12253620; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12247s; TotalTimePerSample = 0.48987ms; SamplesPerSecond = 2041 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14723334; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12365s; TotalTimePerSample = 0.49459ms; SamplesPerSecond = 2021 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19789537; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12271s; TotalTimePerSample = 0.49083ms; SamplesPerSecond = 2037 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12575877; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12486s; TotalTimePerSample = 0.49944ms; SamplesPerSecond = 2002 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13745928; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12591s; TotalTimePerSample = 0.50363ms; SamplesPerSecond = 1985 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12839652; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12015s; TotalTimePerSample = 0.48060ms; SamplesPerSecond = 2080 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16647280; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12200s; TotalTimePerSample = 0.48798ms; SamplesPerSecond = 2049 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20679434; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12384s; TotalTimePerSample = 0.49536ms; SamplesPerSecond = 2018 +MPI Rank 2: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12606s; TotalTimePerSample = 0.50426ms; SamplesPerSecond = 1983 +MPI Rank 2: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.931381 +MPI Rank 2: CNTKCommandTrainEnd: SimpleMultiGPU +MPI Rank 2: COMPLETED +MPI Rank 2: ~MPIWrapper +MPI Rank 3: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank3 +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: Build info: +MPI Rank 3: +MPI Rank 3: Built time: Oct 24 2015 13:33:25 +MPI Rank 3: Last modified date: Thu Oct 22 16:00:27 2015 +MPI Rank 3: Built by amitaga on Amitaga-Win-DT3 +MPI Rank 3: Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\ +MPI Rank 3: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: running on Amitaga-Win-DT3 at 2015/10/24 21:49:39 +MPI Rank 3: command line: +MPI Rank 3: E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining/SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: deviceId=$DeviceId$ +MPI Rank 3: command=SimpleMultiGPU +MPI Rank 3: precision=float +MPI Rank 3: parallelTrain=true +MPI Rank 3: SimpleMultiGPU=[ +MPI Rank 3: action=train +MPI Rank 3: modelPath=$RunDir$/models/Simple.dnn +MPI Rank 3: deviceId=$DeviceId$ +MPI Rank 3: traceLevel=1 +MPI Rank 3: SimpleNetworkBuilder=[ +MPI Rank 3: layerSizes=2:50*2:2 +MPI Rank 3: trainingCriterion=CrossEntropyWithSoftmax +MPI Rank 3: evalCriterion=ErrorPrediction +MPI Rank 3: layerTypes=Sigmoid +MPI Rank 3: initValueScale=1.0 +MPI Rank 3: applyMeanVarNorm=true +MPI Rank 3: uniformInit=true +MPI Rank 3: needPrior=true +MPI Rank 3: ] +MPI Rank 3: SGD=[ +MPI Rank 3: epochSize=0 +MPI Rank 3: minibatchSize=25 +MPI Rank 3: learningRatesPerMB=0.5:0.2*20:0.1 +MPI Rank 3: momentumPerMB=0.9 +MPI Rank 3: dropoutRate=0.0 +MPI Rank 3: maxEpochs=4 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationMethod=DataParallelSGD +MPI Rank 3: DataParallelSGD=[ +MPI Rank 3: gradientBits=1 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: reader=[ +MPI Rank 3: readerType=UCIFastReader +MPI Rank 3: file=$DataDir$/SimpleDataTrain.txt +MPI Rank 3: miniBatchMode=Partial +MPI Rank 3: randomize=None +MPI Rank 3: verbosity=1 +MPI Rank 3: features=[ +MPI Rank 3: dim=2 +MPI Rank 3: start=0 +MPI Rank 3: ] +MPI Rank 3: labels=[ +MPI Rank 3: start=2 +MPI Rank 3: dim=1 +MPI Rank 3: labelDim=2 +MPI Rank 3: labelMappingFile=$DataDir$/SimpleMapping.txt +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu +MPI Rank 3: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data +MPI Rank 3: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining +MPI Rank 3: DeviceId=0 +MPI Rank 3: precision=float +MPI Rank 3: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 3: stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: deviceId=0 +MPI Rank 3: command=SimpleMultiGPU +MPI Rank 3: precision=float +MPI Rank 3: parallelTrain=true +MPI Rank 3: SimpleMultiGPU=[ +MPI Rank 3: action=train +MPI Rank 3: modelPath=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 3: deviceId=0 +MPI Rank 3: traceLevel=1 +MPI Rank 3: SimpleNetworkBuilder=[ +MPI Rank 3: layerSizes=2:50*2:2 +MPI Rank 3: trainingCriterion=CrossEntropyWithSoftmax +MPI Rank 3: evalCriterion=ErrorPrediction +MPI Rank 3: layerTypes=Sigmoid +MPI Rank 3: initValueScale=1.0 +MPI Rank 3: applyMeanVarNorm=true +MPI Rank 3: uniformInit=true +MPI Rank 3: needPrior=true +MPI Rank 3: ] +MPI Rank 3: SGD=[ +MPI Rank 3: epochSize=0 +MPI Rank 3: minibatchSize=25 +MPI Rank 3: learningRatesPerMB=0.5:0.2*20:0.1 +MPI Rank 3: momentumPerMB=0.9 +MPI Rank 3: dropoutRate=0.0 +MPI Rank 3: maxEpochs=4 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationMethod=DataParallelSGD +MPI Rank 3: DataParallelSGD=[ +MPI Rank 3: gradientBits=1 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: reader=[ +MPI Rank 3: readerType=UCIFastReader +MPI Rank 3: file=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 3: miniBatchMode=Partial +MPI Rank 3: randomize=None +MPI Rank 3: verbosity=1 +MPI Rank 3: features=[ +MPI Rank 3: dim=2 +MPI Rank 3: start=0 +MPI Rank 3: ] +MPI Rank 3: labels=[ +MPI Rank 3: start=2 +MPI Rank 3: dim=1 +MPI Rank 3: labelDim=2 +MPI Rank 3: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleMapping.txt +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu +MPI Rank 3: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data +MPI Rank 3: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining +MPI Rank 3: DeviceId=0 +MPI Rank 3: precision=float +MPI Rank 3: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 3: stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: configparameters: SimpleMultiGPU.config:command=SimpleMultiGPU +MPI Rank 3: configparameters: SimpleMultiGPU.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining +MPI Rank 3: configparameters: SimpleMultiGPU.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data +MPI Rank 3: configparameters: SimpleMultiGPU.config:deviceId=0 +MPI Rank 3: configparameters: SimpleMultiGPU.config:parallelTrain=true +MPI Rank 3: configparameters: SimpleMultiGPU.config:precision=float +MPI Rank 3: configparameters: SimpleMultiGPU.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu +MPI Rank 3: configparameters: SimpleMultiGPU.config:SimpleMultiGPU=[ +MPI Rank 3: action=train +MPI Rank 3: modelPath=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 3: deviceId=0 +MPI Rank 3: traceLevel=1 +MPI Rank 3: SimpleNetworkBuilder=[ +MPI Rank 3: layerSizes=2:50*2:2 +MPI Rank 3: trainingCriterion=CrossEntropyWithSoftmax +MPI Rank 3: evalCriterion=ErrorPrediction +MPI Rank 3: layerTypes=Sigmoid +MPI Rank 3: initValueScale=1.0 +MPI Rank 3: applyMeanVarNorm=true +MPI Rank 3: uniformInit=true +MPI Rank 3: needPrior=true +MPI Rank 3: ] +MPI Rank 3: SGD=[ +MPI Rank 3: epochSize=0 +MPI Rank 3: minibatchSize=25 +MPI Rank 3: learningRatesPerMB=0.5:0.2*20:0.1 +MPI Rank 3: momentumPerMB=0.9 +MPI Rank 3: dropoutRate=0.0 +MPI Rank 3: maxEpochs=4 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationMethod=DataParallelSGD +MPI Rank 3: DataParallelSGD=[ +MPI Rank 3: gradientBits=1 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: reader=[ +MPI Rank 3: readerType=UCIFastReader +MPI Rank 3: file=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 3: miniBatchMode=Partial +MPI Rank 3: randomize=None +MPI Rank 3: verbosity=1 +MPI Rank 3: features=[ +MPI Rank 3: dim=2 +MPI Rank 3: start=0 +MPI Rank 3: ] +MPI Rank 3: labels=[ +MPI Rank 3: start=2 +MPI Rank 3: dim=1 +MPI Rank 3: labelDim=2 +MPI Rank 3: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleMapping.txt +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: ] [SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] +MPI Rank 3: +MPI Rank 3: configparameters: SimpleMultiGPU.config:stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: command: SimpleMultiGPU +MPI Rank 3: precision = float +MPI Rank 3: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn +MPI Rank 3: CNTKCommandTrainInfo: SimpleMultiGPU : 4 +MPI Rank 3: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4 +MPI Rank 3: CNTKCommandTrainBegin: SimpleMultiGPU +MPI Rank 3: SimpleNetworkBuilder Using GPU 0 +MPI Rank 3: reading uci file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt +MPI Rank 3: SetUniformRandomValue (GPU): creating curand object with seed 1 +MPI Rank 3: GetTrainCriterionNodes ... +MPI Rank 3: GetEvalCriterionNodes ... +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 3: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 3: +MPI Rank 3: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 3: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 3: +MPI Rank 3: Validating for node CrossEntropyWithSoftmax, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3] +MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3] +MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3] +MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3] +MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3] +MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3] +MPI Rank 3: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1] +MPI Rank 3: +MPI Rank 3: 9 out of 20 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Precomputing --> 3 PreCompute nodes found. +MPI Rank 3: +MPI Rank 3: NodeName: InvStdOfFeatures +MPI Rank 3: NodeName: MeanOfFeatures +MPI Rank 3: NodeName: Prior +MPI Rank 3: starting at epoch 0 counting lines to determine record count +MPI Rank 3: +MPI Rank 3: 10000 records found +MPI Rank 3: starting epoch 0 at record count 0, and file position 0 +MPI Rank 3: already there from last epoch +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: +MPI Rank 3: Validating for node InvStdOfFeatures, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: +MPI Rank 3: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node MeanOfFeatures. 2 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: +MPI Rank 3: Validating for node MeanOfFeatures, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1] +MPI Rank 3: +MPI Rank 3: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node Prior. 2 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 3: +MPI Rank 3: Validating for node Prior. 1 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 3: +MPI Rank 3: Validating for node Prior, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3] +MPI Rank 3: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1] +MPI Rank 3: +MPI Rank 3: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Precomputing --> Completed. +MPI Rank 3: +MPI Rank 3: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 3: Starting Epoch 1: learning rate per sample = 0.020000 effective momentum = 0.900000 +MPI Rank 3: starting epoch 0 at record count 0, and file position 0 +MPI Rank 3: already there from last epoch +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 3: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 3: +MPI Rank 3: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 3: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 3: +MPI Rank 3: Validating for node EvalErrorPrediction, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 25] +MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50] +MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50] +MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2] +MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 25] +MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1] +MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1] +MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25] +MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25] +MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1] +MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25] +MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25] +MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25] +MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1] +MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25] +MPI Rank 3: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1] +MPI Rank 3: +MPI Rank 3: 9 out of 20 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, DataParallelSGD training (MyRank = 3, NumNodes = 4, NumGradientBits = 32). +MPI Rank 3: DecimateMinibatchSequences: WARNING: Number of parallel utterances 25 not a multiple of number of GPUs 4, GPU usage will be suboptimal. +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 1- 10 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70007977; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.21100s; TotalTimePerSample = 0.84399ms; SamplesPerSecond = 1184 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 11- 20 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71514542; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.17235s; TotalTimePerSample = 0.68940ms; SamplesPerSecond = 1450 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 21- 30 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72945595; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.16873s; TotalTimePerSample = 0.67493ms; SamplesPerSecond = 1481 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 31- 40 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70079058; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.16075s; TotalTimePerSample = 0.64300ms; SamplesPerSecond = 1555 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 41- 50 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70605615; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.16197s; TotalTimePerSample = 0.64788ms; SamplesPerSecond = 1543 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 51- 60 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71572398; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.15188s; TotalTimePerSample = 0.60751ms; SamplesPerSecond = 1646 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 61- 70 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72149851; EvalErr[0]PerSample = 0.48000000; TotalTime = 0.14181s; TotalTimePerSample = 0.56722ms; SamplesPerSecond = 1762 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 71- 80 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.79845604; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.14178s; TotalTimePerSample = 0.56714ms; SamplesPerSecond = 1763 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 81- 90 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69665186; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.13817s; TotalTimePerSample = 0.55266ms; SamplesPerSecond = 1809 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 91- 100 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70723326; EvalErr[0]PerSample = 0.49200000; TotalTime = 0.12779s; TotalTimePerSample = 0.51114ms; SamplesPerSecond = 1956 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 101- 110 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71420345; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.13020s; TotalTimePerSample = 0.52080ms; SamplesPerSecond = 1920 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 111- 120 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69535258; EvalErr[0]PerSample = 0.43600000; TotalTime = 0.12825s; TotalTimePerSample = 0.51300ms; SamplesPerSecond = 1949 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 121- 130 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70078531; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12355s; TotalTimePerSample = 0.49419ms; SamplesPerSecond = 2023 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 131- 140 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71857916; EvalErr[0]PerSample = 0.54800000; TotalTime = 0.11948s; TotalTimePerSample = 0.47792ms; SamplesPerSecond = 2092 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 141- 150 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72088358; EvalErr[0]PerSample = 0.48800000; TotalTime = 0.11900s; TotalTimePerSample = 0.47600ms; SamplesPerSecond = 2100 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 151- 160 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71798840; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.11648s; TotalTimePerSample = 0.46593ms; SamplesPerSecond = 2146 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 161- 170 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.74162164; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.11763s; TotalTimePerSample = 0.47053ms; SamplesPerSecond = 2125 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 171- 180 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71835128; EvalErr[0]PerSample = 0.51600000; TotalTime = 0.11827s; TotalTimePerSample = 0.47306ms; SamplesPerSecond = 2113 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 181- 190 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71529463; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12530s; TotalTimePerSample = 0.50120ms; SamplesPerSecond = 1995 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 191- 200 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71727657; EvalErr[0]PerSample = 0.53200000; TotalTime = 0.12000s; TotalTimePerSample = 0.48001ms; SamplesPerSecond = 2083 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 201- 210 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71745517; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.11894s; TotalTimePerSample = 0.47577ms; SamplesPerSecond = 2101 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 211- 220 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72088398; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.11488s; TotalTimePerSample = 0.45951ms; SamplesPerSecond = 2176 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 221- 230 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.72006809; EvalErr[0]PerSample = 0.50800000; TotalTime = 0.12400s; TotalTimePerSample = 0.49601ms; SamplesPerSecond = 2016 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 231- 240 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.71275468; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12093s; TotalTimePerSample = 0.48372ms; SamplesPerSecond = 2067 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 241- 250 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69644781; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.11801s; TotalTimePerSample = 0.47202ms; SamplesPerSecond = 2118 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 251- 260 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70129698; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12166s; TotalTimePerSample = 0.48665ms; SamplesPerSecond = 2054 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 261- 270 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70768095; EvalErr[0]PerSample = 0.54400000; TotalTime = 0.12138s; TotalTimePerSample = 0.48553ms; SamplesPerSecond = 2059 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 271- 280 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69744379; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.11718s; TotalTimePerSample = 0.46872ms; SamplesPerSecond = 2133 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 281- 290 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69266187; EvalErr[0]PerSample = 0.44800000; TotalTime = 0.11796s; TotalTimePerSample = 0.47183ms; SamplesPerSecond = 2119 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 291- 300 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69347266; EvalErr[0]PerSample = 0.49600000; TotalTime = 0.12167s; TotalTimePerSample = 0.48668ms; SamplesPerSecond = 2054 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 301- 310 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69257409; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12564s; TotalTimePerSample = 0.50255ms; SamplesPerSecond = 1989 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 311- 320 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.68625741; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12551s; TotalTimePerSample = 0.50204ms; SamplesPerSecond = 1991 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 321- 330 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69064011; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12341s; TotalTimePerSample = 0.49363ms; SamplesPerSecond = 2025 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 331- 340 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.70192154; EvalErr[0]PerSample = 0.46000000; TotalTime = 0.12370s; TotalTimePerSample = 0.49480ms; SamplesPerSecond = 2021 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 341- 350 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.69058912; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12358s; TotalTimePerSample = 0.49434ms; SamplesPerSecond = 2022 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 351- 360 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.67041492; EvalErr[0]PerSample = 0.39200000; TotalTime = 0.12207s; TotalTimePerSample = 0.48827ms; SamplesPerSecond = 2048 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 361- 370 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.65913973; EvalErr[0]PerSample = 0.35600000; TotalTime = 0.12947s; TotalTimePerSample = 0.51787ms; SamplesPerSecond = 1930 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 371- 380 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.63919877; EvalErr[0]PerSample = 0.36400000; TotalTime = 0.14402s; TotalTimePerSample = 0.57607ms; SamplesPerSecond = 1735 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 381- 390 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.61293883; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.14409s; TotalTimePerSample = 0.57637ms; SamplesPerSecond = 1735 +MPI Rank 3: Epoch[ 1 of 4]-Minibatch[ 391- 400 of -1]: SamplesSeen = 250; TrainLossPerSample = 0.55255352; EvalErr[0]PerSample = 0.18800000; TotalTime = 0.14499s; TotalTimePerSample = 0.57995ms; SamplesPerSecond = 1724 +MPI Rank 3: Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.4735; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.369658 +MPI Rank 3: Starting Epoch 2: learning rate per sample = 0.008000 effective momentum = 0.900000 +MPI Rank 3: starting epoch 1 at record count 10000, and file position 0 +MPI Rank 3: already there from last epoch +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, DataParallelSGD training (MyRank = 3, NumNodes = 4, NumGradientBits = 32). +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.50774625; EvalErr[0]PerSample = 0.24000000; TotalTime = 0.13824s; TotalTimePerSample = 0.55294ms; SamplesPerSecond = 1808 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.43388927; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.13875s; TotalTimePerSample = 0.55502ms; SamplesPerSecond = 1801 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.36674870; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.14007s; TotalTimePerSample = 0.56027ms; SamplesPerSecond = 1784 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.33768765; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.13713s; TotalTimePerSample = 0.54853ms; SamplesPerSecond = 1823 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.30320946; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12833s; TotalTimePerSample = 0.51332ms; SamplesPerSecond = 1948 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.29576043; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.14854s; TotalTimePerSample = 0.59414ms; SamplesPerSecond = 1683 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.24924491; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14657s; TotalTimePerSample = 0.58628ms; SamplesPerSecond = 1705 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.24632415; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.14366s; TotalTimePerSample = 0.57462ms; SamplesPerSecond = 1740 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20943158; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.13604s; TotalTimePerSample = 0.54416ms; SamplesPerSecond = 1837 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19115996; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.13442s; TotalTimePerSample = 0.53768ms; SamplesPerSecond = 1859 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17923231; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.13593s; TotalTimePerSample = 0.54372ms; SamplesPerSecond = 1839 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17075422; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.13592s; TotalTimePerSample = 0.54368ms; SamplesPerSecond = 1839 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14442371; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.13116s; TotalTimePerSample = 0.52464ms; SamplesPerSecond = 1906 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17753819; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.13549s; TotalTimePerSample = 0.54197ms; SamplesPerSecond = 1845 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15087855; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.13867s; TotalTimePerSample = 0.55469ms; SamplesPerSecond = 1802 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19253023; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.13507s; TotalTimePerSample = 0.54028ms; SamplesPerSecond = 1850 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17830684; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.14524s; TotalTimePerSample = 0.58098ms; SamplesPerSecond = 1721 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15115428; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.14260s; TotalTimePerSample = 0.57040ms; SamplesPerSecond = 1753 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19135968; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.14288s; TotalTimePerSample = 0.57152ms; SamplesPerSecond = 1749 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.21491485; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.14526s; TotalTimePerSample = 0.58103ms; SamplesPerSecond = 1721 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18682346; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.13489s; TotalTimePerSample = 0.53955ms; SamplesPerSecond = 1853 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18483205; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14344s; TotalTimePerSample = 0.57375ms; SamplesPerSecond = 1742 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14684504; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.14543s; TotalTimePerSample = 0.58172ms; SamplesPerSecond = 1719 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15322115; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14576s; TotalTimePerSample = 0.58304ms; SamplesPerSecond = 1715 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19882571; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.13993s; TotalTimePerSample = 0.55974ms; SamplesPerSecond = 1786 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13683833; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.13986s; TotalTimePerSample = 0.55944ms; SamplesPerSecond = 1787 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18621188; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.13396s; TotalTimePerSample = 0.53586ms; SamplesPerSecond = 1866 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19408048; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.13196s; TotalTimePerSample = 0.52782ms; SamplesPerSecond = 1894 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17298137; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.14577s; TotalTimePerSample = 0.58307ms; SamplesPerSecond = 1715 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13265130; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.13253s; TotalTimePerSample = 0.53014ms; SamplesPerSecond = 1886 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17627178; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.13034s; TotalTimePerSample = 0.52135ms; SamplesPerSecond = 1918 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12734628; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12706s; TotalTimePerSample = 0.50823ms; SamplesPerSecond = 1967 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15108451; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12629s; TotalTimePerSample = 0.50514ms; SamplesPerSecond = 1979 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19729184; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12368s; TotalTimePerSample = 0.49470ms; SamplesPerSecond = 2021 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12857332; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12167s; TotalTimePerSample = 0.48667ms; SamplesPerSecond = 2054 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13867804; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12142s; TotalTimePerSample = 0.48569ms; SamplesPerSecond = 2058 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12786050; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12244s; TotalTimePerSample = 0.48977ms; SamplesPerSecond = 2041 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16643303; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12588s; TotalTimePerSample = 0.50352ms; SamplesPerSecond = 1986 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20440409; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12226s; TotalTimePerSample = 0.48902ms; SamplesPerSecond = 2044 +MPI Rank 3: Epoch[ 2 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14566238; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12138s; TotalTimePerSample = 0.48551ms; SamplesPerSecond = 2059 +MPI Rank 3: Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 0.20373029; EvalErrPerSample = 0.0827; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.454936 +MPI Rank 3: Starting Epoch 3: learning rate per sample = 0.008000 effective momentum = 0.900000 +MPI Rank 3: starting epoch 2 at record count 20000, and file position 0 +MPI Rank 3: already there from last epoch +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, DataParallelSGD training (MyRank = 3, NumNodes = 4, NumGradientBits = 32). +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12590085; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12023s; TotalTimePerSample = 0.48090ms; SamplesPerSecond = 2079 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17780229; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12465s; TotalTimePerSample = 0.49858ms; SamplesPerSecond = 2005 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14417637; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12297s; TotalTimePerSample = 0.49187ms; SamplesPerSecond = 2033 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15796895; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12257s; TotalTimePerSample = 0.49028ms; SamplesPerSecond = 2039 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17002999; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12288s; TotalTimePerSample = 0.49151ms; SamplesPerSecond = 2034 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18262114; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11518s; TotalTimePerSample = 0.46071ms; SamplesPerSecond = 2170 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14643694; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.11587s; TotalTimePerSample = 0.46348ms; SamplesPerSecond = 2157 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18030528; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12357s; TotalTimePerSample = 0.49427ms; SamplesPerSecond = 2023 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15846150; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12249s; TotalTimePerSample = 0.48998ms; SamplesPerSecond = 2040 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14486534; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.11970s; TotalTimePerSample = 0.47880ms; SamplesPerSecond = 2088 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13469093; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.11866s; TotalTimePerSample = 0.47462ms; SamplesPerSecond = 2106 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12676s; TotalTimePerSample = 0.50704ms; SamplesPerSecond = 1972 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.11641295; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.11993s; TotalTimePerSample = 0.47971ms; SamplesPerSecond = 2084 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16786647; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.11853s; TotalTimePerSample = 0.47412ms; SamplesPerSecond = 2109 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12811514; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12430s; TotalTimePerSample = 0.49721ms; SamplesPerSecond = 2011 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17257851; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.11774s; TotalTimePerSample = 0.47097ms; SamplesPerSecond = 2123 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17623656; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12213s; TotalTimePerSample = 0.48850ms; SamplesPerSecond = 2047 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14121117; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12045s; TotalTimePerSample = 0.48180ms; SamplesPerSecond = 2075 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19243443; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12144s; TotalTimePerSample = 0.48574ms; SamplesPerSecond = 2058 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20908161; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12909s; TotalTimePerSample = 0.51637ms; SamplesPerSecond = 1936 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18472067; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12856s; TotalTimePerSample = 0.51422ms; SamplesPerSecond = 1944 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18185535; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12229s; TotalTimePerSample = 0.48917ms; SamplesPerSecond = 2044 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14074205; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.13026s; TotalTimePerSample = 0.52105ms; SamplesPerSecond = 1919 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14871620; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12010s; TotalTimePerSample = 0.48041ms; SamplesPerSecond = 2081 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20299704; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12197s; TotalTimePerSample = 0.48786ms; SamplesPerSecond = 2049 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12852038; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12039s; TotalTimePerSample = 0.48156ms; SamplesPerSecond = 2076 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18660439; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12410s; TotalTimePerSample = 0.49640ms; SamplesPerSecond = 2014 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19575997; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12024s; TotalTimePerSample = 0.48097ms; SamplesPerSecond = 2079 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16667676; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12499s; TotalTimePerSample = 0.49996ms; SamplesPerSecond = 2000 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12526169; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12472s; TotalTimePerSample = 0.49889ms; SamplesPerSecond = 2004 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17392131; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12303s; TotalTimePerSample = 0.49213ms; SamplesPerSecond = 2031 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12281615; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12350s; TotalTimePerSample = 0.49400ms; SamplesPerSecond = 2024 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14759390; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.11582s; TotalTimePerSample = 0.46327ms; SamplesPerSecond = 2158 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19801300; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12505s; TotalTimePerSample = 0.50019ms; SamplesPerSecond = 1999 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12593395; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12306s; TotalTimePerSample = 0.49225ms; SamplesPerSecond = 2031 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13756617; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12046s; TotalTimePerSample = 0.48184ms; SamplesPerSecond = 2075 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12798s; TotalTimePerSample = 0.51194ms; SamplesPerSecond = 1953 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16654369; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.13400s; TotalTimePerSample = 0.53600ms; SamplesPerSecond = 1865 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20658951; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12144s; TotalTimePerSample = 0.48578ms; SamplesPerSecond = 2058 +MPI Rank 3: Epoch[ 3 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14583322; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.11522s; TotalTimePerSample = 0.46087ms; SamplesPerSecond = 2169 +MPI Rank 3: Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 0.15948618; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.926698 +MPI Rank 3: Starting Epoch 4: learning rate per sample = 0.008000 effective momentum = 0.900000 +MPI Rank 3: starting epoch 3 at record count 30000, and file position 0 +MPI Rank 3: already there from last epoch +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, DataParallelSGD training (MyRank = 3, NumNodes = 4, NumGradientBits = 32). +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 1- 10 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12371232; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12049s; TotalTimePerSample = 0.48194ms; SamplesPerSecond = 2074 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 11- 20 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18070515; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12028s; TotalTimePerSample = 0.48113ms; SamplesPerSecond = 2078 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 21- 30 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14239730; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12702s; TotalTimePerSample = 0.50807ms; SamplesPerSecond = 1968 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 31- 40 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15630155; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.11959s; TotalTimePerSample = 0.47836ms; SamplesPerSecond = 2090 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 41- 50 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16935525; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.11886s; TotalTimePerSample = 0.47544ms; SamplesPerSecond = 2103 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 51- 60 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18198833; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12210s; TotalTimePerSample = 0.48839ms; SamplesPerSecond = 2047 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 61- 70 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14475946; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.11983s; TotalTimePerSample = 0.47932ms; SamplesPerSecond = 2086 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 71- 80 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18021601; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.11840s; TotalTimePerSample = 0.47362ms; SamplesPerSecond = 2111 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 81- 90 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.15849308; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11869s; TotalTimePerSample = 0.47474ms; SamplesPerSecond = 2106 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14474425; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12097s; TotalTimePerSample = 0.48389ms; SamplesPerSecond = 2066 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13362926; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12025s; TotalTimePerSample = 0.48102ms; SamplesPerSecond = 2078 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13708299; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12236s; TotalTimePerSample = 0.48943ms; SamplesPerSecond = 2043 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.11569777; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12025s; TotalTimePerSample = 0.48100ms; SamplesPerSecond = 2079 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16892331; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.11957s; TotalTimePerSample = 0.47830ms; SamplesPerSecond = 2090 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12752162; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12099s; TotalTimePerSample = 0.48394ms; SamplesPerSecond = 2066 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17100866; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12234s; TotalTimePerSample = 0.48934ms; SamplesPerSecond = 2043 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17660425; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.11970s; TotalTimePerSample = 0.47880ms; SamplesPerSecond = 2088 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14105803; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12238s; TotalTimePerSample = 0.48954ms; SamplesPerSecond = 2042 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19333552; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12576s; TotalTimePerSample = 0.50306ms; SamplesPerSecond = 1987 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20859524; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.14403s; TotalTimePerSample = 0.57612ms; SamplesPerSecond = 1735 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18499677; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12854s; TotalTimePerSample = 0.51416ms; SamplesPerSecond = 1944 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18152438; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12730s; TotalTimePerSample = 0.50919ms; SamplesPerSecond = 1963 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14037157; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.11977s; TotalTimePerSample = 0.47907ms; SamplesPerSecond = 2087 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14866862; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12054s; TotalTimePerSample = 0.48217ms; SamplesPerSecond = 2073 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20347747; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.11766s; TotalTimePerSample = 0.47064ms; SamplesPerSecond = 2124 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12815012; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12102s; TotalTimePerSample = 0.48407ms; SamplesPerSecond = 2065 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.18672810; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12104s; TotalTimePerSample = 0.48415ms; SamplesPerSecond = 2065 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19552990; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12194s; TotalTimePerSample = 0.48776ms; SamplesPerSecond = 2050 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16452642; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12325s; TotalTimePerSample = 0.49300ms; SamplesPerSecond = 2028 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12461825; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12225s; TotalTimePerSample = 0.48901ms; SamplesPerSecond = 2044 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.17285251; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12176s; TotalTimePerSample = 0.48705ms; SamplesPerSecond = 2053 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12253620; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12281s; TotalTimePerSample = 0.49122ms; SamplesPerSecond = 2035 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14723334; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12368s; TotalTimePerSample = 0.49473ms; SamplesPerSecond = 2021 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.19789537; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12262s; TotalTimePerSample = 0.49048ms; SamplesPerSecond = 2038 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12575877; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12474s; TotalTimePerSample = 0.49897ms; SamplesPerSecond = 2004 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.13745928; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12588s; TotalTimePerSample = 0.50350ms; SamplesPerSecond = 1986 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.12839652; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12028s; TotalTimePerSample = 0.48113ms; SamplesPerSecond = 2078 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.16647280; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12155s; TotalTimePerSample = 0.48618ms; SamplesPerSecond = 2056 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.20679434; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12413s; TotalTimePerSample = 0.49654ms; SamplesPerSecond = 2013 +MPI Rank 3: Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample = 0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12588s; TotalTimePerSample = 0.50353ms; SamplesPerSecond = 1985 +MPI Rank 3: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.931393 +MPI Rank 3: CNTKCommandTrainEnd: SimpleMultiGPU +MPI Rank 3: COMPLETED +MPI Rank 3: ~MPIWrapper diff --git a/Tests/Speech/DNN/DiscriminativePreTraining/baseline.gpu.txt b/Tests/Speech/DNN/DiscriminativePreTraining/baseline.gpu.txt index f0f1c5727..dc101bc81 100644 --- a/Tests/Speech/DNN/DiscriminativePreTraining/baseline.gpu.txt +++ b/Tests/Speech/DNN/DiscriminativePreTraining/baseline.gpu.txt @@ -1,13 +1,13 @@ -=== Running /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/cntk_dpt.config RunDir=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 -running on localhost at 2015/10/12 18:49:16 -command line options: -configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/cntk_dpt.config RunDir=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 +=== Running /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/cntk_dpt.config RunDir=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining DeviceId=0 +running on localhost at 2015/10/24 12:51:56 +command line: +/home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/cntk_dpt.config RunDir=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining DeviceId=0 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> precision=float deviceId=$DeviceId$ command=DPT_Pre1:AddLayer2:DPT_Pre2:AddLayer3:speechTrain -ndlMacros=$DataDir$/ndl/macros.txt +ndlMacros=$ConfigDir$/macros.txt GlobalMean=GlobalStats/mean.363 GlobalInvStd=GlobalStats/var.363 GlobalPrior=GlobalStats/prior.132 @@ -25,7 +25,7 @@ DPT_Pre1=[ action=train modelPath=$RunDir$/models/Pre1/cntkSpeech NDLNetworkBuilder=[ - networkDescription=$DataDir$/ndl/dnn_1layer.txt + networkDescription=$ConfigDir$/dnn_1layer.txt ] ] AddLayer2=[ @@ -34,13 +34,13 @@ AddLayer2=[ NewLayer=2 CurrModel=$RunDir$/models/Pre1/cntkSpeech NewModel=$RunDir$/models/Pre2/cntkSpeech.0 - editPath=$DataDir$/ndl/add_layer.mel + editPath=$ConfigDir$/add_layer.mel ] DPT_Pre2=[ action=train modelPath=$RunDir$/models/Pre2/cntkSpeech NDLNetworkBuilder=[ - networkDescription=$DataDir$/ndl/dnn_1layer.txt + networkDescription=$ConfigDir$/dnn_1layer.txt ] ] AddLayer3=[ @@ -49,7 +49,7 @@ AddLayer3=[ NewLayer=3 CurrModel=$RunDir$/models/Pre2/cntkSpeech NewModel=$RunDir$/models/cntkSpeech.0 - editPath=$DataDir$/ndl/add_layer.mel + editPath=$ConfigDir$/add_layer.mel ] speechTrain=[ action=train @@ -57,7 +57,7 @@ speechTrain=[ deviceId=$DeviceId$ traceLevel=1 NDLNetworkBuilder=[ - networkDescription=$DataDir$/ndl/dnn.txt + networkDescription=$ConfigDir$/dnn.txt ] SGD=[ epochSize=81920 @@ -90,8 +90,9 @@ reader=[ labelType=Category ] ] -RunDir=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu +RunDir=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining DeviceId=0 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< @@ -100,7 +101,7 @@ DeviceId=0 precision=float deviceId=0 command=DPT_Pre1:AddLayer2:DPT_Pre2:AddLayer3:speechTrain -ndlMacros=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/macros.txt +ndlMacros=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/macros.txt GlobalMean=GlobalStats/mean.363 GlobalInvStd=GlobalStats/var.363 GlobalPrior=GlobalStats/prior.132 @@ -116,41 +117,41 @@ SGD=[ ] DPT_Pre1=[ action=train - modelPath=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech + modelPath=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech NDLNetworkBuilder=[ - networkDescription=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/dnn_1layer.txt + networkDescription=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/dnn_1layer.txt ] ] AddLayer2=[ action=edit CurrLayer=1 NewLayer=2 - CurrModel=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech - NewModel=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0 - editPath=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/add_layer.mel + CurrModel=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech + NewModel=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0 + editPath=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/add_layer.mel ] DPT_Pre2=[ action=train - modelPath=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech + modelPath=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech NDLNetworkBuilder=[ - networkDescription=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/dnn_1layer.txt + networkDescription=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/dnn_1layer.txt ] ] AddLayer3=[ action=edit CurrLayer=2 NewLayer=3 - CurrModel=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech - NewModel=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0 - editPath=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/add_layer.mel + CurrModel=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech + NewModel=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0 + editPath=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/add_layer.mel ] speechTrain=[ action=train - modelPath=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech + modelPath=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech deviceId=0 traceLevel=1 NDLNetworkBuilder=[ - networkDescription=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/dnn.txt + networkDescription=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/dnn.txt ] SGD=[ epochSize=81920 @@ -183,8 +184,9 @@ reader=[ labelType=Category ] ] -RunDir=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu +RunDir=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining DeviceId=0 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< @@ -194,43 +196,44 @@ configparameters: cntk_dpt.config:AddLayer2=[ action=edit CurrLayer=1 NewLayer=2 - CurrModel=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech - NewModel=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0 - editPath=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/add_layer.mel + CurrModel=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech + NewModel=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0 + editPath=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/add_layer.mel ] configparameters: cntk_dpt.config:AddLayer3=[ action=edit CurrLayer=2 NewLayer=3 - CurrModel=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech - NewModel=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0 - editPath=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/add_layer.mel + CurrModel=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech + NewModel=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0 + editPath=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/add_layer.mel ] configparameters: cntk_dpt.config:command=DPT_Pre1:AddLayer2:DPT_Pre2:AddLayer3:speechTrain +configparameters: cntk_dpt.config:ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining configparameters: cntk_dpt.config:DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data configparameters: cntk_dpt.config:deviceId=0 configparameters: cntk_dpt.config:DPT_Pre1=[ action=train - modelPath=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech + modelPath=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech NDLNetworkBuilder=[ - networkDescription=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/dnn_1layer.txt + networkDescription=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/dnn_1layer.txt ] ] configparameters: cntk_dpt.config:DPT_Pre2=[ action=train - modelPath=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech + modelPath=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech NDLNetworkBuilder=[ - networkDescription=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/dnn_1layer.txt + networkDescription=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/dnn_1layer.txt ] ] configparameters: cntk_dpt.config:GlobalInvStd=GlobalStats/var.363 configparameters: cntk_dpt.config:GlobalMean=GlobalStats/mean.363 configparameters: cntk_dpt.config:GlobalPrior=GlobalStats/prior.132 -configparameters: cntk_dpt.config:ndlMacros=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/macros.txt +configparameters: cntk_dpt.config:ndlMacros=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/macros.txt configparameters: cntk_dpt.config:precision=float configparameters: cntk_dpt.config:reader=[ readerType=HTKMLFReader @@ -251,7 +254,7 @@ configparameters: cntk_dpt.config:reader=[ ] ] -configparameters: cntk_dpt.config:RunDir=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu +configparameters: cntk_dpt.config:RunDir=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu configparameters: cntk_dpt.config:SGD=[ epochSize=81920 minibatchSize=256 @@ -264,11 +267,11 @@ configparameters: cntk_dpt.config:SGD=[ configparameters: cntk_dpt.config:speechTrain=[ action=train - modelPath=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech + modelPath=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech deviceId=0 traceLevel=1 NDLNetworkBuilder=[ - networkDescription=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/dnn.txt + networkDescription=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/dnn.txt ] SGD=[ epochSize=81920 @@ -288,11 +291,11 @@ configparameters: cntk_dpt.config:traceLevel=1 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< command: DPT_Pre1 AddLayer2 DPT_Pre2 AddLayer3 speechTrain precision = float -CNTKModelPath: /tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech +CNTKModelPath: /tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech CNTKCommandTrainInfo: DPT_Pre1 : 2 -CNTKModelPath: /tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech +CNTKModelPath: /tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech CNTKCommandTrainInfo: DPT_Pre2 : 2 -CNTKModelPath: /tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech +CNTKModelPath: /tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech CNTKCommandTrainInfo: speechTrain : 4 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 8 CNTKCommandTrainBegin: DPT_Pre1 @@ -400,6 +403,24 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1] +Validating for node cr. 6 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 1], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 1] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 1]) -> [512, MBSize 1] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 1], HL1.b[512, 1]) -> [512, MBSize 1] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 1]) -> [512, MBSize 1] +Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 1]) -> [132, MBSize 1] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1] + Validating for node cr, final verification. Validating --> labels = InputValue -> [132, MBSize 1] @@ -441,7 +462,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 1], logPrior[132, 1]) -> [132, MBSize 1] -Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2. +Validating for node ScaledLogLikelihood. 7 nodes to process in pass 2. Validating --> OL.W = LearnableParameter -> [132, 512] Validating --> HL1.W = LearnableParameter -> [512, 363] @@ -502,6 +523,25 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 1], logPrior[132, 1]) -> [132, MBSize 1] +Validating for node ScaledLogLikelihood. 7 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 1], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 1] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 1]) -> [512, MBSize 1] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 1], HL1.b[512, 1]) -> [512, MBSize 1] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 1]) -> [512, MBSize 1] +Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 1]) -> [132, MBSize 1] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 1], logPrior[132, 1]) -> [132, MBSize 1] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -543,7 +583,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1] Validating --> Err = ErrorPrediction(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1] -Validating for node Err. 1 nodes to process in pass 2. +Validating for node Err. 6 nodes to process in pass 2. Validating --> labels = InputValue -> [132, MBSize 1] Validating --> OL.W = LearnableParameter -> [132, 512] @@ -601,6 +641,24 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1] Validating --> Err = ErrorPrediction(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1] +Validating for node Err. 6 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 1], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 1] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 1]) -> [512, MBSize 1] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 1], HL1.b[512, 1]) -> [512, MBSize 1] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 1]) -> [512, MBSize 1] +Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 1]) -> [132, MBSize 1] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1] +Validating --> Err = ErrorPrediction(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 1] @@ -621,6 +679,7 @@ Validating --> Err = ErrorPrediction(labels[132, MBSize 1], OL.z[132, MBSize 1]) 7 out of 15 nodes do not share the minibatch layout with the input data. +SetUniformRandomValue (GPU): creating curand object with seed 1 GetTrainCriterionNodes ... GetEvalCriterionNodes ... No PreCompute nodes found, skipping PreCompute step @@ -630,79 +689,79 @@ minibatchiterator: epoch 0: frames [0..81920] (first utterance at frame 0), data requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms Starting minibatch loop. -EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once. - Epoch[ 1 of 2]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 3.74183846; EvalErr[0]PerSample = 0.80195313; TotalTime = 0.30483s; TotalTimePerSample = 0.11907ms; SamplesPerSecond = 8398 - Epoch[ 1 of 2]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.91124763; EvalErr[0]PerSample = 0.70898438; TotalTime = 0.12917s; TotalTimePerSample = 0.05046ms; SamplesPerSecond = 19818 - Epoch[ 1 of 2]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.58015976; EvalErr[0]PerSample = 0.66640625; TotalTime = 0.12870s; TotalTimePerSample = 0.05027ms; SamplesPerSecond = 19891 - Epoch[ 1 of 2]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.27427139; EvalErr[0]PerSample = 0.58750000; TotalTime = 0.12889s; TotalTimePerSample = 0.05035ms; SamplesPerSecond = 19861 - Epoch[ 1 of 2]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.05503616; EvalErr[0]PerSample = 0.56093750; TotalTime = 0.12856s; TotalTimePerSample = 0.05022ms; SamplesPerSecond = 19912 - Epoch[ 1 of 2]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.91055145; EvalErr[0]PerSample = 0.52812500; TotalTime = 0.12907s; TotalTimePerSample = 0.05042ms; SamplesPerSecond = 19833 - Epoch[ 1 of 2]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.81562653; EvalErr[0]PerSample = 0.51171875; TotalTime = 0.12874s; TotalTimePerSample = 0.05029ms; SamplesPerSecond = 19884 - Epoch[ 1 of 2]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.68803253; EvalErr[0]PerSample = 0.48476562; TotalTime = 0.12379s; TotalTimePerSample = 0.04836ms; SamplesPerSecond = 20680 - Epoch[ 1 of 2]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.57382050; EvalErr[0]PerSample = 0.45429687; TotalTime = 0.12941s; TotalTimePerSample = 0.05055ms; SamplesPerSecond = 19781 - Epoch[ 1 of 2]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.62090302; EvalErr[0]PerSample = 0.47304687; TotalTime = 0.12857s; TotalTimePerSample = 0.05022ms; SamplesPerSecond = 19911 +EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once. + Epoch[ 1 of 2]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 3.74183807; EvalErr[0]PerSample = 0.80195313; TotalTime = 0.15056s; TotalTimePerSample = 0.05881ms; SamplesPerSecond = 17002 + Epoch[ 1 of 2]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.91124802; EvalErr[0]PerSample = 0.70898438; TotalTime = 0.06815s; TotalTimePerSample = 0.02662ms; SamplesPerSecond = 37561 + Epoch[ 1 of 2]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.58016052; EvalErr[0]PerSample = 0.66640625; TotalTime = 0.06821s; TotalTimePerSample = 0.02665ms; SamplesPerSecond = 37530 + Epoch[ 1 of 2]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.27427139; EvalErr[0]PerSample = 0.58750000; TotalTime = 0.06811s; TotalTimePerSample = 0.02660ms; SamplesPerSecond = 37587 + Epoch[ 1 of 2]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.05503540; EvalErr[0]PerSample = 0.56093750; TotalTime = 0.06794s; TotalTimePerSample = 0.02654ms; SamplesPerSecond = 37680 + Epoch[ 1 of 2]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.91055145; EvalErr[0]PerSample = 0.52812500; TotalTime = 0.06802s; TotalTimePerSample = 0.02657ms; SamplesPerSecond = 37638 + Epoch[ 1 of 2]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.81562653; EvalErr[0]PerSample = 0.51171875; TotalTime = 0.06809s; TotalTimePerSample = 0.02660ms; SamplesPerSecond = 37595 + Epoch[ 1 of 2]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.68803253; EvalErr[0]PerSample = 0.48476562; TotalTime = 0.06801s; TotalTimePerSample = 0.02657ms; SamplesPerSecond = 37640 + Epoch[ 1 of 2]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.57382050; EvalErr[0]PerSample = 0.45429687; TotalTime = 0.06800s; TotalTimePerSample = 0.02656ms; SamplesPerSecond = 37646 + Epoch[ 1 of 2]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.62090149; EvalErr[0]PerSample = 0.47304687; TotalTime = 0.06795s; TotalTimePerSample = 0.02654ms; SamplesPerSecond = 37673 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 1 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.59272614; EvalErr[0]PerSample = 0.47500000; TotalTime = 0.12941s; TotalTimePerSample = 0.05055ms; SamplesPerSecond = 19781 - Epoch[ 1 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.51520386; EvalErr[0]PerSample = 0.44531250; TotalTime = 0.12911s; TotalTimePerSample = 0.05043ms; SamplesPerSecond = 19828 - Epoch[ 1 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.49181976; EvalErr[0]PerSample = 0.45039062; TotalTime = 0.10931s; TotalTimePerSample = 0.04270ms; SamplesPerSecond = 23418 - Epoch[ 1 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.53703613; EvalErr[0]PerSample = 0.44804688; TotalTime = 0.09591s; TotalTimePerSample = 0.03746ms; SamplesPerSecond = 26691 - Epoch[ 1 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.43095093; EvalErr[0]PerSample = 0.41640625; TotalTime = 0.09606s; TotalTimePerSample = 0.03753ms; SamplesPerSecond = 26648 - Epoch[ 1 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.41503601; EvalErr[0]PerSample = 0.40078125; TotalTime = 0.09662s; TotalTimePerSample = 0.03774ms; SamplesPerSecond = 26494 - Epoch[ 1 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.38912659; EvalErr[0]PerSample = 0.41132812; TotalTime = 0.09588s; TotalTimePerSample = 0.03745ms; SamplesPerSecond = 26700 - Epoch[ 1 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.41208191; EvalErr[0]PerSample = 0.42226562; TotalTime = 0.09560s; TotalTimePerSample = 0.03734ms; SamplesPerSecond = 26779 - Epoch[ 1 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.39966125; EvalErr[0]PerSample = 0.40664062; TotalTime = 0.09558s; TotalTimePerSample = 0.03734ms; SamplesPerSecond = 26782 - Epoch[ 1 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.42728577; EvalErr[0]PerSample = 0.42617187; TotalTime = 0.09572s; TotalTimePerSample = 0.03739ms; SamplesPerSecond = 26745 - Epoch[ 1 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.41336365; EvalErr[0]PerSample = 0.42304687; TotalTime = 0.09572s; TotalTimePerSample = 0.03739ms; SamplesPerSecond = 26744 - Epoch[ 1 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.33197937; EvalErr[0]PerSample = 0.39960937; TotalTime = 0.09589s; TotalTimePerSample = 0.03746ms; SamplesPerSecond = 26698 - Epoch[ 1 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.28578796; EvalErr[0]PerSample = 0.38671875; TotalTime = 0.09591s; TotalTimePerSample = 0.03747ms; SamplesPerSecond = 26691 - Epoch[ 1 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.34131775; EvalErr[0]PerSample = 0.40937500; TotalTime = 0.09552s; TotalTimePerSample = 0.03731ms; SamplesPerSecond = 26800 - Epoch[ 1 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.32666016; EvalErr[0]PerSample = 0.39648438; TotalTime = 0.09573s; TotalTimePerSample = 0.03740ms; SamplesPerSecond = 26741 - Epoch[ 1 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21426086; EvalErr[0]PerSample = 0.37226562; TotalTime = 0.09610s; TotalTimePerSample = 0.03754ms; SamplesPerSecond = 26638 - Epoch[ 1 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23750610; EvalErr[0]PerSample = 0.37382813; TotalTime = 0.10318s; TotalTimePerSample = 0.04031ms; SamplesPerSecond = 24810 - Epoch[ 1 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.29967957; EvalErr[0]PerSample = 0.39062500; TotalTime = 0.12995s; TotalTimePerSample = 0.05076ms; SamplesPerSecond = 19699 - Epoch[ 1 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21233215; EvalErr[0]PerSample = 0.37343750; TotalTime = 0.12914s; TotalTimePerSample = 0.05044ms; SamplesPerSecond = 19823 - Epoch[ 1 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20534973; EvalErr[0]PerSample = 0.36718750; TotalTime = 0.12942s; TotalTimePerSample = 0.05056ms; SamplesPerSecond = 19779 - Epoch[ 1 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23558655; EvalErr[0]PerSample = 0.37187500; TotalTime = 0.12904s; TotalTimePerSample = 0.05041ms; SamplesPerSecond = 19838 - Epoch[ 1 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.25517273; EvalErr[0]PerSample = 0.37890625; TotalTime = 0.11876s; TotalTimePerSample = 0.04639ms; SamplesPerSecond = 21555 -Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 1.6294192; EvalErrPerSample = 0.46010742; Ave LearnRatePerSample = 0.003125000047; EpochTime=5.361751 + Epoch[ 1 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.59272461; EvalErr[0]PerSample = 0.47500000; TotalTime = 0.06816s; TotalTimePerSample = 0.02662ms; SamplesPerSecond = 37559 + Epoch[ 1 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.51520386; EvalErr[0]PerSample = 0.44531250; TotalTime = 0.06807s; TotalTimePerSample = 0.02659ms; SamplesPerSecond = 37605 + Epoch[ 1 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.49181976; EvalErr[0]PerSample = 0.45039062; TotalTime = 0.06793s; TotalTimePerSample = 0.02654ms; SamplesPerSecond = 37685 + Epoch[ 1 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.53703613; EvalErr[0]PerSample = 0.44804688; TotalTime = 0.06807s; TotalTimePerSample = 0.02659ms; SamplesPerSecond = 37605 + Epoch[ 1 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.43095398; EvalErr[0]PerSample = 0.41640625; TotalTime = 0.06804s; TotalTimePerSample = 0.02658ms; SamplesPerSecond = 37623 + Epoch[ 1 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.41503601; EvalErr[0]PerSample = 0.40078125; TotalTime = 0.06805s; TotalTimePerSample = 0.02658ms; SamplesPerSecond = 37617 + Epoch[ 1 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.38913574; EvalErr[0]PerSample = 0.41132812; TotalTime = 0.06795s; TotalTimePerSample = 0.02654ms; SamplesPerSecond = 37674 + Epoch[ 1 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.41207886; EvalErr[0]PerSample = 0.42226562; TotalTime = 0.06819s; TotalTimePerSample = 0.02664ms; SamplesPerSecond = 37541 + Epoch[ 1 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.39968262; EvalErr[0]PerSample = 0.40664062; TotalTime = 0.06804s; TotalTimePerSample = 0.02658ms; SamplesPerSecond = 37625 + Epoch[ 1 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.42729187; EvalErr[0]PerSample = 0.42617187; TotalTime = 0.06792s; TotalTimePerSample = 0.02653ms; SamplesPerSecond = 37690 + Epoch[ 1 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.41336365; EvalErr[0]PerSample = 0.42343750; TotalTime = 0.06812s; TotalTimePerSample = 0.02661ms; SamplesPerSecond = 37578 + Epoch[ 1 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.33186951; EvalErr[0]PerSample = 0.39960937; TotalTime = 0.06800s; TotalTimePerSample = 0.02656ms; SamplesPerSecond = 37647 + Epoch[ 1 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.28581238; EvalErr[0]PerSample = 0.38710937; TotalTime = 0.06803s; TotalTimePerSample = 0.02658ms; SamplesPerSecond = 37628 + Epoch[ 1 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.34127502; EvalErr[0]PerSample = 0.40976563; TotalTime = 0.06799s; TotalTimePerSample = 0.02656ms; SamplesPerSecond = 37652 + Epoch[ 1 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.32666016; EvalErr[0]PerSample = 0.39726563; TotalTime = 0.06795s; TotalTimePerSample = 0.02654ms; SamplesPerSecond = 37673 + Epoch[ 1 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21437378; EvalErr[0]PerSample = 0.37265625; TotalTime = 0.06821s; TotalTimePerSample = 0.02664ms; SamplesPerSecond = 37532 + Epoch[ 1 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23749695; EvalErr[0]PerSample = 0.37343750; TotalTime = 0.06804s; TotalTimePerSample = 0.02658ms; SamplesPerSecond = 37626 + Epoch[ 1 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.29956665; EvalErr[0]PerSample = 0.39023438; TotalTime = 0.06814s; TotalTimePerSample = 0.02662ms; SamplesPerSecond = 37570 + Epoch[ 1 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21198120; EvalErr[0]PerSample = 0.37382813; TotalTime = 0.06804s; TotalTimePerSample = 0.02658ms; SamplesPerSecond = 37624 + Epoch[ 1 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20528259; EvalErr[0]PerSample = 0.36718750; TotalTime = 0.06800s; TotalTimePerSample = 0.02656ms; SamplesPerSecond = 37648 + Epoch[ 1 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23613586; EvalErr[0]PerSample = 0.37343750; TotalTime = 0.06807s; TotalTimePerSample = 0.02659ms; SamplesPerSecond = 37608 + Epoch[ 1 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.25615234; EvalErr[0]PerSample = 0.38164063; TotalTime = 0.06263s; TotalTimePerSample = 0.02446ms; SamplesPerSecond = 40877 +Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 1.6294507; EvalErrPerSample = 0.46030274; Ave LearnRatePerSample = 0.003125000047; EpochTime=3.113533 Starting Epoch 2: learning rate per sample = 0.003125 effective momentum = 0.900000 minibatchiterator: epoch 1: frames [81920..163840] (first utterance at frame 81920), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 2 of 2]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23276577; EvalErr[0]PerSample = 0.38125000; TotalTime = 0.13037s; TotalTimePerSample = 0.05093ms; SamplesPerSecond = 19635 - Epoch[ 2 of 2]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20353279; EvalErr[0]PerSample = 0.37265625; TotalTime = 0.12890s; TotalTimePerSample = 0.05035ms; SamplesPerSecond = 19860 - Epoch[ 2 of 2]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.28632336; EvalErr[0]PerSample = 0.37734375; TotalTime = 0.12453s; TotalTimePerSample = 0.04864ms; SamplesPerSecond = 20557 - Epoch[ 2 of 2]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23058014; EvalErr[0]PerSample = 0.37812500; TotalTime = 0.09562s; TotalTimePerSample = 0.03735ms; SamplesPerSecond = 26772 - Epoch[ 2 of 2]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18196945; EvalErr[0]PerSample = 0.35429688; TotalTime = 0.09557s; TotalTimePerSample = 0.03733ms; SamplesPerSecond = 26785 - Epoch[ 2 of 2]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.28158035; EvalErr[0]PerSample = 0.38007812; TotalTime = 0.09562s; TotalTimePerSample = 0.03735ms; SamplesPerSecond = 26773 - Epoch[ 2 of 2]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.22469864; EvalErr[0]PerSample = 0.37265625; TotalTime = 0.09554s; TotalTimePerSample = 0.03732ms; SamplesPerSecond = 26795 - Epoch[ 2 of 2]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17930145; EvalErr[0]PerSample = 0.36718750; TotalTime = 0.09564s; TotalTimePerSample = 0.03736ms; SamplesPerSecond = 26767 - Epoch[ 2 of 2]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23973160; EvalErr[0]PerSample = 0.36328125; TotalTime = 0.09546s; TotalTimePerSample = 0.03729ms; SamplesPerSecond = 26817 - Epoch[ 2 of 2]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18514709; EvalErr[0]PerSample = 0.37539062; TotalTime = 0.09566s; TotalTimePerSample = 0.03737ms; SamplesPerSecond = 26762 + Epoch[ 2 of 2]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23230944; EvalErr[0]PerSample = 0.38320312; TotalTime = 0.06923s; TotalTimePerSample = 0.02704ms; SamplesPerSecond = 36980 + Epoch[ 2 of 2]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20511351; EvalErr[0]PerSample = 0.37421875; TotalTime = 0.06817s; TotalTimePerSample = 0.02663ms; SamplesPerSecond = 37551 + Epoch[ 2 of 2]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.28783760; EvalErr[0]PerSample = 0.37421875; TotalTime = 0.06806s; TotalTimePerSample = 0.02659ms; SamplesPerSecond = 37612 + Epoch[ 2 of 2]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.22809334; EvalErr[0]PerSample = 0.37421875; TotalTime = 0.06819s; TotalTimePerSample = 0.02664ms; SamplesPerSecond = 37539 + Epoch[ 2 of 2]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18090286; EvalErr[0]PerSample = 0.35468750; TotalTime = 0.06809s; TotalTimePerSample = 0.02660ms; SamplesPerSecond = 37598 + Epoch[ 2 of 2]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.28175354; EvalErr[0]PerSample = 0.37695312; TotalTime = 0.06818s; TotalTimePerSample = 0.02663ms; SamplesPerSecond = 37546 + Epoch[ 2 of 2]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.22251205; EvalErr[0]PerSample = 0.37382813; TotalTime = 0.06819s; TotalTimePerSample = 0.02664ms; SamplesPerSecond = 37542 + Epoch[ 2 of 2]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17863007; EvalErr[0]PerSample = 0.36328125; TotalTime = 0.06815s; TotalTimePerSample = 0.02662ms; SamplesPerSecond = 37566 + Epoch[ 2 of 2]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23061218; EvalErr[0]PerSample = 0.35742188; TotalTime = 0.06819s; TotalTimePerSample = 0.02664ms; SamplesPerSecond = 37540 + Epoch[ 2 of 2]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18048782; EvalErr[0]PerSample = 0.37578125; TotalTime = 0.06799s; TotalTimePerSample = 0.02656ms; SamplesPerSecond = 37654 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 2 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20197525; EvalErr[0]PerSample = 0.36171875; TotalTime = 0.09590s; TotalTimePerSample = 0.03746ms; SamplesPerSecond = 26695 - Epoch[ 2 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18739471; EvalErr[0]PerSample = 0.35312500; TotalTime = 0.09610s; TotalTimePerSample = 0.03754ms; SamplesPerSecond = 26637 - Epoch[ 2 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16798859; EvalErr[0]PerSample = 0.35742188; TotalTime = 0.09603s; TotalTimePerSample = 0.03751ms; SamplesPerSecond = 26657 - Epoch[ 2 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13375397; EvalErr[0]PerSample = 0.35273437; TotalTime = 0.09650s; TotalTimePerSample = 0.03769ms; SamplesPerSecond = 26529 - Epoch[ 2 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09628754; EvalErr[0]PerSample = 0.31992188; TotalTime = 0.09675s; TotalTimePerSample = 0.03779ms; SamplesPerSecond = 26459 - Epoch[ 2 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.10226898; EvalErr[0]PerSample = 0.34218750; TotalTime = 0.09621s; TotalTimePerSample = 0.03758ms; SamplesPerSecond = 26608 - Epoch[ 2 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20214386; EvalErr[0]PerSample = 0.36015625; TotalTime = 0.09606s; TotalTimePerSample = 0.03753ms; SamplesPerSecond = 26648 - Epoch[ 2 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17007599; EvalErr[0]PerSample = 0.36015625; TotalTime = 0.09616s; TotalTimePerSample = 0.03756ms; SamplesPerSecond = 26621 - Epoch[ 2 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12343140; EvalErr[0]PerSample = 0.34843750; TotalTime = 0.09620s; TotalTimePerSample = 0.03758ms; SamplesPerSecond = 26611 - Epoch[ 2 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12009735; EvalErr[0]PerSample = 0.34570312; TotalTime = 0.09589s; TotalTimePerSample = 0.03746ms; SamplesPerSecond = 26697 - Epoch[ 2 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.10230255; EvalErr[0]PerSample = 0.33359375; TotalTime = 0.09559s; TotalTimePerSample = 0.03734ms; SamplesPerSecond = 26780 - Epoch[ 2 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12454529; EvalErr[0]PerSample = 0.34179688; TotalTime = 0.09594s; TotalTimePerSample = 0.03748ms; SamplesPerSecond = 26682 - Epoch[ 2 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13382874; EvalErr[0]PerSample = 0.34921875; TotalTime = 0.09603s; TotalTimePerSample = 0.03751ms; SamplesPerSecond = 26657 - Epoch[ 2 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.27786255; EvalErr[0]PerSample = 0.39296875; TotalTime = 0.09608s; TotalTimePerSample = 0.03753ms; SamplesPerSecond = 26644 - Epoch[ 2 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16416626; EvalErr[0]PerSample = 0.34960938; TotalTime = 0.09607s; TotalTimePerSample = 0.03753ms; SamplesPerSecond = 26647 - Epoch[ 2 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12371216; EvalErr[0]PerSample = 0.35546875; TotalTime = 0.09599s; TotalTimePerSample = 0.03750ms; SamplesPerSecond = 26668 - Epoch[ 2 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13847351; EvalErr[0]PerSample = 0.34453125; TotalTime = 0.09585s; TotalTimePerSample = 0.03744ms; SamplesPerSecond = 26707 - Epoch[ 2 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14408264; EvalErr[0]PerSample = 0.34414062; TotalTime = 0.09598s; TotalTimePerSample = 0.03749ms; SamplesPerSecond = 26671 - Epoch[ 2 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.06380920; EvalErr[0]PerSample = 0.33359375; TotalTime = 0.09587s; TotalTimePerSample = 0.03745ms; SamplesPerSecond = 26702 - Epoch[ 2 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09358521; EvalErr[0]PerSample = 0.33476563; TotalTime = 0.09592s; TotalTimePerSample = 0.03747ms; SamplesPerSecond = 26690 - Epoch[ 2 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.08025513; EvalErr[0]PerSample = 0.33046875; TotalTime = 0.09581s; TotalTimePerSample = 0.03743ms; SamplesPerSecond = 26718 - Epoch[ 2 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.05906372; EvalErr[0]PerSample = 0.32968750; TotalTime = 0.08985s; TotalTimePerSample = 0.03510ms; SamplesPerSecond = 28490 -Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 1.164273; EvalErrPerSample = 0.35511476; Ave LearnRatePerSample = 0.003125000047; EpochTime=3.177082 + Epoch[ 2 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.19648056; EvalErr[0]PerSample = 0.35976562; TotalTime = 0.06812s; TotalTimePerSample = 0.02661ms; SamplesPerSecond = 37582 + Epoch[ 2 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18896942; EvalErr[0]PerSample = 0.35429688; TotalTime = 0.06823s; TotalTimePerSample = 0.02665ms; SamplesPerSecond = 37521 + Epoch[ 2 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16628113; EvalErr[0]PerSample = 0.35937500; TotalTime = 0.06815s; TotalTimePerSample = 0.02662ms; SamplesPerSecond = 37563 + Epoch[ 2 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12856445; EvalErr[0]PerSample = 0.35195312; TotalTime = 0.06806s; TotalTimePerSample = 0.02659ms; SamplesPerSecond = 37613 + Epoch[ 2 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.10083466; EvalErr[0]PerSample = 0.32617188; TotalTime = 0.06827s; TotalTimePerSample = 0.02667ms; SamplesPerSecond = 37496 + Epoch[ 2 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09875336; EvalErr[0]PerSample = 0.33906250; TotalTime = 0.06807s; TotalTimePerSample = 0.02659ms; SamplesPerSecond = 37610 + Epoch[ 2 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18634949; EvalErr[0]PerSample = 0.35820313; TotalTime = 0.06811s; TotalTimePerSample = 0.02661ms; SamplesPerSecond = 37585 + Epoch[ 2 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.15709991; EvalErr[0]PerSample = 0.35195312; TotalTime = 0.06819s; TotalTimePerSample = 0.02664ms; SamplesPerSecond = 37540 + Epoch[ 2 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.10971069; EvalErr[0]PerSample = 0.34960938; TotalTime = 0.06807s; TotalTimePerSample = 0.02659ms; SamplesPerSecond = 37605 + Epoch[ 2 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.11317139; EvalErr[0]PerSample = 0.35000000; TotalTime = 0.06824s; TotalTimePerSample = 0.02665ms; SamplesPerSecond = 37516 + Epoch[ 2 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.08727722; EvalErr[0]PerSample = 0.32578125; TotalTime = 0.06816s; TotalTimePerSample = 0.02662ms; SamplesPerSecond = 37558 + Epoch[ 2 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12295990; EvalErr[0]PerSample = 0.34101562; TotalTime = 0.06804s; TotalTimePerSample = 0.02658ms; SamplesPerSecond = 37626 + Epoch[ 2 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12966003; EvalErr[0]PerSample = 0.35078125; TotalTime = 0.06816s; TotalTimePerSample = 0.02663ms; SamplesPerSecond = 37557 + Epoch[ 2 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.27489319; EvalErr[0]PerSample = 0.39257812; TotalTime = 0.06816s; TotalTimePerSample = 0.02662ms; SamplesPerSecond = 37559 + Epoch[ 2 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17423401; EvalErr[0]PerSample = 0.35156250; TotalTime = 0.06803s; TotalTimePerSample = 0.02657ms; SamplesPerSecond = 37632 + Epoch[ 2 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13240051; EvalErr[0]PerSample = 0.35625000; TotalTime = 0.06813s; TotalTimePerSample = 0.02661ms; SamplesPerSecond = 37574 + Epoch[ 2 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13792114; EvalErr[0]PerSample = 0.34335938; TotalTime = 0.06802s; TotalTimePerSample = 0.02657ms; SamplesPerSecond = 37635 + Epoch[ 2 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13433228; EvalErr[0]PerSample = 0.33710937; TotalTime = 0.06814s; TotalTimePerSample = 0.02662ms; SamplesPerSecond = 37568 + Epoch[ 2 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.05835876; EvalErr[0]PerSample = 0.33710937; TotalTime = 0.06819s; TotalTimePerSample = 0.02664ms; SamplesPerSecond = 37539 + Epoch[ 2 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09596558; EvalErr[0]PerSample = 0.33476563; TotalTime = 0.06819s; TotalTimePerSample = 0.02664ms; SamplesPerSecond = 37540 + Epoch[ 2 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.08180847; EvalErr[0]PerSample = 0.33242187; TotalTime = 0.06802s; TotalTimePerSample = 0.02657ms; SamplesPerSecond = 37634 + Epoch[ 2 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.06572876; EvalErr[0]PerSample = 0.33632812; TotalTime = 0.06260s; TotalTimePerSample = 0.02445ms; SamplesPerSecond = 40895 +Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 1.1615628; EvalErrPerSample = 0.35460207; Ave LearnRatePerSample = 0.003125000047; EpochTime=2.185375 CNTKCommandTrainEnd: DPT_Pre1 @@ -800,6 +859,24 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node cr. 6 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node cr, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -841,7 +918,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] -Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2. +Validating for node ScaledLogLikelihood. 7 nodes to process in pass 2. Validating --> OL.W = LearnableParameter -> [132, 512] Validating --> HL1.W = LearnableParameter -> [512, 363] @@ -902,6 +979,25 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] +Validating for node ScaledLogLikelihood. 7 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -943,7 +1039,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] -Validating for node Err. 1 nodes to process in pass 2. +Validating for node Err. 6 nodes to process in pass 2. Validating --> labels = InputValue -> [132, MBSize 0] Validating --> OL.W = LearnableParameter -> [132, 512] @@ -1001,6 +1097,24 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node Err. 6 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -1069,7 +1183,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] -Validating for node cr. 3 nodes to process in pass 2. +Validating for node cr. 9 nodes to process in pass 2. Validating --> labels = InputValue -> [132, MBSize 0] Validating --> OL.W = LearnableParameter -> [132, 512] @@ -1142,6 +1256,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node cr. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node cr, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -1193,6 +1330,30 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] +Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -1245,6 +1406,30 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] +Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -1296,6 +1481,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node Err. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -1346,6 +1554,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node Err. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -1380,7 +1611,7 @@ htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Speech/Data/gl ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances label set 0: 129 classes minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames -Starting from checkpoint. Load Network From File /tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0. +Starting from checkpoint. Load Network From File /tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0. Printing Gradient Computation Node Order ... @@ -1502,6 +1733,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node cr. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node cr, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -1553,7 +1807,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] -Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2. +Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2. Validating --> OL.W = LearnableParameter -> [132, 512] Validating --> HL2.W = LearnableParameter -> [512, 512] @@ -1629,6 +1883,30 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] +Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -1680,7 +1958,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] -Validating for node Err. 1 nodes to process in pass 2. +Validating for node Err. 9 nodes to process in pass 2. Validating --> labels = InputValue -> [132, MBSize 0] Validating --> OL.W = LearnableParameter -> [132, 512] @@ -1753,6 +2031,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node Err. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -1787,78 +2088,78 @@ minibatchiterator: epoch 0: frames [0..81920] (first utterance at frame 0), data requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms Starting minibatch loop. - Epoch[ 1 of 2]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 4.33646812; EvalErr[0]PerSample = 0.80507812; TotalTime = 0.17076s; TotalTimePerSample = 0.06670ms; SamplesPerSecond = 14991 - Epoch[ 1 of 2]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.78729973; EvalErr[0]PerSample = 0.71328125; TotalTime = 0.16588s; TotalTimePerSample = 0.06480ms; SamplesPerSecond = 15432 - Epoch[ 1 of 2]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.21825867; EvalErr[0]PerSample = 0.58007812; TotalTime = 0.13480s; TotalTimePerSample = 0.05266ms; SamplesPerSecond = 18991 - Epoch[ 1 of 2]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.89405746; EvalErr[0]PerSample = 0.50468750; TotalTime = 0.12949s; TotalTimePerSample = 0.05058ms; SamplesPerSecond = 19769 - Epoch[ 1 of 2]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.71779938; EvalErr[0]PerSample = 0.47578125; TotalTime = 0.16601s; TotalTimePerSample = 0.06485ms; SamplesPerSecond = 15420 - Epoch[ 1 of 2]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.60265808; EvalErr[0]PerSample = 0.45000000; TotalTime = 0.16532s; TotalTimePerSample = 0.06458ms; SamplesPerSecond = 15484 - Epoch[ 1 of 2]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.56439209; EvalErr[0]PerSample = 0.44843750; TotalTime = 0.16557s; TotalTimePerSample = 0.06468ms; SamplesPerSecond = 15461 - Epoch[ 1 of 2]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.47621765; EvalErr[0]PerSample = 0.42578125; TotalTime = 0.13552s; TotalTimePerSample = 0.05294ms; SamplesPerSecond = 18890 - Epoch[ 1 of 2]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.39409637; EvalErr[0]PerSample = 0.40625000; TotalTime = 0.12883s; TotalTimePerSample = 0.05032ms; SamplesPerSecond = 19871 - Epoch[ 1 of 2]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.42145081; EvalErr[0]PerSample = 0.42343750; TotalTime = 0.12926s; TotalTimePerSample = 0.05049ms; SamplesPerSecond = 19805 + Epoch[ 1 of 2]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 4.30124588; EvalErr[0]PerSample = 0.80703125; TotalTime = 0.09340s; TotalTimePerSample = 0.03649ms; SamplesPerSecond = 27407 + Epoch[ 1 of 2]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.75448074; EvalErr[0]PerSample = 0.69960937; TotalTime = 0.09098s; TotalTimePerSample = 0.03554ms; SamplesPerSecond = 28139 + Epoch[ 1 of 2]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.20926208; EvalErr[0]PerSample = 0.58515625; TotalTime = 0.09073s; TotalTimePerSample = 0.03544ms; SamplesPerSecond = 28216 + Epoch[ 1 of 2]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.88578110; EvalErr[0]PerSample = 0.50117188; TotalTime = 0.09085s; TotalTimePerSample = 0.03549ms; SamplesPerSecond = 28176 + Epoch[ 1 of 2]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.71906204; EvalErr[0]PerSample = 0.47773437; TotalTime = 0.09080s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28193 + Epoch[ 1 of 2]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.60130463; EvalErr[0]PerSample = 0.44648437; TotalTime = 0.09085s; TotalTimePerSample = 0.03549ms; SamplesPerSecond = 28177 + Epoch[ 1 of 2]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.56077118; EvalErr[0]PerSample = 0.45000000; TotalTime = 0.09086s; TotalTimePerSample = 0.03549ms; SamplesPerSecond = 28175 + Epoch[ 1 of 2]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.47116547; EvalErr[0]PerSample = 0.42460938; TotalTime = 0.09079s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28197 + Epoch[ 1 of 2]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.38874512; EvalErr[0]PerSample = 0.40781250; TotalTime = 0.09069s; TotalTimePerSample = 0.03543ms; SamplesPerSecond = 28227 + Epoch[ 1 of 2]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.41911163; EvalErr[0]PerSample = 0.42539063; TotalTime = 0.09077s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28202 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 1 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.39049683; EvalErr[0]PerSample = 0.42148438; TotalTime = 0.12864s; TotalTimePerSample = 0.05025ms; SamplesPerSecond = 19900 - Epoch[ 1 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.36727448; EvalErr[0]PerSample = 0.41054687; TotalTime = 0.12862s; TotalTimePerSample = 0.05024ms; SamplesPerSecond = 19903 - Epoch[ 1 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.33726044; EvalErr[0]PerSample = 0.40703125; TotalTime = 0.15213s; TotalTimePerSample = 0.05943ms; SamplesPerSecond = 16827 - Epoch[ 1 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.40177307; EvalErr[0]PerSample = 0.40781250; TotalTime = 0.12857s; TotalTimePerSample = 0.05022ms; SamplesPerSecond = 19910 - Epoch[ 1 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.33615417; EvalErr[0]PerSample = 0.39570312; TotalTime = 0.12867s; TotalTimePerSample = 0.05026ms; SamplesPerSecond = 19895 - Epoch[ 1 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.34133606; EvalErr[0]PerSample = 0.40273437; TotalTime = 0.12841s; TotalTimePerSample = 0.05016ms; SamplesPerSecond = 19936 - Epoch[ 1 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.26413574; EvalErr[0]PerSample = 0.37304688; TotalTime = 0.12802s; TotalTimePerSample = 0.05001ms; SamplesPerSecond = 19996 - Epoch[ 1 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.28038635; EvalErr[0]PerSample = 0.38593750; TotalTime = 0.12841s; TotalTimePerSample = 0.05016ms; SamplesPerSecond = 19936 - Epoch[ 1 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.29767151; EvalErr[0]PerSample = 0.39179687; TotalTime = 0.16430s; TotalTimePerSample = 0.06418ms; SamplesPerSecond = 15581 - Epoch[ 1 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.28023682; EvalErr[0]PerSample = 0.39687500; TotalTime = 0.16454s; TotalTimePerSample = 0.06427ms; SamplesPerSecond = 15558 - Epoch[ 1 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.26818542; EvalErr[0]PerSample = 0.38945313; TotalTime = 0.16489s; TotalTimePerSample = 0.06441ms; SamplesPerSecond = 15525 - Epoch[ 1 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21394043; EvalErr[0]PerSample = 0.36250000; TotalTime = 0.16427s; TotalTimePerSample = 0.06417ms; SamplesPerSecond = 15583 - Epoch[ 1 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20627136; EvalErr[0]PerSample = 0.36953125; TotalTime = 0.16384s; TotalTimePerSample = 0.06400ms; SamplesPerSecond = 15624 - Epoch[ 1 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.25008850; EvalErr[0]PerSample = 0.37929687; TotalTime = 0.16415s; TotalTimePerSample = 0.06412ms; SamplesPerSecond = 15595 - Epoch[ 1 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.22965393; EvalErr[0]PerSample = 0.37617187; TotalTime = 0.16463s; TotalTimePerSample = 0.06431ms; SamplesPerSecond = 15550 - Epoch[ 1 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.15062561; EvalErr[0]PerSample = 0.34960938; TotalTime = 0.16421s; TotalTimePerSample = 0.06414ms; SamplesPerSecond = 15590 - Epoch[ 1 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16630554; EvalErr[0]PerSample = 0.35390625; TotalTime = 0.13011s; TotalTimePerSample = 0.05082ms; SamplesPerSecond = 19675 - Epoch[ 1 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.22966309; EvalErr[0]PerSample = 0.37109375; TotalTime = 0.12816s; TotalTimePerSample = 0.05006ms; SamplesPerSecond = 19975 - Epoch[ 1 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16364136; EvalErr[0]PerSample = 0.36445312; TotalTime = 0.12827s; TotalTimePerSample = 0.05010ms; SamplesPerSecond = 19958 - Epoch[ 1 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17280579; EvalErr[0]PerSample = 0.35351562; TotalTime = 0.12890s; TotalTimePerSample = 0.05035ms; SamplesPerSecond = 19860 - Epoch[ 1 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16119995; EvalErr[0]PerSample = 0.34687500; TotalTime = 0.12864s; TotalTimePerSample = 0.05025ms; SamplesPerSecond = 19901 - Epoch[ 1 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16999512; EvalErr[0]PerSample = 0.35000000; TotalTime = 0.12263s; TotalTimePerSample = 0.04790ms; SamplesPerSecond = 20875 -Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 1.5028688; EvalErrPerSample = 0.42475587; Ave LearnRatePerSample = 0.003125000047; EpochTime=5.763236 + Epoch[ 1 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.38730774; EvalErr[0]PerSample = 0.42148438; TotalTime = 0.09085s; TotalTimePerSample = 0.03549ms; SamplesPerSecond = 28178 + Epoch[ 1 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.36617889; EvalErr[0]PerSample = 0.41015625; TotalTime = 0.09080s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28194 + Epoch[ 1 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.33381653; EvalErr[0]PerSample = 0.40781250; TotalTime = 0.09084s; TotalTimePerSample = 0.03549ms; SamplesPerSecond = 28180 + Epoch[ 1 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.39802246; EvalErr[0]PerSample = 0.40546875; TotalTime = 0.09081s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28189 + Epoch[ 1 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.33336182; EvalErr[0]PerSample = 0.40195313; TotalTime = 0.09078s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28200 + Epoch[ 1 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.33834229; EvalErr[0]PerSample = 0.40195313; TotalTime = 0.09078s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28199 + Epoch[ 1 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.26663208; EvalErr[0]PerSample = 0.37578125; TotalTime = 0.09074s; TotalTimePerSample = 0.03545ms; SamplesPerSecond = 28211 + Epoch[ 1 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.28086243; EvalErr[0]PerSample = 0.39296875; TotalTime = 0.09072s; TotalTimePerSample = 0.03544ms; SamplesPerSecond = 28218 + Epoch[ 1 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.29481506; EvalErr[0]PerSample = 0.39531250; TotalTime = 0.09076s; TotalTimePerSample = 0.03545ms; SamplesPerSecond = 28207 + Epoch[ 1 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.27625122; EvalErr[0]PerSample = 0.39375000; TotalTime = 0.09079s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28196 + Epoch[ 1 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.26905518; EvalErr[0]PerSample = 0.38984375; TotalTime = 0.09070s; TotalTimePerSample = 0.03543ms; SamplesPerSecond = 28223 + Epoch[ 1 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21494751; EvalErr[0]PerSample = 0.36250000; TotalTime = 0.09077s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28204 + Epoch[ 1 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20699158; EvalErr[0]PerSample = 0.36914062; TotalTime = 0.09076s; TotalTimePerSample = 0.03545ms; SamplesPerSecond = 28207 + Epoch[ 1 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.25002136; EvalErr[0]PerSample = 0.37851563; TotalTime = 0.09080s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28192 + Epoch[ 1 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.22617493; EvalErr[0]PerSample = 0.37656250; TotalTime = 0.09081s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28189 + Epoch[ 1 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14840393; EvalErr[0]PerSample = 0.35468750; TotalTime = 0.09064s; TotalTimePerSample = 0.03541ms; SamplesPerSecond = 28242 + Epoch[ 1 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16649780; EvalErr[0]PerSample = 0.35468750; TotalTime = 0.09076s; TotalTimePerSample = 0.03545ms; SamplesPerSecond = 28206 + Epoch[ 1 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.22885742; EvalErr[0]PerSample = 0.36992188; TotalTime = 0.09083s; TotalTimePerSample = 0.03548ms; SamplesPerSecond = 28185 + Epoch[ 1 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16533203; EvalErr[0]PerSample = 0.36484375; TotalTime = 0.09081s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28190 + Epoch[ 1 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17502136; EvalErr[0]PerSample = 0.35664062; TotalTime = 0.09073s; TotalTimePerSample = 0.03544ms; SamplesPerSecond = 28215 + Epoch[ 1 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16159058; EvalErr[0]PerSample = 0.35195312; TotalTime = 0.09071s; TotalTimePerSample = 0.03543ms; SamplesPerSecond = 28223 + Epoch[ 1 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17113953; EvalErr[0]PerSample = 0.35429688; TotalTime = 0.08532s; TotalTimePerSample = 0.03333ms; SamplesPerSecond = 30005 +Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 1.4990798; EvalErrPerSample = 0.42547607; Ave LearnRatePerSample = 0.003125000047; EpochTime=3.754035 Starting Epoch 2: learning rate per sample = 0.003125 effective momentum = 0.900000 minibatchiterator: epoch 1: frames [81920..163840] (first utterance at frame 81920), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 2 of 2]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14169836; EvalErr[0]PerSample = 0.35156250; TotalTime = 0.12945s; TotalTimePerSample = 0.05057ms; SamplesPerSecond = 19775 - Epoch[ 2 of 2]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16675386; EvalErr[0]PerSample = 0.35937500; TotalTime = 0.12838s; TotalTimePerSample = 0.05015ms; SamplesPerSecond = 19940 - Epoch[ 2 of 2]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23896408; EvalErr[0]PerSample = 0.37421875; TotalTime = 0.12855s; TotalTimePerSample = 0.05022ms; SamplesPerSecond = 19914 - Epoch[ 2 of 2]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17921028; EvalErr[0]PerSample = 0.36289063; TotalTime = 0.12850s; TotalTimePerSample = 0.05019ms; SamplesPerSecond = 19922 - Epoch[ 2 of 2]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13760986; EvalErr[0]PerSample = 0.34843750; TotalTime = 0.12836s; TotalTimePerSample = 0.05014ms; SamplesPerSecond = 19943 - Epoch[ 2 of 2]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21572113; EvalErr[0]PerSample = 0.36601563; TotalTime = 0.12828s; TotalTimePerSample = 0.05011ms; SamplesPerSecond = 19956 - Epoch[ 2 of 2]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14051437; EvalErr[0]PerSample = 0.34140625; TotalTime = 0.13201s; TotalTimePerSample = 0.05157ms; SamplesPerSecond = 19392 - Epoch[ 2 of 2]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12286606; EvalErr[0]PerSample = 0.34492187; TotalTime = 0.16368s; TotalTimePerSample = 0.06394ms; SamplesPerSecond = 15640 - Epoch[ 2 of 2]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14243240; EvalErr[0]PerSample = 0.33789062; TotalTime = 0.16444s; TotalTimePerSample = 0.06424ms; SamplesPerSecond = 15567 - Epoch[ 2 of 2]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12677765; EvalErr[0]PerSample = 0.35390625; TotalTime = 0.16509s; TotalTimePerSample = 0.06449ms; SamplesPerSecond = 15506 + Epoch[ 2 of 2]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14215412; EvalErr[0]PerSample = 0.34882812; TotalTime = 0.09197s; TotalTimePerSample = 0.03593ms; SamplesPerSecond = 27835 + Epoch[ 2 of 2]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17049236; EvalErr[0]PerSample = 0.36328125; TotalTime = 0.09082s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28189 + Epoch[ 2 of 2]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.24373856; EvalErr[0]PerSample = 0.37460938; TotalTime = 0.09080s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28193 + Epoch[ 2 of 2]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18655586; EvalErr[0]PerSample = 0.36445312; TotalTime = 0.09076s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28204 + Epoch[ 2 of 2]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13848000; EvalErr[0]PerSample = 0.35039063; TotalTime = 0.09077s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28202 + Epoch[ 2 of 2]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21884232; EvalErr[0]PerSample = 0.36757812; TotalTime = 0.09072s; TotalTimePerSample = 0.03544ms; SamplesPerSecond = 28219 + Epoch[ 2 of 2]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14372940; EvalErr[0]PerSample = 0.35000000; TotalTime = 0.09091s; TotalTimePerSample = 0.03551ms; SamplesPerSecond = 28160 + Epoch[ 2 of 2]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12769089; EvalErr[0]PerSample = 0.34960938; TotalTime = 0.09067s; TotalTimePerSample = 0.03542ms; SamplesPerSecond = 28235 + Epoch[ 2 of 2]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14114227; EvalErr[0]PerSample = 0.33554688; TotalTime = 0.09074s; TotalTimePerSample = 0.03545ms; SamplesPerSecond = 28212 + Epoch[ 2 of 2]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12445145; EvalErr[0]PerSample = 0.34843750; TotalTime = 0.09068s; TotalTimePerSample = 0.03542ms; SamplesPerSecond = 28231 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 2 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14400177; EvalErr[0]PerSample = 0.33984375; TotalTime = 0.16439s; TotalTimePerSample = 0.06422ms; SamplesPerSecond = 15572 - Epoch[ 2 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12832870; EvalErr[0]PerSample = 0.34531250; TotalTime = 0.16414s; TotalTimePerSample = 0.06412ms; SamplesPerSecond = 15596 - Epoch[ 2 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.11099091; EvalErr[0]PerSample = 0.34414062; TotalTime = 0.16406s; TotalTimePerSample = 0.06409ms; SamplesPerSecond = 15603 - Epoch[ 2 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.06680908; EvalErr[0]PerSample = 0.32304688; TotalTime = 0.16356s; TotalTimePerSample = 0.06389ms; SamplesPerSecond = 15652 - Epoch[ 2 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.05362549; EvalErr[0]PerSample = 0.30859375; TotalTime = 0.16398s; TotalTimePerSample = 0.06405ms; SamplesPerSecond = 15611 - Epoch[ 2 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.06292725; EvalErr[0]PerSample = 0.32734375; TotalTime = 0.16541s; TotalTimePerSample = 0.06461ms; SamplesPerSecond = 15476 - Epoch[ 2 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14273834; EvalErr[0]PerSample = 0.34882812; TotalTime = 0.16545s; TotalTimePerSample = 0.06463ms; SamplesPerSecond = 15472 - Epoch[ 2 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14362183; EvalErr[0]PerSample = 0.35859375; TotalTime = 0.13098s; TotalTimePerSample = 0.05116ms; SamplesPerSecond = 19544 - Epoch[ 2 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.08687897; EvalErr[0]PerSample = 0.33671875; TotalTime = 0.12837s; TotalTimePerSample = 0.05014ms; SamplesPerSecond = 19942 - Epoch[ 2 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.07546844; EvalErr[0]PerSample = 0.33632812; TotalTime = 0.12850s; TotalTimePerSample = 0.05019ms; SamplesPerSecond = 19922 - Epoch[ 2 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.06579132; EvalErr[0]PerSample = 0.32695313; TotalTime = 0.12887s; TotalTimePerSample = 0.05034ms; SamplesPerSecond = 19864 - Epoch[ 2 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09530640; EvalErr[0]PerSample = 0.33242187; TotalTime = 0.12837s; TotalTimePerSample = 0.05014ms; SamplesPerSecond = 19942 - Epoch[ 2 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.11944122; EvalErr[0]PerSample = 0.35117188; TotalTime = 0.12823s; TotalTimePerSample = 0.05009ms; SamplesPerSecond = 19963 - Epoch[ 2 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13388062; EvalErr[0]PerSample = 0.35507813; TotalTime = 0.12828s; TotalTimePerSample = 0.05011ms; SamplesPerSecond = 19955 - Epoch[ 2 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.08914795; EvalErr[0]PerSample = 0.33437500; TotalTime = 0.12840s; TotalTimePerSample = 0.05016ms; SamplesPerSecond = 19937 - Epoch[ 2 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.06987000; EvalErr[0]PerSample = 0.33359375; TotalTime = 0.12851s; TotalTimePerSample = 0.05020ms; SamplesPerSecond = 19920 - Epoch[ 2 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.06095581; EvalErr[0]PerSample = 0.32109375; TotalTime = 0.12857s; TotalTimePerSample = 0.05022ms; SamplesPerSecond = 19911 - Epoch[ 2 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09798889; EvalErr[0]PerSample = 0.33085938; TotalTime = 0.12868s; TotalTimePerSample = 0.05027ms; SamplesPerSecond = 19894 - Epoch[ 2 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.02103271; EvalErr[0]PerSample = 0.32890625; TotalTime = 0.12890s; TotalTimePerSample = 0.05035ms; SamplesPerSecond = 19860 - Epoch[ 2 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.06984253; EvalErr[0]PerSample = 0.33398438; TotalTime = 0.12823s; TotalTimePerSample = 0.05009ms; SamplesPerSecond = 19964 - Epoch[ 2 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.06397095; EvalErr[0]PerSample = 0.32929687; TotalTime = 0.12842s; TotalTimePerSample = 0.05016ms; SamplesPerSecond = 19935 - Epoch[ 2 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.05246582; EvalErr[0]PerSample = 0.33476563; TotalTime = 0.12215s; TotalTimePerSample = 0.04772ms; SamplesPerSecond = 20957 -Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 1.1114886; EvalErrPerSample = 0.34130859; Ave LearnRatePerSample = 0.003125000047; EpochTime=4.486329 + Epoch[ 2 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14137955; EvalErr[0]PerSample = 0.34101562; TotalTime = 0.09082s; TotalTimePerSample = 0.03548ms; SamplesPerSecond = 28186 + Epoch[ 2 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12705154; EvalErr[0]PerSample = 0.33867188; TotalTime = 0.09065s; TotalTimePerSample = 0.03541ms; SamplesPerSecond = 28241 + Epoch[ 2 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.10779419; EvalErr[0]PerSample = 0.34531250; TotalTime = 0.09078s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28200 + Epoch[ 2 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.07003021; EvalErr[0]PerSample = 0.32500000; TotalTime = 0.09075s; TotalTimePerSample = 0.03545ms; SamplesPerSecond = 28210 + Epoch[ 2 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.05308990; EvalErr[0]PerSample = 0.31406250; TotalTime = 0.09077s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28202 + Epoch[ 2 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.06392975; EvalErr[0]PerSample = 0.33085938; TotalTime = 0.09070s; TotalTimePerSample = 0.03543ms; SamplesPerSecond = 28224 + Epoch[ 2 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14430847; EvalErr[0]PerSample = 0.35507813; TotalTime = 0.09084s; TotalTimePerSample = 0.03549ms; SamplesPerSecond = 28180 + Epoch[ 2 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14809570; EvalErr[0]PerSample = 0.35859375; TotalTime = 0.09087s; TotalTimePerSample = 0.03549ms; SamplesPerSecond = 28173 + Epoch[ 2 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.08184509; EvalErr[0]PerSample = 0.33515625; TotalTime = 0.09083s; TotalTimePerSample = 0.03548ms; SamplesPerSecond = 28185 + Epoch[ 2 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.07637024; EvalErr[0]PerSample = 0.33359375; TotalTime = 0.09081s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28189 + Epoch[ 2 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.06249695; EvalErr[0]PerSample = 0.32500000; TotalTime = 0.09063s; TotalTimePerSample = 0.03540ms; SamplesPerSecond = 28247 + Epoch[ 2 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09361877; EvalErr[0]PerSample = 0.33320312; TotalTime = 0.09059s; TotalTimePerSample = 0.03539ms; SamplesPerSecond = 28257 + Epoch[ 2 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12118530; EvalErr[0]PerSample = 0.34843750; TotalTime = 0.09067s; TotalTimePerSample = 0.03542ms; SamplesPerSecond = 28233 + Epoch[ 2 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13457642; EvalErr[0]PerSample = 0.35195312; TotalTime = 0.09075s; TotalTimePerSample = 0.03545ms; SamplesPerSecond = 28209 + Epoch[ 2 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09024963; EvalErr[0]PerSample = 0.33984375; TotalTime = 0.09067s; TotalTimePerSample = 0.03542ms; SamplesPerSecond = 28235 + Epoch[ 2 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.07457275; EvalErr[0]PerSample = 0.33164063; TotalTime = 0.09070s; TotalTimePerSample = 0.03543ms; SamplesPerSecond = 28225 + Epoch[ 2 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.05975952; EvalErr[0]PerSample = 0.32070312; TotalTime = 0.09076s; TotalTimePerSample = 0.03545ms; SamplesPerSecond = 28205 + Epoch[ 2 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09778137; EvalErr[0]PerSample = 0.33242187; TotalTime = 0.09082s; TotalTimePerSample = 0.03548ms; SamplesPerSecond = 28186 + Epoch[ 2 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.01963196; EvalErr[0]PerSample = 0.32539062; TotalTime = 0.09081s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28190 + Epoch[ 2 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.07533875; EvalErr[0]PerSample = 0.33515625; TotalTime = 0.09069s; TotalTimePerSample = 0.03542ms; SamplesPerSecond = 28228 + Epoch[ 2 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.06417236; EvalErr[0]PerSample = 0.33007812; TotalTime = 0.09071s; TotalTimePerSample = 0.03543ms; SamplesPerSecond = 28221 + Epoch[ 2 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.04990234; EvalErr[0]PerSample = 0.33359375; TotalTime = 0.08542s; TotalTimePerSample = 0.03337ms; SamplesPerSecond = 29970 +Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 1.1123269; EvalErrPerSample = 0.34179688; Ave LearnRatePerSample = 0.003125000047; EpochTime=2.909345 CNTKCommandTrainEnd: DPT_Pre2 @@ -1981,6 +2282,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node cr. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node cr, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -2032,7 +2356,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] -Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2. +Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2. Validating --> OL.W = LearnableParameter -> [132, 512] Validating --> HL2.W = LearnableParameter -> [512, 512] @@ -2108,6 +2432,30 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] +Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -2159,7 +2507,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] -Validating for node Err. 1 nodes to process in pass 2. +Validating for node Err. 9 nodes to process in pass 2. Validating --> labels = InputValue -> [132, MBSize 0] Validating --> OL.W = LearnableParameter -> [132, 512] @@ -2232,6 +2580,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node Err. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -2315,7 +2686,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] -Validating for node cr. 3 nodes to process in pass 2. +Validating for node cr. 12 nodes to process in pass 2. Validating --> labels = InputValue -> [132, MBSize 0] Validating --> OL.W = LearnableParameter -> [132, 512] @@ -2403,6 +2774,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node cr. 12 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL3.W = LearnableParameter -> [512, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.b = LearnableParameter -> [512, 1] +Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0] +Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node cr, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -2464,6 +2863,35 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] +Validating for node ScaledLogLikelihood. 13 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL3.W = LearnableParameter -> [512, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.b = LearnableParameter -> [512, 1] +Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0] +Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -2526,6 +2954,35 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] +Validating for node ScaledLogLikelihood. 13 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL3.W = LearnableParameter -> [512, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.b = LearnableParameter -> [512, 1] +Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0] +Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -2587,6 +3044,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node Err. 12 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL3.W = LearnableParameter -> [512, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.b = LearnableParameter -> [512, 1] +Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0] +Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -2647,6 +3132,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node Err. 12 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL3.W = LearnableParameter -> [512, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.b = LearnableParameter -> [512, 1] +Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0] +Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -2686,7 +3199,7 @@ htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Speech/Data/gl ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances label set 0: 129 classes minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames -Starting from checkpoint. Load Network From File /tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0. +Starting from checkpoint. Load Network From File /tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0. Printing Gradient Computation Node Order ... @@ -2833,6 +3346,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node cr. 12 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL3.W = LearnableParameter -> [512, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.b = LearnableParameter -> [512, 1] +Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0] +Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node cr, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -2894,7 +3435,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] -Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2. +Validating for node ScaledLogLikelihood. 13 nodes to process in pass 2. Validating --> OL.W = LearnableParameter -> [132, 512] Validating --> HL3.W = LearnableParameter -> [512, 512] @@ -2985,6 +3526,35 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] +Validating for node ScaledLogLikelihood. 13 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL3.W = LearnableParameter -> [512, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.b = LearnableParameter -> [512, 1] +Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0] +Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -3046,7 +3616,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] -Validating for node Err. 1 nodes to process in pass 2. +Validating for node Err. 12 nodes to process in pass 2. Validating --> labels = InputValue -> [132, MBSize 0] Validating --> OL.W = LearnableParameter -> [132, 512] @@ -3134,6 +3704,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node Err. 12 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL3.W = LearnableParameter -> [512, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.b = LearnableParameter -> [512, 1] +Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0] +Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -3173,105 +3771,105 @@ minibatchiterator: epoch 0: frames [0..81920] (first utterance at frame 0), data requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms Starting minibatch loop. - Epoch[ 1 of 4]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 3.96939201; EvalErr[0]PerSample = 0.81250000; TotalTime = 0.20433s; TotalTimePerSample = 0.07982ms; SamplesPerSecond = 12528 - Epoch[ 1 of 4]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.64767342; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.19765s; TotalTimePerSample = 0.07721ms; SamplesPerSecond = 12952 - Epoch[ 1 of 4]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.02707901; EvalErr[0]PerSample = 0.53867188; TotalTime = 0.20620s; TotalTimePerSample = 0.08055ms; SamplesPerSecond = 12415 - Epoch[ 1 of 4]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.74281921; EvalErr[0]PerSample = 0.47343750; TotalTime = 0.19865s; TotalTimePerSample = 0.07760ms; SamplesPerSecond = 12886 - Epoch[ 1 of 4]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.58044128; EvalErr[0]PerSample = 0.45156250; TotalTime = 0.19802s; TotalTimePerSample = 0.07735ms; SamplesPerSecond = 12928 - Epoch[ 1 of 4]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.47565231; EvalErr[0]PerSample = 0.41757813; TotalTime = 0.19878s; TotalTimePerSample = 0.07765ms; SamplesPerSecond = 12878 - Epoch[ 1 of 4]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.43280945; EvalErr[0]PerSample = 0.41132812; TotalTime = 0.17110s; TotalTimePerSample = 0.06684ms; SamplesPerSecond = 14961 - Epoch[ 1 of 4]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.35942993; EvalErr[0]PerSample = 0.39531250; TotalTime = 0.16138s; TotalTimePerSample = 0.06304ms; SamplesPerSecond = 15862 - Epoch[ 1 of 4]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.28088837; EvalErr[0]PerSample = 0.37812500; TotalTime = 0.16122s; TotalTimePerSample = 0.06298ms; SamplesPerSecond = 15879 - Epoch[ 1 of 4]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.29705811; EvalErr[0]PerSample = 0.39570312; TotalTime = 0.16121s; TotalTimePerSample = 0.06297ms; SamplesPerSecond = 15879 + Epoch[ 1 of 4]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 3.97086334; EvalErr[0]PerSample = 0.81445312; TotalTime = 0.11658s; TotalTimePerSample = 0.04554ms; SamplesPerSecond = 21959 + Epoch[ 1 of 4]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.63975830; EvalErr[0]PerSample = 0.63320312; TotalTime = 0.11338s; TotalTimePerSample = 0.04429ms; SamplesPerSecond = 22579 + Epoch[ 1 of 4]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.02565231; EvalErr[0]PerSample = 0.54257813; TotalTime = 0.11354s; TotalTimePerSample = 0.04435ms; SamplesPerSecond = 22546 + Epoch[ 1 of 4]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.74204865; EvalErr[0]PerSample = 0.47500000; TotalTime = 0.11328s; TotalTimePerSample = 0.04425ms; SamplesPerSecond = 22599 + Epoch[ 1 of 4]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.58343964; EvalErr[0]PerSample = 0.45156250; TotalTime = 0.11348s; TotalTimePerSample = 0.04433ms; SamplesPerSecond = 22559 + Epoch[ 1 of 4]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.47893143; EvalErr[0]PerSample = 0.42343750; TotalTime = 0.11351s; TotalTimePerSample = 0.04434ms; SamplesPerSecond = 22553 + Epoch[ 1 of 4]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.43405457; EvalErr[0]PerSample = 0.40898438; TotalTime = 0.11369s; TotalTimePerSample = 0.04441ms; SamplesPerSecond = 22517 + Epoch[ 1 of 4]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.35973663; EvalErr[0]PerSample = 0.39648438; TotalTime = 0.11353s; TotalTimePerSample = 0.04435ms; SamplesPerSecond = 22548 + Epoch[ 1 of 4]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.28108978; EvalErr[0]PerSample = 0.37968750; TotalTime = 0.11343s; TotalTimePerSample = 0.04431ms; SamplesPerSecond = 22568 + Epoch[ 1 of 4]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.29773560; EvalErr[0]PerSample = 0.39765625; TotalTime = 0.11329s; TotalTimePerSample = 0.04426ms; SamplesPerSecond = 22596 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 1 of 4]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.28361969; EvalErr[0]PerSample = 0.39101562; TotalTime = 0.16147s; TotalTimePerSample = 0.06308ms; SamplesPerSecond = 15853 - Epoch[ 1 of 4]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.27552490; EvalErr[0]PerSample = 0.38515625; TotalTime = 0.16204s; TotalTimePerSample = 0.06330ms; SamplesPerSecond = 15798 - Epoch[ 1 of 4]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23978882; EvalErr[0]PerSample = 0.37265625; TotalTime = 0.16103s; TotalTimePerSample = 0.06290ms; SamplesPerSecond = 15897 - Epoch[ 1 of 4]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.31328888; EvalErr[0]PerSample = 0.38593750; TotalTime = 0.16089s; TotalTimePerSample = 0.06285ms; SamplesPerSecond = 15911 - Epoch[ 1 of 4]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.25646362; EvalErr[0]PerSample = 0.37109375; TotalTime = 0.18754s; TotalTimePerSample = 0.07326ms; SamplesPerSecond = 13650 - Epoch[ 1 of 4]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.27446442; EvalErr[0]PerSample = 0.38398437; TotalTime = 0.19911s; TotalTimePerSample = 0.07778ms; SamplesPerSecond = 12857 - Epoch[ 1 of 4]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20181580; EvalErr[0]PerSample = 0.36289063; TotalTime = 0.19988s; TotalTimePerSample = 0.07808ms; SamplesPerSecond = 12807 - Epoch[ 1 of 4]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20729980; EvalErr[0]PerSample = 0.36796875; TotalTime = 0.19928s; TotalTimePerSample = 0.07784ms; SamplesPerSecond = 12846 - Epoch[ 1 of 4]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20639648; EvalErr[0]PerSample = 0.36914062; TotalTime = 0.19860s; TotalTimePerSample = 0.07758ms; SamplesPerSecond = 12890 - Epoch[ 1 of 4]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20577698; EvalErr[0]PerSample = 0.37539062; TotalTime = 0.17294s; TotalTimePerSample = 0.06755ms; SamplesPerSecond = 14803 - Epoch[ 1 of 4]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20345459; EvalErr[0]PerSample = 0.37265625; TotalTime = 0.16089s; TotalTimePerSample = 0.06285ms; SamplesPerSecond = 15911 - Epoch[ 1 of 4]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14157104; EvalErr[0]PerSample = 0.34609375; TotalTime = 0.16185s; TotalTimePerSample = 0.06322ms; SamplesPerSecond = 15817 - Epoch[ 1 of 4]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14772339; EvalErr[0]PerSample = 0.35351562; TotalTime = 0.16116s; TotalTimePerSample = 0.06295ms; SamplesPerSecond = 15884 - Epoch[ 1 of 4]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.19301453; EvalErr[0]PerSample = 0.35703125; TotalTime = 0.16144s; TotalTimePerSample = 0.06306ms; SamplesPerSecond = 15857 - Epoch[ 1 of 4]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16928101; EvalErr[0]PerSample = 0.36406250; TotalTime = 0.16115s; TotalTimePerSample = 0.06295ms; SamplesPerSecond = 15885 - Epoch[ 1 of 4]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.08552246; EvalErr[0]PerSample = 0.34062500; TotalTime = 0.16084s; TotalTimePerSample = 0.06283ms; SamplesPerSecond = 15916 - Epoch[ 1 of 4]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.11441040; EvalErr[0]PerSample = 0.33945313; TotalTime = 0.20073s; TotalTimePerSample = 0.07841ms; SamplesPerSecond = 12753 - Epoch[ 1 of 4]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17764893; EvalErr[0]PerSample = 0.35546875; TotalTime = 0.20118s; TotalTimePerSample = 0.07859ms; SamplesPerSecond = 12724 - Epoch[ 1 of 4]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.11296692; EvalErr[0]PerSample = 0.34843750; TotalTime = 0.19897s; TotalTimePerSample = 0.07772ms; SamplesPerSecond = 12866 - Epoch[ 1 of 4]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13165283; EvalErr[0]PerSample = 0.34453125; TotalTime = 0.19846s; TotalTimePerSample = 0.07752ms; SamplesPerSecond = 12899 - Epoch[ 1 of 4]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12458191; EvalErr[0]PerSample = 0.34570312; TotalTime = 0.19971s; TotalTimePerSample = 0.07801ms; SamplesPerSecond = 12818 - Epoch[ 1 of 4]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12154541; EvalErr[0]PerSample = 0.33906250; TotalTime = 0.19018s; TotalTimePerSample = 0.07429ms; SamplesPerSecond = 13461 -Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 1.406283; EvalErrPerSample = 0.40246582; Ave LearnRatePerSample = 0.003125000047; EpochTime=7.080416 + Epoch[ 1 of 4]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.28441925; EvalErr[0]PerSample = 0.39062500; TotalTime = 0.11343s; TotalTimePerSample = 0.04431ms; SamplesPerSecond = 22567 + Epoch[ 1 of 4]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.27777252; EvalErr[0]PerSample = 0.38164063; TotalTime = 0.11341s; TotalTimePerSample = 0.04430ms; SamplesPerSecond = 22573 + Epoch[ 1 of 4]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23615112; EvalErr[0]PerSample = 0.37421875; TotalTime = 0.11341s; TotalTimePerSample = 0.04430ms; SamplesPerSecond = 22573 + Epoch[ 1 of 4]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.31171112; EvalErr[0]PerSample = 0.38671875; TotalTime = 0.11351s; TotalTimePerSample = 0.04434ms; SamplesPerSecond = 22552 + Epoch[ 1 of 4]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.25573883; EvalErr[0]PerSample = 0.37773438; TotalTime = 0.11337s; TotalTimePerSample = 0.04429ms; SamplesPerSecond = 22580 + Epoch[ 1 of 4]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.27382965; EvalErr[0]PerSample = 0.38398437; TotalTime = 0.11349s; TotalTimePerSample = 0.04433ms; SamplesPerSecond = 22556 + Epoch[ 1 of 4]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20634155; EvalErr[0]PerSample = 0.36406250; TotalTime = 0.11336s; TotalTimePerSample = 0.04428ms; SamplesPerSecond = 22582 + Epoch[ 1 of 4]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20973816; EvalErr[0]PerSample = 0.36562500; TotalTime = 0.11355s; TotalTimePerSample = 0.04435ms; SamplesPerSecond = 22546 + Epoch[ 1 of 4]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20688782; EvalErr[0]PerSample = 0.36718750; TotalTime = 0.11352s; TotalTimePerSample = 0.04435ms; SamplesPerSecond = 22550 + Epoch[ 1 of 4]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20260315; EvalErr[0]PerSample = 0.37226562; TotalTime = 0.11337s; TotalTimePerSample = 0.04429ms; SamplesPerSecond = 22580 + Epoch[ 1 of 4]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20553894; EvalErr[0]PerSample = 0.37187500; TotalTime = 0.11352s; TotalTimePerSample = 0.04434ms; SamplesPerSecond = 22551 + Epoch[ 1 of 4]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14160156; EvalErr[0]PerSample = 0.34726563; TotalTime = 0.11316s; TotalTimePerSample = 0.04420ms; SamplesPerSecond = 22623 + Epoch[ 1 of 4]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.15316467; EvalErr[0]PerSample = 0.35273437; TotalTime = 0.11336s; TotalTimePerSample = 0.04428ms; SamplesPerSecond = 22583 + Epoch[ 1 of 4]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.19352417; EvalErr[0]PerSample = 0.35468750; TotalTime = 0.11343s; TotalTimePerSample = 0.04431ms; SamplesPerSecond = 22568 + Epoch[ 1 of 4]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17192078; EvalErr[0]PerSample = 0.35937500; TotalTime = 0.11335s; TotalTimePerSample = 0.04428ms; SamplesPerSecond = 22584 + Epoch[ 1 of 4]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.08281555; EvalErr[0]PerSample = 0.33867188; TotalTime = 0.11366s; TotalTimePerSample = 0.04440ms; SamplesPerSecond = 22522 + Epoch[ 1 of 4]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.11028442; EvalErr[0]PerSample = 0.34453125; TotalTime = 0.11344s; TotalTimePerSample = 0.04431ms; SamplesPerSecond = 22567 + Epoch[ 1 of 4]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17454224; EvalErr[0]PerSample = 0.35312500; TotalTime = 0.11337s; TotalTimePerSample = 0.04428ms; SamplesPerSecond = 22581 + Epoch[ 1 of 4]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.11068115; EvalErr[0]PerSample = 0.34531250; TotalTime = 0.11339s; TotalTimePerSample = 0.04429ms; SamplesPerSecond = 22577 + Epoch[ 1 of 4]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12955627; EvalErr[0]PerSample = 0.34296875; TotalTime = 0.11348s; TotalTimePerSample = 0.04433ms; SamplesPerSecond = 22559 + Epoch[ 1 of 4]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12482300; EvalErr[0]PerSample = 0.34570312; TotalTime = 0.11350s; TotalTimePerSample = 0.04434ms; SamplesPerSecond = 22554 + Epoch[ 1 of 4]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12771912; EvalErr[0]PerSample = 0.34453125; TotalTime = 0.10801s; TotalTimePerSample = 0.04219ms; SamplesPerSecond = 23701 +Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 1.4063962; EvalErrPerSample = 0.40274659; Ave LearnRatePerSample = 0.003125000047; EpochTime=4.485052 Starting Epoch 2: learning rate per sample = 0.003125 effective momentum = 0.810210 minibatchiterator: epoch 1: frames [81920..163840] (first utterance at frame 81920), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 2 of 4]-Minibatch[ 1- 10 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.49601746; EvalErr[0]PerSample = 0.41562500; TotalTime = 0.23368s; TotalTimePerSample = 0.04564ms; SamplesPerSecond = 21910 - Epoch[ 2 of 4]-Minibatch[ 11- 20 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.33961754; EvalErr[0]PerSample = 0.39316406; TotalTime = 0.22738s; TotalTimePerSample = 0.04441ms; SamplesPerSecond = 22516 - Epoch[ 2 of 4]-Minibatch[ 21- 30 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.19400368; EvalErr[0]PerSample = 0.36679688; TotalTime = 0.26154s; TotalTimePerSample = 0.05108ms; SamplesPerSecond = 19576 - Epoch[ 2 of 4]-Minibatch[ 31- 40 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.11921158; EvalErr[0]PerSample = 0.34023437; TotalTime = 0.29191s; TotalTimePerSample = 0.05701ms; SamplesPerSecond = 17539 - Epoch[ 2 of 4]-Minibatch[ 41- 50 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.12285690; EvalErr[0]PerSample = 0.34140625; TotalTime = 0.29226s; TotalTimePerSample = 0.05708ms; SamplesPerSecond = 17518 - Epoch[ 2 of 4]-Minibatch[ 51- 60 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.13342743; EvalErr[0]PerSample = 0.34296875; TotalTime = 0.27248s; TotalTimePerSample = 0.05322ms; SamplesPerSecond = 18790 - Epoch[ 2 of 4]-Minibatch[ 61- 70 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.08950500; EvalErr[0]PerSample = 0.33593750; TotalTime = 0.22818s; TotalTimePerSample = 0.04457ms; SamplesPerSecond = 22438 - Epoch[ 2 of 4]-Minibatch[ 71- 80 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.06079788; EvalErr[0]PerSample = 0.32363281; TotalTime = 0.22761s; TotalTimePerSample = 0.04446ms; SamplesPerSecond = 22494 - Epoch[ 2 of 4]-Minibatch[ 81- 90 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.18579025; EvalErr[0]PerSample = 0.36933594; TotalTime = 0.22761s; TotalTimePerSample = 0.04446ms; SamplesPerSecond = 22494 - Epoch[ 2 of 4]-Minibatch[ 91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.08288193; EvalErr[0]PerSample = 0.34140625; TotalTime = 0.22663s; TotalTimePerSample = 0.04426ms; SamplesPerSecond = 22591 + Epoch[ 2 of 4]-Minibatch[ 1- 10 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.51739788; EvalErr[0]PerSample = 0.41425781; TotalTime = 0.17081s; TotalTimePerSample = 0.03336ms; SamplesPerSecond = 29974 + Epoch[ 2 of 4]-Minibatch[ 11- 20 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.25793457; EvalErr[0]PerSample = 0.37539062; TotalTime = 0.16557s; TotalTimePerSample = 0.03234ms; SamplesPerSecond = 30923 + Epoch[ 2 of 4]-Minibatch[ 21- 30 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.18638287; EvalErr[0]PerSample = 0.36718750; TotalTime = 0.16542s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30950 + Epoch[ 2 of 4]-Minibatch[ 31- 40 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.12794571; EvalErr[0]PerSample = 0.34218750; TotalTime = 0.16542s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30950 + Epoch[ 2 of 4]-Minibatch[ 41- 50 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.14070625; EvalErr[0]PerSample = 0.34570312; TotalTime = 0.16550s; TotalTimePerSample = 0.03232ms; SamplesPerSecond = 30936 + Epoch[ 2 of 4]-Minibatch[ 51- 60 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.14582825; EvalErr[0]PerSample = 0.34765625; TotalTime = 0.16544s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30948 + Epoch[ 2 of 4]-Minibatch[ 61- 70 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.11193542; EvalErr[0]PerSample = 0.34414062; TotalTime = 0.16536s; TotalTimePerSample = 0.03230ms; SamplesPerSecond = 30963 + Epoch[ 2 of 4]-Minibatch[ 71- 80 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.08574600; EvalErr[0]PerSample = 0.33789062; TotalTime = 0.16542s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30951 + Epoch[ 2 of 4]-Minibatch[ 81- 90 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.21058884; EvalErr[0]PerSample = 0.37363281; TotalTime = 0.16562s; TotalTimePerSample = 0.03235ms; SamplesPerSecond = 30914 + Epoch[ 2 of 4]-Minibatch[ 91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.09668579; EvalErr[0]PerSample = 0.34335938; TotalTime = 0.16571s; TotalTimePerSample = 0.03237ms; SamplesPerSecond = 30897 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 2 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.05400925; EvalErr[0]PerSample = 0.32578125; TotalTime = 0.25990s; TotalTimePerSample = 0.05076ms; SamplesPerSecond = 19700 - Epoch[ 2 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.14049835; EvalErr[0]PerSample = 0.35664062; TotalTime = 0.29239s; TotalTimePerSample = 0.05711ms; SamplesPerSecond = 17510 - Epoch[ 2 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.11492462; EvalErr[0]PerSample = 0.34648438; TotalTime = 0.29289s; TotalTimePerSample = 0.05720ms; SamplesPerSecond = 17481 - Epoch[ 2 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.07589722; EvalErr[0]PerSample = 0.32265625; TotalTime = 0.29237s; TotalTimePerSample = 0.05710ms; SamplesPerSecond = 17512 - Epoch[ 2 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.04273682; EvalErr[0]PerSample = 0.32871094; TotalTime = 0.28067s; TotalTimePerSample = 0.05482ms; SamplesPerSecond = 18242 - Epoch[ 2 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.05914001; EvalErr[0]PerSample = 0.32421875; TotalTime = 0.25271s; TotalTimePerSample = 0.04936ms; SamplesPerSecond = 20260 -Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 1.1444572; EvalErrPerSample = 0.34843752; Ave LearnRatePerSample = 0.003125000047; EpochTime=4.181761 + Epoch[ 2 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.05844955; EvalErr[0]PerSample = 0.32675781; TotalTime = 0.16548s; TotalTimePerSample = 0.03232ms; SamplesPerSecond = 30940 + Epoch[ 2 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.10728455; EvalErr[0]PerSample = 0.34726563; TotalTime = 0.16561s; TotalTimePerSample = 0.03235ms; SamplesPerSecond = 30916 + Epoch[ 2 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.08716888; EvalErr[0]PerSample = 0.33593750; TotalTime = 0.16526s; TotalTimePerSample = 0.03228ms; SamplesPerSecond = 30981 + Epoch[ 2 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.06779022; EvalErr[0]PerSample = 0.31855469; TotalTime = 0.16545s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30946 + Epoch[ 2 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.04079590; EvalErr[0]PerSample = 0.32910156; TotalTime = 0.16529s; TotalTimePerSample = 0.03228ms; SamplesPerSecond = 30974 + Epoch[ 2 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.06249695; EvalErr[0]PerSample = 0.32968750; TotalTime = 0.15482s; TotalTimePerSample = 0.03024ms; SamplesPerSecond = 33071 +Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 1.1440711; EvalErrPerSample = 0.34866944; Ave LearnRatePerSample = 0.003125000047; EpochTime=2.658179 Starting Epoch 3: learning rate per sample = 0.003125 effective momentum = 0.810210 minibatchiterator: epoch 2: frames [163840..245760] (first utterance at frame 163840), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 3 of 4]-Minibatch[ 1- 10 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.11066093; EvalErr[0]PerSample = 0.33886719; TotalTime = 0.22848s; TotalTimePerSample = 0.04462ms; SamplesPerSecond = 22409 - Epoch[ 3 of 4]-Minibatch[ 11- 20 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.10548515; EvalErr[0]PerSample = 0.34511719; TotalTime = 0.22788s; TotalTimePerSample = 0.04451ms; SamplesPerSecond = 22468 - Epoch[ 3 of 4]-Minibatch[ 21- 30 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.10001144; EvalErr[0]PerSample = 0.34902344; TotalTime = 0.22845s; TotalTimePerSample = 0.04462ms; SamplesPerSecond = 22411 - Epoch[ 3 of 4]-Minibatch[ 31- 40 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.12368736; EvalErr[0]PerSample = 0.33847656; TotalTime = 0.22749s; TotalTimePerSample = 0.04443ms; SamplesPerSecond = 22506 - Epoch[ 3 of 4]-Minibatch[ 41- 50 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.12565804; EvalErr[0]PerSample = 0.34316406; TotalTime = 0.22824s; TotalTimePerSample = 0.04458ms; SamplesPerSecond = 22432 - Epoch[ 3 of 4]-Minibatch[ 51- 60 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.08098526; EvalErr[0]PerSample = 0.33652344; TotalTime = 0.25245s; TotalTimePerSample = 0.04931ms; SamplesPerSecond = 20281 - Epoch[ 3 of 4]-Minibatch[ 61- 70 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.09546432; EvalErr[0]PerSample = 0.33964844; TotalTime = 0.29113s; TotalTimePerSample = 0.05686ms; SamplesPerSecond = 17586 - Epoch[ 3 of 4]-Minibatch[ 71- 80 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.07909393; EvalErr[0]PerSample = 0.33242187; TotalTime = 0.29145s; TotalTimePerSample = 0.05692ms; SamplesPerSecond = 17567 - Epoch[ 3 of 4]-Minibatch[ 81- 90 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.02718582; EvalErr[0]PerSample = 0.31562500; TotalTime = 0.29116s; TotalTimePerSample = 0.05687ms; SamplesPerSecond = 17584 - Epoch[ 3 of 4]-Minibatch[ 91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.04374771; EvalErr[0]PerSample = 0.31953125; TotalTime = 0.28709s; TotalTimePerSample = 0.05607ms; SamplesPerSecond = 17834 + Epoch[ 3 of 4]-Minibatch[ 1- 10 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.11238871; EvalErr[0]PerSample = 0.34804687; TotalTime = 0.16758s; TotalTimePerSample = 0.03273ms; SamplesPerSecond = 30552 + Epoch[ 3 of 4]-Minibatch[ 11- 20 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.09456167; EvalErr[0]PerSample = 0.34121094; TotalTime = 0.16526s; TotalTimePerSample = 0.03228ms; SamplesPerSecond = 30982 + Epoch[ 3 of 4]-Minibatch[ 21- 30 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.10800095; EvalErr[0]PerSample = 0.34667969; TotalTime = 0.16558s; TotalTimePerSample = 0.03234ms; SamplesPerSecond = 30921 + Epoch[ 3 of 4]-Minibatch[ 31- 40 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.16617966; EvalErr[0]PerSample = 0.35566406; TotalTime = 0.16543s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30949 + Epoch[ 3 of 4]-Minibatch[ 41- 50 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.14173546; EvalErr[0]PerSample = 0.34550781; TotalTime = 0.16551s; TotalTimePerSample = 0.03233ms; SamplesPerSecond = 30935 + Epoch[ 3 of 4]-Minibatch[ 51- 60 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.07876015; EvalErr[0]PerSample = 0.33359375; TotalTime = 0.16532s; TotalTimePerSample = 0.03229ms; SamplesPerSecond = 30970 + Epoch[ 3 of 4]-Minibatch[ 61- 70 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.08043213; EvalErr[0]PerSample = 0.33437500; TotalTime = 0.16507s; TotalTimePerSample = 0.03224ms; SamplesPerSecond = 31017 + Epoch[ 3 of 4]-Minibatch[ 71- 80 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.07423630; EvalErr[0]PerSample = 0.33007812; TotalTime = 0.16543s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30948 + Epoch[ 3 of 4]-Minibatch[ 81- 90 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.02659454; EvalErr[0]PerSample = 0.31113281; TotalTime = 0.16533s; TotalTimePerSample = 0.03229ms; SamplesPerSecond = 30967 + Epoch[ 3 of 4]-Minibatch[ 91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.04602737; EvalErr[0]PerSample = 0.31855469; TotalTime = 0.16517s; TotalTimePerSample = 0.03226ms; SamplesPerSecond = 30997 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 3 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.05229645; EvalErr[0]PerSample = 0.33457031; TotalTime = 0.29182s; TotalTimePerSample = 0.05700ms; SamplesPerSecond = 17545 - Epoch[ 3 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.08028870; EvalErr[0]PerSample = 0.33769531; TotalTime = 0.23230s; TotalTimePerSample = 0.04537ms; SamplesPerSecond = 22040 - Epoch[ 3 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.05386963; EvalErr[0]PerSample = 0.31933594; TotalTime = 0.22718s; TotalTimePerSample = 0.04437ms; SamplesPerSecond = 22536 - Epoch[ 3 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.02473297; EvalErr[0]PerSample = 0.32167969; TotalTime = 0.22772s; TotalTimePerSample = 0.04448ms; SamplesPerSecond = 22483 - Epoch[ 3 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.04503784; EvalErr[0]PerSample = 0.33085938; TotalTime = 0.22719s; TotalTimePerSample = 0.04437ms; SamplesPerSecond = 22536 - Epoch[ 3 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.01943665; EvalErr[0]PerSample = 0.32050781; TotalTime = 0.21543s; TotalTimePerSample = 0.04208ms; SamplesPerSecond = 23766 -Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 1.0729777; EvalErrPerSample = 0.33269045; Ave LearnRatePerSample = 0.003125000047; EpochTime=3.995354 + Epoch[ 3 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.05524902; EvalErr[0]PerSample = 0.33613281; TotalTime = 0.16554s; TotalTimePerSample = 0.03233ms; SamplesPerSecond = 30928 + Epoch[ 3 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.07627411; EvalErr[0]PerSample = 0.33613281; TotalTime = 0.16533s; TotalTimePerSample = 0.03229ms; SamplesPerSecond = 30967 + Epoch[ 3 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.05101776; EvalErr[0]PerSample = 0.31660156; TotalTime = 0.16521s; TotalTimePerSample = 0.03227ms; SamplesPerSecond = 30991 + Epoch[ 3 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.03016815; EvalErr[0]PerSample = 0.32480469; TotalTime = 0.16532s; TotalTimePerSample = 0.03229ms; SamplesPerSecond = 30970 + Epoch[ 3 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.04644623; EvalErr[0]PerSample = 0.32929687; TotalTime = 0.16540s; TotalTimePerSample = 0.03230ms; SamplesPerSecond = 30956 + Epoch[ 3 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.02751465; EvalErr[0]PerSample = 0.32265625; TotalTime = 0.15429s; TotalTimePerSample = 0.03013ms; SamplesPerSecond = 33185 +Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 1.0759742; EvalErrPerSample = 0.33315429; Ave LearnRatePerSample = 0.003125000047; EpochTime=2.652503 Starting Epoch 4: learning rate per sample = 0.003125 effective momentum = 0.810210 minibatchiterator: epoch 3: frames [245760..327680] (first utterance at frame 245760), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 4 of 4]-Minibatch[ 1- 10 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.02822218; EvalErr[0]PerSample = 0.31328125; TotalTime = 0.22742s; TotalTimePerSample = 0.04442ms; SamplesPerSecond = 22513 - Epoch[ 4 of 4]-Minibatch[ 11- 20 of 160]: SamplesSeen = 4926; TrainLossPerSample = 1.04848684; EvalErr[0]PerSample = 0.32967925; TotalTime = 0.51921s; TotalTimePerSample = 0.10540ms; SamplesPerSecond = 9487 - Epoch[ 4 of 4]-Minibatch[ 21- 30 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.01644306; EvalErr[0]PerSample = 0.32148437; TotalTime = 0.29076s; TotalTimePerSample = 0.05679ms; SamplesPerSecond = 17608 - Epoch[ 4 of 4]-Minibatch[ 31- 40 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.99039593; EvalErr[0]PerSample = 0.31425781; TotalTime = 0.29139s; TotalTimePerSample = 0.05691ms; SamplesPerSecond = 17570 - Epoch[ 4 of 4]-Minibatch[ 41- 50 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.99446030; EvalErr[0]PerSample = 0.31562500; TotalTime = 0.29187s; TotalTimePerSample = 0.05701ms; SamplesPerSecond = 17541 - Epoch[ 4 of 4]-Minibatch[ 51- 60 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.00650482; EvalErr[0]PerSample = 0.32382813; TotalTime = 0.29209s; TotalTimePerSample = 0.05705ms; SamplesPerSecond = 17528 - Epoch[ 4 of 4]-Minibatch[ 61- 70 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.02417755; EvalErr[0]PerSample = 0.32031250; TotalTime = 0.29194s; TotalTimePerSample = 0.05702ms; SamplesPerSecond = 17537 - Epoch[ 4 of 4]-Minibatch[ 71- 80 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.01169128; EvalErr[0]PerSample = 0.31855469; TotalTime = 0.24643s; TotalTimePerSample = 0.04813ms; SamplesPerSecond = 20776 - Epoch[ 4 of 4]-Minibatch[ 81- 90 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.99888992; EvalErr[0]PerSample = 0.30937500; TotalTime = 0.22709s; TotalTimePerSample = 0.04435ms; SamplesPerSecond = 22546 - Epoch[ 4 of 4]-Minibatch[ 91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.00544128; EvalErr[0]PerSample = 0.31113281; TotalTime = 0.22708s; TotalTimePerSample = 0.04435ms; SamplesPerSecond = 22547 + Epoch[ 4 of 4]-Minibatch[ 1- 10 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.03003817; EvalErr[0]PerSample = 0.31289062; TotalTime = 0.16620s; TotalTimePerSample = 0.03246ms; SamplesPerSecond = 30805 + Epoch[ 4 of 4]-Minibatch[ 11- 20 of 160]: SamplesSeen = 4926; TrainLossPerSample = 1.04547925; EvalErr[0]PerSample = 0.32947625; TotalTime = 0.37782s; TotalTimePerSample = 0.07670ms; SamplesPerSecond = 13037 + Epoch[ 4 of 4]-Minibatch[ 21- 30 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.01249599; EvalErr[0]PerSample = 0.32246094; TotalTime = 0.16536s; TotalTimePerSample = 0.03230ms; SamplesPerSecond = 30962 + Epoch[ 4 of 4]-Minibatch[ 31- 40 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.99796467; EvalErr[0]PerSample = 0.31425781; TotalTime = 0.16531s; TotalTimePerSample = 0.03229ms; SamplesPerSecond = 30972 + Epoch[ 4 of 4]-Minibatch[ 41- 50 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.99781761; EvalErr[0]PerSample = 0.31464844; TotalTime = 0.16525s; TotalTimePerSample = 0.03228ms; SamplesPerSecond = 30983 + Epoch[ 4 of 4]-Minibatch[ 51- 60 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.00107079; EvalErr[0]PerSample = 0.31855469; TotalTime = 0.16515s; TotalTimePerSample = 0.03226ms; SamplesPerSecond = 31002 + Epoch[ 4 of 4]-Minibatch[ 61- 70 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.02518806; EvalErr[0]PerSample = 0.31972656; TotalTime = 0.16521s; TotalTimePerSample = 0.03227ms; SamplesPerSecond = 30990 + Epoch[ 4 of 4]-Minibatch[ 71- 80 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.00891876; EvalErr[0]PerSample = 0.31660156; TotalTime = 0.16531s; TotalTimePerSample = 0.03229ms; SamplesPerSecond = 30972 + Epoch[ 4 of 4]-Minibatch[ 81- 90 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.99774780; EvalErr[0]PerSample = 0.30585937; TotalTime = 0.16522s; TotalTimePerSample = 0.03227ms; SamplesPerSecond = 30989 + Epoch[ 4 of 4]-Minibatch[ 91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.00037842; EvalErr[0]PerSample = 0.30722656; TotalTime = 0.16522s; TotalTimePerSample = 0.03227ms; SamplesPerSecond = 30989 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 4 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.01687851; EvalErr[0]PerSample = 0.31093750; TotalTime = 0.22702s; TotalTimePerSample = 0.04434ms; SamplesPerSecond = 22553 - Epoch[ 4 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.03951569; EvalErr[0]PerSample = 0.32851562; TotalTime = 0.26397s; TotalTimePerSample = 0.05156ms; SamplesPerSecond = 19396 - Epoch[ 4 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.98455429; EvalErr[0]PerSample = 0.30234375; TotalTime = 0.28984s; TotalTimePerSample = 0.05661ms; SamplesPerSecond = 17664 - Epoch[ 4 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.96297150; EvalErr[0]PerSample = 0.30136719; TotalTime = 0.29115s; TotalTimePerSample = 0.05687ms; SamplesPerSecond = 17585 - Epoch[ 4 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.98015137; EvalErr[0]PerSample = 0.31054688; TotalTime = 0.29163s; TotalTimePerSample = 0.05696ms; SamplesPerSecond = 17556 - Epoch[ 4 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.97653656; EvalErr[0]PerSample = 0.29863281; TotalTime = 0.27506s; TotalTimePerSample = 0.05372ms; SamplesPerSecond = 18614 -Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 1.0051814; EvalErrPerSample = 0.31445312; Ave LearnRatePerSample = 0.003125000047; EpochTime=4.579516 + Epoch[ 4 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.02586746; EvalErr[0]PerSample = 0.31816406; TotalTime = 0.16529s; TotalTimePerSample = 0.03228ms; SamplesPerSecond = 30975 + Epoch[ 4 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.06024628; EvalErr[0]PerSample = 0.33574219; TotalTime = 0.16542s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30952 + Epoch[ 4 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.98301010; EvalErr[0]PerSample = 0.30214844; TotalTime = 0.16545s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30946 + Epoch[ 4 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.96488800; EvalErr[0]PerSample = 0.30156250; TotalTime = 0.16533s; TotalTimePerSample = 0.03229ms; SamplesPerSecond = 30968 + Epoch[ 4 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.99069977; EvalErr[0]PerSample = 0.31640625; TotalTime = 0.16536s; TotalTimePerSample = 0.03230ms; SamplesPerSecond = 30963 + Epoch[ 4 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.97961731; EvalErr[0]PerSample = 0.29921875; TotalTime = 0.15761s; TotalTimePerSample = 0.03078ms; SamplesPerSecond = 32486 +Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 1.0073979; EvalErrPerSample = 0.31477052; Ave LearnRatePerSample = 0.003125000047; EpochTime=2.874394 CNTKCommandTrainEnd: speechTrain COMPLETED diff --git a/Tests/Speech/DNN/DiscriminativePreTraining/baseline.windows.gpu.txt b/Tests/Speech/DNN/DiscriminativePreTraining/baseline.windows.gpu.txt index cbad133ab..9216dfc05 100644 --- a/Tests/Speech/DNN/DiscriminativePreTraining/baseline.windows.gpu.txt +++ b/Tests/Speech/DNN/DiscriminativePreTraining/baseline.windows.gpu.txt @@ -1,22 +1,22 @@ -=== Running /cygdrive/e/NetScale/CNTK/git_repos/cplx_master2/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining\cntk_dpt.config RunDir=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data DeviceId=0 +=== Running /cygdrive/e/NetScale/CNTK/git_repos/cplx_master2/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/cntk_dpt.config RunDir=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining DeviceId=0 ------------------------------------------------------------------- Build info: - Built time: Oct 12 2015 17:58:56 - Last modified date: Sat Oct 10 19:47:14 2015 + Built time: Oct 24 2015 13:33:25 + Last modified date: Thu Oct 22 16:00:27 2015 Built by amitaga on Amitaga-Win-DT3 Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\ CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 ------------------------------------------------------------------- -running on Amitaga-Win-DT3 at 2015/10/13 02:34:55 -command line options: -configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining\cntk_dpt.config RunDir=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data DeviceId=0 +running on Amitaga-Win-DT3 at 2015/10/24 22:09:53 +command line: +E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/cntk_dpt.config RunDir=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining DeviceId=0 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> precision=float deviceId=$DeviceId$ command=DPT_Pre1:AddLayer2:DPT_Pre2:AddLayer3:speechTrain -ndlMacros=$DataDir$/ndl/macros.txt +ndlMacros=$ConfigDir$/macros.txt GlobalMean=GlobalStats/mean.363 GlobalInvStd=GlobalStats/var.363 GlobalPrior=GlobalStats/prior.132 @@ -34,7 +34,7 @@ DPT_Pre1=[ action=train modelPath=$RunDir$/models/Pre1/cntkSpeech NDLNetworkBuilder=[ - networkDescription=$DataDir$/ndl/dnn_1layer.txt + networkDescription=$ConfigDir$/dnn_1layer.txt ] ] AddLayer2=[ @@ -43,13 +43,13 @@ AddLayer2=[ NewLayer=2 CurrModel=$RunDir$/models/Pre1/cntkSpeech NewModel=$RunDir$/models/Pre2/cntkSpeech.0 - editPath=$DataDir$/ndl/add_layer.mel + editPath=$ConfigDir$/add_layer.mel ] DPT_Pre2=[ action=train modelPath=$RunDir$/models/Pre2/cntkSpeech NDLNetworkBuilder=[ - networkDescription=$DataDir$/ndl/dnn_1layer.txt + networkDescription=$ConfigDir$/dnn_1layer.txt ] ] AddLayer3=[ @@ -58,7 +58,7 @@ AddLayer3=[ NewLayer=3 CurrModel=$RunDir$/models/Pre2/cntkSpeech NewModel=$RunDir$/models/cntkSpeech.0 - editPath=$DataDir$/ndl/add_layer.mel + editPath=$ConfigDir$/add_layer.mel ] speechTrain=[ action=train @@ -66,7 +66,7 @@ speechTrain=[ deviceId=$DeviceId$ traceLevel=1 NDLNetworkBuilder=[ - networkDescription=$DataDir$/ndl/dnn.txt + networkDescription=$ConfigDir$/dnn.txt ] SGD=[ epochSize=81920 @@ -99,8 +99,9 @@ reader=[ labelType=Category ] ] -RunDir=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu +RunDir=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining DeviceId=0 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< @@ -109,7 +110,7 @@ DeviceId=0 precision=float deviceId=0 command=DPT_Pre1:AddLayer2:DPT_Pre2:AddLayer3:speechTrain -ndlMacros=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/macros.txt +ndlMacros=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/macros.txt GlobalMean=GlobalStats/mean.363 GlobalInvStd=GlobalStats/var.363 GlobalPrior=GlobalStats/prior.132 @@ -125,41 +126,41 @@ SGD=[ ] DPT_Pre1=[ action=train - modelPath=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech + modelPath=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech NDLNetworkBuilder=[ - networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/dnn_1layer.txt + networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/dnn_1layer.txt ] ] AddLayer2=[ action=edit CurrLayer=1 NewLayer=2 - CurrModel=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech - NewModel=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0 - editPath=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/add_layer.mel + CurrModel=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech + NewModel=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0 + editPath=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/add_layer.mel ] DPT_Pre2=[ action=train - modelPath=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech + modelPath=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech NDLNetworkBuilder=[ - networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/dnn_1layer.txt + networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/dnn_1layer.txt ] ] AddLayer3=[ action=edit CurrLayer=2 NewLayer=3 - CurrModel=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech - NewModel=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0 - editPath=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/add_layer.mel + CurrModel=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech + NewModel=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0 + editPath=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/add_layer.mel ] speechTrain=[ action=train - modelPath=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech + modelPath=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech deviceId=0 traceLevel=1 NDLNetworkBuilder=[ - networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/dnn.txt + networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/dnn.txt ] SGD=[ epochSize=81920 @@ -192,8 +193,9 @@ reader=[ labelType=Category ] ] -RunDir=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu +RunDir=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining DeviceId=0 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< @@ -203,43 +205,44 @@ configparameters: cntk_dpt.config:AddLayer2=[ action=edit CurrLayer=1 NewLayer=2 - CurrModel=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech - NewModel=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0 - editPath=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/add_layer.mel + CurrModel=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech + NewModel=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0 + editPath=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/add_layer.mel ] configparameters: cntk_dpt.config:AddLayer3=[ action=edit CurrLayer=2 NewLayer=3 - CurrModel=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech - NewModel=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0 - editPath=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/add_layer.mel + CurrModel=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech + NewModel=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0 + editPath=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/add_layer.mel ] configparameters: cntk_dpt.config:command=DPT_Pre1:AddLayer2:DPT_Pre2:AddLayer3:speechTrain +configparameters: cntk_dpt.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining configparameters: cntk_dpt.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data configparameters: cntk_dpt.config:deviceId=0 configparameters: cntk_dpt.config:DPT_Pre1=[ action=train - modelPath=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech + modelPath=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech NDLNetworkBuilder=[ - networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/dnn_1layer.txt + networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/dnn_1layer.txt ] ] configparameters: cntk_dpt.config:DPT_Pre2=[ action=train - modelPath=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech + modelPath=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech NDLNetworkBuilder=[ - networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/dnn_1layer.txt + networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/dnn_1layer.txt ] ] configparameters: cntk_dpt.config:GlobalInvStd=GlobalStats/var.363 configparameters: cntk_dpt.config:GlobalMean=GlobalStats/mean.363 configparameters: cntk_dpt.config:GlobalPrior=GlobalStats/prior.132 -configparameters: cntk_dpt.config:ndlMacros=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/macros.txt +configparameters: cntk_dpt.config:ndlMacros=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/macros.txt configparameters: cntk_dpt.config:precision=float configparameters: cntk_dpt.config:reader=[ readerType=HTKMLFReader @@ -260,7 +263,7 @@ configparameters: cntk_dpt.config:reader=[ ] ] -configparameters: cntk_dpt.config:RunDir=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu +configparameters: cntk_dpt.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu configparameters: cntk_dpt.config:SGD=[ epochSize=81920 minibatchSize=256 @@ -273,11 +276,11 @@ configparameters: cntk_dpt.config:SGD=[ configparameters: cntk_dpt.config:speechTrain=[ action=train - modelPath=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech + modelPath=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech deviceId=0 traceLevel=1 NDLNetworkBuilder=[ - networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/dnn.txt + networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/dnn.txt ] SGD=[ epochSize=81920 @@ -297,11 +300,11 @@ configparameters: cntk_dpt.config:traceLevel=1 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< command: DPT_Pre1 AddLayer2 DPT_Pre2 AddLayer3 speechTrain precision = float -CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech +CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech CNTKCommandTrainInfo: DPT_Pre1 : 2 -CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech +CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech CNTKCommandTrainInfo: DPT_Pre2 : 2 -CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech +CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech CNTKCommandTrainInfo: speechTrain : 4 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 8 CNTKCommandTrainBegin: DPT_Pre1 @@ -409,6 +412,24 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1] +Validating for node cr. 6 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 1], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 1] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 1]) -> [512, MBSize 1] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 1], HL1.b[512, 1]) -> [512, MBSize 1] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 1]) -> [512, MBSize 1] +Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 1]) -> [132, MBSize 1] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1] + Validating for node cr, final verification. Validating --> labels = InputValue -> [132, MBSize 1] @@ -450,7 +471,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 1], logPrior[132, 1]) -> [132, MBSize 1] -Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2. +Validating for node ScaledLogLikelihood. 7 nodes to process in pass 2. Validating --> OL.W = LearnableParameter -> [132, 512] Validating --> HL1.W = LearnableParameter -> [512, 363] @@ -511,6 +532,25 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 1], logPrior[132, 1]) -> [132, MBSize 1] +Validating for node ScaledLogLikelihood. 7 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 1], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 1] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 1]) -> [512, MBSize 1] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 1], HL1.b[512, 1]) -> [512, MBSize 1] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 1]) -> [512, MBSize 1] +Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 1]) -> [132, MBSize 1] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 1], logPrior[132, 1]) -> [132, MBSize 1] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -552,7 +592,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1] Validating --> Err = ErrorPrediction(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1] -Validating for node Err. 1 nodes to process in pass 2. +Validating for node Err. 6 nodes to process in pass 2. Validating --> labels = InputValue -> [132, MBSize 1] Validating --> OL.W = LearnableParameter -> [132, 512] @@ -610,6 +650,24 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1] Validating --> Err = ErrorPrediction(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1] +Validating for node Err. 6 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 1], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 1] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 1]) -> [512, MBSize 1] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 1], HL1.b[512, 1]) -> [512, MBSize 1] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 1]) -> [512, MBSize 1] +Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 1]) -> [132, MBSize 1] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1] +Validating --> Err = ErrorPrediction(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 1] @@ -630,6 +688,7 @@ Validating --> Err = ErrorPrediction(labels[132, MBSize 1], OL.z[132, MBSize 1]) 7 out of 15 nodes do not share the minibatch layout with the input data. +SetUniformRandomValue (GPU): creating curand object with seed 1 GetTrainCriterionNodes ... GetEvalCriterionNodes ... No PreCompute nodes found, skipping PreCompute step @@ -639,78 +698,78 @@ minibatchiterator: epoch 0: frames [0..81920] (first utterance at frame 0), data requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms Starting minibatch loop. - Epoch[ 1 of 2]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 3.89978218; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.70651s; TotalTimePerSample = 0.27598ms; SamplesPerSecond = 3623 - Epoch[ 1 of 2]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.96755714; EvalErr[0]PerSample = 0.72031250; TotalTime = 0.28515s; TotalTimePerSample = 0.11139ms; SamplesPerSecond = 8977 - Epoch[ 1 of 2]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.55723495; EvalErr[0]PerSample = 0.65859375; TotalTime = 0.26848s; TotalTimePerSample = 0.10488ms; SamplesPerSecond = 9535 - Epoch[ 1 of 2]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.29642715; EvalErr[0]PerSample = 0.61992187; TotalTime = 0.25356s; TotalTimePerSample = 0.09905ms; SamplesPerSecond = 10096 - Epoch[ 1 of 2]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.02396469; EvalErr[0]PerSample = 0.55117187; TotalTime = 0.24481s; TotalTimePerSample = 0.09563ms; SamplesPerSecond = 10457 - Epoch[ 1 of 2]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.87309418; EvalErr[0]PerSample = 0.51484375; TotalTime = 0.23464s; TotalTimePerSample = 0.09166ms; SamplesPerSecond = 10910 - Epoch[ 1 of 2]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.78157196; EvalErr[0]PerSample = 0.50507813; TotalTime = 0.22702s; TotalTimePerSample = 0.08868ms; SamplesPerSecond = 11276 - Epoch[ 1 of 2]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.75391235; EvalErr[0]PerSample = 0.50781250; TotalTime = 0.21845s; TotalTimePerSample = 0.08533ms; SamplesPerSecond = 11719 - Epoch[ 1 of 2]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.66460266; EvalErr[0]PerSample = 0.45742187; TotalTime = 0.21084s; TotalTimePerSample = 0.08236ms; SamplesPerSecond = 12142 - Epoch[ 1 of 2]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.62184296; EvalErr[0]PerSample = 0.47968750; TotalTime = 0.20613s; TotalTimePerSample = 0.08052ms; SamplesPerSecond = 12419 + Epoch[ 1 of 2]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 3.89978180; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.62266s; TotalTimePerSample = 0.24323ms; SamplesPerSecond = 4111 + Epoch[ 1 of 2]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.96755676; EvalErr[0]PerSample = 0.72031250; TotalTime = 0.30410s; TotalTimePerSample = 0.11879ms; SamplesPerSecond = 8418 + Epoch[ 1 of 2]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.55723495; EvalErr[0]PerSample = 0.65859375; TotalTime = 0.30677s; TotalTimePerSample = 0.11983ms; SamplesPerSecond = 8344 + Epoch[ 1 of 2]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.29642792; EvalErr[0]PerSample = 0.61992187; TotalTime = 0.29877s; TotalTimePerSample = 0.11671ms; SamplesPerSecond = 8568 + Epoch[ 1 of 2]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.02396469; EvalErr[0]PerSample = 0.55117187; TotalTime = 0.27956s; TotalTimePerSample = 0.10920ms; SamplesPerSecond = 9157 + Epoch[ 1 of 2]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.87309265; EvalErr[0]PerSample = 0.51484375; TotalTime = 0.26339s; TotalTimePerSample = 0.10289ms; SamplesPerSecond = 9719 + Epoch[ 1 of 2]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.78157196; EvalErr[0]PerSample = 0.50507813; TotalTime = 0.27964s; TotalTimePerSample = 0.10923ms; SamplesPerSecond = 9154 + Epoch[ 1 of 2]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.75391235; EvalErr[0]PerSample = 0.50781250; TotalTime = 0.29762s; TotalTimePerSample = 0.11626ms; SamplesPerSecond = 8601 + Epoch[ 1 of 2]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.66460266; EvalErr[0]PerSample = 0.45742187; TotalTime = 0.27883s; TotalTimePerSample = 0.10892ms; SamplesPerSecond = 9181 + Epoch[ 1 of 2]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.62184143; EvalErr[0]PerSample = 0.47968750; TotalTime = 0.26243s; TotalTimePerSample = 0.10251ms; SamplesPerSecond = 9755 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 1 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.65328217; EvalErr[0]PerSample = 0.47265625; TotalTime = 0.20100s; TotalTimePerSample = 0.07851ms; SamplesPerSecond = 12736 - Epoch[ 1 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.50686798; EvalErr[0]PerSample = 0.44921875; TotalTime = 0.20189s; TotalTimePerSample = 0.07886ms; SamplesPerSecond = 12680 - Epoch[ 1 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.46723938; EvalErr[0]PerSample = 0.42304687; TotalTime = 0.20090s; TotalTimePerSample = 0.07847ms; SamplesPerSecond = 12742 - Epoch[ 1 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.49163513; EvalErr[0]PerSample = 0.44140625; TotalTime = 0.20162s; TotalTimePerSample = 0.07876ms; SamplesPerSecond = 12697 - Epoch[ 1 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.46437683; EvalErr[0]PerSample = 0.43398437; TotalTime = 0.20111s; TotalTimePerSample = 0.07856ms; SamplesPerSecond = 12729 - Epoch[ 1 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.43047485; EvalErr[0]PerSample = 0.43867187; TotalTime = 0.20070s; TotalTimePerSample = 0.07840ms; SamplesPerSecond = 12755 - Epoch[ 1 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.42105103; EvalErr[0]PerSample = 0.41992188; TotalTime = 0.20147s; TotalTimePerSample = 0.07870ms; SamplesPerSecond = 12706 - Epoch[ 1 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.46536560; EvalErr[0]PerSample = 0.42460938; TotalTime = 0.20084s; TotalTimePerSample = 0.07845ms; SamplesPerSecond = 12746 - Epoch[ 1 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.47426147; EvalErr[0]PerSample = 0.44062500; TotalTime = 0.20085s; TotalTimePerSample = 0.07846ms; SamplesPerSecond = 12745 - Epoch[ 1 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.42851257; EvalErr[0]PerSample = 0.44062500; TotalTime = 0.20094s; TotalTimePerSample = 0.07849ms; SamplesPerSecond = 12740 - Epoch[ 1 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.34079895; EvalErr[0]PerSample = 0.41171875; TotalTime = 0.20082s; TotalTimePerSample = 0.07844ms; SamplesPerSecond = 12747 - Epoch[ 1 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.39476929; EvalErr[0]PerSample = 0.42773438; TotalTime = 0.20133s; TotalTimePerSample = 0.07864ms; SamplesPerSecond = 12715 - Epoch[ 1 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.40154724; EvalErr[0]PerSample = 0.41250000; TotalTime = 0.20108s; TotalTimePerSample = 0.07855ms; SamplesPerSecond = 12731 - Epoch[ 1 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.39338379; EvalErr[0]PerSample = 0.42656250; TotalTime = 0.20143s; TotalTimePerSample = 0.07868ms; SamplesPerSecond = 12709 - Epoch[ 1 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.32473145; EvalErr[0]PerSample = 0.40117188; TotalTime = 0.20309s; TotalTimePerSample = 0.07933ms; SamplesPerSecond = 12605 - Epoch[ 1 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.27008972; EvalErr[0]PerSample = 0.39960937; TotalTime = 0.24300s; TotalTimePerSample = 0.09492ms; SamplesPerSecond = 10534 - Epoch[ 1 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.32326355; EvalErr[0]PerSample = 0.39296875; TotalTime = 0.20162s; TotalTimePerSample = 0.07876ms; SamplesPerSecond = 12697 - Epoch[ 1 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.25377502; EvalErr[0]PerSample = 0.38359375; TotalTime = 0.20118s; TotalTimePerSample = 0.07859ms; SamplesPerSecond = 12725 - Epoch[ 1 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23349915; EvalErr[0]PerSample = 0.37070313; TotalTime = 0.20074s; TotalTimePerSample = 0.07842ms; SamplesPerSecond = 12752 - Epoch[ 1 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20884399; EvalErr[0]PerSample = 0.35937500; TotalTime = 0.20064s; TotalTimePerSample = 0.07838ms; SamplesPerSecond = 12759 - Epoch[ 1 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23698425; EvalErr[0]PerSample = 0.36914063; TotalTime = 0.20078s; TotalTimePerSample = 0.07843ms; SamplesPerSecond = 12750 - Epoch[ 1 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.22963867; EvalErr[0]PerSample = 0.37382813; TotalTime = 0.18087s; TotalTimePerSample = 0.07065ms; SamplesPerSecond = 14153 -Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 1.6516994; EvalErrPerSample = 0.46788332; Ave LearnRatePerSample = 0.003125000047; EpochTime=11.137228 + Epoch[ 1 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.65328064; EvalErr[0]PerSample = 0.47265625; TotalTime = 0.24968s; TotalTimePerSample = 0.09753ms; SamplesPerSecond = 10253 + Epoch[ 1 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.50686951; EvalErr[0]PerSample = 0.44921875; TotalTime = 0.23939s; TotalTimePerSample = 0.09351ms; SamplesPerSecond = 10693 + Epoch[ 1 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.46723938; EvalErr[0]PerSample = 0.42304687; TotalTime = 0.28085s; TotalTimePerSample = 0.10971ms; SamplesPerSecond = 9115 + Epoch[ 1 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.49163513; EvalErr[0]PerSample = 0.44140625; TotalTime = 0.31287s; TotalTimePerSample = 0.12222ms; SamplesPerSecond = 8182 + Epoch[ 1 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.46437683; EvalErr[0]PerSample = 0.43398437; TotalTime = 0.29536s; TotalTimePerSample = 0.11538ms; SamplesPerSecond = 8667 + Epoch[ 1 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.43047485; EvalErr[0]PerSample = 0.43867187; TotalTime = 0.28569s; TotalTimePerSample = 0.11160ms; SamplesPerSecond = 8960 + Epoch[ 1 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.42106018; EvalErr[0]PerSample = 0.41992188; TotalTime = 0.30841s; TotalTimePerSample = 0.12047ms; SamplesPerSecond = 8300 + Epoch[ 1 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.46538086; EvalErr[0]PerSample = 0.42421875; TotalTime = 0.28988s; TotalTimePerSample = 0.11323ms; SamplesPerSecond = 8831 + Epoch[ 1 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.47427673; EvalErr[0]PerSample = 0.44062500; TotalTime = 0.30135s; TotalTimePerSample = 0.11772ms; SamplesPerSecond = 8495 + Epoch[ 1 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.42847290; EvalErr[0]PerSample = 0.44023438; TotalTime = 0.31460s; TotalTimePerSample = 0.12289ms; SamplesPerSecond = 8137 + Epoch[ 1 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.34078064; EvalErr[0]PerSample = 0.41171875; TotalTime = 0.30250s; TotalTimePerSample = 0.11816ms; SamplesPerSecond = 8462 + Epoch[ 1 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.39474487; EvalErr[0]PerSample = 0.42734375; TotalTime = 0.28411s; TotalTimePerSample = 0.11098ms; SamplesPerSecond = 9010 + Epoch[ 1 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.40151062; EvalErr[0]PerSample = 0.41250000; TotalTime = 0.26734s; TotalTimePerSample = 0.10443ms; SamplesPerSecond = 9575 + Epoch[ 1 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.39345703; EvalErr[0]PerSample = 0.42734375; TotalTime = 0.27080s; TotalTimePerSample = 0.10578ms; SamplesPerSecond = 9453 + Epoch[ 1 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.32485046; EvalErr[0]PerSample = 0.40156250; TotalTime = 0.31168s; TotalTimePerSample = 0.12175ms; SamplesPerSecond = 8213 + Epoch[ 1 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.27032471; EvalErr[0]PerSample = 0.39765625; TotalTime = 0.29405s; TotalTimePerSample = 0.11486ms; SamplesPerSecond = 8706 + Epoch[ 1 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.32375488; EvalErr[0]PerSample = 0.39257813; TotalTime = 0.28044s; TotalTimePerSample = 0.10955ms; SamplesPerSecond = 9128 + Epoch[ 1 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.25393982; EvalErr[0]PerSample = 0.38320312; TotalTime = 0.31065s; TotalTimePerSample = 0.12135ms; SamplesPerSecond = 8240 + Epoch[ 1 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23377075; EvalErr[0]PerSample = 0.36953125; TotalTime = 0.29165s; TotalTimePerSample = 0.11393ms; SamplesPerSecond = 8777 + Epoch[ 1 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20861511; EvalErr[0]PerSample = 0.35976562; TotalTime = 0.31200s; TotalTimePerSample = 0.12187ms; SamplesPerSecond = 8205 + Epoch[ 1 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23675232; EvalErr[0]PerSample = 0.36757812; TotalTime = 0.29517s; TotalTimePerSample = 0.11530ms; SamplesPerSecond = 8672 + Epoch[ 1 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.22960205; EvalErr[0]PerSample = 0.37460938; TotalTime = 0.24842s; TotalTimePerSample = 0.09704ms; SamplesPerSecond = 10305 +Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 1.6517237; EvalErrPerSample = 0.46774903; Ave LearnRatePerSample = 0.003125000047; EpochTime=14.544218 Starting Epoch 2: learning rate per sample = 0.003125 effective momentum = 0.900000 minibatchiterator: epoch 1: frames [81920..163840] (first utterance at frame 81920), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 2 of 2]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21834393; EvalErr[0]PerSample = 0.37070313; TotalTime = 0.30891s; TotalTimePerSample = 0.12067ms; SamplesPerSecond = 8287 - Epoch[ 2 of 2]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18350792; EvalErr[0]PerSample = 0.36718750; TotalTime = 0.29223s; TotalTimePerSample = 0.11415ms; SamplesPerSecond = 8760 - Epoch[ 2 of 2]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17285366; EvalErr[0]PerSample = 0.35937500; TotalTime = 0.28457s; TotalTimePerSample = 0.11116ms; SamplesPerSecond = 8995 - Epoch[ 2 of 2]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20266953; EvalErr[0]PerSample = 0.35898438; TotalTime = 0.26168s; TotalTimePerSample = 0.10222ms; SamplesPerSecond = 9782 - Epoch[ 2 of 2]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.19678535; EvalErr[0]PerSample = 0.37890625; TotalTime = 0.24893s; TotalTimePerSample = 0.09724ms; SamplesPerSecond = 10283 - Epoch[ 2 of 2]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16507607; EvalErr[0]PerSample = 0.34453125; TotalTime = 0.23976s; TotalTimePerSample = 0.09365ms; SamplesPerSecond = 10677 - Epoch[ 2 of 2]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13885193; EvalErr[0]PerSample = 0.34804687; TotalTime = 0.22890s; TotalTimePerSample = 0.08941ms; SamplesPerSecond = 11184 - Epoch[ 2 of 2]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.19503098; EvalErr[0]PerSample = 0.37265625; TotalTime = 0.22024s; TotalTimePerSample = 0.08603ms; SamplesPerSecond = 11623 - Epoch[ 2 of 2]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.24308472; EvalErr[0]PerSample = 0.37812500; TotalTime = 0.21525s; TotalTimePerSample = 0.08408ms; SamplesPerSecond = 11892 - Epoch[ 2 of 2]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.19112320; EvalErr[0]PerSample = 0.36445312; TotalTime = 0.20867s; TotalTimePerSample = 0.08151ms; SamplesPerSecond = 12268 + Epoch[ 2 of 2]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21869726; EvalErr[0]PerSample = 0.36992188; TotalTime = 0.29967s; TotalTimePerSample = 0.11706ms; SamplesPerSecond = 8542 + Epoch[ 2 of 2]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18345709; EvalErr[0]PerSample = 0.36679688; TotalTime = 0.30109s; TotalTimePerSample = 0.11761ms; SamplesPerSecond = 8502 + Epoch[ 2 of 2]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17220440; EvalErr[0]PerSample = 0.35898438; TotalTime = 0.30479s; TotalTimePerSample = 0.11906ms; SamplesPerSecond = 8399 + Epoch[ 2 of 2]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20035286; EvalErr[0]PerSample = 0.35781250; TotalTime = 0.30655s; TotalTimePerSample = 0.11975ms; SamplesPerSecond = 8350 + Epoch[ 2 of 2]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.19499779; EvalErr[0]PerSample = 0.37460938; TotalTime = 0.31306s; TotalTimePerSample = 0.12229ms; SamplesPerSecond = 8177 + Epoch[ 2 of 2]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16373482; EvalErr[0]PerSample = 0.34687500; TotalTime = 0.29670s; TotalTimePerSample = 0.11590ms; SamplesPerSecond = 8628 + Epoch[ 2 of 2]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13869247; EvalErr[0]PerSample = 0.34804687; TotalTime = 0.31042s; TotalTimePerSample = 0.12126ms; SamplesPerSecond = 8246 + Epoch[ 2 of 2]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.19293823; EvalErr[0]PerSample = 0.36992188; TotalTime = 0.29057s; TotalTimePerSample = 0.11350ms; SamplesPerSecond = 8810 + Epoch[ 2 of 2]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23978348; EvalErr[0]PerSample = 0.37539062; TotalTime = 0.28823s; TotalTimePerSample = 0.11259ms; SamplesPerSecond = 8881 + Epoch[ 2 of 2]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18622742; EvalErr[0]PerSample = 0.36406250; TotalTime = 0.29621s; TotalTimePerSample = 0.11571ms; SamplesPerSecond = 8642 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 2 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16928406; EvalErr[0]PerSample = 0.35898438; TotalTime = 0.20151s; TotalTimePerSample = 0.07871ms; SamplesPerSecond = 12704 - Epoch[ 2 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.24290924; EvalErr[0]PerSample = 0.38085938; TotalTime = 0.20117s; TotalTimePerSample = 0.07858ms; SamplesPerSecond = 12725 - Epoch[ 2 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17869263; EvalErr[0]PerSample = 0.35039063; TotalTime = 0.20060s; TotalTimePerSample = 0.07836ms; SamplesPerSecond = 12761 - Epoch[ 2 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21065826; EvalErr[0]PerSample = 0.36914063; TotalTime = 0.20114s; TotalTimePerSample = 0.07857ms; SamplesPerSecond = 12727 - Epoch[ 2 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.19442291; EvalErr[0]PerSample = 0.37109375; TotalTime = 0.20060s; TotalTimePerSample = 0.07836ms; SamplesPerSecond = 12761 - Epoch[ 2 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14822540; EvalErr[0]PerSample = 0.34453125; TotalTime = 0.20130s; TotalTimePerSample = 0.07863ms; SamplesPerSecond = 12717 - Epoch[ 2 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14246521; EvalErr[0]PerSample = 0.35664062; TotalTime = 0.20146s; TotalTimePerSample = 0.07869ms; SamplesPerSecond = 12707 - Epoch[ 2 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18051453; EvalErr[0]PerSample = 0.35078125; TotalTime = 0.20061s; TotalTimePerSample = 0.07836ms; SamplesPerSecond = 12760 - Epoch[ 2 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.15268555; EvalErr[0]PerSample = 0.35703125; TotalTime = 0.20114s; TotalTimePerSample = 0.07857ms; SamplesPerSecond = 12727 - Epoch[ 2 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.08914642; EvalErr[0]PerSample = 0.33945313; TotalTime = 0.20109s; TotalTimePerSample = 0.07855ms; SamplesPerSecond = 12730 - Epoch[ 2 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14708710; EvalErr[0]PerSample = 0.34765625; TotalTime = 0.20140s; TotalTimePerSample = 0.07867ms; SamplesPerSecond = 12711 - Epoch[ 2 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17114868; EvalErr[0]PerSample = 0.35625000; TotalTime = 0.20171s; TotalTimePerSample = 0.07879ms; SamplesPerSecond = 12691 - Epoch[ 2 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.19283752; EvalErr[0]PerSample = 0.37539062; TotalTime = 0.20057s; TotalTimePerSample = 0.07835ms; SamplesPerSecond = 12763 - Epoch[ 2 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14776306; EvalErr[0]PerSample = 0.34921875; TotalTime = 0.20076s; TotalTimePerSample = 0.07842ms; SamplesPerSecond = 12751 - Epoch[ 2 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.15021973; EvalErr[0]PerSample = 0.35703125; TotalTime = 0.20060s; TotalTimePerSample = 0.07836ms; SamplesPerSecond = 12761 - Epoch[ 2 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.08143616; EvalErr[0]PerSample = 0.32851562; TotalTime = 0.20156s; TotalTimePerSample = 0.07873ms; SamplesPerSecond = 12700 - Epoch[ 2 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09953003; EvalErr[0]PerSample = 0.34648438; TotalTime = 0.20113s; TotalTimePerSample = 0.07857ms; SamplesPerSecond = 12727 - Epoch[ 2 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.06625977; EvalErr[0]PerSample = 0.33750000; TotalTime = 0.20081s; TotalTimePerSample = 0.07844ms; SamplesPerSecond = 12748 - Epoch[ 2 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09338989; EvalErr[0]PerSample = 0.33242187; TotalTime = 0.20097s; TotalTimePerSample = 0.07850ms; SamplesPerSecond = 12738 - Epoch[ 2 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.15601807; EvalErr[0]PerSample = 0.35898438; TotalTime = 0.20053s; TotalTimePerSample = 0.07833ms; SamplesPerSecond = 12766 - Epoch[ 2 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.10992432; EvalErr[0]PerSample = 0.34765625; TotalTime = 0.20125s; TotalTimePerSample = 0.07861ms; SamplesPerSecond = 12720 - Epoch[ 2 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.07529907; EvalErr[0]PerSample = 0.32890625; TotalTime = 0.18021s; TotalTimePerSample = 0.07040ms; SamplesPerSecond = 14205 -Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 1.1596014; EvalErrPerSample = 0.35587159; Ave LearnRatePerSample = 0.003125000047; EpochTime=6.960407 + Epoch[ 2 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16710815; EvalErr[0]PerSample = 0.35703125; TotalTime = 0.26584s; TotalTimePerSample = 0.10384ms; SamplesPerSecond = 9629 + Epoch[ 2 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.24683685; EvalErr[0]PerSample = 0.38554688; TotalTime = 0.28340s; TotalTimePerSample = 0.11070ms; SamplesPerSecond = 9033 + Epoch[ 2 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18601685; EvalErr[0]PerSample = 0.35273437; TotalTime = 0.31050s; TotalTimePerSample = 0.12129ms; SamplesPerSecond = 8244 + Epoch[ 2 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21721497; EvalErr[0]PerSample = 0.37617187; TotalTime = 0.30730s; TotalTimePerSample = 0.12004ms; SamplesPerSecond = 8330 + Epoch[ 2 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.19934692; EvalErr[0]PerSample = 0.36953125; TotalTime = 0.28662s; TotalTimePerSample = 0.11196ms; SamplesPerSecond = 8931 + Epoch[ 2 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.15099945; EvalErr[0]PerSample = 0.34257813; TotalTime = 0.27093s; TotalTimePerSample = 0.10583ms; SamplesPerSecond = 9448 + Epoch[ 2 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14984589; EvalErr[0]PerSample = 0.35703125; TotalTime = 0.25383s; TotalTimePerSample = 0.09915ms; SamplesPerSecond = 10085 + Epoch[ 2 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.19028320; EvalErr[0]PerSample = 0.35898438; TotalTime = 0.27065s; TotalTimePerSample = 0.10572ms; SamplesPerSecond = 9458 + Epoch[ 2 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16434784; EvalErr[0]PerSample = 0.36406250; TotalTime = 0.31594s; TotalTimePerSample = 0.12341ms; SamplesPerSecond = 8102 + Epoch[ 2 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.08853760; EvalErr[0]PerSample = 0.33359375; TotalTime = 0.30973s; TotalTimePerSample = 0.12099ms; SamplesPerSecond = 8265 + Epoch[ 2 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.15194244; EvalErr[0]PerSample = 0.35039063; TotalTime = 0.31183s; TotalTimePerSample = 0.12181ms; SamplesPerSecond = 8209 + Epoch[ 2 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16113434; EvalErr[0]PerSample = 0.35625000; TotalTime = 0.31285s; TotalTimePerSample = 0.12221ms; SamplesPerSecond = 8182 + Epoch[ 2 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18479004; EvalErr[0]PerSample = 0.36757812; TotalTime = 0.29723s; TotalTimePerSample = 0.11611ms; SamplesPerSecond = 8612 + Epoch[ 2 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14554138; EvalErr[0]PerSample = 0.34843750; TotalTime = 0.27815s; TotalTimePerSample = 0.10865ms; SamplesPerSecond = 9203 + Epoch[ 2 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.15263367; EvalErr[0]PerSample = 0.35390625; TotalTime = 0.26319s; TotalTimePerSample = 0.10281ms; SamplesPerSecond = 9726 + Epoch[ 2 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.08563538; EvalErr[0]PerSample = 0.33437500; TotalTime = 0.24924s; TotalTimePerSample = 0.09736ms; SamplesPerSecond = 10271 + Epoch[ 2 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.10797424; EvalErr[0]PerSample = 0.34882812; TotalTime = 0.23954s; TotalTimePerSample = 0.09357ms; SamplesPerSecond = 10687 + Epoch[ 2 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.07031860; EvalErr[0]PerSample = 0.33593750; TotalTime = 0.23508s; TotalTimePerSample = 0.09183ms; SamplesPerSecond = 10889 + Epoch[ 2 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09429016; EvalErr[0]PerSample = 0.33476563; TotalTime = 0.30298s; TotalTimePerSample = 0.11835ms; SamplesPerSecond = 8449 + Epoch[ 2 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14634094; EvalErr[0]PerSample = 0.35351563; TotalTime = 0.29330s; TotalTimePerSample = 0.11457ms; SamplesPerSecond = 8728 + Epoch[ 2 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.10476990; EvalErr[0]PerSample = 0.34335938; TotalTime = 0.27493s; TotalTimePerSample = 0.10740ms; SamplesPerSecond = 9311 + Epoch[ 2 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.07355957; EvalErr[0]PerSample = 0.32695313; TotalTime = 0.23165s; TotalTimePerSample = 0.09049ms; SamplesPerSecond = 11051 +Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 1.1603298; EvalErrPerSample = 0.35574952; Ave LearnRatePerSample = 0.003125000047; EpochTime=9.225137 CNTKCommandTrainEnd: DPT_Pre1 @@ -808,6 +867,24 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node cr. 6 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node cr, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -849,7 +926,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] -Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2. +Validating for node ScaledLogLikelihood. 7 nodes to process in pass 2. Validating --> OL.W = LearnableParameter -> [132, 512] Validating --> HL1.W = LearnableParameter -> [512, 363] @@ -910,6 +987,25 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] +Validating for node ScaledLogLikelihood. 7 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -951,7 +1047,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] -Validating for node Err. 1 nodes to process in pass 2. +Validating for node Err. 6 nodes to process in pass 2. Validating --> labels = InputValue -> [132, MBSize 0] Validating --> OL.W = LearnableParameter -> [132, 512] @@ -1009,6 +1105,24 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node Err. 6 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -1077,7 +1191,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] -Validating for node cr. 3 nodes to process in pass 2. +Validating for node cr. 9 nodes to process in pass 2. Validating --> labels = InputValue -> [132, MBSize 0] Validating --> OL.W = LearnableParameter -> [132, 512] @@ -1150,6 +1264,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node cr. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node cr, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -1201,6 +1338,30 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] +Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -1253,6 +1414,30 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] +Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -1304,6 +1489,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node Err. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -1354,6 +1562,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node Err. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -1388,7 +1619,7 @@ htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Spe ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances label set 0: 129 classes minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames -Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0. +Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0. Printing Gradient Computation Node Order ... @@ -1510,6 +1741,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node cr. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node cr, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -1561,7 +1815,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] -Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2. +Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2. Validating --> OL.W = LearnableParameter -> [132, 512] Validating --> HL2.W = LearnableParameter -> [512, 512] @@ -1637,6 +1891,30 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] +Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -1688,7 +1966,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] -Validating for node Err. 1 nodes to process in pass 2. +Validating for node Err. 9 nodes to process in pass 2. Validating --> labels = InputValue -> [132, MBSize 0] Validating --> OL.W = LearnableParameter -> [132, 512] @@ -1761,6 +2039,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node Err. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -1795,78 +2096,78 @@ minibatchiterator: epoch 0: frames [0..81920] (first utterance at frame 0), data requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms Starting minibatch loop. - Epoch[ 1 of 2]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 4.36024933; EvalErr[0]PerSample = 0.80703125; TotalTime = 0.25347s; TotalTimePerSample = 0.09901ms; SamplesPerSecond = 10099 - Epoch[ 1 of 2]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.80374603; EvalErr[0]PerSample = 0.67890625; TotalTime = 0.23590s; TotalTimePerSample = 0.09215ms; SamplesPerSecond = 10852 - Epoch[ 1 of 2]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.23118515; EvalErr[0]PerSample = 0.59218750; TotalTime = 0.22699s; TotalTimePerSample = 0.08867ms; SamplesPerSecond = 11277 - Epoch[ 1 of 2]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.89543457; EvalErr[0]PerSample = 0.50625000; TotalTime = 0.22699s; TotalTimePerSample = 0.08867ms; SamplesPerSecond = 11278 - Epoch[ 1 of 2]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.69047775; EvalErr[0]PerSample = 0.47460938; TotalTime = 0.22702s; TotalTimePerSample = 0.08868ms; SamplesPerSecond = 11276 - Epoch[ 1 of 2]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.58694305; EvalErr[0]PerSample = 0.45898438; TotalTime = 0.22739s; TotalTimePerSample = 0.08882ms; SamplesPerSecond = 11258 - Epoch[ 1 of 2]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.48457794; EvalErr[0]PerSample = 0.43281250; TotalTime = 0.22649s; TotalTimePerSample = 0.08847ms; SamplesPerSecond = 11303 - Epoch[ 1 of 2]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.48614807; EvalErr[0]PerSample = 0.43203125; TotalTime = 0.22697s; TotalTimePerSample = 0.08866ms; SamplesPerSecond = 11278 - Epoch[ 1 of 2]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.45286255; EvalErr[0]PerSample = 0.41992188; TotalTime = 0.22646s; TotalTimePerSample = 0.08846ms; SamplesPerSecond = 11304 - Epoch[ 1 of 2]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.41294861; EvalErr[0]PerSample = 0.40937500; TotalTime = 0.22741s; TotalTimePerSample = 0.08883ms; SamplesPerSecond = 11257 + Epoch[ 1 of 2]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 4.49739113; EvalErr[0]PerSample = 0.80429688; TotalTime = 0.30184s; TotalTimePerSample = 0.11790ms; SamplesPerSecond = 8481 + Epoch[ 1 of 2]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.83226433; EvalErr[0]PerSample = 0.68125000; TotalTime = 0.26951s; TotalTimePerSample = 0.10528ms; SamplesPerSecond = 9498 + Epoch[ 1 of 2]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.25921097; EvalErr[0]PerSample = 0.59921875; TotalTime = 0.25048s; TotalTimePerSample = 0.09784ms; SamplesPerSecond = 10220 + Epoch[ 1 of 2]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.91240921; EvalErr[0]PerSample = 0.51210937; TotalTime = 0.26010s; TotalTimePerSample = 0.10160ms; SamplesPerSecond = 9842 + Epoch[ 1 of 2]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.69259949; EvalErr[0]PerSample = 0.46679688; TotalTime = 0.34494s; TotalTimePerSample = 0.13474ms; SamplesPerSecond = 7421 + Epoch[ 1 of 2]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.59069672; EvalErr[0]PerSample = 0.45312500; TotalTime = 0.33370s; TotalTimePerSample = 0.13035ms; SamplesPerSecond = 7671 + Epoch[ 1 of 2]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.48813324; EvalErr[0]PerSample = 0.43789062; TotalTime = 0.32515s; TotalTimePerSample = 0.12701ms; SamplesPerSecond = 7873 + Epoch[ 1 of 2]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.48960571; EvalErr[0]PerSample = 0.43515625; TotalTime = 0.30350s; TotalTimePerSample = 0.11856ms; SamplesPerSecond = 8434 + Epoch[ 1 of 2]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.45628204; EvalErr[0]PerSample = 0.42187500; TotalTime = 0.28491s; TotalTimePerSample = 0.11129ms; SamplesPerSecond = 8985 + Epoch[ 1 of 2]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.41567383; EvalErr[0]PerSample = 0.40820313; TotalTime = 0.27054s; TotalTimePerSample = 0.10568ms; SamplesPerSecond = 9462 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 1 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.41962891; EvalErr[0]PerSample = 0.41132812; TotalTime = 0.22724s; TotalTimePerSample = 0.08877ms; SamplesPerSecond = 11265 - Epoch[ 1 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.33667145; EvalErr[0]PerSample = 0.39335938; TotalTime = 0.22657s; TotalTimePerSample = 0.08851ms; SamplesPerSecond = 11298 - Epoch[ 1 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.31494751; EvalErr[0]PerSample = 0.38281250; TotalTime = 0.22699s; TotalTimePerSample = 0.08867ms; SamplesPerSecond = 11278 - Epoch[ 1 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.33030090; EvalErr[0]PerSample = 0.39648438; TotalTime = 0.22693s; TotalTimePerSample = 0.08864ms; SamplesPerSecond = 11281 - Epoch[ 1 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.31785889; EvalErr[0]PerSample = 0.38789062; TotalTime = 0.22728s; TotalTimePerSample = 0.08878ms; SamplesPerSecond = 11263 - Epoch[ 1 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.27231445; EvalErr[0]PerSample = 0.38125000; TotalTime = 0.22750s; TotalTimePerSample = 0.08887ms; SamplesPerSecond = 11252 - Epoch[ 1 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.29524231; EvalErr[0]PerSample = 0.38359375; TotalTime = 0.22861s; TotalTimePerSample = 0.08930ms; SamplesPerSecond = 11198 - Epoch[ 1 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.32531738; EvalErr[0]PerSample = 0.39023438; TotalTime = 0.24451s; TotalTimePerSample = 0.09551ms; SamplesPerSecond = 10470 - Epoch[ 1 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.33986511; EvalErr[0]PerSample = 0.41367188; TotalTime = 0.22770s; TotalTimePerSample = 0.08895ms; SamplesPerSecond = 11242 - Epoch[ 1 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.31717529; EvalErr[0]PerSample = 0.41093750; TotalTime = 0.22722s; TotalTimePerSample = 0.08876ms; SamplesPerSecond = 11266 - Epoch[ 1 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23374634; EvalErr[0]PerSample = 0.37695313; TotalTime = 0.22688s; TotalTimePerSample = 0.08862ms; SamplesPerSecond = 11283 - Epoch[ 1 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.26781921; EvalErr[0]PerSample = 0.38867188; TotalTime = 0.22676s; TotalTimePerSample = 0.08858ms; SamplesPerSecond = 11289 - Epoch[ 1 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.26815796; EvalErr[0]PerSample = 0.37890625; TotalTime = 0.22765s; TotalTimePerSample = 0.08893ms; SamplesPerSecond = 11245 - Epoch[ 1 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23847656; EvalErr[0]PerSample = 0.36757812; TotalTime = 0.22690s; TotalTimePerSample = 0.08863ms; SamplesPerSecond = 11282 - Epoch[ 1 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21604004; EvalErr[0]PerSample = 0.36640625; TotalTime = 0.22660s; TotalTimePerSample = 0.08851ms; SamplesPerSecond = 11297 - Epoch[ 1 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18456726; EvalErr[0]PerSample = 0.36562500; TotalTime = 0.26425s; TotalTimePerSample = 0.10322ms; SamplesPerSecond = 9687 - Epoch[ 1 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.24215698; EvalErr[0]PerSample = 0.36796875; TotalTime = 0.32872s; TotalTimePerSample = 0.12841ms; SamplesPerSecond = 7787 - Epoch[ 1 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18797607; EvalErr[0]PerSample = 0.36445312; TotalTime = 0.33424s; TotalTimePerSample = 0.13056ms; SamplesPerSecond = 7659 - Epoch[ 1 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16840210; EvalErr[0]PerSample = 0.35351563; TotalTime = 0.34198s; TotalTimePerSample = 0.13359ms; SamplesPerSecond = 7485 - Epoch[ 1 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14445496; EvalErr[0]PerSample = 0.34414062; TotalTime = 0.33989s; TotalTimePerSample = 0.13277ms; SamplesPerSecond = 7531 - Epoch[ 1 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17367554; EvalErr[0]PerSample = 0.35039063; TotalTime = 0.32337s; TotalTimePerSample = 0.12632ms; SamplesPerSecond = 7916 - Epoch[ 1 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18674622; EvalErr[0]PerSample = 0.36875000; TotalTime = 0.27449s; TotalTimePerSample = 0.10722ms; SamplesPerSecond = 9326 -Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 1.5058161; EvalErrPerSample = 0.42365724; Ave LearnRatePerSample = 0.003125000047; EpochTime=11.880998 + Epoch[ 1 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.42048950; EvalErr[0]PerSample = 0.41406250; TotalTime = 0.25862s; TotalTimePerSample = 0.10102ms; SamplesPerSecond = 9898 + Epoch[ 1 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.34279480; EvalErr[0]PerSample = 0.39726563; TotalTime = 0.24826s; TotalTimePerSample = 0.09698ms; SamplesPerSecond = 10311 + Epoch[ 1 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.31633148; EvalErr[0]PerSample = 0.38789062; TotalTime = 0.29231s; TotalTimePerSample = 0.11418ms; SamplesPerSecond = 8757 + Epoch[ 1 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.33296814; EvalErr[0]PerSample = 0.39804688; TotalTime = 0.34247s; TotalTimePerSample = 0.13378ms; SamplesPerSecond = 7475 + Epoch[ 1 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.32084351; EvalErr[0]PerSample = 0.39609375; TotalTime = 0.34517s; TotalTimePerSample = 0.13483ms; SamplesPerSecond = 7416 + Epoch[ 1 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.27189636; EvalErr[0]PerSample = 0.38125000; TotalTime = 0.34273s; TotalTimePerSample = 0.13388ms; SamplesPerSecond = 7469 + Epoch[ 1 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.29380188; EvalErr[0]PerSample = 0.38554688; TotalTime = 0.33912s; TotalTimePerSample = 0.13247ms; SamplesPerSecond = 7548 + Epoch[ 1 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.31463013; EvalErr[0]PerSample = 0.38984375; TotalTime = 0.32910s; TotalTimePerSample = 0.12855ms; SamplesPerSecond = 7778 + Epoch[ 1 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.33578796; EvalErr[0]PerSample = 0.40664062; TotalTime = 0.34127s; TotalTimePerSample = 0.13331ms; SamplesPerSecond = 7501 + Epoch[ 1 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.32202454; EvalErr[0]PerSample = 0.41484375; TotalTime = 0.31738s; TotalTimePerSample = 0.12398ms; SamplesPerSecond = 8066 + Epoch[ 1 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23669434; EvalErr[0]PerSample = 0.37460938; TotalTime = 0.32630s; TotalTimePerSample = 0.12746ms; SamplesPerSecond = 7845 + Epoch[ 1 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.27109985; EvalErr[0]PerSample = 0.38906250; TotalTime = 0.34553s; TotalTimePerSample = 0.13497ms; SamplesPerSecond = 7408 + Epoch[ 1 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.26419678; EvalErr[0]PerSample = 0.37578125; TotalTime = 0.33855s; TotalTimePerSample = 0.13224ms; SamplesPerSecond = 7561 + Epoch[ 1 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23778992; EvalErr[0]PerSample = 0.37265625; TotalTime = 0.32655s; TotalTimePerSample = 0.12756ms; SamplesPerSecond = 7839 + Epoch[ 1 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21040344; EvalErr[0]PerSample = 0.36757812; TotalTime = 0.34363s; TotalTimePerSample = 0.13423ms; SamplesPerSecond = 7449 + Epoch[ 1 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18387146; EvalErr[0]PerSample = 0.36562500; TotalTime = 0.33771s; TotalTimePerSample = 0.13192ms; SamplesPerSecond = 7580 + Epoch[ 1 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23827515; EvalErr[0]PerSample = 0.37148437; TotalTime = 0.32123s; TotalTimePerSample = 0.12548ms; SamplesPerSecond = 7969 + Epoch[ 1 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18418274; EvalErr[0]PerSample = 0.36328125; TotalTime = 0.31703s; TotalTimePerSample = 0.12384ms; SamplesPerSecond = 8074 + Epoch[ 1 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16851501; EvalErr[0]PerSample = 0.35234375; TotalTime = 0.34084s; TotalTimePerSample = 0.13314ms; SamplesPerSecond = 7510 + Epoch[ 1 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14337463; EvalErr[0]PerSample = 0.34375000; TotalTime = 0.34387s; TotalTimePerSample = 0.13432ms; SamplesPerSecond = 7444 + Epoch[ 1 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17227478; EvalErr[0]PerSample = 0.34882812; TotalTime = 0.34417s; TotalTimePerSample = 0.13444ms; SamplesPerSecond = 7438 + Epoch[ 1 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18431091; EvalErr[0]PerSample = 0.36835937; TotalTime = 0.30969s; TotalTimePerSample = 0.12097ms; SamplesPerSecond = 8266 +Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 1.5125258; EvalErrPerSample = 0.42452392; Ave LearnRatePerSample = 0.003125000047; EpochTime=14.904725 Starting Epoch 2: learning rate per sample = 0.003125 effective momentum = 0.900000 minibatchiterator: epoch 1: frames [81920..163840] (first utterance at frame 81920), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 2 of 2]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17634354; EvalErr[0]PerSample = 0.35351563; TotalTime = 0.34560s; TotalTimePerSample = 0.13500ms; SamplesPerSecond = 7407 - Epoch[ 2 of 2]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14589901; EvalErr[0]PerSample = 0.35664062; TotalTime = 0.34276s; TotalTimePerSample = 0.13389ms; SamplesPerSecond = 7468 - Epoch[ 2 of 2]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.15637836; EvalErr[0]PerSample = 0.35117188; TotalTime = 0.32955s; TotalTimePerSample = 0.12873ms; SamplesPerSecond = 7768 - Epoch[ 2 of 2]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14355202; EvalErr[0]PerSample = 0.34179688; TotalTime = 0.33820s; TotalTimePerSample = 0.13211ms; SamplesPerSecond = 7569 - Epoch[ 2 of 2]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14718361; EvalErr[0]PerSample = 0.36093750; TotalTime = 0.33625s; TotalTimePerSample = 0.13135ms; SamplesPerSecond = 7613 - Epoch[ 2 of 2]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14584732; EvalErr[0]PerSample = 0.33945313; TotalTime = 0.33316s; TotalTimePerSample = 0.13014ms; SamplesPerSecond = 7684 - Epoch[ 2 of 2]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09562225; EvalErr[0]PerSample = 0.33789063; TotalTime = 0.32798s; TotalTimePerSample = 0.12812ms; SamplesPerSecond = 7805 - Epoch[ 2 of 2]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16124268; EvalErr[0]PerSample = 0.35859375; TotalTime = 0.33002s; TotalTimePerSample = 0.12892ms; SamplesPerSecond = 7757 - Epoch[ 2 of 2]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16578064; EvalErr[0]PerSample = 0.36210938; TotalTime = 0.35216s; TotalTimePerSample = 0.13756ms; SamplesPerSecond = 7269 - Epoch[ 2 of 2]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12110596; EvalErr[0]PerSample = 0.34218750; TotalTime = 0.34197s; TotalTimePerSample = 0.13358ms; SamplesPerSecond = 7486 + Epoch[ 2 of 2]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17448177; EvalErr[0]PerSample = 0.35195312; TotalTime = 0.32358s; TotalTimePerSample = 0.12640ms; SamplesPerSecond = 7911 + Epoch[ 2 of 2]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14536781; EvalErr[0]PerSample = 0.35664062; TotalTime = 0.32840s; TotalTimePerSample = 0.12828ms; SamplesPerSecond = 7795 + Epoch[ 2 of 2]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.15722904; EvalErr[0]PerSample = 0.34531250; TotalTime = 0.33313s; TotalTimePerSample = 0.13013ms; SamplesPerSecond = 7684 + Epoch[ 2 of 2]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14344521; EvalErr[0]PerSample = 0.34804687; TotalTime = 0.34423s; TotalTimePerSample = 0.13446ms; SamplesPerSecond = 7436 + Epoch[ 2 of 2]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14842377; EvalErr[0]PerSample = 0.36562500; TotalTime = 0.33629s; TotalTimePerSample = 0.13136ms; SamplesPerSecond = 7612 + Epoch[ 2 of 2]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.14489059; EvalErr[0]PerSample = 0.34218750; TotalTime = 0.32385s; TotalTimePerSample = 0.12650ms; SamplesPerSecond = 7905 + Epoch[ 2 of 2]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09631195; EvalErr[0]PerSample = 0.33984375; TotalTime = 0.30827s; TotalTimePerSample = 0.12042ms; SamplesPerSecond = 8304 + Epoch[ 2 of 2]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16026917; EvalErr[0]PerSample = 0.35546875; TotalTime = 0.33824s; TotalTimePerSample = 0.13212ms; SamplesPerSecond = 7568 + Epoch[ 2 of 2]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16528091; EvalErr[0]PerSample = 0.36015625; TotalTime = 0.34935s; TotalTimePerSample = 0.13646ms; SamplesPerSecond = 7327 + Epoch[ 2 of 2]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12257309; EvalErr[0]PerSample = 0.34492187; TotalTime = 0.30700s; TotalTimePerSample = 0.11992ms; SamplesPerSecond = 8338 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 2 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12307053; EvalErr[0]PerSample = 0.35000000; TotalTime = 0.32871s; TotalTimePerSample = 0.12840ms; SamplesPerSecond = 7787 - Epoch[ 2 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18570023; EvalErr[0]PerSample = 0.36328125; TotalTime = 0.32470s; TotalTimePerSample = 0.12683ms; SamplesPerSecond = 7884 - Epoch[ 2 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12998352; EvalErr[0]PerSample = 0.33789063; TotalTime = 0.33742s; TotalTimePerSample = 0.13181ms; SamplesPerSecond = 7586 - Epoch[ 2 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16931915; EvalErr[0]PerSample = 0.35429688; TotalTime = 0.34228s; TotalTimePerSample = 0.13370ms; SamplesPerSecond = 7479 - Epoch[ 2 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.11923828; EvalErr[0]PerSample = 0.34335938; TotalTime = 0.33777s; TotalTimePerSample = 0.13194ms; SamplesPerSecond = 7579 - Epoch[ 2 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09157715; EvalErr[0]PerSample = 0.33789063; TotalTime = 0.34325s; TotalTimePerSample = 0.13408ms; SamplesPerSecond = 7458 - Epoch[ 2 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.10869598; EvalErr[0]PerSample = 0.33945313; TotalTime = 0.34204s; TotalTimePerSample = 0.13361ms; SamplesPerSecond = 7484 - Epoch[ 2 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12568817; EvalErr[0]PerSample = 0.33515625; TotalTime = 0.36284s; TotalTimePerSample = 0.14173ms; SamplesPerSecond = 7055 - Epoch[ 2 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.10168304; EvalErr[0]PerSample = 0.33945313; TotalTime = 0.33356s; TotalTimePerSample = 0.13030ms; SamplesPerSecond = 7674 - Epoch[ 2 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.05705414; EvalErr[0]PerSample = 0.33281250; TotalTime = 0.31607s; TotalTimePerSample = 0.12347ms; SamplesPerSecond = 8099 - Epoch[ 2 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.10415344; EvalErr[0]PerSample = 0.34296875; TotalTime = 0.33578s; TotalTimePerSample = 0.13116ms; SamplesPerSecond = 7624 - Epoch[ 2 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13464966; EvalErr[0]PerSample = 0.34375000; TotalTime = 0.34351s; TotalTimePerSample = 0.13418ms; SamplesPerSecond = 7452 - Epoch[ 2 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12070007; EvalErr[0]PerSample = 0.34296875; TotalTime = 0.32422s; TotalTimePerSample = 0.12665ms; SamplesPerSecond = 7895 - Epoch[ 2 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.10966797; EvalErr[0]PerSample = 0.33671875; TotalTime = 0.33828s; TotalTimePerSample = 0.13214ms; SamplesPerSecond = 7567 - Epoch[ 2 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09540100; EvalErr[0]PerSample = 0.33593750; TotalTime = 0.33439s; TotalTimePerSample = 0.13062ms; SamplesPerSecond = 7655 - Epoch[ 2 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.05154724; EvalErr[0]PerSample = 0.33476563; TotalTime = 0.31050s; TotalTimePerSample = 0.12129ms; SamplesPerSecond = 8244 - Epoch[ 2 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.04845581; EvalErr[0]PerSample = 0.33593750; TotalTime = 0.29355s; TotalTimePerSample = 0.11467ms; SamplesPerSecond = 8720 - Epoch[ 2 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.03578491; EvalErr[0]PerSample = 0.32226563; TotalTime = 0.27642s; TotalTimePerSample = 0.10798ms; SamplesPerSecond = 9261 - Epoch[ 2 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.05906372; EvalErr[0]PerSample = 0.32539062; TotalTime = 0.26455s; TotalTimePerSample = 0.10334ms; SamplesPerSecond = 9676 - Epoch[ 2 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.11199341; EvalErr[0]PerSample = 0.34218750; TotalTime = 0.25517s; TotalTimePerSample = 0.09968ms; SamplesPerSecond = 10032 - Epoch[ 2 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.08496399; EvalErr[0]PerSample = 0.33398438; TotalTime = 0.24490s; TotalTimePerSample = 0.09566ms; SamplesPerSecond = 10453 - Epoch[ 2 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.04550171; EvalErr[0]PerSample = 0.31914063; TotalTime = 0.21540s; TotalTimePerSample = 0.08414ms; SamplesPerSecond = 11884 -Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 1.1147765; EvalErrPerSample = 0.34230956; Ave LearnRatePerSample = 0.003125000047; EpochTime=10.335151 + Epoch[ 2 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12313004; EvalErr[0]PerSample = 0.34765625; TotalTime = 0.33577s; TotalTimePerSample = 0.13116ms; SamplesPerSecond = 7624 + Epoch[ 2 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18492050; EvalErr[0]PerSample = 0.36171875; TotalTime = 0.33462s; TotalTimePerSample = 0.13071ms; SamplesPerSecond = 7650 + Epoch[ 2 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13058014; EvalErr[0]PerSample = 0.33476563; TotalTime = 0.31019s; TotalTimePerSample = 0.12117ms; SamplesPerSecond = 8253 + Epoch[ 2 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16725922; EvalErr[0]PerSample = 0.35781250; TotalTime = 0.29240s; TotalTimePerSample = 0.11422ms; SamplesPerSecond = 8755 + Epoch[ 2 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12244568; EvalErr[0]PerSample = 0.34648438; TotalTime = 0.37010s; TotalTimePerSample = 0.14457ms; SamplesPerSecond = 6917 + Epoch[ 2 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09480591; EvalErr[0]PerSample = 0.33671875; TotalTime = 0.30263s; TotalTimePerSample = 0.11822ms; SamplesPerSecond = 8459 + Epoch[ 2 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.11218109; EvalErr[0]PerSample = 0.34140625; TotalTime = 0.33987s; TotalTimePerSample = 0.13276ms; SamplesPerSecond = 7532 + Epoch[ 2 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.11966095; EvalErr[0]PerSample = 0.33398438; TotalTime = 0.31333s; TotalTimePerSample = 0.12240ms; SamplesPerSecond = 8170 + Epoch[ 2 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.10485687; EvalErr[0]PerSample = 0.33671875; TotalTime = 0.29728s; TotalTimePerSample = 0.11613ms; SamplesPerSecond = 8611 + Epoch[ 2 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.06019897; EvalErr[0]PerSample = 0.32617188; TotalTime = 0.27964s; TotalTimePerSample = 0.10923ms; SamplesPerSecond = 9154 + Epoch[ 2 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.10600891; EvalErr[0]PerSample = 0.34101562; TotalTime = 0.26485s; TotalTimePerSample = 0.10346ms; SamplesPerSecond = 9665 + Epoch[ 2 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13724976; EvalErr[0]PerSample = 0.34101562; TotalTime = 0.31315s; TotalTimePerSample = 0.12232ms; SamplesPerSecond = 8175 + Epoch[ 2 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12464752; EvalErr[0]PerSample = 0.34609375; TotalTime = 0.35469s; TotalTimePerSample = 0.13855ms; SamplesPerSecond = 7217 + Epoch[ 2 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.10831604; EvalErr[0]PerSample = 0.33593750; TotalTime = 0.33481s; TotalTimePerSample = 0.13079ms; SamplesPerSecond = 7646 + Epoch[ 2 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.09707031; EvalErr[0]PerSample = 0.34023437; TotalTime = 0.33923s; TotalTimePerSample = 0.13251ms; SamplesPerSecond = 7546 + Epoch[ 2 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.04812317; EvalErr[0]PerSample = 0.32773438; TotalTime = 0.33522s; TotalTimePerSample = 0.13094ms; SamplesPerSecond = 7636 + Epoch[ 2 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.04979248; EvalErr[0]PerSample = 0.33398438; TotalTime = 0.33766s; TotalTimePerSample = 0.13190ms; SamplesPerSecond = 7581 + Epoch[ 2 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.03223572; EvalErr[0]PerSample = 0.31835938; TotalTime = 0.31202s; TotalTimePerSample = 0.12188ms; SamplesPerSecond = 8204 + Epoch[ 2 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.05677490; EvalErr[0]PerSample = 0.32773438; TotalTime = 0.34007s; TotalTimePerSample = 0.13284ms; SamplesPerSecond = 7527 + Epoch[ 2 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.10880737; EvalErr[0]PerSample = 0.34296875; TotalTime = 0.34820s; TotalTimePerSample = 0.13601ms; SamplesPerSecond = 7352 + Epoch[ 2 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.08513489; EvalErr[0]PerSample = 0.33476563; TotalTime = 0.29751s; TotalTimePerSample = 0.11622ms; SamplesPerSecond = 8604 + Epoch[ 2 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.04244080; EvalErr[0]PerSample = 0.31757812; TotalTime = 0.24686s; TotalTimePerSample = 0.09643ms; SamplesPerSecond = 10370 +Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 1.1148411; EvalErrPerSample = 0.34190676; Ave LearnRatePerSample = 0.003125000047; EpochTime=10.343029 CNTKCommandTrainEnd: DPT_Pre2 @@ -1989,6 +2290,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node cr. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node cr, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -2040,7 +2364,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] -Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2. +Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2. Validating --> OL.W = LearnableParameter -> [132, 512] Validating --> HL2.W = LearnableParameter -> [512, 512] @@ -2116,6 +2440,30 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] +Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -2167,7 +2515,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] -Validating for node Err. 1 nodes to process in pass 2. +Validating for node Err. 9 nodes to process in pass 2. Validating --> labels = InputValue -> [132, MBSize 0] Validating --> OL.W = LearnableParameter -> [132, 512] @@ -2240,6 +2588,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node Err. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -2323,7 +2694,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] -Validating for node cr. 3 nodes to process in pass 2. +Validating for node cr. 12 nodes to process in pass 2. Validating --> labels = InputValue -> [132, MBSize 0] Validating --> OL.W = LearnableParameter -> [132, 512] @@ -2411,6 +2782,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node cr. 12 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL3.W = LearnableParameter -> [512, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.b = LearnableParameter -> [512, 1] +Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0] +Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node cr, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -2472,6 +2871,35 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] +Validating for node ScaledLogLikelihood. 13 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL3.W = LearnableParameter -> [512, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.b = LearnableParameter -> [512, 1] +Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0] +Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -2534,6 +2962,35 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] +Validating for node ScaledLogLikelihood. 13 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL3.W = LearnableParameter -> [512, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.b = LearnableParameter -> [512, 1] +Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0] +Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -2595,6 +3052,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node Err. 12 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL3.W = LearnableParameter -> [512, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.b = LearnableParameter -> [512, 1] +Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0] +Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -2655,6 +3140,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node Err. 12 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL3.W = LearnableParameter -> [512, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.b = LearnableParameter -> [512, 1] +Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0] +Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -2694,7 +3207,7 @@ htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Spe ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances label set 0: 129 classes minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames -Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0. +Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0. Printing Gradient Computation Node Order ... @@ -2841,6 +3354,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node cr. 12 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL3.W = LearnableParameter -> [512, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.b = LearnableParameter -> [512, 1] +Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0] +Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node cr, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -2902,7 +3443,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] -Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2. +Validating for node ScaledLogLikelihood. 13 nodes to process in pass 2. Validating --> OL.W = LearnableParameter -> [132, 512] Validating --> HL3.W = LearnableParameter -> [512, 512] @@ -2993,6 +3534,35 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1] Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] +Validating for node ScaledLogLikelihood. 13 nodes to process in pass 2. + +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL3.W = LearnableParameter -> [512, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.b = LearnableParameter -> [512, 1] +Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0] +Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> GlobalPrior = LearnableParameter -> [132, 1] +Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0] + Validating for node ScaledLogLikelihood, final verification. Validating --> OL.W = LearnableParameter -> [132, 512] @@ -3054,7 +3624,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] -Validating for node Err. 1 nodes to process in pass 2. +Validating for node Err. 12 nodes to process in pass 2. Validating --> labels = InputValue -> [132, MBSize 0] Validating --> OL.W = LearnableParameter -> [132, 512] @@ -3142,6 +3712,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1] Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] +Validating for node Err. 12 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> OL.W = LearnableParameter -> [132, 512] +Validating --> HL3.W = LearnableParameter -> [512, 512] +Validating --> HL2.W = LearnableParameter -> [512, 512] +Validating --> HL1.W = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> GlobalMean = LearnableParameter -> [363, 1] +Validating --> GlobalInvStd = LearnableParameter -> [363, 1] +Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0] +Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0] +Validating --> HL1.b = LearnableParameter -> [512, 1] +Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0] +Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL2.b = LearnableParameter -> [512, 1] +Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0] +Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0] +Validating --> HL3.b = LearnableParameter -> [512, 1] +Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0] +Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0] +Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0] +Validating --> OL.b = LearnableParameter -> [132, 1] +Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0] +Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1] + Validating for node Err, final verification. Validating --> labels = InputValue -> [132, MBSize 0] @@ -3181,105 +3779,105 @@ minibatchiterator: epoch 0: frames [0..81920] (first utterance at frame 0), data requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms Starting minibatch loop. - Epoch[ 1 of 4]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 4.10662956; EvalErr[0]PerSample = 0.82890625; TotalTime = 0.27483s; TotalTimePerSample = 0.10736ms; SamplesPerSecond = 9314 - Epoch[ 1 of 4]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.55908470; EvalErr[0]PerSample = 0.63164062; TotalTime = 0.25240s; TotalTimePerSample = 0.09859ms; SamplesPerSecond = 10142 - Epoch[ 1 of 4]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.03446350; EvalErr[0]PerSample = 0.53906250; TotalTime = 0.25247s; TotalTimePerSample = 0.09862ms; SamplesPerSecond = 10139 - Epoch[ 1 of 4]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.73968811; EvalErr[0]PerSample = 0.47812500; TotalTime = 0.25275s; TotalTimePerSample = 0.09873ms; SamplesPerSecond = 10128 - Epoch[ 1 of 4]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.54626236; EvalErr[0]PerSample = 0.43867187; TotalTime = 0.25343s; TotalTimePerSample = 0.09900ms; SamplesPerSecond = 10101 - Epoch[ 1 of 4]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.44772797; EvalErr[0]PerSample = 0.41171875; TotalTime = 0.25254s; TotalTimePerSample = 0.09865ms; SamplesPerSecond = 10137 - Epoch[ 1 of 4]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.36287384; EvalErr[0]PerSample = 0.40937500; TotalTime = 0.35783s; TotalTimePerSample = 0.13978ms; SamplesPerSecond = 7154 - Epoch[ 1 of 4]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.36141815; EvalErr[0]PerSample = 0.39921875; TotalTime = 0.35367s; TotalTimePerSample = 0.13815ms; SamplesPerSecond = 7238 - Epoch[ 1 of 4]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.34428864; EvalErr[0]PerSample = 0.38710937; TotalTime = 0.37068s; TotalTimePerSample = 0.14480ms; SamplesPerSecond = 6906 - Epoch[ 1 of 4]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.30752716; EvalErr[0]PerSample = 0.38242188; TotalTime = 0.34724s; TotalTimePerSample = 0.13564ms; SamplesPerSecond = 7372 + Epoch[ 1 of 4]-Minibatch[ 1- 10 of 320]: SamplesSeen = 2560; TrainLossPerSample = 4.12455330; EvalErr[0]PerSample = 0.82734375; TotalTime = 0.37556s; TotalTimePerSample = 0.14670ms; SamplesPerSecond = 6816 + Epoch[ 1 of 4]-Minibatch[ 11- 20 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.55599785; EvalErr[0]PerSample = 0.63007813; TotalTime = 0.36775s; TotalTimePerSample = 0.14365ms; SamplesPerSecond = 6961 + Epoch[ 1 of 4]-Minibatch[ 21- 30 of 320]: SamplesSeen = 2560; TrainLossPerSample = 2.03516159; EvalErr[0]PerSample = 0.53945312; TotalTime = 0.38102s; TotalTimePerSample = 0.14884ms; SamplesPerSecond = 6718 + Epoch[ 1 of 4]-Minibatch[ 31- 40 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.73739853; EvalErr[0]PerSample = 0.47500000; TotalTime = 0.36620s; TotalTimePerSample = 0.14305ms; SamplesPerSecond = 6990 + Epoch[ 1 of 4]-Minibatch[ 41- 50 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.54207916; EvalErr[0]PerSample = 0.43515625; TotalTime = 0.34129s; TotalTimePerSample = 0.13332ms; SamplesPerSecond = 7500 + Epoch[ 1 of 4]-Minibatch[ 51- 60 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.44409790; EvalErr[0]PerSample = 0.41328125; TotalTime = 0.31795s; TotalTimePerSample = 0.12420ms; SamplesPerSecond = 8051 + Epoch[ 1 of 4]-Minibatch[ 61- 70 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.36059418; EvalErr[0]PerSample = 0.40898438; TotalTime = 0.34866s; TotalTimePerSample = 0.13620ms; SamplesPerSecond = 7342 + Epoch[ 1 of 4]-Minibatch[ 71- 80 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.35930023; EvalErr[0]PerSample = 0.40117188; TotalTime = 0.36478s; TotalTimePerSample = 0.14249ms; SamplesPerSecond = 7018 + Epoch[ 1 of 4]-Minibatch[ 81- 90 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.34254303; EvalErr[0]PerSample = 0.38632813; TotalTime = 0.35487s; TotalTimePerSample = 0.13862ms; SamplesPerSecond = 7213 + Epoch[ 1 of 4]-Minibatch[ 91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.30505676; EvalErr[0]PerSample = 0.38320312; TotalTime = 0.35420s; TotalTimePerSample = 0.13836ms; SamplesPerSecond = 7227 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 1 of 4]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.30951538; EvalErr[0]PerSample = 0.38671875; TotalTime = 0.37789s; TotalTimePerSample = 0.14761ms; SamplesPerSecond = 6774 - Epoch[ 1 of 4]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23730469; EvalErr[0]PerSample = 0.36914063; TotalTime = 0.37117s; TotalTimePerSample = 0.14499ms; SamplesPerSecond = 6897 - Epoch[ 1 of 4]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21422424; EvalErr[0]PerSample = 0.35625000; TotalTime = 0.37390s; TotalTimePerSample = 0.14606ms; SamplesPerSecond = 6846 - Epoch[ 1 of 4]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23798065; EvalErr[0]PerSample = 0.37421875; TotalTime = 0.34378s; TotalTimePerSample = 0.13429ms; SamplesPerSecond = 7446 - Epoch[ 1 of 4]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23455658; EvalErr[0]PerSample = 0.36914063; TotalTime = 0.32221s; TotalTimePerSample = 0.12586ms; SamplesPerSecond = 7945 - Epoch[ 1 of 4]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.19309692; EvalErr[0]PerSample = 0.34765625; TotalTime = 0.30479s; TotalTimePerSample = 0.11906ms; SamplesPerSecond = 8399 - Epoch[ 1 of 4]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21521301; EvalErr[0]PerSample = 0.36679688; TotalTime = 0.29147s; TotalTimePerSample = 0.11386ms; SamplesPerSecond = 8782 - Epoch[ 1 of 4]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.24454651; EvalErr[0]PerSample = 0.37343750; TotalTime = 0.27830s; TotalTimePerSample = 0.10871ms; SamplesPerSecond = 9198 - Epoch[ 1 of 4]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.26795959; EvalErr[0]PerSample = 0.38750000; TotalTime = 0.26773s; TotalTimePerSample = 0.10458ms; SamplesPerSecond = 9561 - Epoch[ 1 of 4]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.22294617; EvalErr[0]PerSample = 0.38085938; TotalTime = 0.26019s; TotalTimePerSample = 0.10164ms; SamplesPerSecond = 9839 - Epoch[ 1 of 4]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18269348; EvalErr[0]PerSample = 0.35546875; TotalTime = 0.25254s; TotalTimePerSample = 0.09865ms; SamplesPerSecond = 10137 - Epoch[ 1 of 4]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.20206299; EvalErr[0]PerSample = 0.37148437; TotalTime = 0.25223s; TotalTimePerSample = 0.09853ms; SamplesPerSecond = 10149 - Epoch[ 1 of 4]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.22170105; EvalErr[0]PerSample = 0.36132813; TotalTime = 0.25269s; TotalTimePerSample = 0.09871ms; SamplesPerSecond = 10131 - Epoch[ 1 of 4]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18813477; EvalErr[0]PerSample = 0.35703125; TotalTime = 0.25354s; TotalTimePerSample = 0.09904ms; SamplesPerSecond = 10096 - Epoch[ 1 of 4]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17123108; EvalErr[0]PerSample = 0.34843750; TotalTime = 0.31785s; TotalTimePerSample = 0.12416ms; SamplesPerSecond = 8054 - Epoch[ 1 of 4]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12467346; EvalErr[0]PerSample = 0.34843750; TotalTime = 0.36902s; TotalTimePerSample = 0.14415ms; SamplesPerSecond = 6937 - Epoch[ 1 of 4]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.19338379; EvalErr[0]PerSample = 0.36210938; TotalTime = 0.34072s; TotalTimePerSample = 0.13309ms; SamplesPerSecond = 7513 - Epoch[ 1 of 4]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13827820; EvalErr[0]PerSample = 0.34648438; TotalTime = 0.34513s; TotalTimePerSample = 0.13482ms; SamplesPerSecond = 7417 - Epoch[ 1 of 4]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12540894; EvalErr[0]PerSample = 0.33710937; TotalTime = 0.37958s; TotalTimePerSample = 0.14827ms; SamplesPerSecond = 6744 - Epoch[ 1 of 4]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.10656738; EvalErr[0]PerSample = 0.33632812; TotalTime = 0.38014s; TotalTimePerSample = 0.14849ms; SamplesPerSecond = 6734 - Epoch[ 1 of 4]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13371277; EvalErr[0]PerSample = 0.34218750; TotalTime = 0.34945s; TotalTimePerSample = 0.13650ms; SamplesPerSecond = 7325 - Epoch[ 1 of 4]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12843018; EvalErr[0]PerSample = 0.34882812; TotalTime = 0.33717s; TotalTimePerSample = 0.13171ms; SamplesPerSecond = 7592 -Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 1.4094871; EvalErrPerSample = 0.4010376; Ave LearnRatePerSample = 0.003125000047; EpochTime=13.636564 + Epoch[ 1 of 4]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.30881348; EvalErr[0]PerSample = 0.38476563; TotalTime = 0.38684s; TotalTimePerSample = 0.15111ms; SamplesPerSecond = 6617 + Epoch[ 1 of 4]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23755188; EvalErr[0]PerSample = 0.37304688; TotalTime = 0.38140s; TotalTimePerSample = 0.14899ms; SamplesPerSecond = 6712 + Epoch[ 1 of 4]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21070251; EvalErr[0]PerSample = 0.35546875; TotalTime = 0.37976s; TotalTimePerSample = 0.14834ms; SamplesPerSecond = 6741 + Epoch[ 1 of 4]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.24008789; EvalErr[0]PerSample = 0.37109375; TotalTime = 0.35028s; TotalTimePerSample = 0.13683ms; SamplesPerSecond = 7308 + Epoch[ 1 of 4]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.23422089; EvalErr[0]PerSample = 0.36835937; TotalTime = 0.32867s; TotalTimePerSample = 0.12839ms; SamplesPerSecond = 7789 + Epoch[ 1 of 4]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.19425964; EvalErr[0]PerSample = 0.35195312; TotalTime = 0.36256s; TotalTimePerSample = 0.14162ms; SamplesPerSecond = 7060 + Epoch[ 1 of 4]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21415710; EvalErr[0]PerSample = 0.36289063; TotalTime = 0.36688s; TotalTimePerSample = 0.14331ms; SamplesPerSecond = 6977 + Epoch[ 1 of 4]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.24289856; EvalErr[0]PerSample = 0.37031250; TotalTime = 0.36730s; TotalTimePerSample = 0.14348ms; SamplesPerSecond = 6969 + Epoch[ 1 of 4]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.26465454; EvalErr[0]PerSample = 0.38359375; TotalTime = 0.36054s; TotalTimePerSample = 0.14083ms; SamplesPerSecond = 7100 + Epoch[ 1 of 4]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.22050476; EvalErr[0]PerSample = 0.38085938; TotalTime = 0.33458s; TotalTimePerSample = 0.13069ms; SamplesPerSecond = 7651 + Epoch[ 1 of 4]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.17745056; EvalErr[0]PerSample = 0.35507813; TotalTime = 0.34743s; TotalTimePerSample = 0.13571ms; SamplesPerSecond = 7368 + Epoch[ 1 of 4]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.19851379; EvalErr[0]PerSample = 0.37109375; TotalTime = 0.36452s; TotalTimePerSample = 0.14239ms; SamplesPerSecond = 7022 + Epoch[ 1 of 4]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.21453857; EvalErr[0]PerSample = 0.35820313; TotalTime = 0.34422s; TotalTimePerSample = 0.13446ms; SamplesPerSecond = 7437 + Epoch[ 1 of 4]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18011475; EvalErr[0]PerSample = 0.35546875; TotalTime = 0.31584s; TotalTimePerSample = 0.12337ms; SamplesPerSecond = 8105 + Epoch[ 1 of 4]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.16693726; EvalErr[0]PerSample = 0.35195312; TotalTime = 0.29809s; TotalTimePerSample = 0.11644ms; SamplesPerSecond = 8588 + Epoch[ 1 of 4]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12398987; EvalErr[0]PerSample = 0.35234375; TotalTime = 0.28314s; TotalTimePerSample = 0.11060ms; SamplesPerSecond = 9041 + Epoch[ 1 of 4]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.18822021; EvalErr[0]PerSample = 0.36328125; TotalTime = 0.27412s; TotalTimePerSample = 0.10708ms; SamplesPerSecond = 9339 + Epoch[ 1 of 4]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13831482; EvalErr[0]PerSample = 0.35078125; TotalTime = 0.28390s; TotalTimePerSample = 0.11090ms; SamplesPerSecond = 9017 + Epoch[ 1 of 4]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12718811; EvalErr[0]PerSample = 0.33984375; TotalTime = 0.38089s; TotalTimePerSample = 0.14878ms; SamplesPerSecond = 6721 + Epoch[ 1 of 4]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.11155701; EvalErr[0]PerSample = 0.34179688; TotalTime = 0.38982s; TotalTimePerSample = 0.15227ms; SamplesPerSecond = 6567 + Epoch[ 1 of 4]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.13423157; EvalErr[0]PerSample = 0.34101562; TotalTime = 0.38594s; TotalTimePerSample = 0.15076ms; SamplesPerSecond = 6633 + Epoch[ 1 of 4]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample = 1.12716675; EvalErr[0]PerSample = 0.34414062; TotalTime = 0.33494s; TotalTimePerSample = 0.13084ms; SamplesPerSecond = 7643 +Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 1.4082143; EvalErrPerSample = 0.4008545; Ave LearnRatePerSample = 0.003125000047; EpochTime=16.150435 Starting Epoch 2: learning rate per sample = 0.003125 effective momentum = 0.810210 minibatchiterator: epoch 1: frames [81920..163840] (first utterance at frame 81920), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 2 of 4]-Minibatch[ 1- 10 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.20010586; EvalErr[0]PerSample = 0.36894531; TotalTime = 0.62208s; TotalTimePerSample = 0.12150ms; SamplesPerSecond = 8230 - Epoch[ 2 of 4]-Minibatch[ 11- 20 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.15503139; EvalErr[0]PerSample = 0.34570313; TotalTime = 0.61113s; TotalTimePerSample = 0.11936ms; SamplesPerSecond = 8377 - Epoch[ 2 of 4]-Minibatch[ 21- 30 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.09869881; EvalErr[0]PerSample = 0.33535156; TotalTime = 0.56470s; TotalTimePerSample = 0.11029ms; SamplesPerSecond = 9066 - Epoch[ 2 of 4]-Minibatch[ 31- 40 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.09688034; EvalErr[0]PerSample = 0.33593750; TotalTime = 0.51043s; TotalTimePerSample = 0.09969ms; SamplesPerSecond = 10030 - Epoch[ 2 of 4]-Minibatch[ 41- 50 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.25485992; EvalErr[0]PerSample = 0.37636719; TotalTime = 0.47167s; TotalTimePerSample = 0.09212ms; SamplesPerSecond = 10855 - Epoch[ 2 of 4]-Minibatch[ 51- 60 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.15979233; EvalErr[0]PerSample = 0.36191406; TotalTime = 0.44445s; TotalTimePerSample = 0.08681ms; SamplesPerSecond = 11519 - Epoch[ 2 of 4]-Minibatch[ 61- 70 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.14767456; EvalErr[0]PerSample = 0.34414062; TotalTime = 0.42450s; TotalTimePerSample = 0.08291ms; SamplesPerSecond = 12061 - Epoch[ 2 of 4]-Minibatch[ 71- 80 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.08805161; EvalErr[0]PerSample = 0.33222656; TotalTime = 0.62083s; TotalTimePerSample = 0.12126ms; SamplesPerSecond = 8246 - Epoch[ 2 of 4]-Minibatch[ 81- 90 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.09917145; EvalErr[0]PerSample = 0.33476563; TotalTime = 0.59906s; TotalTimePerSample = 0.11700ms; SamplesPerSecond = 8546 - Epoch[ 2 of 4]-Minibatch[ 91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.06098633; EvalErr[0]PerSample = 0.32734375; TotalTime = 0.53178s; TotalTimePerSample = 0.10386ms; SamplesPerSecond = 9628 + Epoch[ 2 of 4]-Minibatch[ 1- 10 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.20089607; EvalErr[0]PerSample = 0.36757812; TotalTime = 0.63575s; TotalTimePerSample = 0.12417ms; SamplesPerSecond = 8053 + Epoch[ 2 of 4]-Minibatch[ 11- 20 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.15295639; EvalErr[0]PerSample = 0.34550781; TotalTime = 0.57244s; TotalTimePerSample = 0.11180ms; SamplesPerSecond = 8944 + Epoch[ 2 of 4]-Minibatch[ 21- 30 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.09945831; EvalErr[0]PerSample = 0.33613281; TotalTime = 0.55825s; TotalTimePerSample = 0.10903ms; SamplesPerSecond = 9171 + Epoch[ 2 of 4]-Minibatch[ 31- 40 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.09916496; EvalErr[0]PerSample = 0.33867188; TotalTime = 0.61763s; TotalTimePerSample = 0.12063ms; SamplesPerSecond = 8289 + Epoch[ 2 of 4]-Minibatch[ 41- 50 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.17260475; EvalErr[0]PerSample = 0.36230469; TotalTime = 0.57866s; TotalTimePerSample = 0.11302ms; SamplesPerSecond = 8847 + Epoch[ 2 of 4]-Minibatch[ 51- 60 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.15717964; EvalErr[0]PerSample = 0.35820313; TotalTime = 0.61745s; TotalTimePerSample = 0.12060ms; SamplesPerSecond = 8292 + Epoch[ 2 of 4]-Minibatch[ 61- 70 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.14431229; EvalErr[0]PerSample = 0.34296875; TotalTime = 0.59477s; TotalTimePerSample = 0.11617ms; SamplesPerSecond = 8608 + Epoch[ 2 of 4]-Minibatch[ 71- 80 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.10515747; EvalErr[0]PerSample = 0.34394531; TotalTime = 0.58508s; TotalTimePerSample = 0.11427ms; SamplesPerSecond = 8750 + Epoch[ 2 of 4]-Minibatch[ 81- 90 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.15175400; EvalErr[0]PerSample = 0.35449219; TotalTime = 0.59203s; TotalTimePerSample = 0.11563ms; SamplesPerSecond = 8648 + Epoch[ 2 of 4]-Minibatch[ 91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.11654053; EvalErr[0]PerSample = 0.34101562; TotalTime = 0.57091s; TotalTimePerSample = 0.11151ms; SamplesPerSecond = 8968 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 2 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.10802689; EvalErr[0]PerSample = 0.33925781; TotalTime = 0.48630s; TotalTimePerSample = 0.09498ms; SamplesPerSecond = 10528 - Epoch[ 2 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.14810791; EvalErr[0]PerSample = 0.35449219; TotalTime = 0.45579s; TotalTimePerSample = 0.08902ms; SamplesPerSecond = 11233 - Epoch[ 2 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.05741577; EvalErr[0]PerSample = 0.32734375; TotalTime = 0.42856s; TotalTimePerSample = 0.08370ms; SamplesPerSecond = 11946 - Epoch[ 2 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.02627869; EvalErr[0]PerSample = 0.32187500; TotalTime = 0.44359s; TotalTimePerSample = 0.08664ms; SamplesPerSecond = 11542 - Epoch[ 2 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.07954559; EvalErr[0]PerSample = 0.32402344; TotalTime = 0.43958s; TotalTimePerSample = 0.08586ms; SamplesPerSecond = 11647 - Epoch[ 2 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.06135712; EvalErr[0]PerSample = 0.32148437; TotalTime = 0.38011s; TotalTimePerSample = 0.07424ms; SamplesPerSecond = 13469 -Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 1.1151241; EvalErrPerSample = 0.34069824; Ave LearnRatePerSample = 0.003125000047; EpochTime=8.123637 + Epoch[ 2 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.11851807; EvalErr[0]PerSample = 0.34472656; TotalTime = 0.58517s; TotalTimePerSample = 0.11429ms; SamplesPerSecond = 8749 + Epoch[ 2 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.11374054; EvalErr[0]PerSample = 0.34492187; TotalTime = 0.58136s; TotalTimePerSample = 0.11355ms; SamplesPerSecond = 8806 + Epoch[ 2 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.04686737; EvalErr[0]PerSample = 0.32265625; TotalTime = 0.51156s; TotalTimePerSample = 0.09991ms; SamplesPerSecond = 10008 + Epoch[ 2 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.02721252; EvalErr[0]PerSample = 0.32246094; TotalTime = 0.61121s; TotalTimePerSample = 0.11938ms; SamplesPerSecond = 8376 + Epoch[ 2 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.08386230; EvalErr[0]PerSample = 0.33144531; TotalTime = 0.58963s; TotalTimePerSample = 0.11516ms; SamplesPerSecond = 8683 + Epoch[ 2 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.06164856; EvalErr[0]PerSample = 0.32558594; TotalTime = 0.52979s; TotalTimePerSample = 0.10347ms; SamplesPerSecond = 9664 +Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 1.1157421; EvalErrPerSample = 0.34266359; Ave LearnRatePerSample = 0.003125000047; EpochTime=9.420168 Starting Epoch 3: learning rate per sample = 0.003125 effective momentum = 0.810210 minibatchiterator: epoch 2: frames [163840..245760] (first utterance at frame 163840), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 3 of 4]-Minibatch[ 1- 10 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.12565956; EvalErr[0]PerSample = 0.34511719; TotalTime = 0.58787s; TotalTimePerSample = 0.11482ms; SamplesPerSecond = 8709 - Epoch[ 3 of 4]-Minibatch[ 11- 20 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.08568897; EvalErr[0]PerSample = 0.33847656; TotalTime = 0.52737s; TotalTimePerSample = 0.10300ms; SamplesPerSecond = 9708 - Epoch[ 3 of 4]-Minibatch[ 21- 30 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.08227139; EvalErr[0]PerSample = 0.33398438; TotalTime = 0.48594s; TotalTimePerSample = 0.09491ms; SamplesPerSecond = 10536 - Epoch[ 3 of 4]-Minibatch[ 31- 40 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.09552345; EvalErr[0]PerSample = 0.33769531; TotalTime = 0.45227s; TotalTimePerSample = 0.08833ms; SamplesPerSecond = 11320 - Epoch[ 3 of 4]-Minibatch[ 41- 50 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.07601204; EvalErr[0]PerSample = 0.33691406; TotalTime = 0.42756s; TotalTimePerSample = 0.08351ms; SamplesPerSecond = 11975 - Epoch[ 3 of 4]-Minibatch[ 51- 60 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.05121803; EvalErr[0]PerSample = 0.33046875; TotalTime = 0.42206s; TotalTimePerSample = 0.08243ms; SamplesPerSecond = 12131 - Epoch[ 3 of 4]-Minibatch[ 61- 70 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.09072342; EvalErr[0]PerSample = 0.33359375; TotalTime = 0.41994s; TotalTimePerSample = 0.08202ms; SamplesPerSecond = 12192 - Epoch[ 3 of 4]-Minibatch[ 71- 80 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.07286148; EvalErr[0]PerSample = 0.32265625; TotalTime = 0.42157s; TotalTimePerSample = 0.08234ms; SamplesPerSecond = 12145 - Epoch[ 3 of 4]-Minibatch[ 81- 90 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.04907379; EvalErr[0]PerSample = 0.32558594; TotalTime = 0.42022s; TotalTimePerSample = 0.08207ms; SamplesPerSecond = 12184 - Epoch[ 3 of 4]-Minibatch[ 91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.05518036; EvalErr[0]PerSample = 0.32714844; TotalTime = 0.42096s; TotalTimePerSample = 0.08222ms; SamplesPerSecond = 12162 + Epoch[ 3 of 4]-Minibatch[ 1- 10 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.12331724; EvalErr[0]PerSample = 0.34121094; TotalTime = 0.60252s; TotalTimePerSample = 0.11768ms; SamplesPerSecond = 8497 + Epoch[ 3 of 4]-Minibatch[ 11- 20 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.07871103; EvalErr[0]PerSample = 0.33652344; TotalTime = 0.61255s; TotalTimePerSample = 0.11964ms; SamplesPerSecond = 8358 + Epoch[ 3 of 4]-Minibatch[ 21- 30 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.06784973; EvalErr[0]PerSample = 0.33183594; TotalTime = 0.56505s; TotalTimePerSample = 0.11036ms; SamplesPerSecond = 9061 + Epoch[ 3 of 4]-Minibatch[ 31- 40 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.08440666; EvalErr[0]PerSample = 0.33398438; TotalTime = 0.55108s; TotalTimePerSample = 0.10763ms; SamplesPerSecond = 9290 + Epoch[ 3 of 4]-Minibatch[ 41- 50 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.07466774; EvalErr[0]PerSample = 0.33320312; TotalTime = 0.58281s; TotalTimePerSample = 0.11383ms; SamplesPerSecond = 8785 + Epoch[ 3 of 4]-Minibatch[ 51- 60 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.05427513; EvalErr[0]PerSample = 0.33125000; TotalTime = 0.59333s; TotalTimePerSample = 0.11589ms; SamplesPerSecond = 8629 + Epoch[ 3 of 4]-Minibatch[ 61- 70 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.06873093; EvalErr[0]PerSample = 0.32773438; TotalTime = 0.60744s; TotalTimePerSample = 0.11864ms; SamplesPerSecond = 8428 + Epoch[ 3 of 4]-Minibatch[ 71- 80 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.08097610; EvalErr[0]PerSample = 0.33007813; TotalTime = 0.53753s; TotalTimePerSample = 0.10499ms; SamplesPerSecond = 9525 + Epoch[ 3 of 4]-Minibatch[ 81- 90 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.05431290; EvalErr[0]PerSample = 0.32792969; TotalTime = 0.48923s; TotalTimePerSample = 0.09555ms; SamplesPerSecond = 10465 + Epoch[ 3 of 4]-Minibatch[ 91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.06173096; EvalErr[0]PerSample = 0.32695313; TotalTime = 0.45004s; TotalTimePerSample = 0.08790ms; SamplesPerSecond = 11376 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 3 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.03641891; EvalErr[0]PerSample = 0.32148437; TotalTime = 0.42161s; TotalTimePerSample = 0.08235ms; SamplesPerSecond = 12143 - Epoch[ 3 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.08370361; EvalErr[0]PerSample = 0.33710937; TotalTime = 0.42014s; TotalTimePerSample = 0.08206ms; SamplesPerSecond = 12186 - Epoch[ 3 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.10984344; EvalErr[0]PerSample = 0.33164063; TotalTime = 0.42099s; TotalTimePerSample = 0.08222ms; SamplesPerSecond = 12161 - Epoch[ 3 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.06258087; EvalErr[0]PerSample = 0.32714844; TotalTime = 0.42017s; TotalTimePerSample = 0.08206ms; SamplesPerSecond = 12185 - Epoch[ 3 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.06584320; EvalErr[0]PerSample = 0.33671875; TotalTime = 0.42112s; TotalTimePerSample = 0.08225ms; SamplesPerSecond = 12158 - Epoch[ 3 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.05016174; EvalErr[0]PerSample = 0.33183594; TotalTime = 0.41651s; TotalTimePerSample = 0.08135ms; SamplesPerSecond = 12292 -Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 1.0745478; EvalErrPerSample = 0.33234864; Ave LearnRatePerSample = 0.003125000047; EpochTime=7.18526 + Epoch[ 3 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.04505692; EvalErr[0]PerSample = 0.32792969; TotalTime = 0.42087s; TotalTimePerSample = 0.08220ms; SamplesPerSecond = 12165 + Epoch[ 3 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.08151245; EvalErr[0]PerSample = 0.33574219; TotalTime = 0.51025s; TotalTimePerSample = 0.09966ms; SamplesPerSecond = 10034 + Epoch[ 3 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.10628204; EvalErr[0]PerSample = 0.33437500; TotalTime = 0.61115s; TotalTimePerSample = 0.11936ms; SamplesPerSecond = 8377 + Epoch[ 3 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.05827026; EvalErr[0]PerSample = 0.32636719; TotalTime = 0.61738s; TotalTimePerSample = 0.12058ms; SamplesPerSecond = 8293 + Epoch[ 3 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.05841064; EvalErr[0]PerSample = 0.33574219; TotalTime = 0.62912s; TotalTimePerSample = 0.12287ms; SamplesPerSecond = 8138 + Epoch[ 3 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.04437714; EvalErr[0]PerSample = 0.32773438; TotalTime = 0.53966s; TotalTimePerSample = 0.10540ms; SamplesPerSecond = 9487 +Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 1.0714306; EvalErrPerSample = 0.33178711; Ave LearnRatePerSample = 0.003125000047; EpochTime=9.000243 Starting Epoch 4: learning rate per sample = 0.003125 effective momentum = 0.810210 minibatchiterator: epoch 3: frames [245760..327680] (first utterance at frame 245760), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 4 of 4]-Minibatch[ 1- 10 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.04058170; EvalErr[0]PerSample = 0.32382813; TotalTime = 0.61361s; TotalTimePerSample = 0.11985ms; SamplesPerSecond = 8344 - Epoch[ 4 of 4]-Minibatch[ 11- 20 of 160]: SamplesSeen = 4926; TrainLossPerSample = 1.03329491; EvalErr[0]PerSample = 0.31465692; TotalTime = 1.68883s; TotalTimePerSample = 0.34284ms; SamplesPerSecond = 2916 - Epoch[ 4 of 4]-Minibatch[ 21- 30 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.01529274; EvalErr[0]PerSample = 0.31835938; TotalTime = 0.45719s; TotalTimePerSample = 0.08929ms; SamplesPerSecond = 11198 - Epoch[ 4 of 4]-Minibatch[ 31- 40 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.00803413; EvalErr[0]PerSample = 0.31679687; TotalTime = 0.42910s; TotalTimePerSample = 0.08381ms; SamplesPerSecond = 11932 - Epoch[ 4 of 4]-Minibatch[ 41- 50 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.01046181; EvalErr[0]PerSample = 0.31894531; TotalTime = 0.42059s; TotalTimePerSample = 0.08215ms; SamplesPerSecond = 12173 - Epoch[ 4 of 4]-Minibatch[ 51- 60 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.99893723; EvalErr[0]PerSample = 0.31367187; TotalTime = 0.42160s; TotalTimePerSample = 0.08234ms; SamplesPerSecond = 12144 - Epoch[ 4 of 4]-Minibatch[ 61- 70 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.99259148; EvalErr[0]PerSample = 0.30644531; TotalTime = 0.42045s; TotalTimePerSample = 0.08212ms; SamplesPerSecond = 12177 - Epoch[ 4 of 4]-Minibatch[ 71- 80 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.01776657; EvalErr[0]PerSample = 0.31914063; TotalTime = 0.42189s; TotalTimePerSample = 0.08240ms; SamplesPerSecond = 12135 - Epoch[ 4 of 4]-Minibatch[ 81- 90 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.99872665; EvalErr[0]PerSample = 0.31503906; TotalTime = 0.42067s; TotalTimePerSample = 0.08216ms; SamplesPerSecond = 12171 - Epoch[ 4 of 4]-Minibatch[ 91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.97249756; EvalErr[0]PerSample = 0.31191406; TotalTime = 0.42106s; TotalTimePerSample = 0.08224ms; SamplesPerSecond = 12159 + Epoch[ 4 of 4]-Minibatch[ 1- 10 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.04450397; EvalErr[0]PerSample = 0.33125000; TotalTime = 0.60059s; TotalTimePerSample = 0.11730ms; SamplesPerSecond = 8524 + Epoch[ 4 of 4]-Minibatch[ 11- 20 of 160]: SamplesSeen = 4926; TrainLossPerSample = 1.02895867; EvalErr[0]PerSample = 0.31567194; TotalTime = 1.93158s; TotalTimePerSample = 0.39212ms; SamplesPerSecond = 2550 + Epoch[ 4 of 4]-Minibatch[ 21- 30 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.00198059; EvalErr[0]PerSample = 0.31601563; TotalTime = 0.56293s; TotalTimePerSample = 0.10995ms; SamplesPerSecond = 9095 + Epoch[ 4 of 4]-Minibatch[ 31- 40 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.00561543; EvalErr[0]PerSample = 0.31777344; TotalTime = 0.59339s; TotalTimePerSample = 0.11590ms; SamplesPerSecond = 8628 + Epoch[ 4 of 4]-Minibatch[ 41- 50 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.00148926; EvalErr[0]PerSample = 0.31601563; TotalTime = 0.61272s; TotalTimePerSample = 0.11967ms; SamplesPerSecond = 8356 + Epoch[ 4 of 4]-Minibatch[ 51- 60 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.00593376; EvalErr[0]PerSample = 0.31406250; TotalTime = 0.53038s; TotalTimePerSample = 0.10359ms; SamplesPerSecond = 9653 + Epoch[ 4 of 4]-Minibatch[ 61- 70 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.98752327; EvalErr[0]PerSample = 0.30722656; TotalTime = 0.48194s; TotalTimePerSample = 0.09413ms; SamplesPerSecond = 10623 + Epoch[ 4 of 4]-Minibatch[ 71- 80 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.01428757; EvalErr[0]PerSample = 0.31992188; TotalTime = 0.44727s; TotalTimePerSample = 0.08736ms; SamplesPerSecond = 11447 + Epoch[ 4 of 4]-Minibatch[ 81- 90 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.99691544; EvalErr[0]PerSample = 0.31621094; TotalTime = 0.48183s; TotalTimePerSample = 0.09411ms; SamplesPerSecond = 10626 + Epoch[ 4 of 4]-Minibatch[ 91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.96604996; EvalErr[0]PerSample = 0.30937500; TotalTime = 0.59759s; TotalTimePerSample = 0.11672ms; SamplesPerSecond = 8567 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 4 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.99847946; EvalErr[0]PerSample = 0.30937500; TotalTime = 0.42112s; TotalTimePerSample = 0.08225ms; SamplesPerSecond = 12157 - Epoch[ 4 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.99825592; EvalErr[0]PerSample = 0.30859375; TotalTime = 0.42117s; TotalTimePerSample = 0.08226ms; SamplesPerSecond = 12156 - Epoch[ 4 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.01118851; EvalErr[0]PerSample = 0.31523438; TotalTime = 0.42075s; TotalTimePerSample = 0.08218ms; SamplesPerSecond = 12168 - Epoch[ 4 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.99189148; EvalErr[0]PerSample = 0.31132813; TotalTime = 0.42094s; TotalTimePerSample = 0.08222ms; SamplesPerSecond = 12163 - Epoch[ 4 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.95366974; EvalErr[0]PerSample = 0.30312500; TotalTime = 0.42195s; TotalTimePerSample = 0.08241ms; SamplesPerSecond = 12134 - Epoch[ 4 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.99163055; EvalErr[0]PerSample = 0.31074219; TotalTime = 0.39088s; TotalTimePerSample = 0.07634ms; SamplesPerSecond = 13098 -Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 1.0018568; EvalErrPerSample = 0.31358644; Ave LearnRatePerSample = 0.003125000047; EpochTime=8.314413 + Epoch[ 4 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.99062958; EvalErr[0]PerSample = 0.30527344; TotalTime = 0.58136s; TotalTimePerSample = 0.11355ms; SamplesPerSecond = 8806 + Epoch[ 4 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.99886856; EvalErr[0]PerSample = 0.30976562; TotalTime = 0.57362s; TotalTimePerSample = 0.11203ms; SamplesPerSecond = 8925 + Epoch[ 4 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample = 1.00958328; EvalErr[0]PerSample = 0.31523438; TotalTime = 0.60384s; TotalTimePerSample = 0.11794ms; SamplesPerSecond = 8479 + Epoch[ 4 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.97942047; EvalErr[0]PerSample = 0.31171875; TotalTime = 0.60621s; TotalTimePerSample = 0.11840ms; SamplesPerSecond = 8445 + Epoch[ 4 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.94226837; EvalErr[0]PerSample = 0.30136719; TotalTime = 0.60218s; TotalTimePerSample = 0.11761ms; SamplesPerSecond = 8502 + Epoch[ 4 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample = 0.96711578; EvalErr[0]PerSample = 0.30175781; TotalTime = 0.49045s; TotalTimePerSample = 0.09579ms; SamplesPerSecond = 10439 +Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.99611807; EvalErrPerSample = 0.31303713; Ave LearnRatePerSample = 0.003125000047; EpochTime=10.396508 CNTKCommandTrainEnd: speechTrain COMPLETED diff --git a/Tests/Speech/DNN/DiscriminativePreTraining/testcases.yml b/Tests/Speech/DNN/DiscriminativePreTraining/testcases.yml index b7b5d9b27..b48f31b15 100644 --- a/Tests/Speech/DNN/DiscriminativePreTraining/testcases.yml +++ b/Tests/Speech/DNN/DiscriminativePreTraining/testcases.yml @@ -3,11 +3,7 @@ tags: # running on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations: - bvt-s (flavor=='debug') ^ (device=='cpu') # running unconditionally on every Nightly job in 'S' leg - # TODO: Temporary disabling Release-GPU because of a known bug causing large variance between - # Release and Debug configurations for GPU only for this (Speech/DNN/DiscriminativePreTraining) test. - # This will be re-enabled after the bug has been addressed. - # DO NOT COPY this disablement for other tests!! - - nightly-s (flavor!='release') or (device!='gpu') + - nightly-s testCases: CNTK Run must be completed: diff --git a/Tests/Speech/DNN/ParallelNoQuantization/baseline.gpu.txt b/Tests/Speech/DNN/ParallelNoQuantization/baseline.gpu.txt index d77e6b08c..4bafb9779 100644 --- a/Tests/Speech/DNN/ParallelNoQuantization/baseline.gpu.txt +++ b/Tests/Speech/DNN/ParallelNoQuantization/baseline.gpu.txt @@ -1,40 +1,37 @@ -=== Running mpiexec -n 3 /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/../cntk.config RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr +=== Running mpiexec -n 3 /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/../cntk.config RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/.. DeviceId=0 stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr MPIWrapper: initializing MPI MPIWrapper: initializing MPI MPIWrapper: initializing MPI ping [requestnodes (before change)]: 3 nodes pinging each other -ping [requestnodes (before change)]: 3 nodes pinging each other -ping [requestnodes (before change)]: 3 nodes pinging each other -ping [requestnodes (before change)]: all 3 nodes responded -requestnodes [MPIWrapper]: using 3 out of 3 MPI nodes (3 requested); we (1) are in (participating) -ping [requestnodes (after change)]: 3 nodes pinging each other -ping [requestnodes (before change)]: all 3 nodes responded -requestnodes [MPIWrapper]: using 3 out of 3 MPI nodes (3 requested); we (0) are in (participating) -ping [requestnodes (after change)]: 3 nodes pinging each other ping [requestnodes (before change)]: all 3 nodes responded requestnodes [MPIWrapper]: using 3 out of 3 MPI nodes (3 requested); we (2) are in (participating) ping [requestnodes (after change)]: 3 nodes pinging each other ping [requestnodes (after change)]: all 3 nodes responded -mpihelper: we are cog 0 in a gearbox of 3 -ping [mpihelper]: 3 nodes pinging each other -ping [requestnodes (after change)]: all 3 nodes responded -mpihelper: we are cog 1 in a gearbox of 3 -ping [mpihelper]: 3 nodes pinging each other -ping [requestnodes (after change)]: all 3 nodes responded mpihelper: we are cog 2 in a gearbox of 3 ping [mpihelper]: 3 nodes pinging each other ping [mpihelper]: all 3 nodes responded +ping [requestnodes (before change)]: 3 nodes pinging each other +ping [requestnodes (before change)]: all 3 nodes responded +requestnodes [MPIWrapper]: using 3 out of 3 MPI nodes (3 requested); we (0) are in (participating) +ping [requestnodes (after change)]: 3 nodes pinging each other +ping [requestnodes (after change)]: all 3 nodes responded +mpihelper: we are cog 0 in a gearbox of 3 +ping [mpihelper]: 3 nodes pinging each other ping [mpihelper]: all 3 nodes responded +ping [requestnodes (before change)]: 3 nodes pinging each other +ping [requestnodes (before change)]: all 3 nodes responded +requestnodes [MPIWrapper]: using 3 out of 3 MPI nodes (3 requested); we (1) are in (participating) +ping [requestnodes (after change)]: 3 nodes pinging each other +ping [requestnodes (after change)]: all 3 nodes responded +mpihelper: we are cog 1 in a gearbox of 3 +ping [mpihelper]: 3 nodes pinging each other ping [mpihelper]: all 3 nodes responded -Redirecting stderr to file /tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank0 -CNTKModelPath: /tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn -Redirecting stderr to file /tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank1 -CNTKModelPath: /tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn -Redirecting stderr to file /tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank2 -CNTKModelPath: /tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn -MPI Rank 0: running on localhost at 2015/10/02 13:38:52 -MPI Rank 0: command line options: -MPI Rank 0: configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/../cntk.config RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr +Redirecting stderr to file /tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank0 +Redirecting stderr to file /tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank1 +Redirecting stderr to file /tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank2 +MPI Rank 0: running on localhost at 2015/10/24 12:56:11 +MPI Rank 0: command line: +MPI Rank 0: /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/../cntk.config RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/.. DeviceId=0 stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 0: MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> MPI Rank 0: precision=float @@ -126,10 +123,11 @@ MPI Rank 0: labelType=Category MPI Rank 0: ] MPI Rank 0: ] MPI Rank 0: ] -MPI Rank 0: RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu +MPI Rank 0: RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu MPI Rank 0: DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +MPI Rank 0: ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/.. MPI Rank 0: DeviceId=0 -MPI Rank 0: stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 0: stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 0: MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< MPI Rank 0: @@ -140,7 +138,7 @@ MPI Rank 0: deviceId=0 MPI Rank 0: parallelTrain=true MPI Rank 0: speechTrain=[ MPI Rank 0: action=train -MPI Rank 0: modelPath=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn +MPI Rank 0: modelPath=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 0: deviceId=0 MPI Rank 0: traceLevel=1 MPI Rank 0: SimpleNetworkBuilder=[ @@ -223,23 +221,25 @@ MPI Rank 0: labelType=Category MPI Rank 0: ] MPI Rank 0: ] MPI Rank 0: ] -MPI Rank 0: RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu +MPI Rank 0: RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu MPI Rank 0: DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +MPI Rank 0: ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/.. MPI Rank 0: DeviceId=0 -MPI Rank 0: stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 0: stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 0: MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 0: MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> MPI Rank 0: configparameters: cntk.config:command=speechTrain +MPI Rank 0: configparameters: cntk.config:ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/.. MPI Rank 0: configparameters: cntk.config:DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data MPI Rank 0: configparameters: cntk.config:deviceId=0 MPI Rank 0: configparameters: cntk.config:parallelTrain=true MPI Rank 0: configparameters: cntk.config:precision=float -MPI Rank 0: configparameters: cntk.config:RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu +MPI Rank 0: configparameters: cntk.config:RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu MPI Rank 0: configparameters: cntk.config:speechTrain=[ MPI Rank 0: action=train -MPI Rank 0: modelPath=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn +MPI Rank 0: modelPath=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 0: deviceId=0 MPI Rank 0: traceLevel=1 MPI Rank 0: SimpleNetworkBuilder=[ @@ -323,10 +323,11 @@ MPI Rank 0: ] MPI Rank 0: ] MPI Rank 0: ] MPI Rank 0: -MPI Rank 0: configparameters: cntk.config:stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 0: configparameters: cntk.config:stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 0: command: speechTrain MPI Rank 0: precision = float +MPI Rank 0: CNTKModelPath: /tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 0: CNTKCommandTrainInfo: speechTrain : 3 MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 MPI Rank 0: CNTKCommandTrainBegin: speechTrain @@ -338,11 +339,12 @@ MPI Rank 0: htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Sp MPI Rank 0: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances MPI Rank 0: label set 0: 129 classes MPI Rank 0: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames +MPI Rank 0: SetUniformRandomValue (GPU): creating curand object with seed 1 MPI Rank 0: GetTrainCriterionNodes ... MPI Rank 0: GetEvalCriterionNodes ... MPI Rank 0: MPI Rank 0: -MPI Rank 0: Validating node CrossEntropyWithSoftmax +MPI Rank 0: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. MPI Rank 0: MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3] MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512] @@ -365,13 +367,57 @@ MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1] MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] MPI Rank 0: +MPI Rank 0: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] +MPI Rank 0: +MPI Rank 0: Validating for node CrossEntropyWithSoftmax, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] MPI Rank 0: MPI Rank 0: 9 out of 20 nodes do not share the minibatch layout with the input data. -MPI Rank 0: Found 6 PreCompute nodes -MPI Rank 0: NodeName: InvStdOfFeatures -MPI Rank 0: NodeName: MeanOfFeatures -MPI Rank 0: NodeName: Prior +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Precomputing --> 3 PreCompute nodes found. +MPI Rank 0: MPI Rank 0: NodeName: InvStdOfFeatures MPI Rank 0: NodeName: MeanOfFeatures MPI Rank 0: NodeName: Prior @@ -379,126 +425,192 @@ MPI Rank 0: minibatchiterator: epoch 0: frames [0..252734] (first utterance at f MPI Rank 0: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms MPI Rank 0: MPI Rank 0: -MPI Rank 0: Validating node InvStdOfFeatures +MPI Rank 0: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1. MPI Rank 0: -MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 64] -MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 64]) -> [363, 1] +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] MPI Rank 0: +MPI Rank 0: Validating for node InvStdOfFeatures, final verification. MPI Rank 0: +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] MPI Rank 0: MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 0: MPI Rank 0: -MPI Rank 0: Validating node MeanOfFeatures MPI Rank 0: -MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 64] -MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 64]) -> [363, 1] +MPI Rank 0: Validating for node MeanOfFeatures. 2 nodes to process in pass 1. MPI Rank 0: +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] MPI Rank 0: +MPI Rank 0: Validating for node MeanOfFeatures, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] MPI Rank 0: MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 0: MPI Rank 0: -MPI Rank 0: Validating node Prior MPI Rank 0: -MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 64] -MPI Rank 0: Validating --> Prior = Mean(labels[132, MBSize 64]) -> [132, 1] +MPI Rank 0: Validating for node Prior. 2 nodes to process in pass 1. MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 0: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] MPI Rank 0: +MPI Rank 0: Validating for node Prior. 1 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 0: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] +MPI Rank 0: +MPI Rank 0: Validating for node Prior, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 0: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] MPI Rank 0: MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once. +MPI Rank 0: +MPI Rank 0: Precomputing --> Completed. +MPI Rank 0: MPI Rank 0: Set Max Temp Mem Size For Convolution Nodes to 0 samples. -MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.015625 momentum = 0.900000 +MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.015625 effective momentum = 0.900000 MPI Rank 0: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 3, with 1 datapasses MPI Rank 0: -MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. MPI Rank 0: +MPI Rank 0: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. MPI Rank 0: -MPI Rank 0: Validating node EvalErrorPrediction -MPI Rank 0: -MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 29] +MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 62] MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512] MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512] MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363] -MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 29] -MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 29]) -> [363, 1] -MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 29]) -> [363, 1] -MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 29], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 29] -MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 29]) -> [512, MBSize 29] +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1] -MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 29], B0[512, 1]) -> [512, MBSize 29] -MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 29]) -> [512, MBSize 29] -MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 29]) -> [512, MBSize 29] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1] -MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 29], B1[512, 1]) -> [512, MBSize 29] -MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 29]) -> [512, MBSize 29] -MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 29]) -> [132, MBSize 29] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1] -MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 29], B2[132, 1]) -> [132, MBSize 29] -MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 29], HLast[132, MBSize 29]) -> [1, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] MPI Rank 0: +MPI Rank 0: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2. MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 62] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] +MPI Rank 0: +MPI Rank 0: Validating for node EvalErrorPrediction, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 62] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] MPI Rank 0: MPI Rank 0: 9 out of 20 nodes do not share the minibatch layout with the input data. -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 1- 10 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.32135414; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.25642s; TotalTimePerSample = 0.40065ms; SamplesPerSecond = 2495 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 11- 20 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.15070930; EvalErr[0]PerSample = 0.86718750; TotalTime = 0.24686s; TotalTimePerSample = 0.38571ms; SamplesPerSecond = 2592 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 21- 30 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.99901060; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.24575s; TotalTimePerSample = 0.38398ms; SamplesPerSecond = 2604 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 31- 40 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.86945780; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.24696s; TotalTimePerSample = 0.38588ms; SamplesPerSecond = 2591 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 41- 50 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.80219517; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.24516s; TotalTimePerSample = 0.38307ms; SamplesPerSecond = 2610 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 51- 60 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.72890717; EvalErr[0]PerSample = 0.86875000; TotalTime = 0.24462s; TotalTimePerSample = 0.38221ms; SamplesPerSecond = 2616 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 61- 70 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.56187025; EvalErr[0]PerSample = 0.82343750; TotalTime = 0.24416s; TotalTimePerSample = 0.38150ms; SamplesPerSecond = 2621 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 71- 80 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.42790310; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.24566s; TotalTimePerSample = 0.38384ms; SamplesPerSecond = 2605 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 81- 90 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.33928303; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.24437s; TotalTimePerSample = 0.38184ms; SamplesPerSecond = 2618 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.36398734; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.24545s; TotalTimePerSample = 0.38352ms; SamplesPerSecond = 2607 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.21223679; EvalErr[0]PerSample = 0.75312500; TotalTime = 0.24567s; TotalTimePerSample = 0.38385ms; SamplesPerSecond = 2605 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.31265333; EvalErr[0]PerSample = 0.78750000; TotalTime = 0.24655s; TotalTimePerSample = 0.38523ms; SamplesPerSecond = 2595 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.14081673; EvalErr[0]PerSample = 0.74687500; TotalTime = 0.24650s; TotalTimePerSample = 0.38515ms; SamplesPerSecond = 2596 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.00690023; EvalErr[0]PerSample = 0.69687500; TotalTime = 0.24591s; TotalTimePerSample = 0.38424ms; SamplesPerSecond = 2602 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.00496087; EvalErr[0]PerSample = 0.72343750; TotalTime = 0.24734s; TotalTimePerSample = 0.38648ms; SamplesPerSecond = 2587 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.97859121; EvalErr[0]PerSample = 0.73906250; TotalTime = 0.24469s; TotalTimePerSample = 0.38233ms; SamplesPerSecond = 2615 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.85686638; EvalErr[0]PerSample = 0.70781250; TotalTime = 0.24529s; TotalTimePerSample = 0.38327ms; SamplesPerSecond = 2609 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.69053374; EvalErr[0]PerSample = 0.67187500; TotalTime = 0.24514s; TotalTimePerSample = 0.38303ms; SamplesPerSecond = 2610 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.78653366; EvalErr[0]PerSample = 0.70468750; TotalTime = 0.24606s; TotalTimePerSample = 0.38447ms; SamplesPerSecond = 2600 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.57702529; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.24668s; TotalTimePerSample = 0.38543ms; SamplesPerSecond = 2594 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.61570793; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.24435s; TotalTimePerSample = 0.38180ms; SamplesPerSecond = 2619 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.55235603; EvalErr[0]PerSample = 0.65781250; TotalTime = 0.24639s; TotalTimePerSample = 0.38499ms; SamplesPerSecond = 2597 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.48211165; EvalErr[0]PerSample = 0.62500000; TotalTime = 0.24605s; TotalTimePerSample = 0.38446ms; SamplesPerSecond = 2601 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.38778376; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.24591s; TotalTimePerSample = 0.38423ms; SamplesPerSecond = 2602 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.36900911; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.24475s; TotalTimePerSample = 0.38243ms; SamplesPerSecond = 2614 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.43967760; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.24451s; TotalTimePerSample = 0.38205ms; SamplesPerSecond = 2617 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.30281011; EvalErr[0]PerSample = 0.61250000; TotalTime = 0.24558s; TotalTimePerSample = 0.38371ms; SamplesPerSecond = 2606 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.19669121; EvalErr[0]PerSample = 0.55937500; TotalTime = 0.24470s; TotalTimePerSample = 0.38235ms; SamplesPerSecond = 2615 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.28979560; EvalErr[0]PerSample = 0.60468750; TotalTime = 0.24495s; TotalTimePerSample = 0.38273ms; SamplesPerSecond = 2612 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.17750506; EvalErr[0]PerSample = 0.62187500; TotalTime = 0.24520s; TotalTimePerSample = 0.38313ms; SamplesPerSecond = 2610 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.26264305; EvalErr[0]PerSample = 0.59687500; TotalTime = 0.24493s; TotalTimePerSample = 0.38270ms; SamplesPerSecond = 2613 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.15073149; EvalErr[0]PerSample = 0.56250000; TotalTime = 0.24465s; TotalTimePerSample = 0.38226ms; SamplesPerSecond = 2616 -MPI Rank 0: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9799568; EvalErrPerSample = 0.72216797; Ave LearnRatePerSample = 0.015625; EpochTime=7.871485 -MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.001953 momentum = 0.656119 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 1- 10 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.32135295; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.24478s; TotalTimePerSample = 0.38247ms; SamplesPerSecond = 2614 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 11- 20 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.15070941; EvalErr[0]PerSample = 0.86718750; TotalTime = 0.23448s; TotalTimePerSample = 0.36637ms; SamplesPerSecond = 2729 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 21- 30 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.99901066; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.22704s; TotalTimePerSample = 0.35475ms; SamplesPerSecond = 2818 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 31- 40 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.86945816; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.22872s; TotalTimePerSample = 0.35738ms; SamplesPerSecond = 2798 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 41- 50 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.80219557; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.22743s; TotalTimePerSample = 0.35535ms; SamplesPerSecond = 2814 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 51- 60 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.72890766; EvalErr[0]PerSample = 0.86875000; TotalTime = 0.22625s; TotalTimePerSample = 0.35352ms; SamplesPerSecond = 2828 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 61- 70 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.56187065; EvalErr[0]PerSample = 0.82343750; TotalTime = 0.22749s; TotalTimePerSample = 0.35546ms; SamplesPerSecond = 2813 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 71- 80 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.42790299; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.22725s; TotalTimePerSample = 0.35508ms; SamplesPerSecond = 2816 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 81- 90 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.33928338; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.22716s; TotalTimePerSample = 0.35493ms; SamplesPerSecond = 2817 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.36398772; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.22771s; TotalTimePerSample = 0.35580ms; SamplesPerSecond = 2810 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.21223693; EvalErr[0]PerSample = 0.75312500; TotalTime = 0.22768s; TotalTimePerSample = 0.35575ms; SamplesPerSecond = 2810 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.31265357; EvalErr[0]PerSample = 0.78750000; TotalTime = 0.22736s; TotalTimePerSample = 0.35525ms; SamplesPerSecond = 2814 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.14081698; EvalErr[0]PerSample = 0.74687500; TotalTime = 0.22896s; TotalTimePerSample = 0.35775ms; SamplesPerSecond = 2795 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.00690035; EvalErr[0]PerSample = 0.69687500; TotalTime = 0.22665s; TotalTimePerSample = 0.35414ms; SamplesPerSecond = 2823 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.00496066; EvalErr[0]PerSample = 0.72343750; TotalTime = 0.22970s; TotalTimePerSample = 0.35891ms; SamplesPerSecond = 2786 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.97859081; EvalErr[0]PerSample = 0.73906250; TotalTime = 0.22699s; TotalTimePerSample = 0.35468ms; SamplesPerSecond = 2819 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.85686609; EvalErr[0]PerSample = 0.70781250; TotalTime = 0.22767s; TotalTimePerSample = 0.35574ms; SamplesPerSecond = 2811 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.69053374; EvalErr[0]PerSample = 0.67187500; TotalTime = 0.22778s; TotalTimePerSample = 0.35590ms; SamplesPerSecond = 2809 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.78653376; EvalErr[0]PerSample = 0.70468750; TotalTime = 0.22753s; TotalTimePerSample = 0.35551ms; SamplesPerSecond = 2812 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.57702533; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.22747s; TotalTimePerSample = 0.35542ms; SamplesPerSecond = 2813 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.61570805; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.22869s; TotalTimePerSample = 0.35733ms; SamplesPerSecond = 2798 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.55235582; EvalErr[0]PerSample = 0.65781250; TotalTime = 0.22822s; TotalTimePerSample = 0.35660ms; SamplesPerSecond = 2804 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.48211151; EvalErr[0]PerSample = 0.62500000; TotalTime = 0.22783s; TotalTimePerSample = 0.35599ms; SamplesPerSecond = 2809 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.38778372; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.22805s; TotalTimePerSample = 0.35633ms; SamplesPerSecond = 2806 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.36900902; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.22808s; TotalTimePerSample = 0.35637ms; SamplesPerSecond = 2806 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.43967781; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.22820s; TotalTimePerSample = 0.35656ms; SamplesPerSecond = 2804 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.30281039; EvalErr[0]PerSample = 0.61250000; TotalTime = 0.22746s; TotalTimePerSample = 0.35541ms; SamplesPerSecond = 2813 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.19669146; EvalErr[0]PerSample = 0.55937500; TotalTime = 0.22798s; TotalTimePerSample = 0.35621ms; SamplesPerSecond = 2807 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.28979581; EvalErr[0]PerSample = 0.60468750; TotalTime = 0.22790s; TotalTimePerSample = 0.35610ms; SamplesPerSecond = 2808 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.17750535; EvalErr[0]PerSample = 0.62187500; TotalTime = 0.22741s; TotalTimePerSample = 0.35532ms; SamplesPerSecond = 2814 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.26264398; EvalErr[0]PerSample = 0.59687500; TotalTime = 0.22641s; TotalTimePerSample = 0.35377ms; SamplesPerSecond = 2826 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.15073110; EvalErr[0]PerSample = 0.56250000; TotalTime = 0.22676s; TotalTimePerSample = 0.35431ms; SamplesPerSecond = 2822 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9799569; EvalErrPerSample = 0.72216797; Ave LearnRatePerSample = 0.015625; EpochTime=7.319218 +MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.001953 effective momentum = 0.656119 MPI Rank 0: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 0 of 3, with 1 datapasses MPI Rank 0: MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. -MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 1- 10 of 80]: SamplesSeen = 2560; TrainLossPerSample = 2.01598530; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.28942s; TotalTimePerSample = 0.11306ms; SamplesPerSecond = 8845 -MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 11- 20 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98818586; EvalErr[0]PerSample = 0.54296875; TotalTime = 0.28095s; TotalTimePerSample = 0.10975ms; SamplesPerSecond = 9111 -MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 21- 30 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98698123; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.28222s; TotalTimePerSample = 0.11024ms; SamplesPerSecond = 9071 -MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 31- 40 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.93126298; EvalErr[0]PerSample = 0.52773437; TotalTime = 0.27954s; TotalTimePerSample = 0.10920ms; SamplesPerSecond = 9157 -MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 41- 50 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90067741; EvalErr[0]PerSample = 0.52656250; TotalTime = 0.27987s; TotalTimePerSample = 0.10933ms; SamplesPerSecond = 9146 -MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 51- 60 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.97115807; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.27910s; TotalTimePerSample = 0.10903ms; SamplesPerSecond = 9172 -MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 61- 70 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89518067; EvalErr[0]PerSample = 0.52031250; TotalTime = 0.27764s; TotalTimePerSample = 0.10846ms; SamplesPerSecond = 9220 -MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 71- 80 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90450396; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.27504s; TotalTimePerSample = 0.10744ms; SamplesPerSecond = 9307 -MPI Rank 0: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9492419; EvalErrPerSample = 0.53417969; Ave LearnRatePerSample = 0.001953125; EpochTime=2.248479 -MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000098 momentum = 0.656119 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 1- 10 of 80]: SamplesSeen = 2560; TrainLossPerSample = 2.01598514; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.25250s; TotalTimePerSample = 0.09863ms; SamplesPerSecond = 10138 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 11- 20 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98818590; EvalErr[0]PerSample = 0.54296875; TotalTime = 0.24831s; TotalTimePerSample = 0.09700ms; SamplesPerSecond = 10309 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 21- 30 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98698122; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.25203s; TotalTimePerSample = 0.09845ms; SamplesPerSecond = 10157 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 31- 40 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.93126295; EvalErr[0]PerSample = 0.52773437; TotalTime = 0.25003s; TotalTimePerSample = 0.09767ms; SamplesPerSecond = 10238 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 41- 50 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90067743; EvalErr[0]PerSample = 0.52656250; TotalTime = 0.24767s; TotalTimePerSample = 0.09675ms; SamplesPerSecond = 10336 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 51- 60 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.97115808; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.25090s; TotalTimePerSample = 0.09801ms; SamplesPerSecond = 10203 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 61- 70 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89518061; EvalErr[0]PerSample = 0.52031250; TotalTime = 0.24830s; TotalTimePerSample = 0.09699ms; SamplesPerSecond = 10309 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 71- 80 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90450394; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.24457s; TotalTimePerSample = 0.09553ms; SamplesPerSecond = 10467 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9492419; EvalErrPerSample = 0.53417969; Ave LearnRatePerSample = 0.001953125; EpochTime=1.999658 +MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 MPI Rank 0: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 3, with 1 datapasses MPI Rank 0: MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. -MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87359851; EvalErr[0]PerSample = 0.51933594; TotalTime = 0.44176s; TotalTimePerSample = 0.04314ms; SamplesPerSecond = 23179 -MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86656277; EvalErr[0]PerSample = 0.51748047; TotalTime = 0.42652s; TotalTimePerSample = 0.04165ms; SamplesPerSecond = 24008 -MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.5184082; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.881328 +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87359841; EvalErr[0]PerSample = 0.51933594; TotalTime = 0.36717s; TotalTimePerSample = 0.03586ms; SamplesPerSecond = 27888 +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86656271; EvalErr[0]PerSample = 0.51748047; TotalTime = 0.34559s; TotalTimePerSample = 0.03375ms; SamplesPerSecond = 29630 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.5184082; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.728516 MPI Rank 0: CNTKCommandTrainEnd: speechTrain MPI Rank 0: COMPLETED MPI Rank 0: ~MPIWrapper -MPI Rank 1: running on localhost at 2015/10/02 13:38:53 -MPI Rank 1: command line options: -MPI Rank 1: configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/../cntk.config RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 1: running on localhost at 2015/10/24 12:56:12 +MPI Rank 1: command line: +MPI Rank 1: /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/../cntk.config RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/.. DeviceId=0 stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 1: MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> MPI Rank 1: precision=float @@ -590,10 +702,11 @@ MPI Rank 1: labelType=Category MPI Rank 1: ] MPI Rank 1: ] MPI Rank 1: ] -MPI Rank 1: RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu +MPI Rank 1: RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu MPI Rank 1: DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +MPI Rank 1: ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/.. MPI Rank 1: DeviceId=0 -MPI Rank 1: stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 1: stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 1: MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< MPI Rank 1: @@ -604,7 +717,7 @@ MPI Rank 1: deviceId=0 MPI Rank 1: parallelTrain=true MPI Rank 1: speechTrain=[ MPI Rank 1: action=train -MPI Rank 1: modelPath=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn +MPI Rank 1: modelPath=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 1: deviceId=0 MPI Rank 1: traceLevel=1 MPI Rank 1: SimpleNetworkBuilder=[ @@ -687,23 +800,25 @@ MPI Rank 1: labelType=Category MPI Rank 1: ] MPI Rank 1: ] MPI Rank 1: ] -MPI Rank 1: RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu +MPI Rank 1: RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu MPI Rank 1: DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +MPI Rank 1: ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/.. MPI Rank 1: DeviceId=0 -MPI Rank 1: stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 1: stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 1: MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 1: MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> MPI Rank 1: configparameters: cntk.config:command=speechTrain +MPI Rank 1: configparameters: cntk.config:ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/.. MPI Rank 1: configparameters: cntk.config:DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data MPI Rank 1: configparameters: cntk.config:deviceId=0 MPI Rank 1: configparameters: cntk.config:parallelTrain=true MPI Rank 1: configparameters: cntk.config:precision=float -MPI Rank 1: configparameters: cntk.config:RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu +MPI Rank 1: configparameters: cntk.config:RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu MPI Rank 1: configparameters: cntk.config:speechTrain=[ MPI Rank 1: action=train -MPI Rank 1: modelPath=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn +MPI Rank 1: modelPath=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 1: deviceId=0 MPI Rank 1: traceLevel=1 MPI Rank 1: SimpleNetworkBuilder=[ @@ -787,10 +902,11 @@ MPI Rank 1: ] MPI Rank 1: ] MPI Rank 1: ] MPI Rank 1: -MPI Rank 1: configparameters: cntk.config:stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 1: configparameters: cntk.config:stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 1: command: speechTrain MPI Rank 1: precision = float +MPI Rank 1: CNTKModelPath: /tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 1: CNTKCommandTrainInfo: speechTrain : 3 MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 MPI Rank 1: CNTKCommandTrainBegin: speechTrain @@ -802,11 +918,12 @@ MPI Rank 1: htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Sp MPI Rank 1: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances MPI Rank 1: label set 0: 129 classes MPI Rank 1: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames +MPI Rank 1: SetUniformRandomValue (GPU): creating curand object with seed 1 MPI Rank 1: GetTrainCriterionNodes ... MPI Rank 1: GetEvalCriterionNodes ... MPI Rank 1: MPI Rank 1: -MPI Rank 1: Validating node CrossEntropyWithSoftmax +MPI Rank 1: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. MPI Rank 1: MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3] MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512] @@ -829,13 +946,57 @@ MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1] MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] MPI Rank 1: +MPI Rank 1: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] +MPI Rank 1: +MPI Rank 1: Validating for node CrossEntropyWithSoftmax, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] MPI Rank 1: MPI Rank 1: 9 out of 20 nodes do not share the minibatch layout with the input data. -MPI Rank 1: Found 6 PreCompute nodes -MPI Rank 1: NodeName: InvStdOfFeatures -MPI Rank 1: NodeName: MeanOfFeatures -MPI Rank 1: NodeName: Prior +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Precomputing --> 3 PreCompute nodes found. +MPI Rank 1: MPI Rank 1: NodeName: InvStdOfFeatures MPI Rank 1: NodeName: MeanOfFeatures MPI Rank 1: NodeName: Prior @@ -843,126 +1004,192 @@ MPI Rank 1: minibatchiterator: epoch 0: frames [0..252734] (first utterance at f MPI Rank 1: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms MPI Rank 1: MPI Rank 1: -MPI Rank 1: Validating node InvStdOfFeatures +MPI Rank 1: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1. MPI Rank 1: -MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 64] -MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 64]) -> [363, 1] +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] MPI Rank 1: +MPI Rank 1: Validating for node InvStdOfFeatures, final verification. MPI Rank 1: +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] MPI Rank 1: MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 1: MPI Rank 1: -MPI Rank 1: Validating node MeanOfFeatures MPI Rank 1: -MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 64] -MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 64]) -> [363, 1] +MPI Rank 1: Validating for node MeanOfFeatures. 2 nodes to process in pass 1. MPI Rank 1: +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] MPI Rank 1: +MPI Rank 1: Validating for node MeanOfFeatures, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] MPI Rank 1: MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 1: MPI Rank 1: -MPI Rank 1: Validating node Prior MPI Rank 1: -MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 64] -MPI Rank 1: Validating --> Prior = Mean(labels[132, MBSize 64]) -> [132, 1] +MPI Rank 1: Validating for node Prior. 2 nodes to process in pass 1. MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 1: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] MPI Rank 1: +MPI Rank 1: Validating for node Prior. 1 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 1: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] +MPI Rank 1: +MPI Rank 1: Validating for node Prior, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 1: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] MPI Rank 1: MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once. +MPI Rank 1: +MPI Rank 1: Precomputing --> Completed. +MPI Rank 1: MPI Rank 1: Set Max Temp Mem Size For Convolution Nodes to 0 samples. -MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.015625 momentum = 0.900000 +MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.015625 effective momentum = 0.900000 MPI Rank 1: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 1 of 3, with 1 datapasses MPI Rank 1: -MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. MPI Rank 1: +MPI Rank 1: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. MPI Rank 1: -MPI Rank 1: Validating node EvalErrorPrediction -MPI Rank 1: -MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 28] +MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 62] MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512] MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512] MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363] -MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 28] -MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 28]) -> [363, 1] -MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 28]) -> [363, 1] -MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 28], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 28] -MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 28]) -> [512, MBSize 28] +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1] -MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 28], B0[512, 1]) -> [512, MBSize 28] -MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 28]) -> [512, MBSize 28] -MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 28]) -> [512, MBSize 28] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1] -MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 28], B1[512, 1]) -> [512, MBSize 28] -MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 28]) -> [512, MBSize 28] -MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 28]) -> [132, MBSize 28] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1] -MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 28], B2[132, 1]) -> [132, MBSize 28] -MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 28], HLast[132, MBSize 28]) -> [1, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] MPI Rank 1: +MPI Rank 1: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2. MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 62] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] +MPI Rank 1: +MPI Rank 1: Validating for node EvalErrorPrediction, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 62] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] MPI Rank 1: MPI Rank 1: 9 out of 20 nodes do not share the minibatch layout with the input data. -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 1- 10 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.32135414; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.25660s; TotalTimePerSample = 0.40093ms; SamplesPerSecond = 2494 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 11- 20 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.15070930; EvalErr[0]PerSample = 0.86718750; TotalTime = 0.24685s; TotalTimePerSample = 0.38571ms; SamplesPerSecond = 2592 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 21- 30 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.99901060; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.24575s; TotalTimePerSample = 0.38398ms; SamplesPerSecond = 2604 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 31- 40 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.86945780; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.24696s; TotalTimePerSample = 0.38587ms; SamplesPerSecond = 2591 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 41- 50 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.80219517; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.24515s; TotalTimePerSample = 0.38305ms; SamplesPerSecond = 2610 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 51- 60 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.72890717; EvalErr[0]PerSample = 0.86875000; TotalTime = 0.24461s; TotalTimePerSample = 0.38220ms; SamplesPerSecond = 2616 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 61- 70 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.56187025; EvalErr[0]PerSample = 0.82343750; TotalTime = 0.24416s; TotalTimePerSample = 0.38149ms; SamplesPerSecond = 2621 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 71- 80 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.42790310; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.24565s; TotalTimePerSample = 0.38382ms; SamplesPerSecond = 2605 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 81- 90 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.33928303; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.24435s; TotalTimePerSample = 0.38179ms; SamplesPerSecond = 2619 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.36398734; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.24545s; TotalTimePerSample = 0.38352ms; SamplesPerSecond = 2607 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.21223679; EvalErr[0]PerSample = 0.75312500; TotalTime = 0.24566s; TotalTimePerSample = 0.38384ms; SamplesPerSecond = 2605 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.31265333; EvalErr[0]PerSample = 0.78750000; TotalTime = 0.24655s; TotalTimePerSample = 0.38523ms; SamplesPerSecond = 2595 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.14081673; EvalErr[0]PerSample = 0.74687500; TotalTime = 0.24648s; TotalTimePerSample = 0.38513ms; SamplesPerSecond = 2596 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.00690023; EvalErr[0]PerSample = 0.69687500; TotalTime = 0.24592s; TotalTimePerSample = 0.38424ms; SamplesPerSecond = 2602 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.00496087; EvalErr[0]PerSample = 0.72343750; TotalTime = 0.24733s; TotalTimePerSample = 0.38646ms; SamplesPerSecond = 2587 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.97859121; EvalErr[0]PerSample = 0.73906250; TotalTime = 0.24469s; TotalTimePerSample = 0.38232ms; SamplesPerSecond = 2615 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.85686638; EvalErr[0]PerSample = 0.70781250; TotalTime = 0.24529s; TotalTimePerSample = 0.38326ms; SamplesPerSecond = 2609 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.69053374; EvalErr[0]PerSample = 0.67187500; TotalTime = 0.24513s; TotalTimePerSample = 0.38302ms; SamplesPerSecond = 2610 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.78653366; EvalErr[0]PerSample = 0.70468750; TotalTime = 0.24605s; TotalTimePerSample = 0.38446ms; SamplesPerSecond = 2601 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.57702529; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.24667s; TotalTimePerSample = 0.38542ms; SamplesPerSecond = 2594 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.61570793; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.24435s; TotalTimePerSample = 0.38179ms; SamplesPerSecond = 2619 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.55235603; EvalErr[0]PerSample = 0.65781250; TotalTime = 0.24639s; TotalTimePerSample = 0.38498ms; SamplesPerSecond = 2597 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.48211165; EvalErr[0]PerSample = 0.62500000; TotalTime = 0.24605s; TotalTimePerSample = 0.38445ms; SamplesPerSecond = 2601 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.38778376; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.24590s; TotalTimePerSample = 0.38422ms; SamplesPerSecond = 2602 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.36900911; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.24475s; TotalTimePerSample = 0.38242ms; SamplesPerSecond = 2614 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.43967760; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.24451s; TotalTimePerSample = 0.38204ms; SamplesPerSecond = 2617 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.30281011; EvalErr[0]PerSample = 0.61250000; TotalTime = 0.24557s; TotalTimePerSample = 0.38370ms; SamplesPerSecond = 2606 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.19669121; EvalErr[0]PerSample = 0.55937500; TotalTime = 0.24469s; TotalTimePerSample = 0.38234ms; SamplesPerSecond = 2615 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.28979560; EvalErr[0]PerSample = 0.60468750; TotalTime = 0.24494s; TotalTimePerSample = 0.38272ms; SamplesPerSecond = 2612 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.17750506; EvalErr[0]PerSample = 0.62187500; TotalTime = 0.24520s; TotalTimePerSample = 0.38312ms; SamplesPerSecond = 2610 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.26264305; EvalErr[0]PerSample = 0.59687500; TotalTime = 0.24493s; TotalTimePerSample = 0.38270ms; SamplesPerSecond = 2613 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.15073149; EvalErr[0]PerSample = 0.56250000; TotalTime = 0.24494s; TotalTimePerSample = 0.38271ms; SamplesPerSecond = 2612 -MPI Rank 1: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9799568; EvalErrPerSample = 0.72216797; Ave LearnRatePerSample = 0.015625; EpochTime=7.871133 -MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.001953 momentum = 0.656119 +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 1- 10 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.32135295; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.24521s; TotalTimePerSample = 0.38315ms; SamplesPerSecond = 2609 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 11- 20 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.15070941; EvalErr[0]PerSample = 0.86718750; TotalTime = 0.23443s; TotalTimePerSample = 0.36629ms; SamplesPerSecond = 2730 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 21- 30 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.99901066; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.22705s; TotalTimePerSample = 0.35477ms; SamplesPerSecond = 2818 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 31- 40 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.86945816; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.22773s; TotalTimePerSample = 0.35583ms; SamplesPerSecond = 2810 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 41- 50 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.80219557; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.22741s; TotalTimePerSample = 0.35533ms; SamplesPerSecond = 2814 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 51- 60 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.72890766; EvalErr[0]PerSample = 0.86875000; TotalTime = 0.22725s; TotalTimePerSample = 0.35507ms; SamplesPerSecond = 2816 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 61- 70 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.56187065; EvalErr[0]PerSample = 0.82343750; TotalTime = 0.22534s; TotalTimePerSample = 0.35209ms; SamplesPerSecond = 2840 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 71- 80 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.42790299; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.22941s; TotalTimePerSample = 0.35846ms; SamplesPerSecond = 2789 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 81- 90 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.33928338; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.22614s; TotalTimePerSample = 0.35334ms; SamplesPerSecond = 2830 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.36398772; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.22771s; TotalTimePerSample = 0.35580ms; SamplesPerSecond = 2810 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.21223693; EvalErr[0]PerSample = 0.75312500; TotalTime = 0.22797s; TotalTimePerSample = 0.35620ms; SamplesPerSecond = 2807 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.31265357; EvalErr[0]PerSample = 0.78750000; TotalTime = 0.22806s; TotalTimePerSample = 0.35634ms; SamplesPerSecond = 2806 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.14081698; EvalErr[0]PerSample = 0.74687500; TotalTime = 0.22797s; TotalTimePerSample = 0.35621ms; SamplesPerSecond = 2807 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.00690035; EvalErr[0]PerSample = 0.69687500; TotalTime = 0.22873s; TotalTimePerSample = 0.35740ms; SamplesPerSecond = 2798 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.00496066; EvalErr[0]PerSample = 0.72343750; TotalTime = 0.22763s; TotalTimePerSample = 0.35567ms; SamplesPerSecond = 2811 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.97859081; EvalErr[0]PerSample = 0.73906250; TotalTime = 0.22697s; TotalTimePerSample = 0.35465ms; SamplesPerSecond = 2819 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.85686609; EvalErr[0]PerSample = 0.70781250; TotalTime = 0.22767s; TotalTimePerSample = 0.35573ms; SamplesPerSecond = 2811 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.69053374; EvalErr[0]PerSample = 0.67187500; TotalTime = 0.22776s; TotalTimePerSample = 0.35587ms; SamplesPerSecond = 2810 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.78653376; EvalErr[0]PerSample = 0.70468750; TotalTime = 0.22751s; TotalTimePerSample = 0.35548ms; SamplesPerSecond = 2813 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.57702533; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.22746s; TotalTimePerSample = 0.35541ms; SamplesPerSecond = 2813 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.61570805; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.22868s; TotalTimePerSample = 0.35731ms; SamplesPerSecond = 2798 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.55235582; EvalErr[0]PerSample = 0.65781250; TotalTime = 0.22821s; TotalTimePerSample = 0.35658ms; SamplesPerSecond = 2804 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.48211151; EvalErr[0]PerSample = 0.62500000; TotalTime = 0.22782s; TotalTimePerSample = 0.35597ms; SamplesPerSecond = 2809 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.38778372; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.22683s; TotalTimePerSample = 0.35443ms; SamplesPerSecond = 2821 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.36900902; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.23027s; TotalTimePerSample = 0.35980ms; SamplesPerSecond = 2779 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.43967781; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.22756s; TotalTimePerSample = 0.35556ms; SamplesPerSecond = 2812 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.30281039; EvalErr[0]PerSample = 0.61250000; TotalTime = 0.22812s; TotalTimePerSample = 0.35643ms; SamplesPerSecond = 2805 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.19669146; EvalErr[0]PerSample = 0.55937500; TotalTime = 0.22729s; TotalTimePerSample = 0.35514ms; SamplesPerSecond = 2815 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.28979581; EvalErr[0]PerSample = 0.60468750; TotalTime = 0.22758s; TotalTimePerSample = 0.35560ms; SamplesPerSecond = 2812 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.17750535; EvalErr[0]PerSample = 0.62187500; TotalTime = 0.22775s; TotalTimePerSample = 0.35585ms; SamplesPerSecond = 2810 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.26264398; EvalErr[0]PerSample = 0.59687500; TotalTime = 0.22606s; TotalTimePerSample = 0.35322ms; SamplesPerSecond = 2831 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.15073110; EvalErr[0]PerSample = 0.56250000; TotalTime = 0.22674s; TotalTimePerSample = 0.35429ms; SamplesPerSecond = 2822 +MPI Rank 1: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9799569; EvalErrPerSample = 0.72216797; Ave LearnRatePerSample = 0.015625; EpochTime=7.319114 +MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.001953 effective momentum = 0.656119 MPI Rank 1: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 1 of 3, with 1 datapasses MPI Rank 1: MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. -MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 1- 10 of 80]: SamplesSeen = 2560; TrainLossPerSample = 2.01598530; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.29109s; TotalTimePerSample = 0.11371ms; SamplesPerSecond = 8794 -MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 11- 20 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98818586; EvalErr[0]PerSample = 0.54296875; TotalTime = 0.28094s; TotalTimePerSample = 0.10974ms; SamplesPerSecond = 9112 -MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 21- 30 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98698123; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.28221s; TotalTimePerSample = 0.11024ms; SamplesPerSecond = 9071 -MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 31- 40 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.93126298; EvalErr[0]PerSample = 0.52773437; TotalTime = 0.27954s; TotalTimePerSample = 0.10919ms; SamplesPerSecond = 9157 -MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 41- 50 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90067741; EvalErr[0]PerSample = 0.52656250; TotalTime = 0.27987s; TotalTimePerSample = 0.10932ms; SamplesPerSecond = 9147 -MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 51- 60 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.97115807; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.27910s; TotalTimePerSample = 0.10902ms; SamplesPerSecond = 9172 -MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 61- 70 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89518067; EvalErr[0]PerSample = 0.52031250; TotalTime = 0.27764s; TotalTimePerSample = 0.10845ms; SamplesPerSecond = 9220 -MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 71- 80 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90450396; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.27505s; TotalTimePerSample = 0.10744ms; SamplesPerSecond = 9307 -MPI Rank 1: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9492419; EvalErrPerSample = 0.53417969; Ave LearnRatePerSample = 0.001953125; EpochTime=2.248133 -MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000098 momentum = 0.656119 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 1- 10 of 80]: SamplesSeen = 2560; TrainLossPerSample = 2.01598514; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.25371s; TotalTimePerSample = 0.09911ms; SamplesPerSecond = 10090 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 11- 20 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98818590; EvalErr[0]PerSample = 0.54296875; TotalTime = 0.24939s; TotalTimePerSample = 0.09742ms; SamplesPerSecond = 10265 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 21- 30 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98698122; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.25094s; TotalTimePerSample = 0.09802ms; SamplesPerSecond = 10201 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 31- 40 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.93126295; EvalErr[0]PerSample = 0.52773437; TotalTime = 0.24942s; TotalTimePerSample = 0.09743ms; SamplesPerSecond = 10263 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 41- 50 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90067743; EvalErr[0]PerSample = 0.52656250; TotalTime = 0.24938s; TotalTimePerSample = 0.09741ms; SamplesPerSecond = 10265 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 51- 60 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.97115808; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.24886s; TotalTimePerSample = 0.09721ms; SamplesPerSecond = 10286 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 61- 70 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89518061; EvalErr[0]PerSample = 0.52031250; TotalTime = 0.24865s; TotalTimePerSample = 0.09713ms; SamplesPerSecond = 10295 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 71- 80 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90450394; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.24518s; TotalTimePerSample = 0.09577ms; SamplesPerSecond = 10441 +MPI Rank 1: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9492419; EvalErrPerSample = 0.53417969; Ave LearnRatePerSample = 0.001953125; EpochTime=1.999721 +MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 MPI Rank 1: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 1 of 3, with 1 datapasses MPI Rank 1: MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. -MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87359851; EvalErr[0]PerSample = 0.51933594; TotalTime = 0.44649s; TotalTimePerSample = 0.04360ms; SamplesPerSecond = 22934 -MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86656277; EvalErr[0]PerSample = 0.51748047; TotalTime = 0.42652s; TotalTimePerSample = 0.04165ms; SamplesPerSecond = 24008 -MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.5184082; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.880982 +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87359841; EvalErr[0]PerSample = 0.51933594; TotalTime = 0.36750s; TotalTimePerSample = 0.03589ms; SamplesPerSecond = 27863 +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86656271; EvalErr[0]PerSample = 0.51748047; TotalTime = 0.34559s; TotalTimePerSample = 0.03375ms; SamplesPerSecond = 29630 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.5184082; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.728446 MPI Rank 1: CNTKCommandTrainEnd: speechTrain MPI Rank 1: COMPLETED MPI Rank 1: ~MPIWrapper -MPI Rank 2: running on localhost at 2015/10/02 13:38:53 -MPI Rank 2: command line options: -MPI Rank 2: configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/../cntk.config RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 2: running on localhost at 2015/10/24 12:56:12 +MPI Rank 2: command line: +MPI Rank 2: /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/../cntk.config RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/.. DeviceId=0 stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 2: MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> MPI Rank 2: precision=float @@ -1054,10 +1281,11 @@ MPI Rank 2: labelType=Category MPI Rank 2: ] MPI Rank 2: ] MPI Rank 2: ] -MPI Rank 2: RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu +MPI Rank 2: RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu MPI Rank 2: DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +MPI Rank 2: ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/.. MPI Rank 2: DeviceId=0 -MPI Rank 2: stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 2: stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 2: MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< MPI Rank 2: @@ -1068,7 +1296,7 @@ MPI Rank 2: deviceId=0 MPI Rank 2: parallelTrain=true MPI Rank 2: speechTrain=[ MPI Rank 2: action=train -MPI Rank 2: modelPath=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn +MPI Rank 2: modelPath=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 2: deviceId=0 MPI Rank 2: traceLevel=1 MPI Rank 2: SimpleNetworkBuilder=[ @@ -1151,23 +1379,25 @@ MPI Rank 2: labelType=Category MPI Rank 2: ] MPI Rank 2: ] MPI Rank 2: ] -MPI Rank 2: RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu +MPI Rank 2: RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu MPI Rank 2: DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +MPI Rank 2: ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/.. MPI Rank 2: DeviceId=0 -MPI Rank 2: stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 2: stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 2: MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 2: MPI Rank 2: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> MPI Rank 2: configparameters: cntk.config:command=speechTrain +MPI Rank 2: configparameters: cntk.config:ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/.. MPI Rank 2: configparameters: cntk.config:DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data MPI Rank 2: configparameters: cntk.config:deviceId=0 MPI Rank 2: configparameters: cntk.config:parallelTrain=true MPI Rank 2: configparameters: cntk.config:precision=float -MPI Rank 2: configparameters: cntk.config:RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu +MPI Rank 2: configparameters: cntk.config:RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu MPI Rank 2: configparameters: cntk.config:speechTrain=[ MPI Rank 2: action=train -MPI Rank 2: modelPath=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn +MPI Rank 2: modelPath=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 2: deviceId=0 MPI Rank 2: traceLevel=1 MPI Rank 2: SimpleNetworkBuilder=[ @@ -1251,10 +1481,11 @@ MPI Rank 2: ] MPI Rank 2: ] MPI Rank 2: ] MPI Rank 2: -MPI Rank 2: configparameters: cntk.config:stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 2: configparameters: cntk.config:stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 2: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 2: command: speechTrain MPI Rank 2: precision = float +MPI Rank 2: CNTKModelPath: /tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 2: CNTKCommandTrainInfo: speechTrain : 3 MPI Rank 2: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 MPI Rank 2: CNTKCommandTrainBegin: speechTrain @@ -1266,11 +1497,12 @@ MPI Rank 2: htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Sp MPI Rank 2: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances MPI Rank 2: label set 0: 129 classes MPI Rank 2: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames +MPI Rank 2: SetUniformRandomValue (GPU): creating curand object with seed 1 MPI Rank 2: GetTrainCriterionNodes ... MPI Rank 2: GetEvalCriterionNodes ... MPI Rank 2: MPI Rank 2: -MPI Rank 2: Validating node CrossEntropyWithSoftmax +MPI Rank 2: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. MPI Rank 2: MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3] MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512] @@ -1293,13 +1525,57 @@ MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1] MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] MPI Rank 2: +MPI Rank 2: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] +MPI Rank 2: +MPI Rank 2: Validating for node CrossEntropyWithSoftmax, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] MPI Rank 2: MPI Rank 2: 9 out of 20 nodes do not share the minibatch layout with the input data. -MPI Rank 2: Found 6 PreCompute nodes -MPI Rank 2: NodeName: InvStdOfFeatures -MPI Rank 2: NodeName: MeanOfFeatures -MPI Rank 2: NodeName: Prior +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Precomputing --> 3 PreCompute nodes found. +MPI Rank 2: MPI Rank 2: NodeName: InvStdOfFeatures MPI Rank 2: NodeName: MeanOfFeatures MPI Rank 2: NodeName: Prior @@ -1307,120 +1583,186 @@ MPI Rank 2: minibatchiterator: epoch 0: frames [0..252734] (first utterance at f MPI Rank 2: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms MPI Rank 2: MPI Rank 2: -MPI Rank 2: Validating node InvStdOfFeatures +MPI Rank 2: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1. MPI Rank 2: -MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 64] -MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 64]) -> [363, 1] +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] MPI Rank 2: +MPI Rank 2: Validating for node InvStdOfFeatures, final verification. MPI Rank 2: +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] MPI Rank 2: MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 2: MPI Rank 2: -MPI Rank 2: Validating node MeanOfFeatures MPI Rank 2: -MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 64] -MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 64]) -> [363, 1] +MPI Rank 2: Validating for node MeanOfFeatures. 2 nodes to process in pass 1. MPI Rank 2: +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] MPI Rank 2: +MPI Rank 2: Validating for node MeanOfFeatures, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] MPI Rank 2: MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 2: MPI Rank 2: -MPI Rank 2: Validating node Prior MPI Rank 2: -MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 64] -MPI Rank 2: Validating --> Prior = Mean(labels[132, MBSize 64]) -> [132, 1] +MPI Rank 2: Validating for node Prior. 2 nodes to process in pass 1. MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 2: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] MPI Rank 2: +MPI Rank 2: Validating for node Prior. 1 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 2: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] +MPI Rank 2: +MPI Rank 2: Validating for node Prior, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 2: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] MPI Rank 2: MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once. +MPI Rank 2: +MPI Rank 2: Precomputing --> Completed. +MPI Rank 2: MPI Rank 2: Set Max Temp Mem Size For Convolution Nodes to 0 samples. -MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.015625 momentum = 0.900000 +MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.015625 effective momentum = 0.900000 MPI Rank 2: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 2 of 3, with 1 datapasses MPI Rank 2: -MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. MPI Rank 2: +MPI Rank 2: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. MPI Rank 2: -MPI Rank 2: Validating node EvalErrorPrediction -MPI Rank 2: -MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 7] +MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 62] MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512] MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512] MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363] -MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 7] -MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 7]) -> [363, 1] -MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 7]) -> [363, 1] -MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 7], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 7] -MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 7]) -> [512, MBSize 7] +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1] -MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 7], B0[512, 1]) -> [512, MBSize 7] -MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 7]) -> [512, MBSize 7] -MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 7]) -> [512, MBSize 7] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1] -MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 7], B1[512, 1]) -> [512, MBSize 7] -MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 7]) -> [512, MBSize 7] -MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 7]) -> [132, MBSize 7] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1] -MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 7], B2[132, 1]) -> [132, MBSize 7] -MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 7], HLast[132, MBSize 7]) -> [1, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] MPI Rank 2: +MPI Rank 2: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2. MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 62] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] +MPI Rank 2: +MPI Rank 2: Validating for node EvalErrorPrediction, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 62] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] MPI Rank 2: MPI Rank 2: 9 out of 20 nodes do not share the minibatch layout with the input data. -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 1- 10 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.32135414; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.25673s; TotalTimePerSample = 0.40114ms; SamplesPerSecond = 2492 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 11- 20 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.15070930; EvalErr[0]PerSample = 0.86718750; TotalTime = 0.24685s; TotalTimePerSample = 0.38571ms; SamplesPerSecond = 2592 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 21- 30 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.99901060; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.24575s; TotalTimePerSample = 0.38398ms; SamplesPerSecond = 2604 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 31- 40 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.86945780; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.24696s; TotalTimePerSample = 0.38588ms; SamplesPerSecond = 2591 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 41- 50 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.80219517; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.24516s; TotalTimePerSample = 0.38307ms; SamplesPerSecond = 2610 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 51- 60 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.72890717; EvalErr[0]PerSample = 0.86875000; TotalTime = 0.24462s; TotalTimePerSample = 0.38221ms; SamplesPerSecond = 2616 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 61- 70 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.56187025; EvalErr[0]PerSample = 0.82343750; TotalTime = 0.24416s; TotalTimePerSample = 0.38150ms; SamplesPerSecond = 2621 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 71- 80 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.42790310; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.24565s; TotalTimePerSample = 0.38383ms; SamplesPerSecond = 2605 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 81- 90 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.33928303; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.24437s; TotalTimePerSample = 0.38183ms; SamplesPerSecond = 2618 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.36398734; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.24545s; TotalTimePerSample = 0.38352ms; SamplesPerSecond = 2607 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.21223679; EvalErr[0]PerSample = 0.75312500; TotalTime = 0.24567s; TotalTimePerSample = 0.38385ms; SamplesPerSecond = 2605 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.31265333; EvalErr[0]PerSample = 0.78750000; TotalTime = 0.24655s; TotalTimePerSample = 0.38523ms; SamplesPerSecond = 2595 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.14081673; EvalErr[0]PerSample = 0.74687500; TotalTime = 0.24650s; TotalTimePerSample = 0.38515ms; SamplesPerSecond = 2596 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.00690023; EvalErr[0]PerSample = 0.69687500; TotalTime = 0.24591s; TotalTimePerSample = 0.38424ms; SamplesPerSecond = 2602 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.00496087; EvalErr[0]PerSample = 0.72343750; TotalTime = 0.24734s; TotalTimePerSample = 0.38647ms; SamplesPerSecond = 2587 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.97859121; EvalErr[0]PerSample = 0.73906250; TotalTime = 0.24469s; TotalTimePerSample = 0.38233ms; SamplesPerSecond = 2615 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.85686638; EvalErr[0]PerSample = 0.70781250; TotalTime = 0.24529s; TotalTimePerSample = 0.38327ms; SamplesPerSecond = 2609 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.69053374; EvalErr[0]PerSample = 0.67187500; TotalTime = 0.24514s; TotalTimePerSample = 0.38303ms; SamplesPerSecond = 2610 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.78653366; EvalErr[0]PerSample = 0.70468750; TotalTime = 0.24606s; TotalTimePerSample = 0.38447ms; SamplesPerSecond = 2600 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.57702529; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.24668s; TotalTimePerSample = 0.38543ms; SamplesPerSecond = 2594 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.61570793; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.24435s; TotalTimePerSample = 0.38180ms; SamplesPerSecond = 2619 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.55235603; EvalErr[0]PerSample = 0.65781250; TotalTime = 0.24639s; TotalTimePerSample = 0.38499ms; SamplesPerSecond = 2597 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.48211165; EvalErr[0]PerSample = 0.62500000; TotalTime = 0.24605s; TotalTimePerSample = 0.38446ms; SamplesPerSecond = 2601 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.38778376; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.24590s; TotalTimePerSample = 0.38423ms; SamplesPerSecond = 2602 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.36900911; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.24475s; TotalTimePerSample = 0.38242ms; SamplesPerSecond = 2614 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.43967760; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.24451s; TotalTimePerSample = 0.38205ms; SamplesPerSecond = 2617 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.30281011; EvalErr[0]PerSample = 0.61250000; TotalTime = 0.24558s; TotalTimePerSample = 0.38371ms; SamplesPerSecond = 2606 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.19669121; EvalErr[0]PerSample = 0.55937500; TotalTime = 0.24470s; TotalTimePerSample = 0.38235ms; SamplesPerSecond = 2615 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.28979560; EvalErr[0]PerSample = 0.60468750; TotalTime = 0.24495s; TotalTimePerSample = 0.38273ms; SamplesPerSecond = 2612 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.17750506; EvalErr[0]PerSample = 0.62187500; TotalTime = 0.24520s; TotalTimePerSample = 0.38313ms; SamplesPerSecond = 2610 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.26264305; EvalErr[0]PerSample = 0.59687500; TotalTime = 0.24493s; TotalTimePerSample = 0.38270ms; SamplesPerSecond = 2613 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.15073149; EvalErr[0]PerSample = 0.56250000; TotalTime = 0.24465s; TotalTimePerSample = 0.38226ms; SamplesPerSecond = 2616 -MPI Rank 2: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9799568; EvalErrPerSample = 0.72216797; Ave LearnRatePerSample = 0.015625; EpochTime=7.87131 -MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.001953 momentum = 0.656119 +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 1- 10 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.32135295; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.24689s; TotalTimePerSample = 0.38577ms; SamplesPerSecond = 2592 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 11- 20 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.15070941; EvalErr[0]PerSample = 0.86718750; TotalTime = 0.23655s; TotalTimePerSample = 0.36962ms; SamplesPerSecond = 2705 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 21- 30 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.99901066; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.22706s; TotalTimePerSample = 0.35478ms; SamplesPerSecond = 2818 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 31- 40 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.86945816; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.22778s; TotalTimePerSample = 0.35590ms; SamplesPerSecond = 2809 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 41- 50 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.80219557; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.22743s; TotalTimePerSample = 0.35536ms; SamplesPerSecond = 2814 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 51- 60 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.72890766; EvalErr[0]PerSample = 0.86875000; TotalTime = 0.22722s; TotalTimePerSample = 0.35502ms; SamplesPerSecond = 2816 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 61- 70 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.56187065; EvalErr[0]PerSample = 0.82343750; TotalTime = 0.22646s; TotalTimePerSample = 0.35384ms; SamplesPerSecond = 2826 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 71- 80 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.42790299; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.22831s; TotalTimePerSample = 0.35673ms; SamplesPerSecond = 2803 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 81- 90 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.33928338; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.22619s; TotalTimePerSample = 0.35342ms; SamplesPerSecond = 2829 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.36398772; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.22774s; TotalTimePerSample = 0.35584ms; SamplesPerSecond = 2810 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.21223693; EvalErr[0]PerSample = 0.75312500; TotalTime = 0.22830s; TotalTimePerSample = 0.35671ms; SamplesPerSecond = 2803 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.31265357; EvalErr[0]PerSample = 0.78750000; TotalTime = 0.22772s; TotalTimePerSample = 0.35581ms; SamplesPerSecond = 2810 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.14081698; EvalErr[0]PerSample = 0.74687500; TotalTime = 0.22802s; TotalTimePerSample = 0.35628ms; SamplesPerSecond = 2806 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.00690035; EvalErr[0]PerSample = 0.69687500; TotalTime = 0.22880s; TotalTimePerSample = 0.35751ms; SamplesPerSecond = 2797 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.00496066; EvalErr[0]PerSample = 0.72343750; TotalTime = 0.22758s; TotalTimePerSample = 0.35559ms; SamplesPerSecond = 2812 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.97859081; EvalErr[0]PerSample = 0.73906250; TotalTime = 0.22700s; TotalTimePerSample = 0.35468ms; SamplesPerSecond = 2819 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.85686609; EvalErr[0]PerSample = 0.70781250; TotalTime = 0.22768s; TotalTimePerSample = 0.35575ms; SamplesPerSecond = 2810 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.69053374; EvalErr[0]PerSample = 0.67187500; TotalTime = 0.22778s; TotalTimePerSample = 0.35590ms; SamplesPerSecond = 2809 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.78653376; EvalErr[0]PerSample = 0.70468750; TotalTime = 0.22753s; TotalTimePerSample = 0.35551ms; SamplesPerSecond = 2812 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.57702533; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.22748s; TotalTimePerSample = 0.35544ms; SamplesPerSecond = 2813 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.61570805; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.22869s; TotalTimePerSample = 0.35733ms; SamplesPerSecond = 2798 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.55235582; EvalErr[0]PerSample = 0.65781250; TotalTime = 0.22823s; TotalTimePerSample = 0.35661ms; SamplesPerSecond = 2804 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.48211151; EvalErr[0]PerSample = 0.62500000; TotalTime = 0.22784s; TotalTimePerSample = 0.35600ms; SamplesPerSecond = 2809 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.38778372; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.22795s; TotalTimePerSample = 0.35618ms; SamplesPerSecond = 2807 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.36900902; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.22914s; TotalTimePerSample = 0.35803ms; SamplesPerSecond = 2793 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.43967781; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.22690s; TotalTimePerSample = 0.35454ms; SamplesPerSecond = 2820 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.30281039; EvalErr[0]PerSample = 0.61250000; TotalTime = 0.22876s; TotalTimePerSample = 0.35743ms; SamplesPerSecond = 2797 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.19669146; EvalErr[0]PerSample = 0.55937500; TotalTime = 0.22764s; TotalTimePerSample = 0.35569ms; SamplesPerSecond = 2811 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.28979581; EvalErr[0]PerSample = 0.60468750; TotalTime = 0.22730s; TotalTimePerSample = 0.35515ms; SamplesPerSecond = 2815 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.17750535; EvalErr[0]PerSample = 0.62187500; TotalTime = 0.22705s; TotalTimePerSample = 0.35477ms; SamplesPerSecond = 2818 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.26264398; EvalErr[0]PerSample = 0.59687500; TotalTime = 0.22676s; TotalTimePerSample = 0.35431ms; SamplesPerSecond = 2822 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.15073110; EvalErr[0]PerSample = 0.56250000; TotalTime = 0.22676s; TotalTimePerSample = 0.35431ms; SamplesPerSecond = 2822 +MPI Rank 2: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9799569; EvalErrPerSample = 0.72216797; Ave LearnRatePerSample = 0.015625; EpochTime=7.319151 +MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.001953 effective momentum = 0.656119 MPI Rank 2: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 2 of 3, with 1 datapasses MPI Rank 2: MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. -MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 1- 10 of 80]: SamplesSeen = 2560; TrainLossPerSample = 2.01598530; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.29002s; TotalTimePerSample = 0.11329ms; SamplesPerSecond = 8826 -MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 11- 20 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98818586; EvalErr[0]PerSample = 0.54296875; TotalTime = 0.28095s; TotalTimePerSample = 0.10975ms; SamplesPerSecond = 9111 -MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 21- 30 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98698123; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.28222s; TotalTimePerSample = 0.11024ms; SamplesPerSecond = 9071 -MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 31- 40 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.93126298; EvalErr[0]PerSample = 0.52773437; TotalTime = 0.27954s; TotalTimePerSample = 0.10920ms; SamplesPerSecond = 9157 -MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 41- 50 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90067741; EvalErr[0]PerSample = 0.52656250; TotalTime = 0.27987s; TotalTimePerSample = 0.10933ms; SamplesPerSecond = 9146 -MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 51- 60 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.97115807; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.27910s; TotalTimePerSample = 0.10902ms; SamplesPerSecond = 9172 -MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 61- 70 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89518067; EvalErr[0]PerSample = 0.52031250; TotalTime = 0.27764s; TotalTimePerSample = 0.10846ms; SamplesPerSecond = 9220 -MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 71- 80 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90450396; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.27504s; TotalTimePerSample = 0.10744ms; SamplesPerSecond = 9307 -MPI Rank 2: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9492419; EvalErrPerSample = 0.53417969; Ave LearnRatePerSample = 0.001953125; EpochTime=2.248306 -MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.000098 momentum = 0.656119 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 1- 10 of 80]: SamplesSeen = 2560; TrainLossPerSample = 2.01598514; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.25401s; TotalTimePerSample = 0.09922ms; SamplesPerSecond = 10078 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 11- 20 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98818590; EvalErr[0]PerSample = 0.54296875; TotalTime = 0.24945s; TotalTimePerSample = 0.09744ms; SamplesPerSecond = 10262 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 21- 30 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98698122; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.24991s; TotalTimePerSample = 0.09762ms; SamplesPerSecond = 10243 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 31- 40 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.93126295; EvalErr[0]PerSample = 0.52773437; TotalTime = 0.24970s; TotalTimePerSample = 0.09754ms; SamplesPerSecond = 10252 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 41- 50 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90067743; EvalErr[0]PerSample = 0.52656250; TotalTime = 0.25009s; TotalTimePerSample = 0.09769ms; SamplesPerSecond = 10236 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 51- 60 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.97115808; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.24884s; TotalTimePerSample = 0.09720ms; SamplesPerSecond = 10287 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 61- 70 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89518061; EvalErr[0]PerSample = 0.52031250; TotalTime = 0.24796s; TotalTimePerSample = 0.09686ms; SamplesPerSecond = 10324 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 71- 80 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90450394; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.24485s; TotalTimePerSample = 0.09565ms; SamplesPerSecond = 10455 +MPI Rank 2: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9492419; EvalErrPerSample = 0.53417969; Ave LearnRatePerSample = 0.001953125; EpochTime=1.999609 +MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 MPI Rank 2: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 2 of 3, with 1 datapasses MPI Rank 2: MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. -MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87359851; EvalErr[0]PerSample = 0.51933594; TotalTime = 0.44455s; TotalTimePerSample = 0.04341ms; SamplesPerSecond = 23034 -MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86656277; EvalErr[0]PerSample = 0.51748047; TotalTime = 0.42652s; TotalTimePerSample = 0.04165ms; SamplesPerSecond = 24008 -MPI Rank 2: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.5184082; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.881154 +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87359841; EvalErr[0]PerSample = 0.51933594; TotalTime = 0.37054s; TotalTimePerSample = 0.03619ms; SamplesPerSecond = 27635 +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86656271; EvalErr[0]PerSample = 0.51748047; TotalTime = 0.34556s; TotalTimePerSample = 0.03375ms; SamplesPerSecond = 29632 +MPI Rank 2: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.5184082; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.728405 MPI Rank 2: CNTKCommandTrainEnd: speechTrain MPI Rank 2: COMPLETED MPI Rank 2: ~MPIWrapper diff --git a/Tests/Speech/DNN/ParallelNoQuantization/baseline.windows.gpu.txt b/Tests/Speech/DNN/ParallelNoQuantization/baseline.windows.gpu.txt index 3449e702d..f658bc92c 100644 --- a/Tests/Speech/DNN/ParallelNoQuantization/baseline.windows.gpu.txt +++ b/Tests/Speech/DNN/ParallelNoQuantization/baseline.windows.gpu.txt @@ -1,4 +1,4 @@ -=== Running C:\Program Files\Microsoft MPI\Bin\/mpiexec.exe -n 3 E:\NetScale\CNTK\git_repos\cplx_master\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\DNN\cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr +=== Running C:\Program Files\Microsoft MPI\Bin\/mpiexec.exe -n 3 E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr MPIWrapper: initializing MPI MPIWrapper: initializing MPI MPIWrapper: initializing MPI @@ -7,41 +7,38 @@ ping [requestnodes (before change)]: 3 nodes pinging each other ping [requestnodes (before change)]: 3 nodes pinging each other ping [requestnodes (before change)]: all 3 nodes responded requestnodes [MPIWrapper]: using 3 out of 3 MPI nodes (3 requested); we (2) are in (participating) -ping [requestnodes (after change)]: 3 nodes pinging each other ping [requestnodes (before change)]: all 3 nodes responded +ping [requestnodes (after change)]: 3 nodes pinging each other requestnodes [MPIWrapper]: using 3 out of 3 MPI nodes (3 requested); we (1) are in (participating) ping [requestnodes (before change)]: all 3 nodes responded ping [requestnodes (after change)]: 3 nodes pinging each other requestnodes [MPIWrapper]: using 3 out of 3 MPI nodes (3 requested); we (0) are in (participating) ping [requestnodes (after change)]: 3 nodes pinging each other ping [requestnodes (after change)]: all 3 nodes responded -ping [requestnodes (after change)]: all 3 nodes responded -mpihelper: we are cog 0 in a gearbox of 3 -ping [requestnodes (after change)]: all 3 nodes responded mpihelper: we are cog 1 in a gearbox of 3 +ping [requestnodes (after change)]: all 3 nodes responded +ping [requestnodes (after change)]: all 3 nodes responded ping [mpihelper]: 3 nodes pinging each other +mpihelper: we are cog 0 in a gearbox of 3 mpihelper: we are cog 2 in a gearbox of 3 ping [mpihelper]: 3 nodes pinging each other ping [mpihelper]: 3 nodes pinging each other ping [mpihelper]: all 3 nodes responded ping [mpihelper]: all 3 nodes responded ping [mpihelper]: all 3 nodes responded -CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn -CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn -CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn -MPI Rank 0: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank0 +MPI Rank 0: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank0 MPI Rank 0: ------------------------------------------------------------------- MPI Rank 0: Build info: MPI Rank 0: -MPI Rank 0: Built time: Oct 2 2015 13:14:34 -MPI Rank 0: Last modified date: Fri Oct 2 13:09:06 2015 +MPI Rank 0: Built time: Oct 24 2015 13:33:25 +MPI Rank 0: Last modified date: Thu Oct 22 16:00:27 2015 MPI Rank 0: Built by amitaga on Amitaga-Win-DT3 -MPI Rank 0: Build Path: E:\NetScale\CNTK\git_repos\cplx_master\MachineLearning\CNTK\ +MPI Rank 0: Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\ MPI Rank 0: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 MPI Rank 0: ------------------------------------------------------------------- -MPI Rank 0: running on Amitaga-Win-DT3 at 2015/10/02 21:20:29 -MPI Rank 0: command line options: -MPI Rank 0: configFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\DNN\cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 0: running on Amitaga-Win-DT3 at 2015/10/24 22:14:12 +MPI Rank 0: command line: +MPI Rank 0: E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 0: MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> MPI Rank 0: precision=float @@ -133,10 +130,11 @@ MPI Rank 0: labelType=Category MPI Rank 0: ] MPI Rank 0: ] MPI Rank 0: ] -MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu -MPI Rank 0: DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data +MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu +MPI Rank 0: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +MPI Rank 0: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN MPI Rank 0: DeviceId=0 -MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 0: MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< MPI Rank 0: @@ -147,7 +145,7 @@ MPI Rank 0: deviceId=0 MPI Rank 0: parallelTrain=true MPI Rank 0: speechTrain=[ MPI Rank 0: action=train -MPI Rank 0: modelPath=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn +MPI Rank 0: modelPath=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 0: deviceId=0 MPI Rank 0: traceLevel=1 MPI Rank 0: SimpleNetworkBuilder=[ @@ -223,30 +221,32 @@ MPI Rank 0: type=Real MPI Rank 0: scpFile=glob_0000.scp MPI Rank 0: ] MPI Rank 0: labels=[ -MPI Rank 0: mlfFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf -MPI Rank 0: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list +MPI Rank 0: mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf +MPI Rank 0: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list MPI Rank 0: labelDim=132 MPI Rank 0: labelType=Category MPI Rank 0: ] MPI Rank 0: ] MPI Rank 0: ] -MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu -MPI Rank 0: DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data +MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu +MPI Rank 0: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +MPI Rank 0: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN MPI Rank 0: DeviceId=0 -MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 0: MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 0: MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> MPI Rank 0: configparameters: cntk.config:command=speechTrain -MPI Rank 0: configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data +MPI Rank 0: configparameters: cntk.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN +MPI Rank 0: configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data MPI Rank 0: configparameters: cntk.config:deviceId=0 MPI Rank 0: configparameters: cntk.config:parallelTrain=true MPI Rank 0: configparameters: cntk.config:precision=float -MPI Rank 0: configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu +MPI Rank 0: configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu MPI Rank 0: configparameters: cntk.config:speechTrain=[ MPI Rank 0: action=train -MPI Rank 0: modelPath=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn +MPI Rank 0: modelPath=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 0: deviceId=0 MPI Rank 0: traceLevel=1 MPI Rank 0: SimpleNetworkBuilder=[ @@ -322,34 +322,36 @@ MPI Rank 0: type=Real MPI Rank 0: scpFile=glob_0000.scp MPI Rank 0: ] MPI Rank 0: labels=[ -MPI Rank 0: mlfFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf -MPI Rank 0: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list +MPI Rank 0: mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf +MPI Rank 0: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list MPI Rank 0: labelDim=132 MPI Rank 0: labelType=Category MPI Rank 0: ] MPI Rank 0: ] MPI Rank 0: ] MPI Rank 0: -MPI Rank 0: configparameters: cntk.config:stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 0: configparameters: cntk.config:stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 0: command: speechTrain MPI Rank 0: precision = float +MPI Rank 0: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 0: CNTKCommandTrainInfo: speechTrain : 3 MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 MPI Rank 0: CNTKCommandTrainBegin: speechTrain MPI Rank 0: SimpleNetworkBuilder Using GPU 0 MPI Rank 0: reading script file glob_0000.scp ... 948 entries MPI Rank 0: trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion -MPI Rank 0: total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list -MPI Rank 0: htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf ... total 948 entries +MPI Rank 0: total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list +MPI Rank 0: htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf ... total 948 entries MPI Rank 0: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances MPI Rank 0: label set 0: 129 classes MPI Rank 0: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames +MPI Rank 0: SetUniformRandomValue (GPU): creating curand object with seed 1 MPI Rank 0: GetTrainCriterionNodes ... MPI Rank 0: GetEvalCriterionNodes ... MPI Rank 0: MPI Rank 0: -MPI Rank 0: Validating node CrossEntropyWithSoftmax +MPI Rank 0: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. MPI Rank 0: MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3] MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512] @@ -372,13 +374,57 @@ MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1] MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] MPI Rank 0: +MPI Rank 0: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] +MPI Rank 0: +MPI Rank 0: Validating for node CrossEntropyWithSoftmax, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] MPI Rank 0: MPI Rank 0: 9 out of 20 nodes do not share the minibatch layout with the input data. -MPI Rank 0: Found 6 PreCompute nodes -MPI Rank 0: NodeName: InvStdOfFeatures -MPI Rank 0: NodeName: MeanOfFeatures -MPI Rank 0: NodeName: Prior +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Precomputing --> 3 PreCompute nodes found. +MPI Rank 0: MPI Rank 0: NodeName: InvStdOfFeatures MPI Rank 0: NodeName: MeanOfFeatures MPI Rank 0: NodeName: Prior @@ -386,136 +432,201 @@ MPI Rank 0: minibatchiterator: epoch 0: frames [0..252734] (first utterance at f MPI Rank 0: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms MPI Rank 0: MPI Rank 0: -MPI Rank 0: Validating node InvStdOfFeatures +MPI Rank 0: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1. MPI Rank 0: -MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 64] -MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 64]) -> [363, 1] +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] MPI Rank 0: +MPI Rank 0: Validating for node InvStdOfFeatures, final verification. MPI Rank 0: +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] MPI Rank 0: MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 0: MPI Rank 0: -MPI Rank 0: Validating node MeanOfFeatures MPI Rank 0: -MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 64] -MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 64]) -> [363, 1] +MPI Rank 0: Validating for node MeanOfFeatures. 2 nodes to process in pass 1. MPI Rank 0: +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] MPI Rank 0: +MPI Rank 0: Validating for node MeanOfFeatures, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] MPI Rank 0: MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 0: MPI Rank 0: -MPI Rank 0: Validating node Prior MPI Rank 0: -MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 64] -MPI Rank 0: Validating --> Prior = Mean(labels[132, MBSize 64]) -> [132, 1] +MPI Rank 0: Validating for node Prior. 2 nodes to process in pass 1. MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 0: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] MPI Rank 0: +MPI Rank 0: Validating for node Prior. 1 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 0: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] +MPI Rank 0: +MPI Rank 0: Validating for node Prior, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 0: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] MPI Rank 0: MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Precomputing --> Completed. +MPI Rank 0: MPI Rank 0: Set Max Temp Mem Size For Convolution Nodes to 0 samples. -MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.015625 momentum = 0.900000 +MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.015625 effective momentum = 0.900000 MPI Rank 0: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 3, with 1 datapasses MPI Rank 0: -MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. MPI Rank 0: +MPI Rank 0: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. MPI Rank 0: -MPI Rank 0: Validating node EvalErrorPrediction -MPI Rank 0: -MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 33] +MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 62] MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512] MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512] MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363] -MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 33] -MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 33]) -> [363, 1] -MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 33]) -> [363, 1] -MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 33], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 33] -MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 33]) -> [512, MBSize 33] +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1] -MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 33], B0[512, 1]) -> [512, MBSize 33] -MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 33]) -> [512, MBSize 33] -MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 33]) -> [512, MBSize 33] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1] -MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 33], B1[512, 1]) -> [512, MBSize 33] -MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 33]) -> [512, MBSize 33] -MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 33]) -> [132, MBSize 33] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1] -MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 33], B2[132, 1]) -> [132, MBSize 33] -MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 33], HLast[132, MBSize 33]) -> [1, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] MPI Rank 0: +MPI Rank 0: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2. MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 62] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] +MPI Rank 0: +MPI Rank 0: Validating for node EvalErrorPrediction, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 62] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] MPI Rank 0: MPI Rank 0: 9 out of 20 nodes do not share the minibatch layout with the input data. -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 1- 10 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.45646170; EvalErr[0]PerSample = 0.92500000; TotalTime = 0.49931s; TotalTimePerSample = 0.78017ms; SamplesPerSecond = 1281 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 11- 20 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.22315661; EvalErr[0]PerSample = 0.90156250; TotalTime = 0.41658s; TotalTimePerSample = 0.65091ms; SamplesPerSecond = 1536 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 21- 30 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.95180607; EvalErr[0]PerSample = 0.84687500; TotalTime = 0.39417s; TotalTimePerSample = 0.61589ms; SamplesPerSecond = 1623 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 31- 40 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.94158019; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.40679s; TotalTimePerSample = 0.63561ms; SamplesPerSecond = 1573 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 41- 50 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.85668726; EvalErr[0]PerSample = 0.91093750; TotalTime = 0.38408s; TotalTimePerSample = 0.60013ms; SamplesPerSecond = 1666 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 51- 60 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.72866371; EvalErr[0]PerSample = 0.89531250; TotalTime = 0.38362s; TotalTimePerSample = 0.59940ms; SamplesPerSecond = 1668 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 61- 70 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.51808934; EvalErr[0]PerSample = 0.82968750; TotalTime = 0.37649s; TotalTimePerSample = 0.58826ms; SamplesPerSecond = 1699 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 71- 80 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.48455124; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.37230s; TotalTimePerSample = 0.58172ms; SamplesPerSecond = 1719 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 81- 90 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.33829281; EvalErr[0]PerSample = 0.76875000; TotalTime = 0.38200s; TotalTimePerSample = 0.59688ms; SamplesPerSecond = 1675 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.50167446; EvalErr[0]PerSample = 0.79843750; TotalTime = 0.37789s; TotalTimePerSample = 0.59045ms; SamplesPerSecond = 1693 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.22861682; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.37564s; TotalTimePerSample = 0.58693ms; SamplesPerSecond = 1703 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.32616995; EvalErr[0]PerSample = 0.79062500; TotalTime = 0.37496s; TotalTimePerSample = 0.58588ms; SamplesPerSecond = 1706 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.16897953; EvalErr[0]PerSample = 0.77968750; TotalTime = 0.37711s; TotalTimePerSample = 0.58923ms; SamplesPerSecond = 1697 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.08892002; EvalErr[0]PerSample = 0.77656250; TotalTime = 0.38238s; TotalTimePerSample = 0.59747ms; SamplesPerSecond = 1673 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.06004848; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.37484s; TotalTimePerSample = 0.58569ms; SamplesPerSecond = 1707 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.91128321; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.37006s; TotalTimePerSample = 0.57822ms; SamplesPerSecond = 1729 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.90171920; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.37733s; TotalTimePerSample = 0.58958ms; SamplesPerSecond = 1696 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.73262413; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.37413s; TotalTimePerSample = 0.58458ms; SamplesPerSecond = 1710 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.66515363; EvalErr[0]PerSample = 0.68437500; TotalTime = 0.38770s; TotalTimePerSample = 0.60578ms; SamplesPerSecond = 1650 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.67382489; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.37570s; TotalTimePerSample = 0.58703ms; SamplesPerSecond = 1703 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.52869718; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37952s; TotalTimePerSample = 0.59299ms; SamplesPerSecond = 1686 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.60031970; EvalErr[0]PerSample = 0.66718750; TotalTime = 0.37927s; TotalTimePerSample = 0.59261ms; SamplesPerSecond = 1687 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.51134087; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.37869s; TotalTimePerSample = 0.59171ms; SamplesPerSecond = 1690 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.45362164; EvalErr[0]PerSample = 0.63750000; TotalTime = 0.37509s; TotalTimePerSample = 0.58608ms; SamplesPerSecond = 1706 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.41640677; EvalErr[0]PerSample = 0.61562500; TotalTime = 0.36971s; TotalTimePerSample = 0.57768ms; SamplesPerSecond = 1731 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.39745369; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.37111s; TotalTimePerSample = 0.57986ms; SamplesPerSecond = 1724 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.16416032; EvalErr[0]PerSample = 0.56718750; TotalTime = 0.37536s; TotalTimePerSample = 0.58650ms; SamplesPerSecond = 1705 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.30346910; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37963s; TotalTimePerSample = 0.59317ms; SamplesPerSecond = 1685 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.24398823; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.37262s; TotalTimePerSample = 0.58221ms; SamplesPerSecond = 1717 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.15322470; EvalErr[0]PerSample = 0.57968750; TotalTime = 0.37330s; TotalTimePerSample = 0.58328ms; SamplesPerSecond = 1714 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.21664598; EvalErr[0]PerSample = 0.59531250; TotalTime = 0.38548s; TotalTimePerSample = 0.60232ms; SamplesPerSecond = 1660 -MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.25246635; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.37795s; TotalTimePerSample = 0.59054ms; SamplesPerSecond = 1693 -MPI Rank 0: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0000031; EvalErrPerSample = 0.72836914; Ave LearnRatePerSample = 0.015625; EpochTime=12.320558 -MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.001953 momentum = 0.656119 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 1- 10 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.45645981; EvalErr[0]PerSample = 0.92500000; TotalTime = 0.33332s; TotalTimePerSample = 0.52081ms; SamplesPerSecond = 1920 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 11- 20 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.22315785; EvalErr[0]PerSample = 0.90156250; TotalTime = 0.28125s; TotalTimePerSample = 0.43946ms; SamplesPerSecond = 2275 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 21- 30 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.95180676; EvalErr[0]PerSample = 0.84687500; TotalTime = 0.25998s; TotalTimePerSample = 0.40622ms; SamplesPerSecond = 2461 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 31- 40 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.94158071; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.28931s; TotalTimePerSample = 0.45204ms; SamplesPerSecond = 2212 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 41- 50 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.85668763; EvalErr[0]PerSample = 0.91093750; TotalTime = 0.26162s; TotalTimePerSample = 0.40878ms; SamplesPerSecond = 2446 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 51- 60 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.72866399; EvalErr[0]PerSample = 0.89531250; TotalTime = 0.24786s; TotalTimePerSample = 0.38728ms; SamplesPerSecond = 2582 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 61- 70 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.51808951; EvalErr[0]PerSample = 0.82968750; TotalTime = 0.26001s; TotalTimePerSample = 0.40627ms; SamplesPerSecond = 2461 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 71- 80 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.48455147; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.25240s; TotalTimePerSample = 0.39438ms; SamplesPerSecond = 2535 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 81- 90 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.33829288; EvalErr[0]PerSample = 0.76875000; TotalTime = 0.23911s; TotalTimePerSample = 0.37360ms; SamplesPerSecond = 2676 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.50167490; EvalErr[0]PerSample = 0.79843750; TotalTime = 0.24356s; TotalTimePerSample = 0.38057ms; SamplesPerSecond = 2627 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.22861768; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.23348s; TotalTimePerSample = 0.36482ms; SamplesPerSecond = 2741 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.32617094; EvalErr[0]PerSample = 0.79062500; TotalTime = 0.23074s; TotalTimePerSample = 0.36054ms; SamplesPerSecond = 2773 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.16898033; EvalErr[0]PerSample = 0.77968750; TotalTime = 0.22036s; TotalTimePerSample = 0.34431ms; SamplesPerSecond = 2904 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.08892100; EvalErr[0]PerSample = 0.77656250; TotalTime = 0.21850s; TotalTimePerSample = 0.34140ms; SamplesPerSecond = 2929 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.06004828; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.24011s; TotalTimePerSample = 0.37518ms; SamplesPerSecond = 2665 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.91128317; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.32731s; TotalTimePerSample = 0.51141ms; SamplesPerSecond = 1955 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.90171901; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.32067s; TotalTimePerSample = 0.50105ms; SamplesPerSecond = 1995 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.73262447; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.33546s; TotalTimePerSample = 0.52416ms; SamplesPerSecond = 1907 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.66515410; EvalErr[0]PerSample = 0.68437500; TotalTime = 0.35016s; TotalTimePerSample = 0.54712ms; SamplesPerSecond = 1827 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.67382540; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.33016s; TotalTimePerSample = 0.51587ms; SamplesPerSecond = 1938 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.52869780; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.35647s; TotalTimePerSample = 0.55699ms; SamplesPerSecond = 1795 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.60032086; EvalErr[0]PerSample = 0.66718750; TotalTime = 0.36474s; TotalTimePerSample = 0.56991ms; SamplesPerSecond = 1754 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.51134188; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.33930s; TotalTimePerSample = 0.53015ms; SamplesPerSecond = 1886 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.45362252; EvalErr[0]PerSample = 0.63750000; TotalTime = 0.34390s; TotalTimePerSample = 0.53735ms; SamplesPerSecond = 1860 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.41640740; EvalErr[0]PerSample = 0.61562500; TotalTime = 0.33520s; TotalTimePerSample = 0.52375ms; SamplesPerSecond = 1909 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.39745478; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.34931s; TotalTimePerSample = 0.54580ms; SamplesPerSecond = 1832 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.16416053; EvalErr[0]PerSample = 0.56718750; TotalTime = 0.36688s; TotalTimePerSample = 0.57324ms; SamplesPerSecond = 1744 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.30346869; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37710s; TotalTimePerSample = 0.58922ms; SamplesPerSecond = 1697 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.24398831; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.36403s; TotalTimePerSample = 0.56879ms; SamplesPerSecond = 1758 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.15322487; EvalErr[0]PerSample = 0.57968750; TotalTime = 0.34077s; TotalTimePerSample = 0.53246ms; SamplesPerSecond = 1878 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.21664627; EvalErr[0]PerSample = 0.59531250; TotalTime = 0.35019s; TotalTimePerSample = 0.54718ms; SamplesPerSecond = 1827 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.25246685; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.32892s; TotalTimePerSample = 0.51394ms; SamplesPerSecond = 1945 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0000035; EvalErrPerSample = 0.72836914; Ave LearnRatePerSample = 0.015625; EpochTime=9.770017 +MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.001953 effective momentum = 0.656119 MPI Rank 0: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 0 of 3, with 1 datapasses MPI Rank 0: MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. -MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 1- 10 of 80]: SamplesSeen = 2560; TrainLossPerSample = 2.08151923; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.49257s; TotalTimePerSample = 0.19241ms; SamplesPerSecond = 5197 -MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 11- 20 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98395650; EvalErr[0]PerSample = 0.54257813; TotalTime = 0.45941s; TotalTimePerSample = 0.17946ms; SamplesPerSecond = 5572 -MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 21- 30 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98575441; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.47006s; TotalTimePerSample = 0.18362ms; SamplesPerSecond = 5446 -MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 31- 40 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90485007; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.47298s; TotalTimePerSample = 0.18476ms; SamplesPerSecond = 5412 -MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 41- 50 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.88324108; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.48624s; TotalTimePerSample = 0.18994ms; SamplesPerSecond = 5264 -MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 51- 60 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89109287; EvalErr[0]PerSample = 0.53359375; TotalTime = 0.48020s; TotalTimePerSample = 0.18758ms; SamplesPerSecond = 5331 -MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 61- 70 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89496218; EvalErr[0]PerSample = 0.52890625; TotalTime = 0.45121s; TotalTimePerSample = 0.17625ms; SamplesPerSecond = 5673 -MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 71- 80 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.85944253; EvalErr[0]PerSample = 0.52265625; TotalTime = 0.43800s; TotalTimePerSample = 0.17110ms; SamplesPerSecond = 5844 -MPI Rank 0: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9356024; EvalErrPerSample = 0.53603516; Ave LearnRatePerSample = 0.001953125; EpochTime=3.769975 -MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000098 momentum = 0.656119 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 1- 10 of 80]: SamplesSeen = 2560; TrainLossPerSample = 2.08151948; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.56668s; TotalTimePerSample = 0.22136ms; SamplesPerSecond = 4517 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 11- 20 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98395688; EvalErr[0]PerSample = 0.54257813; TotalTime = 0.46958s; TotalTimePerSample = 0.18343ms; SamplesPerSecond = 5451 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 21- 30 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98575479; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.41833s; TotalTimePerSample = 0.16341ms; SamplesPerSecond = 6119 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 31- 40 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90485039; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.33510s; TotalTimePerSample = 0.13090ms; SamplesPerSecond = 7639 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 41- 50 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.88324146; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.31206s; TotalTimePerSample = 0.12190ms; SamplesPerSecond = 8203 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 51- 60 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89109327; EvalErr[0]PerSample = 0.53359375; TotalTime = 0.29685s; TotalTimePerSample = 0.11596ms; SamplesPerSecond = 8623 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 61- 70 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89496253; EvalErr[0]PerSample = 0.52890625; TotalTime = 0.32672s; TotalTimePerSample = 0.12762ms; SamplesPerSecond = 7835 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 71- 80 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.85944295; EvalErr[0]PerSample = 0.52265625; TotalTime = 0.34130s; TotalTimePerSample = 0.13332ms; SamplesPerSecond = 7500 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9356027; EvalErrPerSample = 0.53603516; Ave LearnRatePerSample = 0.001953125; EpochTime=3.100422 +MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 MPI Rank 0: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 3, with 1 datapasses MPI Rank 0: MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. -MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86752815; EvalErr[0]PerSample = 0.52177734; TotalTime = 0.95049s; TotalTimePerSample = 0.09282ms; SamplesPerSecond = 10773 -MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87358797; EvalErr[0]PerSample = 0.51542969; TotalTime = 0.79398s; TotalTimePerSample = 0.07754ms; SamplesPerSecond = 12897 -MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705581; EvalErrPerSample = 0.51860352; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=1.792685 +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86752856; EvalErr[0]PerSample = 0.52177734; TotalTime = 1.01461s; TotalTimePerSample = 0.09908ms; SamplesPerSecond = 10092 +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87358831; EvalErr[0]PerSample = 0.51542969; TotalTime = 0.82689s; TotalTimePerSample = 0.08075ms; SamplesPerSecond = 12383 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705584; EvalErrPerSample = 0.51860352; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=1.909274 MPI Rank 0: CNTKCommandTrainEnd: speechTrain MPI Rank 0: COMPLETED MPI Rank 0: ~MPIWrapper -MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank1 +MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank1 MPI Rank 1: ------------------------------------------------------------------- MPI Rank 1: Build info: MPI Rank 1: -MPI Rank 1: Built time: Oct 2 2015 13:14:34 -MPI Rank 1: Last modified date: Fri Oct 2 13:09:06 2015 +MPI Rank 1: Built time: Oct 24 2015 13:33:25 +MPI Rank 1: Last modified date: Thu Oct 22 16:00:27 2015 MPI Rank 1: Built by amitaga on Amitaga-Win-DT3 -MPI Rank 1: Build Path: E:\NetScale\CNTK\git_repos\cplx_master\MachineLearning\CNTK\ +MPI Rank 1: Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\ MPI Rank 1: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 MPI Rank 1: ------------------------------------------------------------------- -MPI Rank 1: running on Amitaga-Win-DT3 at 2015/10/02 21:20:29 -MPI Rank 1: command line options: -MPI Rank 1: configFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\DNN\cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 1: running on Amitaga-Win-DT3 at 2015/10/24 22:14:12 +MPI Rank 1: command line: +MPI Rank 1: E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 1: MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> MPI Rank 1: precision=float @@ -607,10 +718,11 @@ MPI Rank 1: labelType=Category MPI Rank 1: ] MPI Rank 1: ] MPI Rank 1: ] -MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu -MPI Rank 1: DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data +MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu +MPI Rank 1: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +MPI Rank 1: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN MPI Rank 1: DeviceId=0 -MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 1: MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< MPI Rank 1: @@ -621,7 +733,7 @@ MPI Rank 1: deviceId=0 MPI Rank 1: parallelTrain=true MPI Rank 1: speechTrain=[ MPI Rank 1: action=train -MPI Rank 1: modelPath=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn +MPI Rank 1: modelPath=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 1: deviceId=0 MPI Rank 1: traceLevel=1 MPI Rank 1: SimpleNetworkBuilder=[ @@ -697,30 +809,32 @@ MPI Rank 1: type=Real MPI Rank 1: scpFile=glob_0000.scp MPI Rank 1: ] MPI Rank 1: labels=[ -MPI Rank 1: mlfFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf -MPI Rank 1: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list +MPI Rank 1: mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf +MPI Rank 1: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list MPI Rank 1: labelDim=132 MPI Rank 1: labelType=Category MPI Rank 1: ] MPI Rank 1: ] MPI Rank 1: ] -MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu -MPI Rank 1: DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data +MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu +MPI Rank 1: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +MPI Rank 1: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN MPI Rank 1: DeviceId=0 -MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 1: MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 1: MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> MPI Rank 1: configparameters: cntk.config:command=speechTrain -MPI Rank 1: configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data +MPI Rank 1: configparameters: cntk.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN +MPI Rank 1: configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data MPI Rank 1: configparameters: cntk.config:deviceId=0 MPI Rank 1: configparameters: cntk.config:parallelTrain=true MPI Rank 1: configparameters: cntk.config:precision=float -MPI Rank 1: configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu +MPI Rank 1: configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu MPI Rank 1: configparameters: cntk.config:speechTrain=[ MPI Rank 1: action=train -MPI Rank 1: modelPath=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn +MPI Rank 1: modelPath=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 1: deviceId=0 MPI Rank 1: traceLevel=1 MPI Rank 1: SimpleNetworkBuilder=[ @@ -796,34 +910,36 @@ MPI Rank 1: type=Real MPI Rank 1: scpFile=glob_0000.scp MPI Rank 1: ] MPI Rank 1: labels=[ -MPI Rank 1: mlfFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf -MPI Rank 1: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list +MPI Rank 1: mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf +MPI Rank 1: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list MPI Rank 1: labelDim=132 MPI Rank 1: labelType=Category MPI Rank 1: ] MPI Rank 1: ] MPI Rank 1: ] MPI Rank 1: -MPI Rank 1: configparameters: cntk.config:stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 1: configparameters: cntk.config:stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 1: command: speechTrain MPI Rank 1: precision = float +MPI Rank 1: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 1: CNTKCommandTrainInfo: speechTrain : 3 MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 MPI Rank 1: CNTKCommandTrainBegin: speechTrain MPI Rank 1: SimpleNetworkBuilder Using GPU 0 MPI Rank 1: reading script file glob_0000.scp ... 948 entries MPI Rank 1: trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion -MPI Rank 1: total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list -MPI Rank 1: htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf ... total 948 entries +MPI Rank 1: total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list +MPI Rank 1: htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf ... total 948 entries MPI Rank 1: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances MPI Rank 1: label set 0: 129 classes MPI Rank 1: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames +MPI Rank 1: SetUniformRandomValue (GPU): creating curand object with seed 1 MPI Rank 1: GetTrainCriterionNodes ... MPI Rank 1: GetEvalCriterionNodes ... MPI Rank 1: MPI Rank 1: -MPI Rank 1: Validating node CrossEntropyWithSoftmax +MPI Rank 1: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. MPI Rank 1: MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3] MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512] @@ -846,13 +962,57 @@ MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1] MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] MPI Rank 1: +MPI Rank 1: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] +MPI Rank 1: +MPI Rank 1: Validating for node CrossEntropyWithSoftmax, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] MPI Rank 1: MPI Rank 1: 9 out of 20 nodes do not share the minibatch layout with the input data. -MPI Rank 1: Found 6 PreCompute nodes -MPI Rank 1: NodeName: InvStdOfFeatures -MPI Rank 1: NodeName: MeanOfFeatures -MPI Rank 1: NodeName: Prior +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Precomputing --> 3 PreCompute nodes found. +MPI Rank 1: MPI Rank 1: NodeName: InvStdOfFeatures MPI Rank 1: NodeName: MeanOfFeatures MPI Rank 1: NodeName: Prior @@ -860,136 +1020,201 @@ MPI Rank 1: minibatchiterator: epoch 0: frames [0..252734] (first utterance at f MPI Rank 1: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms MPI Rank 1: MPI Rank 1: -MPI Rank 1: Validating node InvStdOfFeatures +MPI Rank 1: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1. MPI Rank 1: -MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 64] -MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 64]) -> [363, 1] +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] MPI Rank 1: +MPI Rank 1: Validating for node InvStdOfFeatures, final verification. MPI Rank 1: +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] MPI Rank 1: MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 1: MPI Rank 1: -MPI Rank 1: Validating node MeanOfFeatures MPI Rank 1: -MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 64] -MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 64]) -> [363, 1] +MPI Rank 1: Validating for node MeanOfFeatures. 2 nodes to process in pass 1. MPI Rank 1: +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] MPI Rank 1: +MPI Rank 1: Validating for node MeanOfFeatures, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] MPI Rank 1: MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 1: MPI Rank 1: -MPI Rank 1: Validating node Prior MPI Rank 1: -MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 64] -MPI Rank 1: Validating --> Prior = Mean(labels[132, MBSize 64]) -> [132, 1] +MPI Rank 1: Validating for node Prior. 2 nodes to process in pass 1. MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 1: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] MPI Rank 1: +MPI Rank 1: Validating for node Prior. 1 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 1: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] +MPI Rank 1: +MPI Rank 1: Validating for node Prior, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 1: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] MPI Rank 1: MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Precomputing --> Completed. +MPI Rank 1: MPI Rank 1: Set Max Temp Mem Size For Convolution Nodes to 0 samples. -MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.015625 momentum = 0.900000 +MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.015625 effective momentum = 0.900000 MPI Rank 1: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 1 of 3, with 1 datapasses MPI Rank 1: -MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. MPI Rank 1: +MPI Rank 1: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. MPI Rank 1: -MPI Rank 1: Validating node EvalErrorPrediction -MPI Rank 1: -MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 15] +MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 62] MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512] MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512] MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363] -MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 15] -MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 15]) -> [363, 1] -MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 15]) -> [363, 1] -MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 15], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 15] -MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 15]) -> [512, MBSize 15] +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1] -MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 15], B0[512, 1]) -> [512, MBSize 15] -MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 15]) -> [512, MBSize 15] -MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 15]) -> [512, MBSize 15] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1] -MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 15], B1[512, 1]) -> [512, MBSize 15] -MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 15]) -> [512, MBSize 15] -MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 15]) -> [132, MBSize 15] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1] -MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 15], B2[132, 1]) -> [132, MBSize 15] -MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 15], HLast[132, MBSize 15]) -> [1, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] MPI Rank 1: +MPI Rank 1: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2. MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 62] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] +MPI Rank 1: +MPI Rank 1: Validating for node EvalErrorPrediction, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 62] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] MPI Rank 1: MPI Rank 1: 9 out of 20 nodes do not share the minibatch layout with the input data. -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 1- 10 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.45646170; EvalErr[0]PerSample = 0.92500000; TotalTime = 0.50010s; TotalTimePerSample = 0.78140ms; SamplesPerSecond = 1279 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 11- 20 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.22315661; EvalErr[0]PerSample = 0.90156250; TotalTime = 0.41657s; TotalTimePerSample = 0.65089ms; SamplesPerSecond = 1536 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 21- 30 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.95180607; EvalErr[0]PerSample = 0.84687500; TotalTime = 0.39416s; TotalTimePerSample = 0.61588ms; SamplesPerSecond = 1623 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 31- 40 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.94158019; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.40678s; TotalTimePerSample = 0.63559ms; SamplesPerSecond = 1573 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 41- 50 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.85668726; EvalErr[0]PerSample = 0.91093750; TotalTime = 0.38408s; TotalTimePerSample = 0.60012ms; SamplesPerSecond = 1666 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 51- 60 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.72866371; EvalErr[0]PerSample = 0.89531250; TotalTime = 0.38362s; TotalTimePerSample = 0.59941ms; SamplesPerSecond = 1668 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 61- 70 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.51808934; EvalErr[0]PerSample = 0.82968750; TotalTime = 0.37649s; TotalTimePerSample = 0.58827ms; SamplesPerSecond = 1699 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 71- 80 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.48455124; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.37226s; TotalTimePerSample = 0.58166ms; SamplesPerSecond = 1719 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 81- 90 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.33829281; EvalErr[0]PerSample = 0.76875000; TotalTime = 0.38200s; TotalTimePerSample = 0.59687ms; SamplesPerSecond = 1675 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.50167446; EvalErr[0]PerSample = 0.79843750; TotalTime = 0.37789s; TotalTimePerSample = 0.59045ms; SamplesPerSecond = 1693 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.22861682; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.37564s; TotalTimePerSample = 0.58693ms; SamplesPerSecond = 1703 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.32616995; EvalErr[0]PerSample = 0.79062500; TotalTime = 0.37496s; TotalTimePerSample = 0.58588ms; SamplesPerSecond = 1706 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.16897953; EvalErr[0]PerSample = 0.77968750; TotalTime = 0.37711s; TotalTimePerSample = 0.58923ms; SamplesPerSecond = 1697 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.08892002; EvalErr[0]PerSample = 0.77656250; TotalTime = 0.38239s; TotalTimePerSample = 0.59748ms; SamplesPerSecond = 1673 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.06004848; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.37484s; TotalTimePerSample = 0.58569ms; SamplesPerSecond = 1707 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.91128321; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.37009s; TotalTimePerSample = 0.57826ms; SamplesPerSecond = 1729 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.90171920; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.37731s; TotalTimePerSample = 0.58955ms; SamplesPerSecond = 1696 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.73262413; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.37414s; TotalTimePerSample = 0.58459ms; SamplesPerSecond = 1710 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.66515363; EvalErr[0]PerSample = 0.68437500; TotalTime = 0.38771s; TotalTimePerSample = 0.60580ms; SamplesPerSecond = 1650 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.67382489; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.37570s; TotalTimePerSample = 0.58703ms; SamplesPerSecond = 1703 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.52869718; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37950s; TotalTimePerSample = 0.59297ms; SamplesPerSecond = 1686 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.60031970; EvalErr[0]PerSample = 0.66718750; TotalTime = 0.37925s; TotalTimePerSample = 0.59258ms; SamplesPerSecond = 1687 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.51134087; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.37868s; TotalTimePerSample = 0.59169ms; SamplesPerSecond = 1690 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.45362164; EvalErr[0]PerSample = 0.63750000; TotalTime = 0.37507s; TotalTimePerSample = 0.58605ms; SamplesPerSecond = 1706 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.41640677; EvalErr[0]PerSample = 0.61562500; TotalTime = 0.36971s; TotalTimePerSample = 0.57767ms; SamplesPerSecond = 1731 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.39745369; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.37111s; TotalTimePerSample = 0.57986ms; SamplesPerSecond = 1724 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.16416032; EvalErr[0]PerSample = 0.56718750; TotalTime = 0.37534s; TotalTimePerSample = 0.58647ms; SamplesPerSecond = 1705 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.30346910; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37964s; TotalTimePerSample = 0.59319ms; SamplesPerSecond = 1685 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.24398823; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.37259s; TotalTimePerSample = 0.58217ms; SamplesPerSecond = 1717 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.15322470; EvalErr[0]PerSample = 0.57968750; TotalTime = 0.37329s; TotalTimePerSample = 0.58327ms; SamplesPerSecond = 1714 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.21664598; EvalErr[0]PerSample = 0.59531250; TotalTime = 0.38547s; TotalTimePerSample = 0.60230ms; SamplesPerSecond = 1660 -MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.25246635; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.37794s; TotalTimePerSample = 0.59053ms; SamplesPerSecond = 1693 -MPI Rank 1: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0000031; EvalErrPerSample = 0.72836914; Ave LearnRatePerSample = 0.015625; EpochTime=12.320764 -MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.001953 momentum = 0.656119 +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 1- 10 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.45645981; EvalErr[0]PerSample = 0.92500000; TotalTime = 0.33798s; TotalTimePerSample = 0.52810ms; SamplesPerSecond = 1893 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 11- 20 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.22315785; EvalErr[0]PerSample = 0.90156250; TotalTime = 0.28013s; TotalTimePerSample = 0.43770ms; SamplesPerSecond = 2284 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 21- 30 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.95180676; EvalErr[0]PerSample = 0.84687500; TotalTime = 0.25668s; TotalTimePerSample = 0.40105ms; SamplesPerSecond = 2493 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 31- 40 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.94158071; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.29127s; TotalTimePerSample = 0.45511ms; SamplesPerSecond = 2197 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 41- 50 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.85668763; EvalErr[0]PerSample = 0.91093750; TotalTime = 0.26257s; TotalTimePerSample = 0.41026ms; SamplesPerSecond = 2437 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 51- 60 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.72866399; EvalErr[0]PerSample = 0.89531250; TotalTime = 0.24697s; TotalTimePerSample = 0.38590ms; SamplesPerSecond = 2591 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 61- 70 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.51808951; EvalErr[0]PerSample = 0.82968750; TotalTime = 0.25941s; TotalTimePerSample = 0.40532ms; SamplesPerSecond = 2467 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 71- 80 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.48455147; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.25278s; TotalTimePerSample = 0.39497ms; SamplesPerSecond = 2531 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 81- 90 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.33829288; EvalErr[0]PerSample = 0.76875000; TotalTime = 0.23921s; TotalTimePerSample = 0.37376ms; SamplesPerSecond = 2675 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.50167490; EvalErr[0]PerSample = 0.79843750; TotalTime = 0.24341s; TotalTimePerSample = 0.38032ms; SamplesPerSecond = 2629 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.22861768; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.23534s; TotalTimePerSample = 0.36772ms; SamplesPerSecond = 2719 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.32617094; EvalErr[0]PerSample = 0.79062500; TotalTime = 0.22909s; TotalTimePerSample = 0.35795ms; SamplesPerSecond = 2793 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.16898033; EvalErr[0]PerSample = 0.77968750; TotalTime = 0.21901s; TotalTimePerSample = 0.34220ms; SamplesPerSecond = 2922 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.08892100; EvalErr[0]PerSample = 0.77656250; TotalTime = 0.21937s; TotalTimePerSample = 0.34277ms; SamplesPerSecond = 2917 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.06004828; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.24068s; TotalTimePerSample = 0.37606ms; SamplesPerSecond = 2659 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.91128317; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.32338s; TotalTimePerSample = 0.50528ms; SamplesPerSecond = 1979 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.90171901; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.32313s; TotalTimePerSample = 0.50489ms; SamplesPerSecond = 1980 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.73262447; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.33982s; TotalTimePerSample = 0.53097ms; SamplesPerSecond = 1883 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.66515410; EvalErr[0]PerSample = 0.68437500; TotalTime = 0.34907s; TotalTimePerSample = 0.54543ms; SamplesPerSecond = 1833 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.67382540; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.33119s; TotalTimePerSample = 0.51748ms; SamplesPerSecond = 1932 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.52869780; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.35522s; TotalTimePerSample = 0.55503ms; SamplesPerSecond = 1801 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.60032086; EvalErr[0]PerSample = 0.66718750; TotalTime = 0.36689s; TotalTimePerSample = 0.57327ms; SamplesPerSecond = 1744 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.51134188; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.33598s; TotalTimePerSample = 0.52497ms; SamplesPerSecond = 1904 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.45362252; EvalErr[0]PerSample = 0.63750000; TotalTime = 0.34488s; TotalTimePerSample = 0.53888ms; SamplesPerSecond = 1855 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.41640740; EvalErr[0]PerSample = 0.61562500; TotalTime = 0.33076s; TotalTimePerSample = 0.51681ms; SamplesPerSecond = 1934 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.39745478; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.35552s; TotalTimePerSample = 0.55550ms; SamplesPerSecond = 1800 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.16416053; EvalErr[0]PerSample = 0.56718750; TotalTime = 0.36347s; TotalTimePerSample = 0.56791ms; SamplesPerSecond = 1760 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.30346869; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37626s; TotalTimePerSample = 0.58791ms; SamplesPerSecond = 1700 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.24398831; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.36571s; TotalTimePerSample = 0.57142ms; SamplesPerSecond = 1750 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.15322487; EvalErr[0]PerSample = 0.57968750; TotalTime = 0.34686s; TotalTimePerSample = 0.54198ms; SamplesPerSecond = 1845 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.21664627; EvalErr[0]PerSample = 0.59531250; TotalTime = 0.34436s; TotalTimePerSample = 0.53806ms; SamplesPerSecond = 1858 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.25246685; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.33067s; TotalTimePerSample = 0.51667ms; SamplesPerSecond = 1935 +MPI Rank 1: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0000035; EvalErrPerSample = 0.72836914; Ave LearnRatePerSample = 0.015625; EpochTime=9.771287 +MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.001953 effective momentum = 0.656119 MPI Rank 1: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 1 of 3, with 1 datapasses MPI Rank 1: MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. -MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 1- 10 of 80]: SamplesSeen = 2560; TrainLossPerSample = 2.08151923; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.49230s; TotalTimePerSample = 0.19230ms; SamplesPerSecond = 5200 -MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 11- 20 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98395650; EvalErr[0]PerSample = 0.54257813; TotalTime = 0.45941s; TotalTimePerSample = 0.17946ms; SamplesPerSecond = 5572 -MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 21- 30 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98575441; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.47004s; TotalTimePerSample = 0.18361ms; SamplesPerSecond = 5446 -MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 31- 40 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90485007; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.47297s; TotalTimePerSample = 0.18476ms; SamplesPerSecond = 5412 -MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 41- 50 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.88324108; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.48623s; TotalTimePerSample = 0.18993ms; SamplesPerSecond = 5265 -MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 51- 60 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89109287; EvalErr[0]PerSample = 0.53359375; TotalTime = 0.48019s; TotalTimePerSample = 0.18757ms; SamplesPerSecond = 5331 -MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 61- 70 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89496218; EvalErr[0]PerSample = 0.52890625; TotalTime = 0.45120s; TotalTimePerSample = 0.17625ms; SamplesPerSecond = 5673 -MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 71- 80 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.85944253; EvalErr[0]PerSample = 0.52265625; TotalTime = 0.43800s; TotalTimePerSample = 0.17109ms; SamplesPerSecond = 5844 -MPI Rank 1: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9356024; EvalErrPerSample = 0.53603516; Ave LearnRatePerSample = 0.001953125; EpochTime=3.770171 -MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000098 momentum = 0.656119 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 1- 10 of 80]: SamplesSeen = 2560; TrainLossPerSample = 2.08151948; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.56765s; TotalTimePerSample = 0.22174ms; SamplesPerSecond = 4509 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 11- 20 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98395688; EvalErr[0]PerSample = 0.54257813; TotalTime = 0.46990s; TotalTimePerSample = 0.18356ms; SamplesPerSecond = 5447 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 21- 30 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98575479; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.42066s; TotalTimePerSample = 0.16432ms; SamplesPerSecond = 6085 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 31- 40 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90485039; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.33456s; TotalTimePerSample = 0.13069ms; SamplesPerSecond = 7651 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 41- 50 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.88324146; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.31133s; TotalTimePerSample = 0.12161ms; SamplesPerSecond = 8222 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 51- 60 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89109327; EvalErr[0]PerSample = 0.53359375; TotalTime = 0.29677s; TotalTimePerSample = 0.11593ms; SamplesPerSecond = 8626 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 61- 70 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89496253; EvalErr[0]PerSample = 0.52890625; TotalTime = 0.32641s; TotalTimePerSample = 0.12750ms; SamplesPerSecond = 7842 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 71- 80 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.85944295; EvalErr[0]PerSample = 0.52265625; TotalTime = 0.34823s; TotalTimePerSample = 0.13603ms; SamplesPerSecond = 7351 +MPI Rank 1: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9356027; EvalErrPerSample = 0.53603516; Ave LearnRatePerSample = 0.001953125; EpochTime=3.105348 +MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 MPI Rank 1: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 1 of 3, with 1 datapasses MPI Rank 1: MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. -MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86752815; EvalErr[0]PerSample = 0.52177734; TotalTime = 0.95579s; TotalTimePerSample = 0.09334ms; SamplesPerSecond = 10713 -MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87358797; EvalErr[0]PerSample = 0.51542969; TotalTime = 0.79393s; TotalTimePerSample = 0.07753ms; SamplesPerSecond = 12897 -MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705581; EvalErrPerSample = 0.51860352; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=1.792467 +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86752856; EvalErr[0]PerSample = 0.52177734; TotalTime = 1.01546s; TotalTimePerSample = 0.09917ms; SamplesPerSecond = 10084 +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87358831; EvalErr[0]PerSample = 0.51542969; TotalTime = 0.82899s; TotalTimePerSample = 0.08096ms; SamplesPerSecond = 12352 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705584; EvalErrPerSample = 0.51860352; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=1.90997 MPI Rank 1: CNTKCommandTrainEnd: speechTrain MPI Rank 1: COMPLETED MPI Rank 1: ~MPIWrapper -MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank2 +MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank2 MPI Rank 2: ------------------------------------------------------------------- MPI Rank 2: Build info: MPI Rank 2: -MPI Rank 2: Built time: Oct 2 2015 13:14:34 -MPI Rank 2: Last modified date: Fri Oct 2 13:09:06 2015 +MPI Rank 2: Built time: Oct 24 2015 13:33:25 +MPI Rank 2: Last modified date: Thu Oct 22 16:00:27 2015 MPI Rank 2: Built by amitaga on Amitaga-Win-DT3 -MPI Rank 2: Build Path: E:\NetScale\CNTK\git_repos\cplx_master\MachineLearning\CNTK\ +MPI Rank 2: Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\ MPI Rank 2: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 MPI Rank 2: ------------------------------------------------------------------- -MPI Rank 2: running on Amitaga-Win-DT3 at 2015/10/02 21:20:30 -MPI Rank 2: command line options: -MPI Rank 2: configFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\DNN\cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 2: running on Amitaga-Win-DT3 at 2015/10/24 22:14:13 +MPI Rank 2: command line: +MPI Rank 2: E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 2: MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> MPI Rank 2: precision=float @@ -1081,10 +1306,11 @@ MPI Rank 2: labelType=Category MPI Rank 2: ] MPI Rank 2: ] MPI Rank 2: ] -MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu -MPI Rank 2: DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data +MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu +MPI Rank 2: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +MPI Rank 2: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN MPI Rank 2: DeviceId=0 -MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 2: MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< MPI Rank 2: @@ -1095,7 +1321,7 @@ MPI Rank 2: deviceId=0 MPI Rank 2: parallelTrain=true MPI Rank 2: speechTrain=[ MPI Rank 2: action=train -MPI Rank 2: modelPath=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn +MPI Rank 2: modelPath=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 2: deviceId=0 MPI Rank 2: traceLevel=1 MPI Rank 2: SimpleNetworkBuilder=[ @@ -1171,30 +1397,32 @@ MPI Rank 2: type=Real MPI Rank 2: scpFile=glob_0000.scp MPI Rank 2: ] MPI Rank 2: labels=[ -MPI Rank 2: mlfFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf -MPI Rank 2: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list +MPI Rank 2: mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf +MPI Rank 2: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list MPI Rank 2: labelDim=132 MPI Rank 2: labelType=Category MPI Rank 2: ] MPI Rank 2: ] MPI Rank 2: ] -MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu -MPI Rank 2: DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data +MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu +MPI Rank 2: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +MPI Rank 2: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN MPI Rank 2: DeviceId=0 -MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 2: MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 2: MPI Rank 2: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> MPI Rank 2: configparameters: cntk.config:command=speechTrain -MPI Rank 2: configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data +MPI Rank 2: configparameters: cntk.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN +MPI Rank 2: configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data MPI Rank 2: configparameters: cntk.config:deviceId=0 MPI Rank 2: configparameters: cntk.config:parallelTrain=true MPI Rank 2: configparameters: cntk.config:precision=float -MPI Rank 2: configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu +MPI Rank 2: configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu MPI Rank 2: configparameters: cntk.config:speechTrain=[ MPI Rank 2: action=train -MPI Rank 2: modelPath=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn +MPI Rank 2: modelPath=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 2: deviceId=0 MPI Rank 2: traceLevel=1 MPI Rank 2: SimpleNetworkBuilder=[ @@ -1270,34 +1498,36 @@ MPI Rank 2: type=Real MPI Rank 2: scpFile=glob_0000.scp MPI Rank 2: ] MPI Rank 2: labels=[ -MPI Rank 2: mlfFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf -MPI Rank 2: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list +MPI Rank 2: mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf +MPI Rank 2: labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list MPI Rank 2: labelDim=132 MPI Rank 2: labelType=Category MPI Rank 2: ] MPI Rank 2: ] MPI Rank 2: ] MPI Rank 2: -MPI Rank 2: configparameters: cntk.config:stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr +MPI Rank 2: configparameters: cntk.config:stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr MPI Rank 2: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< MPI Rank 2: command: speechTrain MPI Rank 2: precision = float +MPI Rank 2: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn MPI Rank 2: CNTKCommandTrainInfo: speechTrain : 3 MPI Rank 2: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 MPI Rank 2: CNTKCommandTrainBegin: speechTrain MPI Rank 2: SimpleNetworkBuilder Using GPU 0 MPI Rank 2: reading script file glob_0000.scp ... 948 entries MPI Rank 2: trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion -MPI Rank 2: total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list -MPI Rank 2: htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf ... total 948 entries +MPI Rank 2: total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list +MPI Rank 2: htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf ... total 948 entries MPI Rank 2: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances MPI Rank 2: label set 0: 129 classes MPI Rank 2: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames +MPI Rank 2: SetUniformRandomValue (GPU): creating curand object with seed 1 MPI Rank 2: GetTrainCriterionNodes ... MPI Rank 2: GetEvalCriterionNodes ... MPI Rank 2: MPI Rank 2: -MPI Rank 2: Validating node CrossEntropyWithSoftmax +MPI Rank 2: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. MPI Rank 2: MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3] MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512] @@ -1320,13 +1550,57 @@ MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1] MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] MPI Rank 2: +MPI Rank 2: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] +MPI Rank 2: +MPI Rank 2: Validating for node CrossEntropyWithSoftmax, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] MPI Rank 2: MPI Rank 2: 9 out of 20 nodes do not share the minibatch layout with the input data. -MPI Rank 2: Found 6 PreCompute nodes -MPI Rank 2: NodeName: InvStdOfFeatures -MPI Rank 2: NodeName: MeanOfFeatures -MPI Rank 2: NodeName: Prior +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Precomputing --> 3 PreCompute nodes found. +MPI Rank 2: MPI Rank 2: NodeName: InvStdOfFeatures MPI Rank 2: NodeName: MeanOfFeatures MPI Rank 2: NodeName: Prior @@ -1334,120 +1608,185 @@ MPI Rank 2: minibatchiterator: epoch 0: frames [0..252734] (first utterance at f MPI Rank 2: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms MPI Rank 2: MPI Rank 2: -MPI Rank 2: Validating node InvStdOfFeatures +MPI Rank 2: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1. MPI Rank 2: -MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 64] -MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 64]) -> [363, 1] +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] MPI Rank 2: +MPI Rank 2: Validating for node InvStdOfFeatures, final verification. MPI Rank 2: +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] MPI Rank 2: MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 2: MPI Rank 2: -MPI Rank 2: Validating node MeanOfFeatures MPI Rank 2: -MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 64] -MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 64]) -> [363, 1] +MPI Rank 2: Validating for node MeanOfFeatures. 2 nodes to process in pass 1. MPI Rank 2: +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] MPI Rank 2: +MPI Rank 2: Validating for node MeanOfFeatures, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] MPI Rank 2: MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data. MPI Rank 2: MPI Rank 2: -MPI Rank 2: Validating node Prior MPI Rank 2: -MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 64] -MPI Rank 2: Validating --> Prior = Mean(labels[132, MBSize 64]) -> [132, 1] +MPI Rank 2: Validating for node Prior. 2 nodes to process in pass 1. MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 2: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] MPI Rank 2: +MPI Rank 2: Validating for node Prior. 1 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 2: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] +MPI Rank 2: +MPI Rank 2: Validating for node Prior, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3] +MPI Rank 2: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] MPI Rank 2: MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Precomputing --> Completed. +MPI Rank 2: MPI Rank 2: Set Max Temp Mem Size For Convolution Nodes to 0 samples. -MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.015625 momentum = 0.900000 +MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.015625 effective momentum = 0.900000 MPI Rank 2: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 2 of 3, with 1 datapasses MPI Rank 2: -MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. MPI Rank 2: +MPI Rank 2: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. MPI Rank 2: -MPI Rank 2: Validating node EvalErrorPrediction -MPI Rank 2: -MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 16] +MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 62] MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512] MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512] MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363] -MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 16] -MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 16]) -> [363, 1] -MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 16]) -> [363, 1] -MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 16], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 16] -MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 16]) -> [512, MBSize 16] +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1] -MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 16], B0[512, 1]) -> [512, MBSize 16] -MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 16]) -> [512, MBSize 16] -MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 16]) -> [512, MBSize 16] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1] -MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 16], B1[512, 1]) -> [512, MBSize 16] -MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 16]) -> [512, MBSize 16] -MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 16]) -> [132, MBSize 16] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1] -MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 16], B2[132, 1]) -> [132, MBSize 16] -MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 16], HLast[132, MBSize 16]) -> [1, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] MPI Rank 2: +MPI Rank 2: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2. MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 62] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] +MPI Rank 2: +MPI Rank 2: Validating for node EvalErrorPrediction, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 62] +MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512] +MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512] +MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363] +MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 62] +MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1] +MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1] +MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1] +MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] MPI Rank 2: MPI Rank 2: 9 out of 20 nodes do not share the minibatch layout with the input data. -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 1- 10 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.45646170; EvalErr[0]PerSample = 0.92500000; TotalTime = 0.49987s; TotalTimePerSample = 0.78104ms; SamplesPerSecond = 1280 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 11- 20 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.22315661; EvalErr[0]PerSample = 0.90156250; TotalTime = 0.41635s; TotalTimePerSample = 0.65054ms; SamplesPerSecond = 1537 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 21- 30 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.95180607; EvalErr[0]PerSample = 0.84687500; TotalTime = 0.39403s; TotalTimePerSample = 0.61567ms; SamplesPerSecond = 1624 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 31- 40 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.94158019; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.40679s; TotalTimePerSample = 0.63560ms; SamplesPerSecond = 1573 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 41- 50 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.85668726; EvalErr[0]PerSample = 0.91093750; TotalTime = 0.38405s; TotalTimePerSample = 0.60008ms; SamplesPerSecond = 1666 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 51- 60 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.72866371; EvalErr[0]PerSample = 0.89531250; TotalTime = 0.38342s; TotalTimePerSample = 0.59910ms; SamplesPerSecond = 1669 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 61- 70 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.51808934; EvalErr[0]PerSample = 0.82968750; TotalTime = 0.37619s; TotalTimePerSample = 0.58779ms; SamplesPerSecond = 1701 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 71- 80 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.48455124; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.37204s; TotalTimePerSample = 0.58132ms; SamplesPerSecond = 1720 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 81- 90 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.33829281; EvalErr[0]PerSample = 0.76875000; TotalTime = 0.38168s; TotalTimePerSample = 0.59637ms; SamplesPerSecond = 1676 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.50167446; EvalErr[0]PerSample = 0.79843750; TotalTime = 0.37763s; TotalTimePerSample = 0.59004ms; SamplesPerSecond = 1694 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.22861682; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.37533s; TotalTimePerSample = 0.58645ms; SamplesPerSecond = 1705 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.32616995; EvalErr[0]PerSample = 0.79062500; TotalTime = 0.37460s; TotalTimePerSample = 0.58531ms; SamplesPerSecond = 1708 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.16897953; EvalErr[0]PerSample = 0.77968750; TotalTime = 0.37657s; TotalTimePerSample = 0.58838ms; SamplesPerSecond = 1699 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.08892002; EvalErr[0]PerSample = 0.77656250; TotalTime = 0.38202s; TotalTimePerSample = 0.59690ms; SamplesPerSecond = 1675 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.06004848; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.37470s; TotalTimePerSample = 0.58546ms; SamplesPerSecond = 1708 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.91128321; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.36968s; TotalTimePerSample = 0.57763ms; SamplesPerSecond = 1731 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.90171920; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.37682s; TotalTimePerSample = 0.58879ms; SamplesPerSecond = 1698 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.73262413; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.37381s; TotalTimePerSample = 0.58408ms; SamplesPerSecond = 1712 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.66515363; EvalErr[0]PerSample = 0.68437500; TotalTime = 0.38731s; TotalTimePerSample = 0.60517ms; SamplesPerSecond = 1652 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.67382489; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.37515s; TotalTimePerSample = 0.58618ms; SamplesPerSecond = 1705 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.52869718; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37936s; TotalTimePerSample = 0.59275ms; SamplesPerSecond = 1687 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.60031970; EvalErr[0]PerSample = 0.66718750; TotalTime = 0.37877s; TotalTimePerSample = 0.59183ms; SamplesPerSecond = 1689 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.51134087; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.37837s; TotalTimePerSample = 0.59121ms; SamplesPerSecond = 1691 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.45362164; EvalErr[0]PerSample = 0.63750000; TotalTime = 0.37481s; TotalTimePerSample = 0.58564ms; SamplesPerSecond = 1707 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.41640677; EvalErr[0]PerSample = 0.61562500; TotalTime = 0.36931s; TotalTimePerSample = 0.57705ms; SamplesPerSecond = 1732 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.39745369; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.37068s; TotalTimePerSample = 0.57918ms; SamplesPerSecond = 1726 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.16416032; EvalErr[0]PerSample = 0.56718750; TotalTime = 0.37472s; TotalTimePerSample = 0.58550ms; SamplesPerSecond = 1707 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.30346910; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37953s; TotalTimePerSample = 0.59302ms; SamplesPerSecond = 1686 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.24398823; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.37227s; TotalTimePerSample = 0.58167ms; SamplesPerSecond = 1719 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.15322470; EvalErr[0]PerSample = 0.57968750; TotalTime = 0.37312s; TotalTimePerSample = 0.58301ms; SamplesPerSecond = 1715 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.21664598; EvalErr[0]PerSample = 0.59531250; TotalTime = 0.38484s; TotalTimePerSample = 0.60131ms; SamplesPerSecond = 1663 -MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.25246635; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.37786s; TotalTimePerSample = 0.59041ms; SamplesPerSecond = 1693 -MPI Rank 2: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0000031; EvalErrPerSample = 0.72836914; Ave LearnRatePerSample = 0.015625; EpochTime=12.321116 -MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.001953 momentum = 0.656119 +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 1- 10 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.45645981; EvalErr[0]PerSample = 0.92500000; TotalTime = 0.35109s; TotalTimePerSample = 0.54858ms; SamplesPerSecond = 1822 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 11- 20 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.22315785; EvalErr[0]PerSample = 0.90156250; TotalTime = 0.28226s; TotalTimePerSample = 0.44103ms; SamplesPerSecond = 2267 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 21- 30 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.95180676; EvalErr[0]PerSample = 0.84687500; TotalTime = 0.25635s; TotalTimePerSample = 0.40055ms; SamplesPerSecond = 2496 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 31- 40 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.94158071; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.29191s; TotalTimePerSample = 0.45612ms; SamplesPerSecond = 2192 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 41- 50 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.85668763; EvalErr[0]PerSample = 0.91093750; TotalTime = 0.26100s; TotalTimePerSample = 0.40781ms; SamplesPerSecond = 2452 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 51- 60 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.72866399; EvalErr[0]PerSample = 0.89531250; TotalTime = 0.24718s; TotalTimePerSample = 0.38621ms; SamplesPerSecond = 2589 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 61- 70 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.51808951; EvalErr[0]PerSample = 0.82968750; TotalTime = 0.26306s; TotalTimePerSample = 0.41103ms; SamplesPerSecond = 2432 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 71- 80 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.48455147; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.25239s; TotalTimePerSample = 0.39436ms; SamplesPerSecond = 2535 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 81- 90 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.33829288; EvalErr[0]PerSample = 0.76875000; TotalTime = 0.23938s; TotalTimePerSample = 0.37404ms; SamplesPerSecond = 2673 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.50167490; EvalErr[0]PerSample = 0.79843750; TotalTime = 0.24288s; TotalTimePerSample = 0.37949ms; SamplesPerSecond = 2635 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.22861768; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.23594s; TotalTimePerSample = 0.36866ms; SamplesPerSecond = 2712 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.32617094; EvalErr[0]PerSample = 0.79062500; TotalTime = 0.23051s; TotalTimePerSample = 0.36018ms; SamplesPerSecond = 2776 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.16898033; EvalErr[0]PerSample = 0.77968750; TotalTime = 0.21907s; TotalTimePerSample = 0.34230ms; SamplesPerSecond = 2921 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.08892100; EvalErr[0]PerSample = 0.77656250; TotalTime = 0.21842s; TotalTimePerSample = 0.34128ms; SamplesPerSecond = 2930 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.06004828; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.23960s; TotalTimePerSample = 0.37438ms; SamplesPerSecond = 2671 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.91128317; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.32538s; TotalTimePerSample = 0.50841ms; SamplesPerSecond = 1966 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.90171901; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.32097s; TotalTimePerSample = 0.50152ms; SamplesPerSecond = 1993 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.73262447; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.33493s; TotalTimePerSample = 0.52334ms; SamplesPerSecond = 1910 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.66515410; EvalErr[0]PerSample = 0.68437500; TotalTime = 0.35758s; TotalTimePerSample = 0.55872ms; SamplesPerSecond = 1789 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.67382540; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.32538s; TotalTimePerSample = 0.50840ms; SamplesPerSecond = 1966 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.52869780; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.36246s; TotalTimePerSample = 0.56634ms; SamplesPerSecond = 1765 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.60032086; EvalErr[0]PerSample = 0.66718750; TotalTime = 0.35998s; TotalTimePerSample = 0.56246ms; SamplesPerSecond = 1777 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.51134188; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.33777s; TotalTimePerSample = 0.52777ms; SamplesPerSecond = 1894 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.45362252; EvalErr[0]PerSample = 0.63750000; TotalTime = 0.34429s; TotalTimePerSample = 0.53796ms; SamplesPerSecond = 1858 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.41640740; EvalErr[0]PerSample = 0.61562500; TotalTime = 0.33129s; TotalTimePerSample = 0.51763ms; SamplesPerSecond = 1931 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.39745478; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.35530s; TotalTimePerSample = 0.55515ms; SamplesPerSecond = 1801 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.16416053; EvalErr[0]PerSample = 0.56718750; TotalTime = 0.36414s; TotalTimePerSample = 0.56897ms; SamplesPerSecond = 1757 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.30346869; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37427s; TotalTimePerSample = 0.58480ms; SamplesPerSecond = 1710 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.24398831; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.36414s; TotalTimePerSample = 0.56897ms; SamplesPerSecond = 1757 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.15322487; EvalErr[0]PerSample = 0.57968750; TotalTime = 0.35425s; TotalTimePerSample = 0.55351ms; SamplesPerSecond = 1806 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.21664627; EvalErr[0]PerSample = 0.59531250; TotalTime = 0.34279s; TotalTimePerSample = 0.53561ms; SamplesPerSecond = 1867 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.25246685; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.33153s; TotalTimePerSample = 0.51802ms; SamplesPerSecond = 1930 +MPI Rank 2: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0000035; EvalErrPerSample = 0.72836914; Ave LearnRatePerSample = 0.015625; EpochTime=9.773289 +MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.001953 effective momentum = 0.656119 MPI Rank 2: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 2 of 3, with 1 datapasses MPI Rank 2: MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. -MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 1- 10 of 80]: SamplesSeen = 2560; TrainLossPerSample = 2.08151923; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.49122s; TotalTimePerSample = 0.19188ms; SamplesPerSecond = 5211 -MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 11- 20 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98395650; EvalErr[0]PerSample = 0.54257813; TotalTime = 0.45895s; TotalTimePerSample = 0.17928ms; SamplesPerSecond = 5577 -MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 21- 30 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98575441; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.46993s; TotalTimePerSample = 0.18357ms; SamplesPerSecond = 5447 -MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 31- 40 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90485007; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.47271s; TotalTimePerSample = 0.18465ms; SamplesPerSecond = 5415 -MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 41- 50 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.88324108; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.48627s; TotalTimePerSample = 0.18995ms; SamplesPerSecond = 5264 -MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 51- 60 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89109287; EvalErr[0]PerSample = 0.53359375; TotalTime = 0.48005s; TotalTimePerSample = 0.18752ms; SamplesPerSecond = 5332 -MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 61- 70 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89496218; EvalErr[0]PerSample = 0.52890625; TotalTime = 0.45117s; TotalTimePerSample = 0.17624ms; SamplesPerSecond = 5674 -MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 71- 80 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.85944253; EvalErr[0]PerSample = 0.52265625; TotalTime = 0.43751s; TotalTimePerSample = 0.17090ms; SamplesPerSecond = 5851 -MPI Rank 2: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9356024; EvalErrPerSample = 0.53603516; Ave LearnRatePerSample = 0.001953125; EpochTime=3.770598 -MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.000098 momentum = 0.656119 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 1- 10 of 80]: SamplesSeen = 2560; TrainLossPerSample = 2.08151948; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.57966s; TotalTimePerSample = 0.22643ms; SamplesPerSecond = 4416 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 11- 20 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98395688; EvalErr[0]PerSample = 0.54257813; TotalTime = 0.46937s; TotalTimePerSample = 0.18335ms; SamplesPerSecond = 5454 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 21- 30 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98575479; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.42189s; TotalTimePerSample = 0.16480ms; SamplesPerSecond = 6067 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 31- 40 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90485039; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.33354s; TotalTimePerSample = 0.13029ms; SamplesPerSecond = 7675 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 41- 50 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.88324146; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.31174s; TotalTimePerSample = 0.12177ms; SamplesPerSecond = 8211 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 51- 60 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89109327; EvalErr[0]PerSample = 0.53359375; TotalTime = 0.29732s; TotalTimePerSample = 0.11614ms; SamplesPerSecond = 8610 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 61- 70 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89496253; EvalErr[0]PerSample = 0.52890625; TotalTime = 0.32607s; TotalTimePerSample = 0.12737ms; SamplesPerSecond = 7851 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 71- 80 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.85944295; EvalErr[0]PerSample = 0.52265625; TotalTime = 0.34337s; TotalTimePerSample = 0.13413ms; SamplesPerSecond = 7455 +MPI Rank 2: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9356027; EvalErrPerSample = 0.53603516; Ave LearnRatePerSample = 0.001953125; EpochTime=3.101094 +MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 MPI Rank 2: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 2 of 3, with 1 datapasses MPI Rank 2: MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED. -MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86752815; EvalErr[0]PerSample = 0.52177734; TotalTime = 0.96478s; TotalTimePerSample = 0.09422ms; SamplesPerSecond = 10613 -MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87358797; EvalErr[0]PerSample = 0.51542969; TotalTime = 0.79373s; TotalTimePerSample = 0.07751ms; SamplesPerSecond = 12901 -MPI Rank 2: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705581; EvalErrPerSample = 0.51860352; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=1.792243 +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86752856; EvalErr[0]PerSample = 0.52177734; TotalTime = 1.03760s; TotalTimePerSample = 0.10133ms; SamplesPerSecond = 9868 +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87358831; EvalErr[0]PerSample = 0.51542969; TotalTime = 0.82561s; TotalTimePerSample = 0.08063ms; SamplesPerSecond = 12402 +MPI Rank 2: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705584; EvalErrPerSample = 0.51860352; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=1.906446 MPI Rank 2: CNTKCommandTrainEnd: speechTrain MPI Rank 2: COMPLETED MPI Rank 2: ~MPIWrapper diff --git a/Tests/Speech/LSTM/FullUtterance/baseline.gpu.txt b/Tests/Speech/LSTM/FullUtterance/baseline.gpu.txt index 84158c7cc..44f79cf16 100644 --- a/Tests/Speech/LSTM/FullUtterance/baseline.gpu.txt +++ b/Tests/Speech/LSTM/FullUtterance/baseline.gpu.txt @@ -1,7 +1,7 @@ -=== Running /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/../cntk.config RunDir=/tmp/cntk-test-20151001135442.141617/Speech/LSTM_FullUtterance@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/.. Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]] -running on localhost at 2015/10/01 13:54:42 -command line options: -configFile=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/../cntk.config RunDir=/tmp/cntk-test-20151001135442.141617/Speech/LSTM_FullUtterance@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/.. Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]] +=== Running /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/../cntk.config RunDir=/tmp/cntk-test-20151024125903.915765/Speech/LSTM_FullUtterance@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/.. DeviceId=0 Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]] +running on localhost at 2015/10/24 12:59:03 +command line: +/home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/../cntk.config RunDir=/tmp/cntk-test-20151024125903.915765/Speech/LSTM_FullUtterance@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/.. DeviceId=0 Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]] >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> precision=float @@ -15,9 +15,6 @@ speechTrain=[ modelPath=$RunDir$/models/cntkSpeech.dnn deviceId=$DeviceId$ traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=$NDLDir$/lstmp-3layer_WithSelfStab.ndl - ] SGD=[ epochSize=20480 minibatchSize=20 @@ -191,10 +188,10 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features); ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output') // sadly we can't say x - y since we want to assign a tag ] ] -RunDir=/tmp/cntk-test-20151001135442.141617/Speech/LSTM_FullUtterance@debug_gpu +RunDir=/tmp/cntk-test-20151024125903.915765/Speech/LSTM_FullUtterance@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/.. DeviceId=0 -NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/.. Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] @@ -212,12 +209,9 @@ frameMode=false Truncated=true speechTrain=[ action=train - modelPath=/tmp/cntk-test-20151001135442.141617/Speech/LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn + modelPath=/tmp/cntk-test-20151024125903.915765/Speech/LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn deviceId=0 traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/../lstmp-3layer_WithSelfStab.ndl - ] SGD=[ epochSize=20480 minibatchSize=20 @@ -391,10 +385,10 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features); ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output') // sadly we can't say x - y since we want to assign a tag ] ] -RunDir=/tmp/cntk-test-20151001135442.141617/Speech/LSTM_FullUtterance@debug_gpu +RunDir=/tmp/cntk-test-20151024125903.915765/Speech/LSTM_FullUtterance@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/.. DeviceId=0 -NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/.. Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] @@ -405,21 +399,18 @@ speechTrain=[SGD=[numMBsToShowResult=1]] >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> configparameters: cntk.config:command=speechTrain +configparameters: cntk.config:ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/.. configparameters: cntk.config:DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data configparameters: cntk.config:deviceId=0 configparameters: cntk.config:frameMode=false -configparameters: cntk.config:NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/.. configparameters: cntk.config:parallelTrain=false configparameters: cntk.config:precision=float -configparameters: cntk.config:RunDir=/tmp/cntk-test-20151001135442.141617/Speech/LSTM_FullUtterance@debug_gpu +configparameters: cntk.config:RunDir=/tmp/cntk-test-20151024125903.915765/Speech/LSTM_FullUtterance@debug_gpu configparameters: cntk.config:speechTrain=[ action=train - modelPath=/tmp/cntk-test-20151001135442.141617/Speech/LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn + modelPath=/tmp/cntk-test-20151024125903.915765/Speech/LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn deviceId=0 traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/../lstmp-3layer_WithSelfStab.ndl - ] SGD=[ epochSize=20480 minibatchSize=20 @@ -598,9 +589,11 @@ configparameters: cntk.config:Truncated=false <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< command: speechTrain precision = float -CNTKModelPath: /tmp/cntk-test-20151001135442.141617/Speech/LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn +CNTKModelPath: /tmp/cntk-test-20151024125903.915765/Speech/LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn +CNTKCommandTrainInfo: speechTrain : 2 +CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 2 CNTKCommandTrainBegin: speechTrain -NDLBuilder Using GPU 0 +ExperimentalNetworkBuilder using GPU 0 reading script file /home/mluser/src/cplx_master/Tests/Speech/Data/glob_0000.scp ... 948 entries trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion total 132 state names in state list /home/mluser/src/cplx_master/Tests/Speech/Data/state.list @@ -608,2124 +601,3382 @@ htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Speech/Data/gl ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances label set 0: 129 classes minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames - nodes in the recurrent loops : -LSTMoutput1.unnamed174 LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.bit LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.unnamed224 LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.bit LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.unnamed274 LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.bit LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.unnamed174 LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.bit LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.unnamed224 LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.bit LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.unnamed274 LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.bit LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Printing Gradient Computation Node Order ... - -cr[0, 0] = CrossEntropyWithSoftmax(labels[132, 1], LSTMoutputW[0, 0]) -LSTMoutputW[0, 0] = Plus(unnamed283[0, 0], b[132, 1]) -b[132, 1] = LearnableParameter -unnamed283[0, 0] = Times(W[132, 256], unnamed284[0, 0]) -unnamed284[0, 0] = Scale(expsW[0, 0], LSTMoutput3.output[0, 0]) -LSTMoutput3.output[0, 0] = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[0, 0]) -LSTMoutput3.unnamed275[0, 0] = Scale(LSTMoutput3.expsWmr[0, 0], LSTMoutput3.mt[0, 0]) -LSTMoutput3.mt[0, 0] = ElementTimes(LSTMoutput3.ot[0, 0], LSTMoutput3.unnamed274[0, 0]) -LSTMoutput3.unnamed274[0, 0] = Tanh(LSTMoutput3.ct[0, 0]) -LSTMoutput3.ot[0, 0] = Sigmoid(LSTMoutput3.unnamed271[0, 0]) -LSTMoutput3.unnamed271[0, 0] = Plus(LSTMoutput3.unnamed272[0, 0], LSTMoutput3.Wcoct[0, 0]) -LSTMoutput3.Wcoct[0, 0] = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[0, 0]) -LSTMoutput3.unnamed270[0, 0] = Scale(LSTMoutput3.expsWco[0, 0], LSTMoutput3.ct[0, 0]) -LSTMoutput3.ct[0, 0] = Plus(LSTMoutput3.bft[0, 0], LSTMoutput3.bit[0, 0]) -LSTMoutput3.bit[0, 0] = ElementTimes(LSTMoutput3.it[0, 0], LSTMoutput3.unnamed259[0, 0]) -LSTMoutput3.unnamed259[0, 0] = Tanh(LSTMoutput3.unnamed260[0, 0]) -LSTMoutput3.unnamed260[0, 0] = Plus(LSTMoutput3.Wxcx[0, 0], LSTMoutput3.unnamed261[0, 0]) -LSTMoutput3.unnamed261[0, 0] = Plus(LSTMoutput3.Whcdh[0, 0], LSTMoutput3.bc[1024, 1]) -LSTMoutput3.Whcdh[0, 0] = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[0, 0]) -LSTMoutput3.unnamed258[0, 0] = Scale(LSTMoutput3.expsWhc[0, 0], LSTMoutput3.dh[256, 1]) -LSTMoutput3.it[0, 0] = Sigmoid(LSTMoutput3.unnamed254[0, 0]) -LSTMoutput3.unnamed254[0, 0] = Plus(LSTMoutput3.unnamed255[0, 0], LSTMoutput3.Wcidc[0, 0]) -LSTMoutput3.Wcidc[0, 0] = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[0, 0]) -LSTMoutput3.unnamed253[0, 0] = Scale(LSTMoutput3.expsWci[0, 0], LSTMoutput3.dc[1024, 1]) -LSTMoutput3.unnamed255[0, 0] = Plus(LSTMoutput3.unnamed256[0, 0], LSTMoutput3.Whidh[0, 0]) -LSTMoutput3.Whidh[0, 0] = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[0, 0]) -LSTMoutput3.unnamed252[0, 0] = Scale(LSTMoutput3.expsWhi[0, 0], LSTMoutput3.dh[256, 1]) -LSTMoutput3.bft[0, 0] = ElementTimes(LSTMoutput3.ft[0, 0], LSTMoutput3.dc[1024, 1]) -LSTMoutput3.ft[0, 0] = Sigmoid(LSTMoutput3.unnamed265[0, 0]) -LSTMoutput3.unnamed265[0, 0] = Plus(LSTMoutput3.unnamed266[0, 0], LSTMoutput3.Wcfdc[0, 0]) -LSTMoutput3.Wcfdc[0, 0] = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[0, 0]) -LSTMoutput3.unnamed264[0, 0] = Scale(LSTMoutput3.expsWcf[0, 0], LSTMoutput3.dc[1024, 1]) -LSTMoutput3.dc[1024, 1] = PastValue(LSTMoutput3.ct[0, 0]) -LSTMoutput3.unnamed266[0, 0] = Plus(LSTMoutput3.unnamed267[0, 0], LSTMoutput3.Whfdh[0, 0]) -LSTMoutput3.Whfdh[0, 0] = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[0, 0]) -LSTMoutput3.unnamed263[0, 0] = Scale(LSTMoutput3.expsWhf[0, 0], LSTMoutput3.dh[256, 1]) -LSTMoutput3.unnamed272[0, 0] = Plus(LSTMoutput3.unnamed273[0, 0], LSTMoutput3.Whodh[0, 0]) -LSTMoutput3.Whodh[0, 0] = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[0, 0]) -LSTMoutput3.unnamed269[0, 0] = Scale(LSTMoutput3.expsWho[0, 0], LSTMoutput3.dh[256, 1]) -LSTMoutput3.dh[256, 1] = PastValue(LSTMoutput3.output[0, 0]) -LSTMoutput3.bc[1024, 1] = LearnableParameter -LSTMoutput3.expsWhc[0, 0] = Exp(LSTMoutput3.sWhc[1, 1]) -LSTMoutput3.sWhc[1, 1] = LearnableParameter -LSTMoutput3.Whc[1024, 256] = LearnableParameter -LSTMoutput3.Wxcx[0, 0] = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[0, 0]) -LSTMoutput3.unnamed257[0, 0] = Scale(LSTMoutput3.expsWxc[0, 0], LSTMoutput2.output[0, 0]) -LSTMoutput3.expsWxc[0, 0] = Exp(LSTMoutput3.sWxc[1, 1]) -LSTMoutput3.sWxc[1, 1] = LearnableParameter -LSTMoutput3.Wxc[1024, 256] = LearnableParameter -LSTMoutput3.expsWci[0, 0] = Exp(LSTMoutput3.sWci[1, 1]) -LSTMoutput3.sWci[1, 1] = LearnableParameter -LSTMoutput3.Wci[1024, 1] = LearnableParameter -LSTMoutput3.expsWhi[0, 0] = Exp(LSTMoutput3.sWhi[1, 1]) -LSTMoutput3.sWhi[1, 1] = LearnableParameter -LSTMoutput3.Whi[1024, 256] = LearnableParameter -LSTMoutput3.unnamed256[0, 0] = Plus(LSTMoutput3.Wxix[0, 0], LSTMoutput3.bi[1024, 1]) -LSTMoutput3.bi[1024, 1] = LearnableParameter -LSTMoutput3.Wxix[0, 0] = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[0, 0]) -LSTMoutput3.unnamed251[0, 0] = Scale(LSTMoutput3.expsWxi[0, 0], LSTMoutput2.output[0, 0]) -LSTMoutput3.expsWxi[0, 0] = Exp(LSTMoutput3.sWxi[1, 1]) -LSTMoutput3.sWxi[1, 1] = LearnableParameter -LSTMoutput3.Wxi[1024, 256] = LearnableParameter -LSTMoutput3.expsWcf[0, 0] = Exp(LSTMoutput3.sWcf[1, 1]) -LSTMoutput3.sWcf[1, 1] = LearnableParameter -LSTMoutput3.Wcf[1024, 1] = LearnableParameter -LSTMoutput3.expsWhf[0, 0] = Exp(LSTMoutput3.sWhf[1, 1]) -LSTMoutput3.sWhf[1, 1] = LearnableParameter -LSTMoutput3.Whf[1024, 256] = LearnableParameter -LSTMoutput3.unnamed267[0, 0] = Plus(LSTMoutput3.Wxfx[0, 0], LSTMoutput3.bf[1024, 1]) -LSTMoutput3.bf[1024, 1] = LearnableParameter -LSTMoutput3.Wxfx[0, 0] = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[0, 0]) -LSTMoutput3.unnamed262[0, 0] = Scale(LSTMoutput3.expsWxf[0, 0], LSTMoutput2.output[0, 0]) -LSTMoutput3.expsWxf[0, 0] = Exp(LSTMoutput3.sWxf[1, 1]) -LSTMoutput3.sWxf[1, 1] = LearnableParameter -LSTMoutput3.Wxf[1024, 256] = LearnableParameter -LSTMoutput3.expsWco[0, 0] = Exp(LSTMoutput3.sWco[1, 1]) -LSTMoutput3.sWco[1, 1] = LearnableParameter -LSTMoutput3.Wco[1024, 1] = LearnableParameter -LSTMoutput3.expsWho[0, 0] = Exp(LSTMoutput3.sWho[1, 1]) -LSTMoutput3.sWho[1, 1] = LearnableParameter -LSTMoutput3.Who[1024, 256] = LearnableParameter -LSTMoutput3.unnamed273[0, 0] = Plus(LSTMoutput3.Wxox[0, 0], LSTMoutput3.bo[1024, 1]) -LSTMoutput3.bo[1024, 1] = LearnableParameter -LSTMoutput3.Wxox[0, 0] = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[0, 0]) -LSTMoutput3.unnamed268[0, 0] = Scale(LSTMoutput3.expsWxo[0, 0], LSTMoutput2.output[0, 0]) -LSTMoutput2.output[0, 0] = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[0, 0]) -LSTMoutput2.unnamed225[0, 0] = Scale(LSTMoutput2.expsWmr[0, 0], LSTMoutput2.mt[0, 0]) -LSTMoutput2.mt[0, 0] = ElementTimes(LSTMoutput2.ot[0, 0], LSTMoutput2.unnamed224[0, 0]) -LSTMoutput2.unnamed224[0, 0] = Tanh(LSTMoutput2.ct[0, 0]) -LSTMoutput2.ot[0, 0] = Sigmoid(LSTMoutput2.unnamed221[0, 0]) -LSTMoutput2.unnamed221[0, 0] = Plus(LSTMoutput2.unnamed222[0, 0], LSTMoutput2.Wcoct[0, 0]) -LSTMoutput2.Wcoct[0, 0] = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[0, 0]) -LSTMoutput2.unnamed220[0, 0] = Scale(LSTMoutput2.expsWco[0, 0], LSTMoutput2.ct[0, 0]) -LSTMoutput2.ct[0, 0] = Plus(LSTMoutput2.bft[0, 0], LSTMoutput2.bit[0, 0]) -LSTMoutput2.bit[0, 0] = ElementTimes(LSTMoutput2.it[0, 0], LSTMoutput2.unnamed209[0, 0]) -LSTMoutput2.unnamed209[0, 0] = Tanh(LSTMoutput2.unnamed210[0, 0]) -LSTMoutput2.unnamed210[0, 0] = Plus(LSTMoutput2.Wxcx[0, 0], LSTMoutput2.unnamed211[0, 0]) -LSTMoutput2.unnamed211[0, 0] = Plus(LSTMoutput2.Whcdh[0, 0], LSTMoutput2.bc[1024, 1]) -LSTMoutput2.Whcdh[0, 0] = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[0, 0]) -LSTMoutput2.unnamed208[0, 0] = Scale(LSTMoutput2.expsWhc[0, 0], LSTMoutput2.dh[256, 1]) -LSTMoutput2.it[0, 0] = Sigmoid(LSTMoutput2.unnamed204[0, 0]) -LSTMoutput2.unnamed204[0, 0] = Plus(LSTMoutput2.unnamed205[0, 0], LSTMoutput2.Wcidc[0, 0]) -LSTMoutput2.Wcidc[0, 0] = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[0, 0]) -LSTMoutput2.unnamed203[0, 0] = Scale(LSTMoutput2.expsWci[0, 0], LSTMoutput2.dc[1024, 1]) -LSTMoutput2.unnamed205[0, 0] = Plus(LSTMoutput2.unnamed206[0, 0], LSTMoutput2.Whidh[0, 0]) -LSTMoutput2.Whidh[0, 0] = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[0, 0]) -LSTMoutput2.unnamed202[0, 0] = Scale(LSTMoutput2.expsWhi[0, 0], LSTMoutput2.dh[256, 1]) -LSTMoutput2.bft[0, 0] = ElementTimes(LSTMoutput2.ft[0, 0], LSTMoutput2.dc[1024, 1]) -LSTMoutput2.ft[0, 0] = Sigmoid(LSTMoutput2.unnamed215[0, 0]) -LSTMoutput2.unnamed215[0, 0] = Plus(LSTMoutput2.unnamed216[0, 0], LSTMoutput2.Wcfdc[0, 0]) -LSTMoutput2.Wcfdc[0, 0] = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[0, 0]) -LSTMoutput2.unnamed214[0, 0] = Scale(LSTMoutput2.expsWcf[0, 0], LSTMoutput2.dc[1024, 1]) -LSTMoutput2.dc[1024, 1] = PastValue(LSTMoutput2.ct[0, 0]) -LSTMoutput2.unnamed216[0, 0] = Plus(LSTMoutput2.unnamed217[0, 0], LSTMoutput2.Whfdh[0, 0]) -LSTMoutput2.Whfdh[0, 0] = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[0, 0]) -LSTMoutput2.unnamed213[0, 0] = Scale(LSTMoutput2.expsWhf[0, 0], LSTMoutput2.dh[256, 1]) -LSTMoutput2.unnamed222[0, 0] = Plus(LSTMoutput2.unnamed223[0, 0], LSTMoutput2.Whodh[0, 0]) -LSTMoutput2.Whodh[0, 0] = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[0, 0]) -LSTMoutput2.unnamed219[0, 0] = Scale(LSTMoutput2.expsWho[0, 0], LSTMoutput2.dh[256, 1]) -LSTMoutput2.dh[256, 1] = PastValue(LSTMoutput2.output[0, 0]) -LSTMoutput2.bc[1024, 1] = LearnableParameter -LSTMoutput2.expsWhc[0, 0] = Exp(LSTMoutput2.sWhc[1, 1]) -LSTMoutput2.sWhc[1, 1] = LearnableParameter -LSTMoutput2.Whc[1024, 256] = LearnableParameter -LSTMoutput2.Wxcx[0, 0] = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[0, 0]) -LSTMoutput2.unnamed207[0, 0] = Scale(LSTMoutput2.expsWxc[0, 0], LSTMoutput1.output[0, 0]) -LSTMoutput2.expsWxc[0, 0] = Exp(LSTMoutput2.sWxc[1, 1]) -LSTMoutput2.sWxc[1, 1] = LearnableParameter -LSTMoutput2.Wxc[1024, 256] = LearnableParameter -LSTMoutput2.expsWci[0, 0] = Exp(LSTMoutput2.sWci[1, 1]) -LSTMoutput2.sWci[1, 1] = LearnableParameter -LSTMoutput2.Wci[1024, 1] = LearnableParameter -LSTMoutput2.expsWhi[0, 0] = Exp(LSTMoutput2.sWhi[1, 1]) -LSTMoutput2.sWhi[1, 1] = LearnableParameter -LSTMoutput2.Whi[1024, 256] = LearnableParameter -LSTMoutput2.unnamed206[0, 0] = Plus(LSTMoutput2.Wxix[0, 0], LSTMoutput2.bi[1024, 1]) -LSTMoutput2.bi[1024, 1] = LearnableParameter -LSTMoutput2.Wxix[0, 0] = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[0, 0]) -LSTMoutput2.unnamed201[0, 0] = Scale(LSTMoutput2.expsWxi[0, 0], LSTMoutput1.output[0, 0]) -LSTMoutput2.expsWxi[0, 0] = Exp(LSTMoutput2.sWxi[1, 1]) -LSTMoutput2.sWxi[1, 1] = LearnableParameter -LSTMoutput2.Wxi[1024, 256] = LearnableParameter -LSTMoutput2.expsWcf[0, 0] = Exp(LSTMoutput2.sWcf[1, 1]) -LSTMoutput2.sWcf[1, 1] = LearnableParameter -LSTMoutput2.Wcf[1024, 1] = LearnableParameter -LSTMoutput2.expsWhf[0, 0] = Exp(LSTMoutput2.sWhf[1, 1]) -LSTMoutput2.sWhf[1, 1] = LearnableParameter -LSTMoutput2.Whf[1024, 256] = LearnableParameter -LSTMoutput2.unnamed217[0, 0] = Plus(LSTMoutput2.Wxfx[0, 0], LSTMoutput2.bf[1024, 1]) -LSTMoutput2.bf[1024, 1] = LearnableParameter -LSTMoutput2.Wxfx[0, 0] = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[0, 0]) -LSTMoutput2.unnamed212[0, 0] = Scale(LSTMoutput2.expsWxf[0, 0], LSTMoutput1.output[0, 0]) -LSTMoutput2.expsWxf[0, 0] = Exp(LSTMoutput2.sWxf[1, 1]) -LSTMoutput2.sWxf[1, 1] = LearnableParameter -LSTMoutput2.Wxf[1024, 256] = LearnableParameter -LSTMoutput2.expsWco[0, 0] = Exp(LSTMoutput2.sWco[1, 1]) -LSTMoutput2.sWco[1, 1] = LearnableParameter -LSTMoutput2.Wco[1024, 1] = LearnableParameter -LSTMoutput2.expsWho[0, 0] = Exp(LSTMoutput2.sWho[1, 1]) -LSTMoutput2.sWho[1, 1] = LearnableParameter -LSTMoutput2.Who[1024, 256] = LearnableParameter -LSTMoutput2.unnamed223[0, 0] = Plus(LSTMoutput2.Wxox[0, 0], LSTMoutput2.bo[1024, 1]) -LSTMoutput2.bo[1024, 1] = LearnableParameter -LSTMoutput2.Wxox[0, 0] = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[0, 0]) -LSTMoutput2.unnamed218[0, 0] = Scale(LSTMoutput2.expsWxo[0, 0], LSTMoutput1.output[0, 0]) -LSTMoutput1.output[0, 0] = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[0, 0]) -LSTMoutput1.unnamed175[0, 0] = Scale(LSTMoutput1.expsWmr[0, 0], LSTMoutput1.mt[0, 0]) -LSTMoutput1.mt[0, 0] = ElementTimes(LSTMoutput1.ot[0, 0], LSTMoutput1.unnamed174[0, 0]) -LSTMoutput1.unnamed174[0, 0] = Tanh(LSTMoutput1.ct[0, 0]) -LSTMoutput1.ot[0, 0] = Sigmoid(LSTMoutput1.unnamed171[0, 0]) -LSTMoutput1.unnamed171[0, 0] = Plus(LSTMoutput1.unnamed172[0, 0], LSTMoutput1.Wcoct[0, 0]) -LSTMoutput1.Wcoct[0, 0] = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[0, 0]) -LSTMoutput1.unnamed170[0, 0] = Scale(LSTMoutput1.expsWco[0, 0], LSTMoutput1.ct[0, 0]) -LSTMoutput1.ct[0, 0] = Plus(LSTMoutput1.bft[0, 0], LSTMoutput1.bit[0, 0]) -LSTMoutput1.bit[0, 0] = ElementTimes(LSTMoutput1.it[0, 0], LSTMoutput1.unnamed159[0, 0]) -LSTMoutput1.unnamed159[0, 0] = Tanh(LSTMoutput1.unnamed160[0, 0]) -LSTMoutput1.unnamed160[0, 0] = Plus(LSTMoutput1.Wxcx[0, 0], LSTMoutput1.unnamed161[0, 0]) -LSTMoutput1.unnamed161[0, 0] = Plus(LSTMoutput1.Whcdh[0, 0], LSTMoutput1.bc[1024, 1]) -LSTMoutput1.Whcdh[0, 0] = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[0, 0]) -LSTMoutput1.unnamed158[0, 0] = Scale(LSTMoutput1.expsWhc[0, 0], LSTMoutput1.dh[256, 1]) -LSTMoutput1.it[0, 0] = Sigmoid(LSTMoutput1.unnamed154[0, 0]) -LSTMoutput1.unnamed154[0, 0] = Plus(LSTMoutput1.unnamed155[0, 0], LSTMoutput1.Wcidc[0, 0]) -LSTMoutput1.Wcidc[0, 0] = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[0, 0]) -LSTMoutput1.unnamed153[0, 0] = Scale(LSTMoutput1.expsWci[0, 0], LSTMoutput1.dc[1024, 1]) -LSTMoutput1.unnamed155[0, 0] = Plus(LSTMoutput1.unnamed156[0, 0], LSTMoutput1.Whidh[0, 0]) -LSTMoutput1.Whidh[0, 0] = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[0, 0]) -LSTMoutput1.unnamed152[0, 0] = Scale(LSTMoutput1.expsWhi[0, 0], LSTMoutput1.dh[256, 1]) -LSTMoutput1.bft[0, 0] = ElementTimes(LSTMoutput1.ft[0, 0], LSTMoutput1.dc[1024, 1]) -LSTMoutput1.ft[0, 0] = Sigmoid(LSTMoutput1.unnamed165[0, 0]) -LSTMoutput1.unnamed165[0, 0] = Plus(LSTMoutput1.unnamed166[0, 0], LSTMoutput1.Wcfdc[0, 0]) -LSTMoutput1.Wcfdc[0, 0] = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[0, 0]) -LSTMoutput1.unnamed164[0, 0] = Scale(LSTMoutput1.expsWcf[0, 0], LSTMoutput1.dc[1024, 1]) -LSTMoutput1.dc[1024, 1] = PastValue(LSTMoutput1.ct[0, 0]) -LSTMoutput1.unnamed166[0, 0] = Plus(LSTMoutput1.unnamed167[0, 0], LSTMoutput1.Whfdh[0, 0]) -LSTMoutput1.Whfdh[0, 0] = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[0, 0]) -LSTMoutput1.unnamed163[0, 0] = Scale(LSTMoutput1.expsWhf[0, 0], LSTMoutput1.dh[256, 1]) -LSTMoutput1.unnamed172[0, 0] = Plus(LSTMoutput1.unnamed173[0, 0], LSTMoutput1.Whodh[0, 0]) -LSTMoutput1.Whodh[0, 0] = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[0, 0]) -LSTMoutput1.unnamed169[0, 0] = Scale(LSTMoutput1.expsWho[0, 0], LSTMoutput1.dh[256, 1]) -LSTMoutput1.dh[256, 1] = PastValue(LSTMoutput1.output[0, 0]) -LSTMoutput1.bc[1024, 1] = LearnableParameter -LSTMoutput1.expsWhc[0, 0] = Exp(LSTMoutput1.sWhc[1, 1]) -LSTMoutput1.sWhc[1, 1] = LearnableParameter -LSTMoutput1.Whc[1024, 256] = LearnableParameter -LSTMoutput1.Wxcx[0, 0] = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[0, 0]) -LSTMoutput1.unnamed157[0, 0] = Scale(LSTMoutput1.expsWxc[0, 0], featNorm.xNorm[0, 0]) -LSTMoutput1.expsWxc[0, 0] = Exp(LSTMoutput1.sWxc[1, 1]) -LSTMoutput1.sWxc[1, 1] = LearnableParameter -LSTMoutput1.Wxc[1024, 33] = LearnableParameter -LSTMoutput1.expsWci[0, 0] = Exp(LSTMoutput1.sWci[1, 1]) -LSTMoutput1.sWci[1, 1] = LearnableParameter -LSTMoutput1.Wci[1024, 1] = LearnableParameter -LSTMoutput1.expsWhi[0, 0] = Exp(LSTMoutput1.sWhi[1, 1]) -LSTMoutput1.sWhi[1, 1] = LearnableParameter -LSTMoutput1.Whi[1024, 256] = LearnableParameter -LSTMoutput1.unnamed156[0, 0] = Plus(LSTMoutput1.Wxix[0, 0], LSTMoutput1.bi[1024, 1]) -LSTMoutput1.bi[1024, 1] = LearnableParameter -LSTMoutput1.Wxix[0, 0] = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[0, 0]) -LSTMoutput1.unnamed151[0, 0] = Scale(LSTMoutput1.expsWxi[0, 0], featNorm.xNorm[0, 0]) -LSTMoutput1.expsWxi[0, 0] = Exp(LSTMoutput1.sWxi[1, 1]) -LSTMoutput1.sWxi[1, 1] = LearnableParameter -LSTMoutput1.Wxi[1024, 33] = LearnableParameter -LSTMoutput1.expsWcf[0, 0] = Exp(LSTMoutput1.sWcf[1, 1]) -LSTMoutput1.sWcf[1, 1] = LearnableParameter -LSTMoutput1.Wcf[1024, 1] = LearnableParameter -LSTMoutput1.expsWhf[0, 0] = Exp(LSTMoutput1.sWhf[1, 1]) -LSTMoutput1.sWhf[1, 1] = LearnableParameter -LSTMoutput1.Whf[1024, 256] = LearnableParameter -LSTMoutput1.unnamed167[0, 0] = Plus(LSTMoutput1.Wxfx[0, 0], LSTMoutput1.bf[1024, 1]) -LSTMoutput1.bf[1024, 1] = LearnableParameter -LSTMoutput1.Wxfx[0, 0] = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[0, 0]) -LSTMoutput1.unnamed162[0, 0] = Scale(LSTMoutput1.expsWxf[0, 0], featNorm.xNorm[0, 0]) -LSTMoutput1.expsWxf[0, 0] = Exp(LSTMoutput1.sWxf[1, 1]) -LSTMoutput1.sWxf[1, 1] = LearnableParameter -LSTMoutput1.Wxf[1024, 33] = LearnableParameter -LSTMoutput1.expsWco[0, 0] = Exp(LSTMoutput1.sWco[1, 1]) -LSTMoutput1.sWco[1, 1] = LearnableParameter -LSTMoutput1.Wco[1024, 1] = LearnableParameter -LSTMoutput1.expsWho[0, 0] = Exp(LSTMoutput1.sWho[1, 1]) -LSTMoutput1.sWho[1, 1] = LearnableParameter -LSTMoutput1.Who[1024, 256] = LearnableParameter -LSTMoutput1.unnamed173[0, 0] = Plus(LSTMoutput1.Wxox[0, 0], LSTMoutput1.bo[1024, 1]) -LSTMoutput1.bo[1024, 1] = LearnableParameter -LSTMoutput1.Wxox[0, 0] = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[0, 0]) -LSTMoutput1.unnamed168[0, 0] = Scale(LSTMoutput1.expsWxo[0, 0], featNorm.xNorm[0, 0]) -featNorm.xNorm[0, 0] = PerDimMeanVarNormalization(feashift[0, 0], featNorm.xMean[0, 0], featNorm.xStdDev[0, 0]) -featNorm.xStdDev[0, 0] = InvStdDev(feashift[0, 0]) -featNorm.xMean[0, 0] = Mean(feashift[0, 0]) -feashift[0, 0] = RowSlice(features[363, 1]) -features[363, 1] = InputValue -LSTMoutput1.expsWxo[0, 0] = Exp(LSTMoutput1.sWxo[1, 1]) -LSTMoutput1.sWxo[1, 1] = LearnableParameter -LSTMoutput1.Wxo[1024, 33] = LearnableParameter -LSTMoutput1.expsWmr[0, 0] = Exp(LSTMoutput1.sWmr[1, 1]) -LSTMoutput1.sWmr[1, 1] = LearnableParameter -LSTMoutput1.Wmr[256, 1024] = LearnableParameter -LSTMoutput2.expsWxo[0, 0] = Exp(LSTMoutput2.sWxo[1, 1]) -LSTMoutput2.sWxo[1, 1] = LearnableParameter -LSTMoutput2.Wxo[1024, 256] = LearnableParameter -LSTMoutput2.expsWmr[0, 0] = Exp(LSTMoutput2.sWmr[1, 1]) -LSTMoutput2.sWmr[1, 1] = LearnableParameter -LSTMoutput2.Wmr[256, 1024] = LearnableParameter -LSTMoutput3.expsWxo[0, 0] = Exp(LSTMoutput3.sWxo[1, 1]) -LSTMoutput3.sWxo[1, 1] = LearnableParameter -LSTMoutput3.Wxo[1024, 256] = LearnableParameter -LSTMoutput3.expsWmr[0, 0] = Exp(LSTMoutput3.sWmr[1, 1]) -LSTMoutput3.sWmr[1, 1] = LearnableParameter -LSTMoutput3.Wmr[256, 1024] = LearnableParameter -expsW[0, 0] = Exp(sW[1, 1]) -sW[1, 1] = LearnableParameter -W[132, 256] = LearnableParameter -labels[132, 1] = InputValue - -Validating node cr - nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Validating node cr - -Validating --> labels = InputValue -> [132, MBSize 1] -Validating --> W = LearnableParameter -> [132, 256] -Validating --> sW = LearnableParameter -> [1, 1] -Validating --> expsW = Exp(sW[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1] -Validating --> features = InputValue -> [363, MBSize 1] -Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] -Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256 {W=6854767804416, H=4294967295, C=0}, 1]) -> [256, 1] -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256 {W=6854767804416, H=4294967295, C=0}, 1]) -> [1024, 1] -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256 {W=6854767804416, H=4294967295, C=0}, 1]) -> [256, 1] -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256 {W=6854767804416, H=4294967295, C=0}, 1]) -> [1024, 1] -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, 1]) -> [1024, 1] -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3080, H=1, C=1480711769}, 1]) -> [1024, 1] -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3080, H=1, C=1480711769}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256 {W=6854767804416, H=4294967295, C=0}, 1]) -> [256, 1] -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256 {W=6854767804416, H=4294967295, C=0}, 1]) -> [1024, 1] -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, 1]) -> [1024, 1] -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3080, H=1, C=1480711769}, 1]) -> [1024, 1] -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3080, H=1, C=1480711769}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256 {W=6854767804416, H=4294967295, C=0}, 1]) -> [256, 1] -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256 {W=6854767804416, H=4294967295, C=0}, 1]) -> [1024, 1] -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1]) -> [1024, 1] -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.bit[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256 {W=4121110508178779188, H=3474281011731246130, C=4116338653636141100}, 1]) -> [256, 1] -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256 {W=4121110508178779188, H=3474281011731246130, C=4116338653636141100}, 1]) -> [1024, 1] -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256 {W=4121110508178779188, H=3474281011731246130, C=4116338653636141100}, 1]) -> [256, 1] -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256 {W=4121110508178779188, H=3474281011731246130, C=4116338653636141100}, 1]) -> [1024, 1] -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, 1]) -> [1024, 1] -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, 1]) -> [1024, 1] -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256 {W=4121110508178779188, H=3474281011731246130, C=4116338653636141100}, 1]) -> [256, 1] -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256 {W=4121110508178779188, H=3474281011731246130, C=4116338653636141100}, 1]) -> [1024, 1] -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, 1]) -> [1024, 1] -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, 1]) -> [1024, 1] -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256 {W=4121110508178779188, H=3474281011731246130, C=4116338653636141100}, 1]) -> [256, 1] -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256 {W=4121110508178779188, H=3474281011731246130, C=4116338653636141100}, 1]) -> [1024, 1] -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1]) -> [1024, 1] -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.bit[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256 {W=3186069275638247986, H=3760560802616062240, C=3900165879994916908}, 1]) -> [256, 1] -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256 {W=3186069275638247986, H=3760560802616062240, C=3900165879994916908}, 1]) -> [1024, 1] -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256 {W=3186069275638247986, H=3760560802616062240, C=3900165879994916908}, 1]) -> [256, 1] -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256 {W=3186069275638247986, H=3760560802616062240, C=3900165879994916908}, 1]) -> [1024, 1] -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, 1]) -> [1024, 1] -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, 1]) -> [1024, 1] -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256 {W=3186069275638247986, H=3760560802616062240, C=3900165879994916908}, 1]) -> [256, 1] -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256 {W=3186069275638247986, H=3760560802616062240, C=3900165879994916908}, 1]) -> [1024, 1] -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, 1]) -> [1024, 1] -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, 1]) -> [1024, 1] -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256 {W=3186069275638247986, H=3760560802616062240, C=3900165879994916908}, 1]) -> [256, 1] -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256 {W=3186069275638247986, H=3760560802616062240, C=3900165879994916908}, 1]) -> [1024, 1] -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1]) -> [1024, 1] -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.bit[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1] -Validating --> b = LearnableParameter -> [132, 1] -Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1] -Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] ---- -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] - -Revalidating - - - - - -127 out of 272 nodes do not share the minibatch layout with the input data. - -Validating --> labels = InputValue -> [132, MBSize 1] -Validating --> W = LearnableParameter -> [132, 256] -Validating --> sW = LearnableParameter -> [1, 1] -Validating --> expsW = Exp(sW[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1] -Validating --> features = InputValue -> [363, MBSize 1] -Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] -Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.bit[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.bit[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.bit[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1] -Validating --> b = LearnableParameter -> [132, 1] -Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1] -Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] ---- -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] - -Revalidating - - - - - -127 out of 272 nodes do not share the minibatch layout with the input data. - nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Validating node ScaledLogLikelihood - nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Validating node ScaledLogLikelihood - -Validating --> W = LearnableParameter -> [132, 256] -Validating --> sW = LearnableParameter -> [1, 1] -Validating --> expsW = Exp(sW[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1] -Validating --> features = InputValue -> [363, MBSize 1] -Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] -Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.bit[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.bit[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.bit[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1] -Validating --> b = LearnableParameter -> [132, 1] -Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1] -Validating --> labels = InputValue -> [132, MBSize 1] -Validating --> logPrior.Prior = Mean(labels[132, MBSize 1]) -> [132, 1] -Validating --> logPrior.LogPrior = Log(logPrior.Prior[132, 1]) -> [132, 1] -Validating --> ScaledLogLikelihood = Minus(LSTMoutputW[132, MBSize 1], logPrior.LogPrior[132, 1]) -> [132, MBSize 1] ---- -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] - -Revalidating - - - - - -128 out of 274 nodes do not share the minibatch layout with the input data. - -Validating --> W = LearnableParameter -> [132, 256] -Validating --> sW = LearnableParameter -> [1, 1] -Validating --> expsW = Exp(sW[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1] -Validating --> features = InputValue -> [363, MBSize 1] -Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] -Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.bit[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.bit[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.bit[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1] -Validating --> b = LearnableParameter -> [132, 1] -Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1] -Validating --> labels = InputValue -> [132, MBSize 1] -Validating --> logPrior.Prior = Mean(labels[132, MBSize 1]) -> [132, 1] -Validating --> logPrior.LogPrior = Log(logPrior.Prior[132, 1]) -> [132, 1] -Validating --> ScaledLogLikelihood = Minus(LSTMoutputW[132, MBSize 1], logPrior.LogPrior[132, 1]) -> [132, MBSize 1] ---- -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] - -Revalidating - - - - - -128 out of 274 nodes do not share the minibatch layout with the input data. - nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Validating node Err - nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Validating node Err - -Validating --> labels = InputValue -> [132, MBSize 1] -Validating --> W = LearnableParameter -> [132, 256] -Validating --> sW = LearnableParameter -> [1, 1] -Validating --> expsW = Exp(sW[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1] -Validating --> features = InputValue -> [363, MBSize 1] -Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] -Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.bit[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.bit[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.bit[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1] -Validating --> b = LearnableParameter -> [132, 1] -Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1] -Validating --> Err = ErrorPrediction(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] ---- -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] - -Revalidating - - - - - -127 out of 272 nodes do not share the minibatch layout with the input data. - -Validating --> labels = InputValue -> [132, MBSize 1] -Validating --> W = LearnableParameter -> [132, 256] -Validating --> sW = LearnableParameter -> [1, 1] -Validating --> expsW = Exp(sW[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1] -Validating --> features = InputValue -> [363, MBSize 1] -Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] -Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.bit[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.bit[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.bit[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1] -Validating --> b = LearnableParameter -> [132, 1] -Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1] -Validating --> Err = ErrorPrediction(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] ---- -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1] - -Revalidating - - - - - -127 out of 272 nodes do not share the minibatch layout with the input data. +Node --> B = LearnableParameter +Node --> labels = InputValue +Node --> LSTMoutputW./*+*/left./***/left = LearnableParameter +Node --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].Wmr = LearnableParameter +Node --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].Wmr = LearnableParameter +Node --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].Wmr = LearnableParameter +Node --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> features = InputValue +Node --> feashift = RowSlice +Node --> featNorm.meanVector = Mean +Node --> featNorm.invStdDevVector = InvStdDev +Node --> featNorm = PerDimMeanVarNormalization +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].dh = PastValue +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[1].ot.z./*+*/left = Plus +Node --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[1].ft.z./*+*/left = Plus +Node --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[1].dc = PastValue +Node --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale +Node --> LSTMoutput[1].ft.z./*+*/right = DiagTimes +Node --> LSTMoutput[1].ft.z = Plus +Node --> LSTMoutput[1].ft = Sigmoid +Node --> LSTMoutput[1].bft = ElementTimes +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[1].it.z./*+*/left = Plus +Node --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[1].it.z./*+*/right.matrix = Scale +Node --> LSTMoutput[1].it.z./*+*/right = DiagTimes +Node --> LSTMoutput[1].it.z = Plus +Node --> LSTMoutput[1].it = Sigmoid +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus +Node --> LSTMoutput[1].bit./*.**/right.z = Plus +Node --> LSTMoutput[1].bit./*.**/right = Tanh +Node --> LSTMoutput[1].bit = ElementTimes +Node --> LSTMoutput[1].ct = Plus +Node --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale +Node --> LSTMoutput[1].ot.z./*+*/right = DiagTimes +Node --> LSTMoutput[1].ot.z = Plus +Node --> LSTMoutput[1].ot = Sigmoid +Node --> LSTMoutput[1].mt./*.**/right = Tanh +Node --> LSTMoutput[1].mt = ElementTimes +Node --> LSTMoutput[1].output./***/right = Scale +Node --> LSTMoutput[1].output = Times +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].dh = PastValue +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[2].ot.z./*+*/left = Plus +Node --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[2].ft.z./*+*/left = Plus +Node --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[2].dc = PastValue +Node --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale +Node --> LSTMoutput[2].ft.z./*+*/right = DiagTimes +Node --> LSTMoutput[2].ft.z = Plus +Node --> LSTMoutput[2].ft = Sigmoid +Node --> LSTMoutput[2].bft = ElementTimes +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[2].it.z./*+*/left = Plus +Node --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[2].it.z./*+*/right.matrix = Scale +Node --> LSTMoutput[2].it.z./*+*/right = DiagTimes +Node --> LSTMoutput[2].it.z = Plus +Node --> LSTMoutput[2].it = Sigmoid +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus +Node --> LSTMoutput[2].bit./*.**/right.z = Plus +Node --> LSTMoutput[2].bit./*.**/right = Tanh +Node --> LSTMoutput[2].bit = ElementTimes +Node --> LSTMoutput[2].ct = Plus +Node --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale +Node --> LSTMoutput[2].ot.z./*+*/right = DiagTimes +Node --> LSTMoutput[2].ot.z = Plus +Node --> LSTMoutput[2].ot = Sigmoid +Node --> LSTMoutput[2].mt./*.**/right = Tanh +Node --> LSTMoutput[2].mt = ElementTimes +Node --> LSTMoutput[2].output./***/right = Scale +Node --> LSTMoutput[2].output = Times +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].dh = PastValue +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[3].ot.z./*+*/left = Plus +Node --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[3].ft.z./*+*/left = Plus +Node --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[3].dc = PastValue +Node --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale +Node --> LSTMoutput[3].ft.z./*+*/right = DiagTimes +Node --> LSTMoutput[3].ft.z = Plus +Node --> LSTMoutput[3].ft = Sigmoid +Node --> LSTMoutput[3].bft = ElementTimes +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[3].it.z./*+*/left = Plus +Node --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[3].it.z./*+*/right.matrix = Scale +Node --> LSTMoutput[3].it.z./*+*/right = DiagTimes +Node --> LSTMoutput[3].it.z = Plus +Node --> LSTMoutput[3].it = Sigmoid +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus +Node --> LSTMoutput[3].bit./*.**/right.z = Plus +Node --> LSTMoutput[3].bit./*.**/right = Tanh +Node --> LSTMoutput[3].bit = ElementTimes +Node --> LSTMoutput[3].ct = Plus +Node --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale +Node --> LSTMoutput[3].ot.z./*+*/right = DiagTimes +Node --> LSTMoutput[3].ot.z = Plus +Node --> LSTMoutput[3].ot = Sigmoid +Node --> LSTMoutput[3].mt./*.**/right = Tanh +Node --> LSTMoutput[3].mt = ElementTimes +Node --> LSTMoutput[3].output./***/right = Scale +Node --> LSTMoutput[3].output = Times +Node --> LSTMoutputW./*+*/left./***/right = Scale +Node --> LSTMoutputW./*+*/left = Times +Node --> LSTMoutputW = Plus +Node --> Err = ErrorPrediction +Node --> logPrior.x = Mean +Node --> logPrior = Log +Node --> ScaledLogLikelihood = Minus +Node --> cr = CrossEntropyWithSoftmax +N9Microsoft3MSR4CNTK18ComputationNetworkE [ + B : LearnableParameter 132 x 1 () + cr : CrossEntropyWithSoftmax 0 x 0 ( + labels + LSTMoutputW + ) + Err : ErrorPrediction 0 x 0 ( + labels + LSTMoutputW + ) + feashift : RowSlice 0 x 0 ( + features + ) + featNorm : PerDimMeanVarNormalization 0 x 0 ( + feashift + featNorm.meanVector + featNorm.invStdDevVector + ) + featNorm.invStdDevVector : InvStdDev 0 x 0 ( + feashift + ) + featNorm.meanVector : Mean 0 x 0 ( + feashift + ) + features : InputValue 363 x 1 () + labels : InputValue 132 x 1 () + logPrior : Log 0 x 0 ( + logPrior.x + ) + logPrior.x : Mean 0 x 0 ( + labels + ) + LSTMoutput[1].bft : ElementTimes 0 x 0 ( + LSTMoutput[1].ft + LSTMoutput[1].dc + ) + LSTMoutput[1].bit : ElementTimes 0 x 0 ( + LSTMoutput[1].it + LSTMoutput[1].bit./*.**/right + ) + LSTMoutput[1].bit./*.**/right : Tanh 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z + ) + LSTMoutput[1].bit./*.**/right.z : Plus 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/left + LSTMoutput[1].bit./*.**/right.z./*+*/right + ) + LSTMoutput[1].bit./*.**/right.z./*+*/left : Times 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right + ) + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 33 () + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor + featNorm + ) + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].bit./*.**/right.z./*+*/right : Plus 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right + ) + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right + ) + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].dh + ) + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[1].ct : Plus 0 x 0 ( + LSTMoutput[1].bft + LSTMoutput[1].bit + ) + LSTMoutput[1].dc : PastValue 1024 x 1 ( + LSTMoutput[1].ct + ) + LSTMoutput[1].dh : PastValue 256 x 1 ( + LSTMoutput[1].output + ) + LSTMoutput[1].ft : Sigmoid 0 x 0 ( + LSTMoutput[1].ft.z + ) + LSTMoutput[1].ft.z : Plus 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left + LSTMoutput[1].ft.z./*+*/right + ) + LSTMoutput[1].ft.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left + LSTMoutput[1].ft.z./*+*/left./*+*/right + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 () + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + featNorm + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[1].ft.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[1].dh + ) + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].ft.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[1].ft.z./*+*/right.matrix + ) + LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[1].ft.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[1].dc + ) + LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].it : Sigmoid 0 x 0 ( + LSTMoutput[1].it.z + ) + LSTMoutput[1].it.z : Plus 0 x 0 ( + LSTMoutput[1].it.z./*+*/left + LSTMoutput[1].it.z./*+*/right + ) + LSTMoutput[1].it.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left + LSTMoutput[1].it.z./*+*/left./*+*/right + ) + LSTMoutput[1].it.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 () + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + featNorm + ) + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[1].it.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/right./***/left + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[1].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[1].dh + ) + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].it.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[1].it.z./*+*/right.matrix + ) + LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[1].it.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[1].dc + ) + LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].mt : ElementTimes 0 x 0 ( + LSTMoutput[1].ot + LSTMoutput[1].mt./*.**/right + ) + LSTMoutput[1].mt./*.**/right : Tanh 0 x 0 ( + LSTMoutput[1].ct + ) + LSTMoutput[1].ot : Sigmoid 0 x 0 ( + LSTMoutput[1].ot.z + ) + LSTMoutput[1].ot.z : Plus 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left + LSTMoutput[1].ot.z./*+*/right + ) + LSTMoutput[1].ot.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left + LSTMoutput[1].ot.z./*+*/left./*+*/right + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 () + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + featNorm + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[1].ot.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[1].dh + ) + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].ot.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[1].ot.z./*+*/right.matrix + ) + LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[1].ot.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[1].ct + ) + LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].output : Times 0 x 0 ( + LSTMoutput[1].Wmr + LSTMoutput[1].output./***/right + ) + LSTMoutput[1].output./***/right : Scale 0 x 0 ( + LSTMoutput[1].output./***/right.scalarScalingFactor + LSTMoutput[1].mt + ) + LSTMoutput[1].output./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].output./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].Wmr : LearnableParameter 256 x 1024 () + LSTMoutput[2].bft : ElementTimes 0 x 0 ( + LSTMoutput[2].ft + LSTMoutput[2].dc + ) + LSTMoutput[2].bit : ElementTimes 0 x 0 ( + LSTMoutput[2].it + LSTMoutput[2].bit./*.**/right + ) + LSTMoutput[2].bit./*.**/right : Tanh 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z + ) + LSTMoutput[2].bit./*.**/right.z : Plus 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/left + LSTMoutput[2].bit./*.**/right.z./*+*/right + ) + LSTMoutput[2].bit./*.**/right.z./*+*/left : Times 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right + ) + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].output + ) + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].bit./*.**/right.z./*+*/right : Plus 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right + ) + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right + ) + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].dh + ) + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[2].ct : Plus 0 x 0 ( + LSTMoutput[2].bft + LSTMoutput[2].bit + ) + LSTMoutput[2].dc : PastValue 1024 x 1 ( + LSTMoutput[2].ct + ) + LSTMoutput[2].dh : PastValue 256 x 1 ( + LSTMoutput[2].output + ) + LSTMoutput[2].ft : Sigmoid 0 x 0 ( + LSTMoutput[2].ft.z + ) + LSTMoutput[2].ft.z : Plus 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left + LSTMoutput[2].ft.z./*+*/right + ) + LSTMoutput[2].ft.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left + LSTMoutput[2].ft.z./*+*/left./*+*/right + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].output + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[2].ft.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[2].dh + ) + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].ft.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[2].ft.z./*+*/right.matrix + ) + LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[2].ft.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[2].dc + ) + LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].it : Sigmoid 0 x 0 ( + LSTMoutput[2].it.z + ) + LSTMoutput[2].it.z : Plus 0 x 0 ( + LSTMoutput[2].it.z./*+*/left + LSTMoutput[2].it.z./*+*/right + ) + LSTMoutput[2].it.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left + LSTMoutput[2].it.z./*+*/left./*+*/right + ) + LSTMoutput[2].it.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].output + ) + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[2].it.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/right./***/left + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[2].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[2].dh + ) + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].it.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[2].it.z./*+*/right.matrix + ) + LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[2].it.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[2].dc + ) + LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].mt : ElementTimes 0 x 0 ( + LSTMoutput[2].ot + LSTMoutput[2].mt./*.**/right + ) + LSTMoutput[2].mt./*.**/right : Tanh 0 x 0 ( + LSTMoutput[2].ct + ) + LSTMoutput[2].ot : Sigmoid 0 x 0 ( + LSTMoutput[2].ot.z + ) + LSTMoutput[2].ot.z : Plus 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left + LSTMoutput[2].ot.z./*+*/right + ) + LSTMoutput[2].ot.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left + LSTMoutput[2].ot.z./*+*/left./*+*/right + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].output + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[2].ot.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[2].dh + ) + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].ot.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[2].ot.z./*+*/right.matrix + ) + LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[2].ot.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[2].ct + ) + LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].output : Times 0 x 0 ( + LSTMoutput[2].Wmr + LSTMoutput[2].output./***/right + ) + LSTMoutput[2].output./***/right : Scale 0 x 0 ( + LSTMoutput[2].output./***/right.scalarScalingFactor + LSTMoutput[2].mt + ) + LSTMoutput[2].output./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].output./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].Wmr : LearnableParameter 256 x 1024 () + LSTMoutput[3].bft : ElementTimes 0 x 0 ( + LSTMoutput[3].ft + LSTMoutput[3].dc + ) + LSTMoutput[3].bit : ElementTimes 0 x 0 ( + LSTMoutput[3].it + LSTMoutput[3].bit./*.**/right + ) + LSTMoutput[3].bit./*.**/right : Tanh 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z + ) + LSTMoutput[3].bit./*.**/right.z : Plus 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/left + LSTMoutput[3].bit./*.**/right.z./*+*/right + ) + LSTMoutput[3].bit./*.**/right.z./*+*/left : Times 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right + ) + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].output + ) + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].bit./*.**/right.z./*+*/right : Plus 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right + ) + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right + ) + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor + LSTMoutput[3].dh + ) + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[3].ct : Plus 0 x 0 ( + LSTMoutput[3].bft + LSTMoutput[3].bit + ) + LSTMoutput[3].dc : PastValue 1024 x 1 ( + LSTMoutput[3].ct + ) + LSTMoutput[3].dh : PastValue 256 x 1 ( + LSTMoutput[3].output + ) + LSTMoutput[3].ft : Sigmoid 0 x 0 ( + LSTMoutput[3].ft.z + ) + LSTMoutput[3].ft.z : Plus 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left + LSTMoutput[3].ft.z./*+*/right + ) + LSTMoutput[3].ft.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left + LSTMoutput[3].ft.z./*+*/left./*+*/right + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].output + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[3].ft.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[3].dh + ) + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].ft.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[3].ft.z./*+*/right.matrix + ) + LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[3].ft.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[3].dc + ) + LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].it : Sigmoid 0 x 0 ( + LSTMoutput[3].it.z + ) + LSTMoutput[3].it.z : Plus 0 x 0 ( + LSTMoutput[3].it.z./*+*/left + LSTMoutput[3].it.z./*+*/right + ) + LSTMoutput[3].it.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left + LSTMoutput[3].it.z./*+*/left./*+*/right + ) + LSTMoutput[3].it.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].output + ) + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[3].it.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/right./***/left + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[3].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[3].dh + ) + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].it.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[3].it.z./*+*/right.matrix + ) + LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[3].it.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[3].dc + ) + LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].mt : ElementTimes 0 x 0 ( + LSTMoutput[3].ot + LSTMoutput[3].mt./*.**/right + ) + LSTMoutput[3].mt./*.**/right : Tanh 0 x 0 ( + LSTMoutput[3].ct + ) + LSTMoutput[3].ot : Sigmoid 0 x 0 ( + LSTMoutput[3].ot.z + ) + LSTMoutput[3].ot.z : Plus 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left + LSTMoutput[3].ot.z./*+*/right + ) + LSTMoutput[3].ot.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left + LSTMoutput[3].ot.z./*+*/left./*+*/right + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].output + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[3].ot.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[3].dh + ) + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].ot.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[3].ot.z./*+*/right.matrix + ) + LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[3].ot.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[3].ct + ) + LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].output : Times 0 x 0 ( + LSTMoutput[3].Wmr + LSTMoutput[3].output./***/right + ) + LSTMoutput[3].output./***/right : Scale 0 x 0 ( + LSTMoutput[3].output./***/right.scalarScalingFactor + LSTMoutput[3].mt + ) + LSTMoutput[3].output./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].output./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].Wmr : LearnableParameter 256 x 1024 () + LSTMoutputW : Plus 0 x 0 ( + LSTMoutputW./*+*/left + B + ) + LSTMoutputW./*+*/left : Times 0 x 0 ( + LSTMoutputW./*+*/left./***/left + LSTMoutputW./*+*/left./***/right + ) + LSTMoutputW./*+*/left./***/left : LearnableParameter 132 x 256 () + LSTMoutputW./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutputW./*+*/left./***/right.scalarScalingFactor + LSTMoutput[3].output + ) + LSTMoutputW./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + ScaledLogLikelihood : Minus 0 x 0 ( + LSTMoutputW + logPrior + ) +] GetTrainCriterionNodes ... GetEvalCriterionNodes ... -Found 6 PreCompute nodes - NodeName: featNorm.xMean - NodeName: featNorm.xStdDev - NodeName: logPrior.Prior - NodeName: featNorm.xMean - NodeName: featNorm.xStdDev - NodeName: logPrior.Prior + nodes in the recurrent loops : +LSTMoutput[1].mt./*.**/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].mt./*.**/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].mt./*.**/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].mt./*.**/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].mt./*.**/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].mt./*.**/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output + +Validating for node cr. 272 nodes to process in pass 1. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] + +Validating for node cr. 183 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] + +Validating for node cr. 60 nodes to process in pass 3. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] + +Validating for node cr, final verification. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] + +127 out of 272 nodes do not share the minibatch layout with the input data. + + +Precomputing --> 3 PreCompute nodes found. + + NodeName: featNorm.invStdDevVector + NodeName: featNorm.meanVector + NodeName: logPrior.x minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output -Validating node featNorm.xMean +Validating for node featNorm.invStdDevVector. 3 nodes to process in pass 1. -Validating --> features = InputValue -> [363, MBSize 348] -Validating --> feashift = RowSlice(features[363, MBSize 348]) -> [33, MBSize 348] -Validating --> featNorm.xMean = Mean(feashift[33, MBSize 348]) -> [33, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating for node featNorm.invStdDevVector, final verification. +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] 1 out of 3 nodes do not share the minibatch layout with the input data. + nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output -Validating node featNorm.xStdDev +Validating for node featNorm.meanVector. 3 nodes to process in pass 1. -Validating --> features = InputValue -> [363, MBSize 348] -Validating --> feashift = RowSlice(features[363, MBSize 348]) -> [33, MBSize 348] -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 348]) -> [33, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating for node featNorm.meanVector, final verification. +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] 1 out of 3 nodes do not share the minibatch layout with the input data. + nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output -Validating node logPrior.Prior +Validating for node logPrior.x. 2 nodes to process in pass 1. -Validating --> labels = InputValue -> [132, MBSize 348] -Validating --> logPrior.Prior = Mean(labels[132, MBSize 348]) -> [132, 1] +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1] +Validating for node logPrior.x. 1 nodes to process in pass 2. +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1] + +Validating for node logPrior.x, final verification. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1] 1 out of 2 nodes do not share the minibatch layout with the input data. + +EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once. + +Precomputing --> Completed. + Set Max Temp Mem Size For Convolution Nodes to 0 samples. -Starting Epoch 1: learning rate per sample = 0.025000 momentum = 0.000000 +Starting Epoch 1: learning rate per sample = 0.025000 effective momentum = 0.000000 minibatchiterator: epoch 0: frames [0..2560] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses + nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output + +Validating for node Err. 272 nodes to process in pass 1. + +Validating --> labels = InputValue -> [132, MBSize 218] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 218] +Validating --> feashift = RowSlice(features[363, MBSize 218]) -> [33, MBSize 218] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 218]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 218]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 218], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 218], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 218], LSTMoutput[1].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 218], LSTMoutput[1].bit[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 218], LSTMoutput[1].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 218], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 218], LSTMoutput[2].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 218], LSTMoutput[2].bit[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 218], LSTMoutput[2].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 218], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 218], LSTMoutput[3].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 218], LSTMoutput[3].bit[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 218], LSTMoutput[3].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 218]) -> [132, MBSize 218] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 218], B[132, 1]) -> [132, MBSize 218] +Validating --> Err = ErrorPrediction(labels[132, MBSize 218], LSTMoutputW[132, MBSize 218]) -> [1, 1] + +Validating for node Err. 180 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 218] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 218] +Validating --> feashift = RowSlice(features[363, MBSize 218]) -> [33, MBSize 218] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 218]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 218]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 218], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 218], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 218], LSTMoutput[1].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 218], LSTMoutput[1].bit[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 218], LSTMoutput[1].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 218], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 218], LSTMoutput[2].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 218], LSTMoutput[2].bit[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 218], LSTMoutput[2].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 218], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 218], LSTMoutput[3].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 218], LSTMoutput[3].bit[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 218], LSTMoutput[3].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 218]) -> [132, MBSize 218] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 218], B[132, 1]) -> [132, MBSize 218] +Validating --> Err = ErrorPrediction(labels[132, MBSize 218], LSTMoutputW[132, MBSize 218]) -> [1, 1] + +Validating for node Err. 6 nodes to process in pass 3. + +Validating --> labels = InputValue -> [132, MBSize 218] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 218] +Validating --> feashift = RowSlice(features[363, MBSize 218]) -> [33, MBSize 218] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 218]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 218]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 218], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 218], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 218], LSTMoutput[1].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 218], LSTMoutput[1].bit[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 218], LSTMoutput[1].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 218], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 218], LSTMoutput[2].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 218], LSTMoutput[2].bit[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 218], LSTMoutput[2].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 218], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 218], LSTMoutput[3].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 218], LSTMoutput[3].bit[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 218], LSTMoutput[3].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 218]) -> [132, MBSize 218] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 218], B[132, 1]) -> [132, MBSize 218] +Validating --> Err = ErrorPrediction(labels[132, MBSize 218], LSTMoutputW[132, MBSize 218]) -> [1, 1] + +Validating for node Err, final verification. + +Validating --> labels = InputValue -> [132, MBSize 218] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 218] +Validating --> feashift = RowSlice(features[363, MBSize 218]) -> [33, MBSize 218] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 218]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 218]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 218], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 218], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 218], LSTMoutput[1].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 218], LSTMoutput[1].bit[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 218], LSTMoutput[1].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 218], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 218], LSTMoutput[2].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 218], LSTMoutput[2].bit[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 218], LSTMoutput[2].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 218], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 218], LSTMoutput[3].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 218], LSTMoutput[3].bit[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 218], LSTMoutput[3].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 218]) -> [1024, MBSize 218] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 218]) -> [256, MBSize 218] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 218]) -> [132, MBSize 218] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 218], B[132, 1]) -> [132, MBSize 218] +Validating --> Err = ErrorPrediction(labels[132, MBSize 218], LSTMoutputW[132, MBSize 218]) -> [1, 1] + +127 out of 272 nodes do not share the minibatch layout with the input data. + Starting minibatch loop. - Epoch[ 1 of 2]-Minibatch[ 1- 1 of 128]: SamplesSeen = 348; TrainLossPerSample = 4.88242489; EvalErr[0]PerSample = 0.99137931; TotalTime = 4.53694s; TotalTimePerSample = 13.03719ms; SamplesPerSecond = 76 - Epoch[ 1 of 2]-Minibatch[ 2- 2 of 128]: SamplesSeen = 168; TrainLossPerSample = 4.61489796; EvalErr[0]PerSample = 0.93452381; TotalTime = 2.28522s; TotalTimePerSample = 13.60248ms; SamplesPerSecond = 73 - Epoch[ 1 of 2]-Minibatch[ 3- 3 of 128]: SamplesSeen = 198; TrainLossPerSample = 4.45420884; EvalErr[0]PerSample = 0.87373737; TotalTime = 2.33478s; TotalTimePerSample = 11.79183ms; SamplesPerSecond = 84 - Epoch[ 1 of 2]-Minibatch[ 4- 4 of 128]: SamplesSeen = 258; TrainLossPerSample = 4.39047076; EvalErr[0]PerSample = 0.83333333; TotalTime = 3.50424s; TotalTimePerSample = 13.58233ms; SamplesPerSecond = 73 - Epoch[ 1 of 2]-Minibatch[ 5- 5 of 128]: SamplesSeen = 248; TrainLossPerSample = 57.29654817; EvalErr[0]PerSample = 0.92338710; TotalTime = 2.91642s; TotalTimePerSample = 11.75974ms; SamplesPerSecond = 85 - Epoch[ 1 of 2]-Minibatch[ 6- 6 of 128]: SamplesSeen = 358; TrainLossPerSample = 5.41497905; EvalErr[0]PerSample = 0.91620112; TotalTime = 4.56634s; TotalTimePerSample = 12.75514ms; SamplesPerSecond = 78 - Epoch[ 1 of 2]-Minibatch[ 7- 7 of 128]: SamplesSeen = 98; TrainLossPerSample = 4.44218351; EvalErr[0]PerSample = 0.88775510; TotalTime = 1.34473s; TotalTimePerSample = 13.72177ms; SamplesPerSecond = 72 - Epoch[ 1 of 2]-Minibatch[ 8- 8 of 128]: SamplesSeen = 278; TrainLossPerSample = 4.20048336; EvalErr[0]PerSample = 0.77697842; TotalTime = 3.89413s; TotalTimePerSample = 14.00767ms; SamplesPerSecond = 71 - Epoch[ 1 of 2]-Minibatch[ 9- 9 of 128]: SamplesSeen = 288; TrainLossPerSample = 4.66156684; EvalErr[0]PerSample = 0.92361111; TotalTime = 3.77711s; TotalTimePerSample = 13.11497ms; SamplesPerSecond = 76 - Epoch[ 1 of 2]-Minibatch[ 10- 10 of 128]: SamplesSeen = 258; TrainLossPerSample = 4.35901920; EvalErr[0]PerSample = 0.86434109; TotalTime = 3.59676s; TotalTimePerSample = 13.94092ms; SamplesPerSecond = 71 + Epoch[ 1 of 2]-Minibatch[ 1- 1 of 128]: SamplesSeen = 348; TrainLossPerSample = 4.88242489; EvalErr[0]PerSample = 0.99137931; TotalTime = 4.44437s; TotalTimePerSample = 12.77117ms; SamplesPerSecond = 78 + Epoch[ 1 of 2]-Minibatch[ 2- 2 of 128]: SamplesSeen = 168; TrainLossPerSample = 4.61489796; EvalErr[0]PerSample = 0.93452381; TotalTime = 2.16304s; TotalTimePerSample = 12.87527ms; SamplesPerSecond = 77 + Epoch[ 1 of 2]-Minibatch[ 3- 3 of 128]: SamplesSeen = 198; TrainLossPerSample = 4.45420760; EvalErr[0]PerSample = 0.87373737; TotalTime = 2.53836s; TotalTimePerSample = 12.81998ms; SamplesPerSecond = 78 + Epoch[ 1 of 2]-Minibatch[ 4- 4 of 128]: SamplesSeen = 258; TrainLossPerSample = 4.39046981; EvalErr[0]PerSample = 0.83333333; TotalTime = 3.29028s; TotalTimePerSample = 12.75303ms; SamplesPerSecond = 78 + Epoch[ 1 of 2]-Minibatch[ 5- 5 of 128]: SamplesSeen = 248; TrainLossPerSample = 57.29652651; EvalErr[0]PerSample = 0.92338710; TotalTime = 3.15566s; TotalTimePerSample = 12.72446ms; SamplesPerSecond = 78 + Epoch[ 1 of 2]-Minibatch[ 6- 6 of 128]: SamplesSeen = 358; TrainLossPerSample = 5.41497905; EvalErr[0]PerSample = 0.91620112; TotalTime = 4.60661s; TotalTimePerSample = 12.86763ms; SamplesPerSecond = 77 + Epoch[ 1 of 2]-Minibatch[ 7- 7 of 128]: SamplesSeen = 98; TrainLossPerSample = 4.44218351; EvalErr[0]PerSample = 0.88775510; TotalTime = 1.29487s; TotalTimePerSample = 13.21293ms; SamplesPerSecond = 75 + Epoch[ 1 of 2]-Minibatch[ 8- 8 of 128]: SamplesSeen = 278; TrainLossPerSample = 4.20048336; EvalErr[0]PerSample = 0.77697842; TotalTime = 3.55463s; TotalTimePerSample = 12.78645ms; SamplesPerSecond = 78 + Epoch[ 1 of 2]-Minibatch[ 9- 9 of 128]: SamplesSeen = 288; TrainLossPerSample = 4.66156684; EvalErr[0]PerSample = 0.92361111; TotalTime = 3.67845s; TotalTimePerSample = 12.77239ms; SamplesPerSecond = 78 + Epoch[ 1 of 2]-Minibatch[ 10- 10 of 128]: SamplesSeen = 258; TrainLossPerSample = 4.35901920; EvalErr[0]PerSample = 0.86434109; TotalTime = 3.30382s; TotalTimePerSample = 12.80552ms; SamplesPerSecond = 78 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 1 of 2]-Minibatch[ 11- 11 of 128]: SamplesSeen = 98; TrainLossPerSample = 3.70348374; EvalErr[0]PerSample = 0.67346939; TotalTime = 1.31943s; TotalTimePerSample = 13.46359ms; SamplesPerSecond = 74 -Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 9.6498594; EvalErrPerSample = 0.88722092; Ave LearnRatePerSample = 0.02500000037; EpochTime=34.105187 -Starting Epoch 2: learning rate per sample = 0.025000 momentum = 0.900000 + Epoch[ 1 of 2]-Minibatch[ 11- 11 of 128]: SamplesSeen = 98; TrainLossPerSample = 3.70348374; EvalErr[0]PerSample = 0.67346939; TotalTime = 1.28901s; TotalTimePerSample = 13.15320ms; SamplesPerSecond = 76 +Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 9.6498575; EvalErrPerSample = 0.88722092; Ave LearnRatePerSample = 0.02500000037; EpochTime=33.380318 +Starting Epoch 2: learning rate per sample = 0.025000 effective momentum = 0.900000 minibatchiterator: epoch 1: frames [2560..5120] (first utterance at frame 2598), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 2 of 2]-Minibatch[ 1- 1 of 128]: SamplesSeen = 138; TrainLossPerSample = 4.16283737; EvalErr[0]PerSample = 0.82608696; TotalTime = 1.62122s; TotalTimePerSample = 11.74799ms; SamplesPerSecond = 85 - Epoch[ 2 of 2]-Minibatch[ 2- 2 of 128]: SamplesSeen = 318; TrainLossPerSample = 4.31444468; EvalErr[0]PerSample = 0.94654088; TotalTime = 3.71468s; TotalTimePerSample = 11.68139ms; SamplesPerSecond = 85 - Epoch[ 2 of 2]-Minibatch[ 3- 3 of 128]: SamplesSeen = 368; TrainLossPerSample = 4.13243335; EvalErr[0]PerSample = 0.88586957; TotalTime = 4.31795s; TotalTimePerSample = 11.73355ms; SamplesPerSecond = 85 - Epoch[ 2 of 2]-Minibatch[ 4- 4 of 128]: SamplesSeen = 98; TrainLossPerSample = 3.75084204; EvalErr[0]PerSample = 1.00000000; TotalTime = 1.16138s; TotalTimePerSample = 11.85083ms; SamplesPerSecond = 84 - Epoch[ 2 of 2]-Minibatch[ 5- 5 of 128]: SamplesSeen = 308; TrainLossPerSample = 3.53811210; EvalErr[0]PerSample = 0.81168831; TotalTime = 3.57337s; TotalTimePerSample = 11.60187ms; SamplesPerSecond = 86 - Epoch[ 2 of 2]-Minibatch[ 6- 6 of 128]: SamplesSeen = 258; TrainLossPerSample = 3.79568458; EvalErr[0]PerSample = 0.94186047; TotalTime = 3.02145s; TotalTimePerSample = 11.71103ms; SamplesPerSecond = 85 - Epoch[ 2 of 2]-Minibatch[ 7- 7 of 128]: SamplesSeen = 238; TrainLossPerSample = 4.43607414; EvalErr[0]PerSample = 0.97058824; TotalTime = 2.94950s; TotalTimePerSample = 12.39287ms; SamplesPerSecond = 80 - Epoch[ 2 of 2]-Minibatch[ 8- 8 of 128]: SamplesSeen = 268; TrainLossPerSample = 4.03240876; EvalErr[0]PerSample = 0.86194030; TotalTime = 3.38811s; TotalTimePerSample = 12.64221ms; SamplesPerSecond = 79 - Epoch[ 2 of 2]-Minibatch[ 9- 9 of 128]: SamplesSeen = 308; TrainLossPerSample = 4.48105849; EvalErr[0]PerSample = 0.95779221; TotalTime = 3.88247s; TotalTimePerSample = 12.60544ms; SamplesPerSecond = 79 - Epoch[ 2 of 2]-Minibatch[ 10- 10 of 128]: SamplesSeen = 288; TrainLossPerSample = 4.29093424; EvalErr[0]PerSample = 0.92708333; TotalTime = 3.66267s; TotalTimePerSample = 12.71761ms; SamplesPerSecond = 78 -Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 4.1143761; EvalErrPerSample = 0.90965247; Ave LearnRatePerSample = 0.02500000037; EpochTime=31.297491 + Epoch[ 2 of 2]-Minibatch[ 1- 1 of 128]: SamplesSeen = 138; TrainLossPerSample = 4.16283693; EvalErr[0]PerSample = 0.82608696; TotalTime = 1.79211s; TotalTimePerSample = 12.98633ms; SamplesPerSecond = 77 + Epoch[ 2 of 2]-Minibatch[ 2- 2 of 128]: SamplesSeen = 318; TrainLossPerSample = 4.31444449; EvalErr[0]PerSample = 0.94654088; TotalTime = 4.04166s; TotalTimePerSample = 12.70961ms; SamplesPerSecond = 78 + Epoch[ 2 of 2]-Minibatch[ 3- 3 of 128]: SamplesSeen = 368; TrainLossPerSample = 4.13243302; EvalErr[0]PerSample = 0.88586957; TotalTime = 4.73206s; TotalTimePerSample = 12.85885ms; SamplesPerSecond = 77 + Epoch[ 2 of 2]-Minibatch[ 4- 4 of 128]: SamplesSeen = 98; TrainLossPerSample = 3.75084204; EvalErr[0]PerSample = 1.00000000; TotalTime = 1.29247s; TotalTimePerSample = 13.18848ms; SamplesPerSecond = 75 + Epoch[ 2 of 2]-Minibatch[ 5- 5 of 128]: SamplesSeen = 308; TrainLossPerSample = 3.53811289; EvalErr[0]PerSample = 0.81168831; TotalTime = 3.92750s; TotalTimePerSample = 12.75161ms; SamplesPerSecond = 78 + Epoch[ 2 of 2]-Minibatch[ 6- 6 of 128]: SamplesSeen = 258; TrainLossPerSample = 3.79568458; EvalErr[0]PerSample = 0.94186047; TotalTime = 3.30113s; TotalTimePerSample = 12.79506ms; SamplesPerSecond = 78 + Epoch[ 2 of 2]-Minibatch[ 7- 7 of 128]: SamplesSeen = 238; TrainLossPerSample = 4.43607414; EvalErr[0]PerSample = 0.97058824; TotalTime = 3.05107s; TotalTimePerSample = 12.81964ms; SamplesPerSecond = 78 + Epoch[ 2 of 2]-Minibatch[ 8- 8 of 128]: SamplesSeen = 268; TrainLossPerSample = 4.03240876; EvalErr[0]PerSample = 0.86194030; TotalTime = 3.42811s; TotalTimePerSample = 12.79145ms; SamplesPerSecond = 78 + Epoch[ 2 of 2]-Minibatch[ 9- 9 of 128]: SamplesSeen = 308; TrainLossPerSample = 4.48105849; EvalErr[0]PerSample = 0.95779221; TotalTime = 3.92167s; TotalTimePerSample = 12.73270ms; SamplesPerSecond = 78 + Epoch[ 2 of 2]-Minibatch[ 10- 10 of 128]: SamplesSeen = 288; TrainLossPerSample = 4.29093424; EvalErr[0]PerSample = 0.92708333; TotalTime = 3.67384s; TotalTimePerSample = 12.75639ms; SamplesPerSecond = 78 +Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 4.1143761; EvalErrPerSample = 0.90965247; Ave LearnRatePerSample = 0.02500000037; EpochTime=33.170301 CNTKCommandTrainEnd: speechTrain COMPLETED diff --git a/Tests/Speech/LSTM/FullUtterance/baseline.windows.gpu.txt b/Tests/Speech/LSTM/FullUtterance/baseline.windows.gpu.txt index b508e0391..a220aa8f7 100644 --- a/Tests/Speech/LSTM/FullUtterance/baseline.windows.gpu.txt +++ b/Tests/Speech/LSTM/FullUtterance/baseline.windows.gpu.txt @@ -1,16 +1,16 @@ -=== Running /cygdrive/e/NetScale/CNTK/git_repos/cplx_master/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM\cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151001133538.306652\Speech\LSTM_FullUtterance@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data DeviceId=0 NDLDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]] +=== Running /cygdrive/e/NetScale/CNTK/git_repos/cplx_master2/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024135143.969217\Speech\LSTM_FullUtterance@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM DeviceId=0 Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]] ------------------------------------------------------------------- Build info: - Built time: Sep 30 2015 17:18:44 - Last modified date: Wed Sep 30 14:44:42 2015 + Built time: Oct 24 2015 13:33:25 + Last modified date: Thu Oct 22 16:00:27 2015 Built by amitaga on Amitaga-Win-DT3 - Build Path: E:\NetScale\CNTK\git_repos\cplx_master\MachineLearning\CNTK\ + Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\ CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 ------------------------------------------------------------------- -running on Amitaga-Win-DT3 at 2015/10/01 21:35:39 -command line options: -configFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM\cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151001133538.306652\Speech\LSTM_FullUtterance@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data DeviceId=0 NDLDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]] +running on Amitaga-Win-DT3 at 2015/10/24 21:51:44 +command line: +E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024135143.969217\Speech\LSTM_FullUtterance@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM DeviceId=0 Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]] >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> precision=float @@ -24,9 +24,6 @@ speechTrain=[ modelPath=$RunDir$/models/cntkSpeech.dnn deviceId=$DeviceId$ traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=$NDLDir$/lstmp-3layer_WithSelfStab.ndl - ] SGD=[ epochSize=20480 minibatchSize=20 @@ -200,10 +197,10 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features); ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output') // sadly we can't say x - y since we want to assign a tag ] ] -RunDir=C:\cygwin64\tmp\cntk-test-20151001133538.306652\Speech\LSTM_FullUtterance@debug_gpu -DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data +RunDir=C:\cygwin64\tmp\cntk-test-20151024135143.969217\Speech\LSTM_FullUtterance@debug_gpu +DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM DeviceId=0 -NDLDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] @@ -221,12 +218,9 @@ frameMode=false Truncated=true speechTrain=[ action=train - modelPath=C:\cygwin64\tmp\cntk-test-20151001133538.306652\Speech\LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn + modelPath=C:\cygwin64\tmp\cntk-test-20151024135143.969217\Speech\LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn deviceId=0 traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM/lstmp-3layer_WithSelfStab.ndl - ] SGD=[ epochSize=20480 minibatchSize=20 @@ -246,11 +240,11 @@ speechTrain=[ features=[ dim=363 type=Real - scpFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.scp + scpFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.scp ] labels=[ - mlfFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf - labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list + mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf + labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list labelDim=132 labelType=Category ] @@ -400,10 +394,10 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features); ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output') // sadly we can't say x - y since we want to assign a tag ] ] -RunDir=C:\cygwin64\tmp\cntk-test-20151001133538.306652\Speech\LSTM_FullUtterance@debug_gpu -DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data +RunDir=C:\cygwin64\tmp\cntk-test-20151024135143.969217\Speech\LSTM_FullUtterance@debug_gpu +DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM DeviceId=0 -NDLDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] @@ -414,21 +408,18 @@ speechTrain=[SGD=[numMBsToShowResult=1]] >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> configparameters: cntk.config:command=speechTrain -configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data +configparameters: cntk.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM +configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data configparameters: cntk.config:deviceId=0 configparameters: cntk.config:frameMode=false -configparameters: cntk.config:NDLDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM configparameters: cntk.config:parallelTrain=false configparameters: cntk.config:precision=float -configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151001133538.306652\Speech\LSTM_FullUtterance@debug_gpu +configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024135143.969217\Speech\LSTM_FullUtterance@debug_gpu configparameters: cntk.config:speechTrain=[ action=train - modelPath=C:\cygwin64\tmp\cntk-test-20151001133538.306652\Speech\LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn + modelPath=C:\cygwin64\tmp\cntk-test-20151024135143.969217\Speech\LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn deviceId=0 traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM/lstmp-3layer_WithSelfStab.ndl - ] SGD=[ epochSize=20480 minibatchSize=20 @@ -448,11 +439,11 @@ configparameters: cntk.config:speechTrain=[ features=[ dim=363 type=Real - scpFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.scp + scpFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.scp ] labels=[ - mlfFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf - labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list + mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf + labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list labelDim=132 labelType=Category ] @@ -599,8 +590,7 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features); Err = ErrorPrediction(labels, LSTMoutputW, tag='eval') // this also gets tracked // decoding logPrior = LogPrior(labels) - ScaledLogLikelihood = Minus(LSTMoutputW, logPCNTKModelPath: C:\cygwin64\tmp\cntk-test-20151001133538.306652\Speech\LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn -rior, tag='output') // sadly we can't say x - y since we want to assign a tag + ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output') // sadly we can't say x - y since we want to assign a tag ] ] [reader=[nbruttsineachrecurrentiter=1]] [SGD=[epochSize=2560]] [SGD=[maxEpochs=2]] [SGD=[numMBsToShowResult=1]] @@ -608,2129 +598,3389 @@ configparameters: cntk.config:Truncated=false <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< command: speechTrain precision = float +CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024135143.969217\Speech\LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn +CNTKCommandTrainInfo: speechTrain : 2 +CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 2 CNTKCommandTrainBegin: speechTrain -NDLBuilder Using GPU 0 -reading script file E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.scp ... 948 entries +ExperimentalNetworkBuilder using GPU 0 +reading script file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.scp ... 948 entries trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion -total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list -htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf ... total 948 entries +total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list +htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf ... total 948 entries ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances label set 0: 129 classes minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames - nodes in the recurrent loops : -LSTMoutput1.unnamed174 LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.bit LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.unnamed224 LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.bit LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.unnamed274 LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.bit LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.unnamed174 LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.bit LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.unnamed224 LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.bit LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.unnamed274 LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.bit LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Printing Gradient Computation Node Order ... - -cr[0, 0] = CrossEntropyWithSoftmax(labels[132, 1], LSTMoutputW[0, 0]) -LSTMoutputW[0, 0] = Plus(unnamed283[0, 0], b[132, 1]) -b[132, 1] = LearnableParameter -unnamed283[0, 0] = Times(W[132, 256], unnamed284[0, 0]) -unnamed284[0, 0] = Scale(expsW[0, 0], LSTMoutput3.output[0, 0]) -LSTMoutput3.output[0, 0] = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[0, 0]) -LSTMoutput3.unnamed275[0, 0] = Scale(LSTMoutput3.expsWmr[0, 0], LSTMoutput3.mt[0, 0]) -LSTMoutput3.mt[0, 0] = ElementTimes(LSTMoutput3.ot[0, 0], LSTMoutput3.unnamed274[0, 0]) -LSTMoutput3.unnamed274[0, 0] = Tanh(LSTMoutput3.ct[0, 0]) -LSTMoutput3.ot[0, 0] = Sigmoid(LSTMoutput3.unnamed271[0, 0]) -LSTMoutput3.unnamed271[0, 0] = Plus(LSTMoutput3.unnamed272[0, 0], LSTMoutput3.Wcoct[0, 0]) -LSTMoutput3.Wcoct[0, 0] = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[0, 0]) -LSTMoutput3.unnamed270[0, 0] = Scale(LSTMoutput3.expsWco[0, 0], LSTMoutput3.ct[0, 0]) -LSTMoutput3.ct[0, 0] = Plus(LSTMoutput3.bft[0, 0], LSTMoutput3.bit[0, 0]) -LSTMoutput3.bit[0, 0] = ElementTimes(LSTMoutput3.it[0, 0], LSTMoutput3.unnamed259[0, 0]) -LSTMoutput3.unnamed259[0, 0] = Tanh(LSTMoutput3.unnamed260[0, 0]) -LSTMoutput3.unnamed260[0, 0] = Plus(LSTMoutput3.Wxcx[0, 0], LSTMoutput3.unnamed261[0, 0]) -LSTMoutput3.unnamed261[0, 0] = Plus(LSTMoutput3.Whcdh[0, 0], LSTMoutput3.bc[1024, 1]) -LSTMoutput3.Whcdh[0, 0] = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[0, 0]) -LSTMoutput3.unnamed258[0, 0] = Scale(LSTMoutput3.expsWhc[0, 0], LSTMoutput3.dh[256, 1]) -LSTMoutput3.it[0, 0] = Sigmoid(LSTMoutput3.unnamed254[0, 0]) -LSTMoutput3.unnamed254[0, 0] = Plus(LSTMoutput3.unnamed255[0, 0], LSTMoutput3.Wcidc[0, 0]) -LSTMoutput3.Wcidc[0, 0] = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[0, 0]) -LSTMoutput3.unnamed253[0, 0] = Scale(LSTMoutput3.expsWci[0, 0], LSTMoutput3.dc[1024, 1]) -LSTMoutput3.unnamed255[0, 0] = Plus(LSTMoutput3.unnamed256[0, 0], LSTMoutput3.Whidh[0, 0]) -LSTMoutput3.Whidh[0, 0] = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[0, 0]) -LSTMoutput3.unnamed252[0, 0] = Scale(LSTMoutput3.expsWhi[0, 0], LSTMoutput3.dh[256, 1]) -LSTMoutput3.bft[0, 0] = ElementTimes(LSTMoutput3.ft[0, 0], LSTMoutput3.dc[1024, 1]) -LSTMoutput3.ft[0, 0] = Sigmoid(LSTMoutput3.unnamed265[0, 0]) -LSTMoutput3.unnamed265[0, 0] = Plus(LSTMoutput3.unnamed266[0, 0], LSTMoutput3.Wcfdc[0, 0]) -LSTMoutput3.Wcfdc[0, 0] = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[0, 0]) -LSTMoutput3.unnamed264[0, 0] = Scale(LSTMoutput3.expsWcf[0, 0], LSTMoutput3.dc[1024, 1]) -LSTMoutput3.dc[1024, 1] = PastValue(LSTMoutput3.ct[0, 0]) -LSTMoutput3.unnamed266[0, 0] = Plus(LSTMoutput3.unnamed267[0, 0], LSTMoutput3.Whfdh[0, 0]) -LSTMoutput3.Whfdh[0, 0] = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[0, 0]) -LSTMoutput3.unnamed263[0, 0] = Scale(LSTMoutput3.expsWhf[0, 0], LSTMoutput3.dh[256, 1]) -LSTMoutput3.unnamed272[0, 0] = Plus(LSTMoutput3.unnamed273[0, 0], LSTMoutput3.Whodh[0, 0]) -LSTMoutput3.Whodh[0, 0] = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[0, 0]) -LSTMoutput3.unnamed269[0, 0] = Scale(LSTMoutput3.expsWho[0, 0], LSTMoutput3.dh[256, 1]) -LSTMoutput3.dh[256, 1] = PastValue(LSTMoutput3.output[0, 0]) -LSTMoutput3.bc[1024, 1] = LearnableParameter -LSTMoutput3.expsWhc[0, 0] = Exp(LSTMoutput3.sWhc[1, 1]) -LSTMoutput3.sWhc[1, 1] = LearnableParameter -LSTMoutput3.Whc[1024, 256] = LearnableParameter -LSTMoutput3.Wxcx[0, 0] = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[0, 0]) -LSTMoutput3.unnamed257[0, 0] = Scale(LSTMoutput3.expsWxc[0, 0], LSTMoutput2.output[0, 0]) -LSTMoutput3.expsWxc[0, 0] = Exp(LSTMoutput3.sWxc[1, 1]) -LSTMoutput3.sWxc[1, 1] = LearnableParameter -LSTMoutput3.Wxc[1024, 256] = LearnableParameter -LSTMoutput3.expsWci[0, 0] = Exp(LSTMoutput3.sWci[1, 1]) -LSTMoutput3.sWci[1, 1] = LearnableParameter -LSTMoutput3.Wci[1024, 1] = LearnableParameter -LSTMoutput3.expsWhi[0, 0] = Exp(LSTMoutput3.sWhi[1, 1]) -LSTMoutput3.sWhi[1, 1] = LearnableParameter -LSTMoutput3.Whi[1024, 256] = LearnableParameter -LSTMoutput3.unnamed256[0, 0] = Plus(LSTMoutput3.Wxix[0, 0], LSTMoutput3.bi[1024, 1]) -LSTMoutput3.bi[1024, 1] = LearnableParameter -LSTMoutput3.Wxix[0, 0] = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[0, 0]) -LSTMoutput3.unnamed251[0, 0] = Scale(LSTMoutput3.expsWxi[0, 0], LSTMoutput2.output[0, 0]) -LSTMoutput3.expsWxi[0, 0] = Exp(LSTMoutput3.sWxi[1, 1]) -LSTMoutput3.sWxi[1, 1] = LearnableParameter -LSTMoutput3.Wxi[1024, 256] = LearnableParameter -LSTMoutput3.expsWcf[0, 0] = Exp(LSTMoutput3.sWcf[1, 1]) -LSTMoutput3.sWcf[1, 1] = LearnableParameter -LSTMoutput3.Wcf[1024, 1] = LearnableParameter -LSTMoutput3.expsWhf[0, 0] = Exp(LSTMoutput3.sWhf[1, 1]) -LSTMoutput3.sWhf[1, 1] = LearnableParameter -LSTMoutput3.Whf[1024, 256] = LearnableParameter -LSTMoutput3.unnamed267[0, 0] = Plus(LSTMoutput3.Wxfx[0, 0], LSTMoutput3.bf[1024, 1]) -LSTMoutput3.bf[1024, 1] = LearnableParameter -LSTMoutput3.Wxfx[0, 0] = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[0, 0]) -LSTMoutput3.unnamed262[0, 0] = Scale(LSTMoutput3.expsWxf[0, 0], LSTMoutput2.output[0, 0]) -LSTMoutput3.expsWxf[0, 0] = Exp(LSTMoutput3.sWxf[1, 1]) -LSTMoutput3.sWxf[1, 1] = LearnableParameter -LSTMoutput3.Wxf[1024, 256] = LearnableParameter -LSTMoutput3.expsWco[0, 0] = Exp(LSTMoutput3.sWco[1, 1]) -LSTMoutput3.sWco[1, 1] = LearnableParameter -LSTMoutput3.Wco[1024, 1] = LearnableParameter -LSTMoutput3.expsWho[0, 0] = Exp(LSTMoutput3.sWho[1, 1]) -LSTMoutput3.sWho[1, 1] = LearnableParameter -LSTMoutput3.Who[1024, 256] = LearnableParameter -LSTMoutput3.unnamed273[0, 0] = Plus(LSTMoutput3.Wxox[0, 0], LSTMoutput3.bo[1024, 1]) -LSTMoutput3.bo[1024, 1] = LearnableParameter -LSTMoutput3.Wxox[0, 0] = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[0, 0]) -LSTMoutput3.unnamed268[0, 0] = Scale(LSTMoutput3.expsWxo[0, 0], LSTMoutput2.output[0, 0]) -LSTMoutput2.output[0, 0] = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[0, 0]) -LSTMoutput2.unnamed225[0, 0] = Scale(LSTMoutput2.expsWmr[0, 0], LSTMoutput2.mt[0, 0]) -LSTMoutput2.mt[0, 0] = ElementTimes(LSTMoutput2.ot[0, 0], LSTMoutput2.unnamed224[0, 0]) -LSTMoutput2.unnamed224[0, 0] = Tanh(LSTMoutput2.ct[0, 0]) -LSTMoutput2.ot[0, 0] = Sigmoid(LSTMoutput2.unnamed221[0, 0]) -LSTMoutput2.unnamed221[0, 0] = Plus(LSTMoutput2.unnamed222[0, 0], LSTMoutput2.Wcoct[0, 0]) -LSTMoutput2.Wcoct[0, 0] = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[0, 0]) -LSTMoutput2.unnamed220[0, 0] = Scale(LSTMoutput2.expsWco[0, 0], LSTMoutput2.ct[0, 0]) -LSTMoutput2.ct[0, 0] = Plus(LSTMoutput2.bft[0, 0], LSTMoutput2.bit[0, 0]) -LSTMoutput2.bit[0, 0] = ElementTimes(LSTMoutput2.it[0, 0], LSTMoutput2.unnamed209[0, 0]) -LSTMoutput2.unnamed209[0, 0] = Tanh(LSTMoutput2.unnamed210[0, 0]) -LSTMoutput2.unnamed210[0, 0] = Plus(LSTMoutput2.Wxcx[0, 0], LSTMoutput2.unnamed211[0, 0]) -LSTMoutput2.unnamed211[0, 0] = Plus(LSTMoutput2.Whcdh[0, 0], LSTMoutput2.bc[1024, 1]) -LSTMoutput2.Whcdh[0, 0] = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[0, 0]) -LSTMoutput2.unnamed208[0, 0] = Scale(LSTMoutput2.expsWhc[0, 0], LSTMoutput2.dh[256, 1]) -LSTMoutput2.it[0, 0] = Sigmoid(LSTMoutput2.unnamed204[0, 0]) -LSTMoutput2.unnamed204[0, 0] = Plus(LSTMoutput2.unnamed205[0, 0], LSTMoutput2.Wcidc[0, 0]) -LSTMoutput2.Wcidc[0, 0] = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[0, 0]) -LSTMoutput2.unnamed203[0, 0] = Scale(LSTMoutput2.expsWci[0, 0], LSTMoutput2.dc[1024, 1]) -LSTMoutput2.unnamed205[0, 0] = Plus(LSTMoutput2.unnamed206[0, 0], LSTMoutput2.Whidh[0, 0]) -LSTMoutput2.Whidh[0, 0] = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[0, 0]) -LSTMoutput2.unnamed202[0, 0] = Scale(LSTMoutput2.expsWhi[0, 0], LSTMoutput2.dh[256, 1]) -LSTMoutput2.bft[0, 0] = ElementTimes(LSTMoutput2.ft[0, 0], LSTMoutput2.dc[1024, 1]) -LSTMoutput2.ft[0, 0] = Sigmoid(LSTMoutput2.unnamed215[0, 0]) -LSTMoutput2.unnamed215[0, 0] = Plus(LSTMoutput2.unnamed216[0, 0], LSTMoutput2.Wcfdc[0, 0]) -LSTMoutput2.Wcfdc[0, 0] = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[0, 0]) -LSTMoutput2.unnamed214[0, 0] = Scale(LSTMoutput2.expsWcf[0, 0], LSTMoutput2.dc[1024, 1]) -LSTMoutput2.dc[1024, 1] = PastValue(LSTMoutput2.ct[0, 0]) -LSTMoutput2.unnamed216[0, 0] = Plus(LSTMoutput2.unnamed217[0, 0], LSTMoutput2.Whfdh[0, 0]) -LSTMoutput2.Whfdh[0, 0] = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[0, 0]) -LSTMoutput2.unnamed213[0, 0] = Scale(LSTMoutput2.expsWhf[0, 0], LSTMoutput2.dh[256, 1]) -LSTMoutput2.unnamed222[0, 0] = Plus(LSTMoutput2.unnamed223[0, 0], LSTMoutput2.Whodh[0, 0]) -LSTMoutput2.Whodh[0, 0] = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[0, 0]) -LSTMoutput2.unnamed219[0, 0] = Scale(LSTMoutput2.expsWho[0, 0], LSTMoutput2.dh[256, 1]) -LSTMoutput2.dh[256, 1] = PastValue(LSTMoutput2.output[0, 0]) -LSTMoutput2.bc[1024, 1] = LearnableParameter -LSTMoutput2.expsWhc[0, 0] = Exp(LSTMoutput2.sWhc[1, 1]) -LSTMoutput2.sWhc[1, 1] = LearnableParameter -LSTMoutput2.Whc[1024, 256] = LearnableParameter -LSTMoutput2.Wxcx[0, 0] = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[0, 0]) -LSTMoutput2.unnamed207[0, 0] = Scale(LSTMoutput2.expsWxc[0, 0], LSTMoutput1.output[0, 0]) -LSTMoutput2.expsWxc[0, 0] = Exp(LSTMoutput2.sWxc[1, 1]) -LSTMoutput2.sWxc[1, 1] = LearnableParameter -LSTMoutput2.Wxc[1024, 256] = LearnableParameter -LSTMoutput2.expsWci[0, 0] = Exp(LSTMoutput2.sWci[1, 1]) -LSTMoutput2.sWci[1, 1] = LearnableParameter -LSTMoutput2.Wci[1024, 1] = LearnableParameter -LSTMoutput2.expsWhi[0, 0] = Exp(LSTMoutput2.sWhi[1, 1]) -LSTMoutput2.sWhi[1, 1] = LearnableParameter -LSTMoutput2.Whi[1024, 256] = LearnableParameter -LSTMoutput2.unnamed206[0, 0] = Plus(LSTMoutput2.Wxix[0, 0], LSTMoutput2.bi[1024, 1]) -LSTMoutput2.bi[1024, 1] = LearnableParameter -LSTMoutput2.Wxix[0, 0] = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[0, 0]) -LSTMoutput2.unnamed201[0, 0] = Scale(LSTMoutput2.expsWxi[0, 0], LSTMoutput1.output[0, 0]) -LSTMoutput2.expsWxi[0, 0] = Exp(LSTMoutput2.sWxi[1, 1]) -LSTMoutput2.sWxi[1, 1] = LearnableParameter -LSTMoutput2.Wxi[1024, 256] = LearnableParameter -LSTMoutput2.expsWcf[0, 0] = Exp(LSTMoutput2.sWcf[1, 1]) -LSTMoutput2.sWcf[1, 1] = LearnableParameter -LSTMoutput2.Wcf[1024, 1] = LearnableParameter -LSTMoutput2.expsWhf[0, 0] = Exp(LSTMoutput2.sWhf[1, 1]) -LSTMoutput2.sWhf[1, 1] = LearnableParameter -LSTMoutput2.Whf[1024, 256] = LearnableParameter -LSTMoutput2.unnamed217[0, 0] = Plus(LSTMoutput2.Wxfx[0, 0], LSTMoutput2.bf[1024, 1]) -LSTMoutput2.bf[1024, 1] = LearnableParameter -LSTMoutput2.Wxfx[0, 0] = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[0, 0]) -LSTMoutput2.unnamed212[0, 0] = Scale(LSTMoutput2.expsWxf[0, 0], LSTMoutput1.output[0, 0]) -LSTMoutput2.expsWxf[0, 0] = Exp(LSTMoutput2.sWxf[1, 1]) -LSTMoutput2.sWxf[1, 1] = LearnableParameter -LSTMoutput2.Wxf[1024, 256] = LearnableParameter -LSTMoutput2.expsWco[0, 0] = Exp(LSTMoutput2.sWco[1, 1]) -LSTMoutput2.sWco[1, 1] = LearnableParameter -LSTMoutput2.Wco[1024, 1] = LearnableParameter -LSTMoutput2.expsWho[0, 0] = Exp(LSTMoutput2.sWho[1, 1]) -LSTMoutput2.sWho[1, 1] = LearnableParameter -LSTMoutput2.Who[1024, 256] = LearnableParameter -LSTMoutput2.unnamed223[0, 0] = Plus(LSTMoutput2.Wxox[0, 0], LSTMoutput2.bo[1024, 1]) -LSTMoutput2.bo[1024, 1] = LearnableParameter -LSTMoutput2.Wxox[0, 0] = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[0, 0]) -LSTMoutput2.unnamed218[0, 0] = Scale(LSTMoutput2.expsWxo[0, 0], LSTMoutput1.output[0, 0]) -LSTMoutput1.output[0, 0] = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[0, 0]) -LSTMoutput1.unnamed175[0, 0] = Scale(LSTMoutput1.expsWmr[0, 0], LSTMoutput1.mt[0, 0]) -LSTMoutput1.mt[0, 0] = ElementTimes(LSTMoutput1.ot[0, 0], LSTMoutput1.unnamed174[0, 0]) -LSTMoutput1.unnamed174[0, 0] = Tanh(LSTMoutput1.ct[0, 0]) -LSTMoutput1.ot[0, 0] = Sigmoid(LSTMoutput1.unnamed171[0, 0]) -LSTMoutput1.unnamed171[0, 0] = Plus(LSTMoutput1.unnamed172[0, 0], LSTMoutput1.Wcoct[0, 0]) -LSTMoutput1.Wcoct[0, 0] = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[0, 0]) -LSTMoutput1.unnamed170[0, 0] = Scale(LSTMoutput1.expsWco[0, 0], LSTMoutput1.ct[0, 0]) -LSTMoutput1.ct[0, 0] = Plus(LSTMoutput1.bft[0, 0], LSTMoutput1.bit[0, 0]) -LSTMoutput1.bit[0, 0] = ElementTimes(LSTMoutput1.it[0, 0], LSTMoutput1.unnamed159[0, 0]) -LSTMoutput1.unnamed159[0, 0] = Tanh(LSTMoutput1.unnamed160[0, 0]) -LSTMoutput1.unnamed160[0, 0] = Plus(LSTMoutput1.Wxcx[0, 0], LSTMoutput1.unnamed161[0, 0]) -LSTMoutput1.unnamed161[0, 0] = Plus(LSTMoutput1.Whcdh[0, 0], LSTMoutput1.bc[1024, 1]) -LSTMoutput1.Whcdh[0, 0] = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[0, 0]) -LSTMoutput1.unnamed158[0, 0] = Scale(LSTMoutput1.expsWhc[0, 0], LSTMoutput1.dh[256, 1]) -LSTMoutput1.it[0, 0] = Sigmoid(LSTMoutput1.unnamed154[0, 0]) -LSTMoutput1.unnamed154[0, 0] = Plus(LSTMoutput1.unnamed155[0, 0], LSTMoutput1.Wcidc[0, 0]) -LSTMoutput1.Wcidc[0, 0] = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[0, 0]) -LSTMoutput1.unnamed153[0, 0] = Scale(LSTMoutput1.expsWci[0, 0], LSTMoutput1.dc[1024, 1]) -LSTMoutput1.unnamed155[0, 0] = Plus(LSTMoutput1.unnamed156[0, 0], LSTMoutput1.Whidh[0, 0]) -LSTMoutput1.Whidh[0, 0] = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[0, 0]) -LSTMoutput1.unnamed152[0, 0] = Scale(LSTMoutput1.expsWhi[0, 0], LSTMoutput1.dh[256, 1]) -LSTMoutput1.bft[0, 0] = ElementTimes(LSTMoutput1.ft[0, 0], LSTMoutput1.dc[1024, 1]) -LSTMoutput1.ft[0, 0] = Sigmoid(LSTMoutput1.unnamed165[0, 0]) -LSTMoutput1.unnamed165[0, 0] = Plus(LSTMoutput1.unnamed166[0, 0], LSTMoutput1.Wcfdc[0, 0]) -LSTMoutput1.Wcfdc[0, 0] = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[0, 0]) -LSTMoutput1.unnamed164[0, 0] = Scale(LSTMoutput1.expsWcf[0, 0], LSTMoutput1.dc[1024, 1]) -LSTMoutput1.dc[1024, 1] = PastValue(LSTMoutput1.ct[0, 0]) -LSTMoutput1.unnamed166[0, 0] = Plus(LSTMoutput1.unnamed167[0, 0], LSTMoutput1.Whfdh[0, 0]) -LSTMoutput1.Whfdh[0, 0] = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[0, 0]) -LSTMoutput1.unnamed163[0, 0] = Scale(LSTMoutput1.expsWhf[0, 0], LSTMoutput1.dh[256, 1]) -LSTMoutput1.unnamed172[0, 0] = Plus(LSTMoutput1.unnamed173[0, 0], LSTMoutput1.Whodh[0, 0]) -LSTMoutput1.Whodh[0, 0] = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[0, 0]) -LSTMoutput1.unnamed169[0, 0] = Scale(LSTMoutput1.expsWho[0, 0], LSTMoutput1.dh[256, 1]) -LSTMoutput1.dh[256, 1] = PastValue(LSTMoutput1.output[0, 0]) -LSTMoutput1.bc[1024, 1] = LearnableParameter -LSTMoutput1.expsWhc[0, 0] = Exp(LSTMoutput1.sWhc[1, 1]) -LSTMoutput1.sWhc[1, 1] = LearnableParameter -LSTMoutput1.Whc[1024, 256] = LearnableParameter -LSTMoutput1.Wxcx[0, 0] = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[0, 0]) -LSTMoutput1.unnamed157[0, 0] = Scale(LSTMoutput1.expsWxc[0, 0], featNorm.xNorm[0, 0]) -LSTMoutput1.expsWxc[0, 0] = Exp(LSTMoutput1.sWxc[1, 1]) -LSTMoutput1.sWxc[1, 1] = LearnableParameter -LSTMoutput1.Wxc[1024, 33] = LearnableParameter -LSTMoutput1.expsWci[0, 0] = Exp(LSTMoutput1.sWci[1, 1]) -LSTMoutput1.sWci[1, 1] = LearnableParameter -LSTMoutput1.Wci[1024, 1] = LearnableParameter -LSTMoutput1.expsWhi[0, 0] = Exp(LSTMoutput1.sWhi[1, 1]) -LSTMoutput1.sWhi[1, 1] = LearnableParameter -LSTMoutput1.Whi[1024, 256] = LearnableParameter -LSTMoutput1.unnamed156[0, 0] = Plus(LSTMoutput1.Wxix[0, 0], LSTMoutput1.bi[1024, 1]) -LSTMoutput1.bi[1024, 1] = LearnableParameter -LSTMoutput1.Wxix[0, 0] = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[0, 0]) -LSTMoutput1.unnamed151[0, 0] = Scale(LSTMoutput1.expsWxi[0, 0], featNorm.xNorm[0, 0]) -LSTMoutput1.expsWxi[0, 0] = Exp(LSTMoutput1.sWxi[1, 1]) -LSTMoutput1.sWxi[1, 1] = LearnableParameter -LSTMoutput1.Wxi[1024, 33] = LearnableParameter -LSTMoutput1.expsWcf[0, 0] = Exp(LSTMoutput1.sWcf[1, 1]) -LSTMoutput1.sWcf[1, 1] = LearnableParameter -LSTMoutput1.Wcf[1024, 1] = LearnableParameter -LSTMoutput1.expsWhf[0, 0] = Exp(LSTMoutput1.sWhf[1, 1]) -LSTMoutput1.sWhf[1, 1] = LearnableParameter -LSTMoutput1.Whf[1024, 256] = LearnableParameter -LSTMoutput1.unnamed167[0, 0] = Plus(LSTMoutput1.Wxfx[0, 0], LSTMoutput1.bf[1024, 1]) -LSTMoutput1.bf[1024, 1] = LearnableParameter -LSTMoutput1.Wxfx[0, 0] = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[0, 0]) -LSTMoutput1.unnamed162[0, 0] = Scale(LSTMoutput1.expsWxf[0, 0], featNorm.xNorm[0, 0]) -LSTMoutput1.expsWxf[0, 0] = Exp(LSTMoutput1.sWxf[1, 1]) -LSTMoutput1.sWxf[1, 1] = LearnableParameter -LSTMoutput1.Wxf[1024, 33] = LearnableParameter -LSTMoutput1.expsWco[0, 0] = Exp(LSTMoutput1.sWco[1, 1]) -LSTMoutput1.sWco[1, 1] = LearnableParameter -LSTMoutput1.Wco[1024, 1] = LearnableParameter -LSTMoutput1.expsWho[0, 0] = Exp(LSTMoutput1.sWho[1, 1]) -LSTMoutput1.sWho[1, 1] = LearnableParameter -LSTMoutput1.Who[1024, 256] = LearnableParameter -LSTMoutput1.unnamed173[0, 0] = Plus(LSTMoutput1.Wxox[0, 0], LSTMoutput1.bo[1024, 1]) -LSTMoutput1.bo[1024, 1] = LearnableParameter -LSTMoutput1.Wxox[0, 0] = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[0, 0]) -LSTMoutput1.unnamed168[0, 0] = Scale(LSTMoutput1.expsWxo[0, 0], featNorm.xNorm[0, 0]) -featNorm.xNorm[0, 0] = PerDimMeanVarNormalization(feashift[0, 0], featNorm.xMean[0, 0], featNorm.xStdDev[0, 0]) -featNorm.xStdDev[0, 0] = InvStdDev(feashift[0, 0]) -featNorm.xMean[0, 0] = Mean(feashift[0, 0]) -feashift[0, 0] = RowSlice(features[363, 1]) -features[363, 1] = InputValue -LSTMoutput1.expsWxo[0, 0] = Exp(LSTMoutput1.sWxo[1, 1]) -LSTMoutput1.sWxo[1, 1] = LearnableParameter -LSTMoutput1.Wxo[1024, 33] = LearnableParameter -LSTMoutput1.expsWmr[0, 0] = Exp(LSTMoutput1.sWmr[1, 1]) -LSTMoutput1.sWmr[1, 1] = LearnableParameter -LSTMoutput1.Wmr[256, 1024] = LearnableParameter -LSTMoutput2.expsWxo[0, 0] = Exp(LSTMoutput2.sWxo[1, 1]) -LSTMoutput2.sWxo[1, 1] = LearnableParameter -LSTMoutput2.Wxo[1024, 256] = LearnableParameter -LSTMoutput2.expsWmr[0, 0] = Exp(LSTMoutput2.sWmr[1, 1]) -LSTMoutput2.sWmr[1, 1] = LearnableParameter -LSTMoutput2.Wmr[256, 1024] = LearnableParameter -LSTMoutput3.expsWxo[0, 0] = Exp(LSTMoutput3.sWxo[1, 1]) -LSTMoutput3.sWxo[1, 1] = LearnableParameter -LSTMoutput3.Wxo[1024, 256] = LearnableParameter -LSTMoutput3.expsWmr[0, 0] = Exp(LSTMoutput3.sWmr[1, 1]) -LSTMoutput3.sWmr[1, 1] = LearnableParameter -LSTMoutput3.Wmr[256, 1024] = LearnableParameter -expsW[0, 0] = Exp(sW[1, 1]) -sW[1, 1] = LearnableParameter -W[132, 256] = LearnableParameter -labels[132, 1] = InputValue - -Validating node cr - nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Validating node cr - -Validating --> labels = InputValue -> [132, MBSize 1] -Validating --> W = LearnableParameter -> [132, 256] -Validating --> sW = LearnableParameter -> [1, 1] -Validating --> expsW = Exp(sW[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1] -Validating --> features = InputValue -> [363, MBSize 1] -Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] -Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1] -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1] -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1] -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1] -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1]) -> [1024, 1] -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1] -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1] -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1] -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1] -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1]) -> [1024, 1] -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1] -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1] -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1] -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1] -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1] -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1]) -> [1024, 1] -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1] -Validating --> b = LearnableParameter -> [132, 1] -Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1] -Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] ---- -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] - -Revalidating - - - - - -127 out of 272 nodes do not share the minibatch layout with the input data. - -Validating --> labels = InputValue -> [132, MBSize 1] -Validating --> W = LearnableParameter -> [132, 256] -Validating --> sW = LearnableParameter -> [1, 1] -Validating --> expsW = Exp(sW[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1] -Validating --> features = InputValue -> [363, MBSize 1] -Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] -Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1] -Validating --> b = LearnableParameter -> [132, 1] -Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1] -Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] ---- -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] - -Revalidating - - - - - -127 out of 272 nodes do not share the minibatch layout with the input data. - nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Validating node ScaledLogLikelihood - nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Validating node ScaledLogLikelihood - -Validating --> W = LearnableParameter -> [132, 256] -Validating --> sW = LearnableParameter -> [1, 1] -Validating --> expsW = Exp(sW[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1] -Validating --> features = InputValue -> [363, MBSize 1] -Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] -Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1] -Validating --> b = LearnableParameter -> [132, 1] -Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1] -Validating --> labels = InputValue -> [132, MBSize 1] -Validating --> logPrior.Prior = Mean(labels[132, MBSize 1]) -> [132, 1] -Validating --> logPrior.LogPrior = Log(logPrior.Prior[132, 1]) -> [132, 1] -Validating --> ScaledLogLikelihood = Minus(LSTMoutputW[132, MBSize 1], logPrior.LogPrior[132, 1]) -> [132, MBSize 1] ---- -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] - -Revalidating - - - - - -128 out of 274 nodes do not share the minibatch layout with the input data. - -Validating --> W = LearnableParameter -> [132, 256] -Validating --> sW = LearnableParameter -> [1, 1] -Validating --> expsW = Exp(sW[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1] -Validating --> features = InputValue -> [363, MBSize 1] -Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] -Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1] -Validating --> b = LearnableParameter -> [132, 1] -Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1] -Validating --> labels = InputValue -> [132, MBSize 1] -Validating --> logPrior.Prior = Mean(labels[132, MBSize 1]) -> [132, 1] -Validating --> logPrior.LogPrior = Log(logPrior.Prior[132, 1]) -> [132, 1] -Validating --> ScaledLogLikelihood = Minus(LSTMoutputW[132, MBSize 1], logPrior.LogPrior[132, 1]) -> [132, MBSize 1] ---- -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] - -Revalidating - - - - - -128 out of 274 nodes do not share the minibatch layout with the input data. - nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Validating node Err - nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Validating node Err - -Validating --> labels = InputValue -> [132, MBSize 1] -Validating --> W = LearnableParameter -> [132, 256] -Validating --> sW = LearnableParameter -> [1, 1] -Validating --> expsW = Exp(sW[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1] -Validating --> features = InputValue -> [363, MBSize 1] -Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] -Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1] -Validating --> b = LearnableParameter -> [132, 1] -Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1] -Validating --> Err = ErrorPrediction(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] ---- -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] - -Revalidating - - - - - -127 out of 272 nodes do not share the minibatch layout with the input data. - -Validating --> labels = InputValue -> [132, MBSize 1] -Validating --> W = LearnableParameter -> [132, 256] -Validating --> sW = LearnableParameter -> [1, 1] -Validating --> expsW = Exp(sW[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024] -Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1] -Validating --> features = InputValue -> [363, MBSize 1] -Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] -Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33] -Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1] -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256] -Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1] -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1] -Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1] -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1] -Validating --> b = LearnableParameter -> [132, 1] -Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1] -Validating --> Err = ErrorPrediction(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] ---- -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1] -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1] - -Revalidating - - - - - -127 out of 272 nodes do not share the minibatch layout with the input data. +Node --> B = LearnableParameter +Node --> labels = InputValue +Node --> LSTMoutputW./*+*/left./***/left = LearnableParameter +Node --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].Wmr = LearnableParameter +Node --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].Wmr = LearnableParameter +Node --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].Wmr = LearnableParameter +Node --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> features = InputValue +Node --> feashift = RowSlice +Node --> featNorm.meanVector = Mean +Node --> featNorm.invStdDevVector = InvStdDev +Node --> featNorm = PerDimMeanVarNormalization +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].dh = PastValue +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[1].ot.z./*+*/left = Plus +Node --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[1].ft.z./*+*/left = Plus +Node --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[1].dc = PastValue +Node --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale +Node --> LSTMoutput[1].ft.z./*+*/right = DiagTimes +Node --> LSTMoutput[1].ft.z = Plus +Node --> LSTMoutput[1].ft = Sigmoid +Node --> LSTMoutput[1].bft = ElementTimes +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[1].it.z./*+*/left = Plus +Node --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[1].it.z./*+*/right.matrix = Scale +Node --> LSTMoutput[1].it.z./*+*/right = DiagTimes +Node --> LSTMoutput[1].it.z = Plus +Node --> LSTMoutput[1].it = Sigmoid +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus +Node --> LSTMoutput[1].bit./*.**/right.z = Plus +Node --> LSTMoutput[1].bit./*.**/right = Tanh +Node --> LSTMoutput[1].bit = ElementTimes +Node --> LSTMoutput[1].ct = Plus +Node --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale +Node --> LSTMoutput[1].ot.z./*+*/right = DiagTimes +Node --> LSTMoutput[1].ot.z = Plus +Node --> LSTMoutput[1].ot = Sigmoid +Node --> LSTMoutput[1].mt./*.**/right = Tanh +Node --> LSTMoutput[1].mt = ElementTimes +Node --> LSTMoutput[1].output./***/right = Scale +Node --> LSTMoutput[1].output = Times +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].dh = PastValue +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[2].ot.z./*+*/left = Plus +Node --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[2].ft.z./*+*/left = Plus +Node --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[2].dc = PastValue +Node --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale +Node --> LSTMoutput[2].ft.z./*+*/right = DiagTimes +Node --> LSTMoutput[2].ft.z = Plus +Node --> LSTMoutput[2].ft = Sigmoid +Node --> LSTMoutput[2].bft = ElementTimes +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[2].it.z./*+*/left = Plus +Node --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[2].it.z./*+*/right.matrix = Scale +Node --> LSTMoutput[2].it.z./*+*/right = DiagTimes +Node --> LSTMoutput[2].it.z = Plus +Node --> LSTMoutput[2].it = Sigmoid +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus +Node --> LSTMoutput[2].bit./*.**/right.z = Plus +Node --> LSTMoutput[2].bit./*.**/right = Tanh +Node --> LSTMoutput[2].bit = ElementTimes +Node --> LSTMoutput[2].ct = Plus +Node --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale +Node --> LSTMoutput[2].ot.z./*+*/right = DiagTimes +Node --> LSTMoutput[2].ot.z = Plus +Node --> LSTMoutput[2].ot = Sigmoid +Node --> LSTMoutput[2].mt./*.**/right = Tanh +Node --> LSTMoutput[2].mt = ElementTimes +Node --> LSTMoutput[2].output./***/right = Scale +Node --> LSTMoutput[2].output = Times +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].dh = PastValue +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[3].ot.z./*+*/left = Plus +Node --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[3].ft.z./*+*/left = Plus +Node --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[3].dc = PastValue +Node --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale +Node --> LSTMoutput[3].ft.z./*+*/right = DiagTimes +Node --> LSTMoutput[3].ft.z = Plus +Node --> LSTMoutput[3].ft = Sigmoid +Node --> LSTMoutput[3].bft = ElementTimes +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[3].it.z./*+*/left = Plus +Node --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[3].it.z./*+*/right.matrix = Scale +Node --> LSTMoutput[3].it.z./*+*/right = DiagTimes +Node --> LSTMoutput[3].it.z = Plus +Node --> LSTMoutput[3].it = Sigmoid +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus +Node --> LSTMoutput[3].bit./*.**/right.z = Plus +Node --> LSTMoutput[3].bit./*.**/right = Tanh +Node --> LSTMoutput[3].bit = ElementTimes +Node --> LSTMoutput[3].ct = Plus +Node --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale +Node --> LSTMoutput[3].ot.z./*+*/right = DiagTimes +Node --> LSTMoutput[3].ot.z = Plus +Node --> LSTMoutput[3].ot = Sigmoid +Node --> LSTMoutput[3].mt./*.**/right = Tanh +Node --> LSTMoutput[3].mt = ElementTimes +Node --> LSTMoutput[3].output./***/right = Scale +Node --> LSTMoutput[3].output = Times +Node --> LSTMoutputW./*+*/left./***/right = Scale +Node --> LSTMoutputW./*+*/left = Times +Node --> LSTMoutputW = Plus +Node --> Err = ErrorPrediction +Node --> logPrior.x = Mean +Node --> logPrior = Log +Node --> ScaledLogLikelihood = Minus +Node --> cr = CrossEntropyWithSoftmax +class Microsoft::MSR::CNTK::ComputationNetwork [ + B : LearnableParameter 132 x 1 () + cr : CrossEntropyWithSoftmax 0 x 0 ( + labels + LSTMoutputW + ) + Err : ErrorPrediction 0 x 0 ( + labels + LSTMoutputW + ) + feashift : RowSlice 0 x 0 ( + features + ) + featNorm : PerDimMeanVarNormalization 0 x 0 ( + feashift + featNorm.meanVector + featNorm.invStdDevVector + ) + featNorm.invStdDevVector : InvStdDev 0 x 0 ( + feashift + ) + featNorm.meanVector : Mean 0 x 0 ( + feashift + ) + features : InputValue 363 x 1 () + labels : InputValue 132 x 1 () + logPrior : Log 0 x 0 ( + logPrior.x + ) + logPrior.x : Mean 0 x 0 ( + labels + ) + LSTMoutput[1].bft : ElementTimes 0 x 0 ( + LSTMoutput[1].ft + LSTMoutput[1].dc + ) + LSTMoutput[1].bit : ElementTimes 0 x 0 ( + LSTMoutput[1].it + LSTMoutput[1].bit./*.**/right + ) + LSTMoutput[1].bit./*.**/right : Tanh 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z + ) + LSTMoutput[1].bit./*.**/right.z : Plus 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/left + LSTMoutput[1].bit./*.**/right.z./*+*/right + ) + LSTMoutput[1].bit./*.**/right.z./*+*/left : Times 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right + ) + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 33 () + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor + featNorm + ) + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].bit./*.**/right.z./*+*/right : Plus 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right + ) + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right + ) + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].dh + ) + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[1].ct : Plus 0 x 0 ( + LSTMoutput[1].bft + LSTMoutput[1].bit + ) + LSTMoutput[1].dc : PastValue 1024 x 1 ( + LSTMoutput[1].ct + ) + LSTMoutput[1].dh : PastValue 256 x 1 ( + LSTMoutput[1].output + ) + LSTMoutput[1].ft : Sigmoid 0 x 0 ( + LSTMoutput[1].ft.z + ) + LSTMoutput[1].ft.z : Plus 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left + LSTMoutput[1].ft.z./*+*/right + ) + LSTMoutput[1].ft.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left + LSTMoutput[1].ft.z./*+*/left./*+*/right + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 () + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + featNorm + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[1].ft.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[1].dh + ) + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].ft.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[1].ft.z./*+*/right.matrix + ) + LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[1].ft.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[1].dc + ) + LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].it : Sigmoid 0 x 0 ( + LSTMoutput[1].it.z + ) + LSTMoutput[1].it.z : Plus 0 x 0 ( + LSTMoutput[1].it.z./*+*/left + LSTMoutput[1].it.z./*+*/right + ) + LSTMoutput[1].it.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left + LSTMoutput[1].it.z./*+*/left./*+*/right + ) + LSTMoutput[1].it.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 () + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + featNorm + ) + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[1].it.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/right./***/left + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[1].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[1].dh + ) + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].it.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[1].it.z./*+*/right.matrix + ) + LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[1].it.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[1].dc + ) + LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].mt : ElementTimes 0 x 0 ( + LSTMoutput[1].ot + LSTMoutput[1].mt./*.**/right + ) + LSTMoutput[1].mt./*.**/right : Tanh 0 x 0 ( + LSTMoutput[1].ct + ) + LSTMoutput[1].ot : Sigmoid 0 x 0 ( + LSTMoutput[1].ot.z + ) + LSTMoutput[1].ot.z : Plus 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left + LSTMoutput[1].ot.z./*+*/right + ) + LSTMoutput[1].ot.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left + LSTMoutput[1].ot.z./*+*/left./*+*/right + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 () + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + featNorm + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[1].ot.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[1].dh + ) + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].ot.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[1].ot.z./*+*/right.matrix + ) + LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[1].ot.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[1].ct + ) + LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].output : Times 0 x 0 ( + LSTMoutput[1].Wmr + LSTMoutput[1].output./***/right + ) + LSTMoutput[1].output./***/right : Scale 0 x 0 ( + LSTMoutput[1].output./***/right.scalarScalingFactor + LSTMoutput[1].mt + ) + LSTMoutput[1].output./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].output./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].Wmr : LearnableParameter 256 x 1024 () + LSTMoutput[2].bft : ElementTimes 0 x 0 ( + LSTMoutput[2].ft + LSTMoutput[2].dc + ) + LSTMoutput[2].bit : ElementTimes 0 x 0 ( + LSTMoutput[2].it + LSTMoutput[2].bit./*.**/right + ) + LSTMoutput[2].bit./*.**/right : Tanh 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z + ) + LSTMoutput[2].bit./*.**/right.z : Plus 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/left + LSTMoutput[2].bit./*.**/right.z./*+*/right + ) + LSTMoutput[2].bit./*.**/right.z./*+*/left : Times 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right + ) + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].output + ) + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].bit./*.**/right.z./*+*/right : Plus 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right + ) + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right + ) + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].dh + ) + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[2].ct : Plus 0 x 0 ( + LSTMoutput[2].bft + LSTMoutput[2].bit + ) + LSTMoutput[2].dc : PastValue 1024 x 1 ( + LSTMoutput[2].ct + ) + LSTMoutput[2].dh : PastValue 256 x 1 ( + LSTMoutput[2].output + ) + LSTMoutput[2].ft : Sigmoid 0 x 0 ( + LSTMoutput[2].ft.z + ) + LSTMoutput[2].ft.z : Plus 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left + LSTMoutput[2].ft.z./*+*/right + ) + LSTMoutput[2].ft.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left + LSTMoutput[2].ft.z./*+*/left./*+*/right + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].output + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[2].ft.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[2].dh + ) + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].ft.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[2].ft.z./*+*/right.matrix + ) + LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[2].ft.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[2].dc + ) + LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].it : Sigmoid 0 x 0 ( + LSTMoutput[2].it.z + ) + LSTMoutput[2].it.z : Plus 0 x 0 ( + LSTMoutput[2].it.z./*+*/left + LSTMoutput[2].it.z./*+*/right + ) + LSTMoutput[2].it.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left + LSTMoutput[2].it.z./*+*/left./*+*/right + ) + LSTMoutput[2].it.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].output + ) + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[2].it.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/right./***/left + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[2].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[2].dh + ) + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].it.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[2].it.z./*+*/right.matrix + ) + LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[2].it.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[2].dc + ) + LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].mt : ElementTimes 0 x 0 ( + LSTMoutput[2].ot + LSTMoutput[2].mt./*.**/right + ) + LSTMoutput[2].mt./*.**/right : Tanh 0 x 0 ( + LSTMoutput[2].ct + ) + LSTMoutput[2].ot : Sigmoid 0 x 0 ( + LSTMoutput[2].ot.z + ) + LSTMoutput[2].ot.z : Plus 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left + LSTMoutput[2].ot.z./*+*/right + ) + LSTMoutput[2].ot.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left + LSTMoutput[2].ot.z./*+*/left./*+*/right + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].output + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[2].ot.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[2].dh + ) + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].ot.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[2].ot.z./*+*/right.matrix + ) + LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[2].ot.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[2].ct + ) + LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].output : Times 0 x 0 ( + LSTMoutput[2].Wmr + LSTMoutput[2].output./***/right + ) + LSTMoutput[2].output./***/right : Scale 0 x 0 ( + LSTMoutput[2].output./***/right.scalarScalingFactor + LSTMoutput[2].mt + ) + LSTMoutput[2].output./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].output./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].Wmr : LearnableParameter 256 x 1024 () + LSTMoutput[3].bft : ElementTimes 0 x 0 ( + LSTMoutput[3].ft + LSTMoutput[3].dc + ) + LSTMoutput[3].bit : ElementTimes 0 x 0 ( + LSTMoutput[3].it + LSTMoutput[3].bit./*.**/right + ) + LSTMoutput[3].bit./*.**/right : Tanh 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z + ) + LSTMoutput[3].bit./*.**/right.z : Plus 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/left + LSTMoutput[3].bit./*.**/right.z./*+*/right + ) + LSTMoutput[3].bit./*.**/right.z./*+*/left : Times 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right + ) + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].output + ) + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].bit./*.**/right.z./*+*/right : Plus 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right + ) + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right + ) + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor + LSTMoutput[3].dh + ) + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[3].ct : Plus 0 x 0 ( + LSTMoutput[3].bft + LSTMoutput[3].bit + ) + LSTMoutput[3].dc : PastValue 1024 x 1 ( + LSTMoutput[3].ct + ) + LSTMoutput[3].dh : PastValue 256 x 1 ( + LSTMoutput[3].output + ) + LSTMoutput[3].ft : Sigmoid 0 x 0 ( + LSTMoutput[3].ft.z + ) + LSTMoutput[3].ft.z : Plus 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left + LSTMoutput[3].ft.z./*+*/right + ) + LSTMoutput[3].ft.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left + LSTMoutput[3].ft.z./*+*/left./*+*/right + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].output + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[3].ft.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[3].dh + ) + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].ft.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[3].ft.z./*+*/right.matrix + ) + LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[3].ft.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[3].dc + ) + LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].it : Sigmoid 0 x 0 ( + LSTMoutput[3].it.z + ) + LSTMoutput[3].it.z : Plus 0 x 0 ( + LSTMoutput[3].it.z./*+*/left + LSTMoutput[3].it.z./*+*/right + ) + LSTMoutput[3].it.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left + LSTMoutput[3].it.z./*+*/left./*+*/right + ) + LSTMoutput[3].it.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].output + ) + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[3].it.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/right./***/left + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[3].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[3].dh + ) + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].it.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[3].it.z./*+*/right.matrix + ) + LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[3].it.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[3].dc + ) + LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].mt : ElementTimes 0 x 0 ( + LSTMoutput[3].ot + LSTMoutput[3].mt./*.**/right + ) + LSTMoutput[3].mt./*.**/right : Tanh 0 x 0 ( + LSTMoutput[3].ct + ) + LSTMoutput[3].ot : Sigmoid 0 x 0 ( + LSTMoutput[3].ot.z + ) + LSTMoutput[3].ot.z : Plus 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left + LSTMoutput[3].ot.z./*+*/right + ) + LSTMoutput[3].ot.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left + LSTMoutput[3].ot.z./*+*/left./*+*/right + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].output + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[3].ot.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[3].dh + ) + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].ot.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[3].ot.z./*+*/right.matrix + ) + LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[3].ot.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[3].ct + ) + LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].output : Times 0 x 0 ( + LSTMoutput[3].Wmr + LSTMoutput[3].output./***/right + ) + LSTMoutput[3].output./***/right : Scale 0 x 0 ( + LSTMoutput[3].output./***/right.scalarScalingFactor + LSTMoutput[3].mt + ) + LSTMoutput[3].output./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].output./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].Wmr : LearnableParameter 256 x 1024 () + LSTMoutputW : Plus 0 x 0 ( + LSTMoutputW./*+*/left + B + ) + LSTMoutputW./*+*/left : Times 0 x 0 ( + LSTMoutputW./*+*/left./***/left + LSTMoutputW./*+*/left./***/right + ) + LSTMoutputW./*+*/left./***/left : LearnableParameter 132 x 256 () + LSTMoutputW./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutputW./*+*/left./***/right.scalarScalingFactor + LSTMoutput[3].output + ) + LSTMoutputW./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + ScaledLogLikelihood : Minus 0 x 0 ( + LSTMoutputW + logPrior + ) +] GetTrainCriterionNodes ... GetEvalCriterionNodes ... -Found 6 PreCompute nodes - NodeName: featNorm.xMean - NodeName: featNorm.xStdDev - NodeName: logPrior.Prior - NodeName: featNorm.xMean - NodeName: featNorm.xStdDev - NodeName: logPrior.Prior + nodes in the recurrent loops : +LSTMoutput[1].mt./*.**/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].mt./*.**/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].mt./*.**/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].mt./*.**/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].mt./*.**/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].mt./*.**/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output + +Validating for node cr. 272 nodes to process in pass 1. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] + +Validating for node cr. 183 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] + +Validating for node cr. 60 nodes to process in pass 3. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] + +Validating for node cr, final verification. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] + +127 out of 272 nodes do not share the minibatch layout with the input data. + + +Precomputing --> 3 PreCompute nodes found. + + NodeName: featNorm.invStdDevVector + NodeName: featNorm.meanVector + NodeName: logPrior.x minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output -Validating node featNorm.xMean +Validating for node featNorm.invStdDevVector. 3 nodes to process in pass 1. -Validating --> features = InputValue -> [363, MBSize 308] -Validating --> feashift = RowSlice(features[363, MBSize 308]) -> [33, MBSize 308] -Validating --> featNorm.xMean = Mean(feashift[33, MBSize 308]) -> [33, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating for node featNorm.invStdDevVector, final verification. +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] 1 out of 3 nodes do not share the minibatch layout with the input data. + nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output -Validating node featNorm.xStdDev +Validating for node featNorm.meanVector. 3 nodes to process in pass 1. -Validating --> features = InputValue -> [363, MBSize 308] -Validating --> feashift = RowSlice(features[363, MBSize 308]) -> [33, MBSize 308] -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 308]) -> [33, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating for node featNorm.meanVector, final verification. +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] 1 out of 3 nodes do not share the minibatch layout with the input data. + nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output -Validating node logPrior.Prior +Validating for node logPrior.x. 2 nodes to process in pass 1. -Validating --> labels = InputValue -> [132, MBSize 308] -Validating --> logPrior.Prior = Mean(labels[132, MBSize 308]) -> [132, 1] +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1] +Validating for node logPrior.x. 1 nodes to process in pass 2. +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1] + +Validating for node logPrior.x, final verification. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1] 1 out of 2 nodes do not share the minibatch layout with the input data. + + +Precomputing --> Completed. + Set Max Temp Mem Size For Convolution Nodes to 0 samples. -Starting Epoch 1: learning rate per sample = 0.025000 momentum = 0.000000 +Starting Epoch 1: learning rate per sample = 0.025000 effective momentum = 0.000000 minibatchiterator: epoch 0: frames [0..2560] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses + nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output + +Validating for node Err. 272 nodes to process in pass 1. + +Validating --> labels = InputValue -> [132, MBSize 378] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 378] +Validating --> feashift = RowSlice(features[363, MBSize 378]) -> [33, MBSize 378] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 378]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 378]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 378], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 378], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 378], LSTMoutput[1].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 378], LSTMoutput[1].bit[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 378], LSTMoutput[1].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 378], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 378], LSTMoutput[2].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 378], LSTMoutput[2].bit[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 378], LSTMoutput[2].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 378], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 378], LSTMoutput[3].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 378], LSTMoutput[3].bit[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 378], LSTMoutput[3].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 378]) -> [132, MBSize 378] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 378], B[132, 1]) -> [132, MBSize 378] +Validating --> Err = ErrorPrediction(labels[132, MBSize 378], LSTMoutputW[132, MBSize 378]) -> [1, 1] + +Validating for node Err. 180 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 378] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 378] +Validating --> feashift = RowSlice(features[363, MBSize 378]) -> [33, MBSize 378] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 378]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 378]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 378], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 378], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 378], LSTMoutput[1].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 378], LSTMoutput[1].bit[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 378], LSTMoutput[1].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 378], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 378], LSTMoutput[2].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 378], LSTMoutput[2].bit[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 378], LSTMoutput[2].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 378], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 378], LSTMoutput[3].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 378], LSTMoutput[3].bit[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 378], LSTMoutput[3].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 378]) -> [132, MBSize 378] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 378], B[132, 1]) -> [132, MBSize 378] +Validating --> Err = ErrorPrediction(labels[132, MBSize 378], LSTMoutputW[132, MBSize 378]) -> [1, 1] + +Validating for node Err. 6 nodes to process in pass 3. + +Validating --> labels = InputValue -> [132, MBSize 378] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 378] +Validating --> feashift = RowSlice(features[363, MBSize 378]) -> [33, MBSize 378] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 378]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 378]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 378], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 378], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 378], LSTMoutput[1].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 378], LSTMoutput[1].bit[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 378], LSTMoutput[1].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 378], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 378], LSTMoutput[2].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 378], LSTMoutput[2].bit[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 378], LSTMoutput[2].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 378], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 378], LSTMoutput[3].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 378], LSTMoutput[3].bit[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 378], LSTMoutput[3].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 378]) -> [132, MBSize 378] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 378], B[132, 1]) -> [132, MBSize 378] +Validating --> Err = ErrorPrediction(labels[132, MBSize 378], LSTMoutputW[132, MBSize 378]) -> [1, 1] + +Validating for node Err, final verification. + +Validating --> labels = InputValue -> [132, MBSize 378] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 378] +Validating --> feashift = RowSlice(features[363, MBSize 378]) -> [33, MBSize 378] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 378]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 378]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 378], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 378], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 378], LSTMoutput[1].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 378], LSTMoutput[1].bit[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 378], LSTMoutput[1].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 378], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 378], LSTMoutput[2].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 378], LSTMoutput[2].bit[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 378], LSTMoutput[2].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 378], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 378], LSTMoutput[3].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 378], LSTMoutput[3].bit[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 378], LSTMoutput[3].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 378]) -> [1024, MBSize 378] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 378]) -> [256, MBSize 378] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 378]) -> [132, MBSize 378] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 378], B[132, 1]) -> [132, MBSize 378] +Validating --> Err = ErrorPrediction(labels[132, MBSize 378], LSTMoutputW[132, MBSize 378]) -> [1, 1] + +127 out of 272 nodes do not share the minibatch layout with the input data. + Starting minibatch loop. - Epoch[ 1 of 2]-Minibatch[ 1- 1 of 128]: SamplesSeen = 308; TrainLossPerSample = 4.88306536; EvalErr[0]PerSample = 0.99025974; TotalTime = 22.51031s; TotalTimePerSample = 73.08543ms; SamplesPerSecond = 13 - Epoch[ 1 of 2]-Minibatch[ 2- 2 of 128]: SamplesSeen = 408; TrainLossPerSample = 4.62618899; EvalErr[0]PerSample = 0.84068627; TotalTime = 32.29271s; TotalTimePerSample = 79.14880ms; SamplesPerSecond = 12 - Epoch[ 1 of 2]-Minibatch[ 3- 3 of 128]: SamplesSeen = 378; TrainLossPerSample = 4.56298699; EvalErr[0]PerSample = 0.94708995; TotalTime = 28.78492s; TotalTimePerSample = 76.15060ms; SamplesPerSecond = 13 - Epoch[ 1 of 2]-Minibatch[ 4- 4 of 128]: SamplesSeen = 478; TrainLossPerSample = 3.76631656; EvalErr[0]PerSample = 0.78661088; TotalTime = 37.13984s; TotalTimePerSample = 77.69841ms; SamplesPerSecond = 12 - Epoch[ 1 of 2]-Minibatch[ 5- 5 of 128]: SamplesSeen = 158; TrainLossPerSample = 8763.14191369; EvalErr[0]PerSample = 0.93037975; TotalTime = 11.48446s; TotalTimePerSample = 72.68647ms; SamplesPerSecond = 13 - Epoch[ 1 of 2]-Minibatch[ 6- 6 of 128]: SamplesSeen = 258; TrainLossPerSample = 4.56298450; EvalErr[0]PerSample = 0.89922481; TotalTime = 21.40300s; TotalTimePerSample = 82.95738ms; SamplesPerSecond = 12 - Epoch[ 1 of 2]-Minibatch[ 7- 7 of 128]: SamplesSeen = 328; TrainLossPerSample = 4.37957317; EvalErr[0]PerSample = 0.87500000; TotalTime = 26.18570s; TotalTimePerSample = 79.83445ms; SamplesPerSecond = 12 - Epoch[ 1 of 2]-Minibatch[ 8- 8 of 128]: SamplesSeen = 288; TrainLossPerSample = 4.65104167; EvalErr[0]PerSample = 0.90625000; TotalTime = 19.35972s; TotalTimePerSample = 67.22124ms; SamplesPerSecond = 14 -Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 535.88568; EvalErrPerSample = 0.88671273; Ave LearnRatePerSample = 0.02500000037; EpochTime=199.19972 -Starting Epoch 2: learning rate per sample = 0.025000 momentum = 0.900000 + Epoch[ 1 of 2]-Minibatch[ 1- 1 of 128]: SamplesSeen = 308; TrainLossPerSample = 4.88306536; EvalErr[0]PerSample = 0.99025974; TotalTime = 33.19239s; TotalTimePerSample = 107.76749ms; SamplesPerSecond = 9 + Epoch[ 1 of 2]-Minibatch[ 2- 2 of 128]: SamplesSeen = 408; TrainLossPerSample = 4.62618899; EvalErr[0]PerSample = 0.84068627; TotalTime = 47.40593s; TotalTimePerSample = 116.19102ms; SamplesPerSecond = 8 + Epoch[ 1 of 2]-Minibatch[ 3- 3 of 128]: SamplesSeen = 378; TrainLossPerSample = 4.56298570; EvalErr[0]PerSample = 0.94708995; TotalTime = 45.36358s; TotalTimePerSample = 120.00946ms; SamplesPerSecond = 8 + Epoch[ 1 of 2]-Minibatch[ 4- 4 of 128]: SamplesSeen = 478; TrainLossPerSample = 3.76631656; EvalErr[0]PerSample = 0.78661088; TotalTime = 56.13956s; TotalTimePerSample = 117.44677ms; SamplesPerSecond = 8 + Epoch[ 1 of 2]-Minibatch[ 5- 5 of 128]: SamplesSeen = 158; TrainLossPerSample = 8763.14508134; EvalErr[0]PerSample = 0.93037975; TotalTime = 16.77154s; TotalTimePerSample = 106.14897ms; SamplesPerSecond = 9 + Epoch[ 1 of 2]-Minibatch[ 6- 6 of 128]: SamplesSeen = 258; TrainLossPerSample = 4.56298450; EvalErr[0]PerSample = 0.89922481; TotalTime = 24.72391s; TotalTimePerSample = 95.82912ms; SamplesPerSecond = 10 + Epoch[ 1 of 2]-Minibatch[ 7- 7 of 128]: SamplesSeen = 328; TrainLossPerSample = 4.37957317; EvalErr[0]PerSample = 0.87500000; TotalTime = 28.38026s; TotalTimePerSample = 86.52517ms; SamplesPerSecond = 11 + Epoch[ 1 of 2]-Minibatch[ 8- 8 of 128]: SamplesSeen = 288; TrainLossPerSample = 4.65104167; EvalErr[0]PerSample = 0.90625000; TotalTime = 25.08192s; TotalTimePerSample = 87.09000ms; SamplesPerSecond = 11 +Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 535.88586; EvalErrPerSample = 0.88671273; Ave LearnRatePerSample = 0.02500000037; EpochTime=277.708 +Starting Epoch 2: learning rate per sample = 0.025000 effective momentum = 0.900000 minibatchiterator: epoch 1: frames [2560..5120] (first utterance at frame 2604), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 2 of 2]-Minibatch[ 1- 1 of 128]: SamplesSeen = 508; TrainLossPerSample = 4.26512518; EvalErr[0]PerSample = 0.85433071; TotalTime = 41.28329s; TotalTimePerSample = 81.26632ms; SamplesPerSecond = 12 - Epoch[ 2 of 2]-Minibatch[ 2- 2 of 128]: SamplesSeen = 228; TrainLossPerSample = 3.77295993; EvalErr[0]PerSample = 0.82456140; TotalTime = 14.96108s; TotalTimePerSample = 65.61879ms; SamplesPerSecond = 15 - Epoch[ 2 of 2]-Minibatch[ 3- 3 of 128]: SamplesSeen = 88; TrainLossPerSample = 3.83270264; EvalErr[0]PerSample = 0.89772727; TotalTime = 5.98518s; TotalTimePerSample = 68.01345ms; SamplesPerSecond = 14 - Epoch[ 2 of 2]-Minibatch[ 4- 4 of 128]: SamplesSeen = 208; TrainLossPerSample = 4.20982009; EvalErr[0]PerSample = 0.91826923; TotalTime = 13.63850s; TotalTimePerSample = 65.56970ms; SamplesPerSecond = 15 - Epoch[ 2 of 2]-Minibatch[ 5- 5 of 128]: SamplesSeen = 198; TrainLossPerSample = 4.20819277; EvalErr[0]PerSample = 0.91919192; TotalTime = 13.50887s; TotalTimePerSample = 68.22662ms; SamplesPerSecond = 14 - Epoch[ 2 of 2]-Minibatch[ 6- 6 of 128]: SamplesSeen = 458; TrainLossPerSample = 3.93088581; EvalErr[0]PerSample = 0.93231441; TotalTime = 33.74585s; TotalTimePerSample = 73.68089ms; SamplesPerSecond = 13 - Epoch[ 2 of 2]-Minibatch[ 7- 7 of 128]: SamplesSeen = 258; TrainLossPerSample = 3.87346513; EvalErr[0]PerSample = 0.91860465; TotalTime = 21.74937s; TotalTimePerSample = 84.29989ms; SamplesPerSecond = 11 - Epoch[ 2 of 2]-Minibatch[ 8- 8 of 128]: SamplesSeen = 218; TrainLossPerSample = 3.73194703; EvalErr[0]PerSample = 0.79816514; TotalTime = 16.44576s; TotalTimePerSample = 75.43928ms; SamplesPerSecond = 13 - Epoch[ 2 of 2]-Minibatch[ 9- 9 of 128]: SamplesSeen = 238; TrainLossPerSample = 3.93201402; EvalErr[0]PerSample = 0.81932773; TotalTime = 19.53173s; TotalTimePerSample = 82.06611ms; SamplesPerSecond = 12 - Epoch[ 2 of 2]-Minibatch[ 10- 10 of 128]: SamplesSeen = 248; TrainLossPerSample = 4.68575164; EvalErr[0]PerSample = 0.92741935; TotalTime = 19.55488s; TotalTimePerSample = 78.85032ms; SamplesPerSecond = 12 -Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 4.0695133; EvalErrPerSample = 0.88188678; Ave LearnRatePerSample = 0.02500000037; EpochTime=200.45244 + Epoch[ 2 of 2]-Minibatch[ 1- 1 of 128]: SamplesSeen = 508; TrainLossPerSample = 4.26512518; EvalErr[0]PerSample = 0.85433071; TotalTime = 47.12508s; TotalTimePerSample = 92.76590ms; SamplesPerSecond = 10 + Epoch[ 2 of 2]-Minibatch[ 2- 2 of 128]: SamplesSeen = 228; TrainLossPerSample = 3.77295993; EvalErr[0]PerSample = 0.82456140; TotalTime = 20.64559s; TotalTimePerSample = 90.55082ms; SamplesPerSecond = 11 + Epoch[ 2 of 2]-Minibatch[ 3- 3 of 128]: SamplesSeen = 88; TrainLossPerSample = 3.83270264; EvalErr[0]PerSample = 0.89772727; TotalTime = 7.97201s; TotalTimePerSample = 90.59103ms; SamplesPerSecond = 11 + Epoch[ 2 of 2]-Minibatch[ 4- 4 of 128]: SamplesSeen = 208; TrainLossPerSample = 4.20982009; EvalErr[0]PerSample = 0.91826923; TotalTime = 16.50900s; TotalTimePerSample = 79.37017ms; SamplesPerSecond = 12 + Epoch[ 2 of 2]-Minibatch[ 5- 5 of 128]: SamplesSeen = 198; TrainLossPerSample = 4.20819523; EvalErr[0]PerSample = 0.91919192; TotalTime = 16.28937s; TotalTimePerSample = 82.26952ms; SamplesPerSecond = 12 + Epoch[ 2 of 2]-Minibatch[ 6- 6 of 128]: SamplesSeen = 458; TrainLossPerSample = 3.93088581; EvalErr[0]PerSample = 0.93231441; TotalTime = 40.95506s; TotalTimePerSample = 89.42153ms; SamplesPerSecond = 11 + Epoch[ 2 of 2]-Minibatch[ 7- 7 of 128]: SamplesSeen = 258; TrainLossPerSample = 3.87346892; EvalErr[0]PerSample = 0.91860465; TotalTime = 21.19852s; TotalTimePerSample = 82.16480ms; SamplesPerSecond = 12 + Epoch[ 2 of 2]-Minibatch[ 8- 8 of 128]: SamplesSeen = 218; TrainLossPerSample = 3.73194927; EvalErr[0]PerSample = 0.79816514; TotalTime = 18.45402s; TotalTimePerSample = 84.65144ms; SamplesPerSecond = 11 + Epoch[ 2 of 2]-Minibatch[ 9- 9 of 128]: SamplesSeen = 238; TrainLossPerSample = 3.93201402; EvalErr[0]PerSample = 0.81932773; TotalTime = 19.20098s; TotalTimePerSample = 80.67637ms; SamplesPerSecond = 12 + Epoch[ 2 of 2]-Minibatch[ 10- 10 of 128]: SamplesSeen = 248; TrainLossPerSample = 4.68575164; EvalErr[0]PerSample = 0.92741935; TotalTime = 23.74306s; TotalTimePerSample = 95.73816ms; SamplesPerSecond = 10 +Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 4.0695143; EvalErrPerSample = 0.88188678; Ave LearnRatePerSample = 0.02500000037; EpochTime=232.15937 CNTKCommandTrainEnd: speechTrain COMPLETED diff --git a/Tests/Speech/LSTM/Truncated/baseline.gpu.txt b/Tests/Speech/LSTM/Truncated/baseline.gpu.txt index bad10f355..4f3677b81 100644 --- a/Tests/Speech/LSTM/Truncated/baseline.gpu.txt +++ b/Tests/Speech/LSTM/Truncated/baseline.gpu.txt @@ -1,7 +1,7 @@ -=== Running /home/mluser/src/cplx_master/build/release/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/LSTM/cntk.config RunDir=/tmp/cntk-test-20150908125603.612544/Speech_LSTM@release_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM -running on localhost at 2015/09/08 12:56:03 -command line options: -configFile=/home/mluser/src/cplx_master/Tests/Speech/LSTM/cntk.config RunDir=/tmp/cntk-test-20150908125603.612544/Speech_LSTM@release_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM +=== Running /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/LSTM/Truncated/../cntk.config RunDir=/tmp/cntk-test-20151024130338.510376/Speech/LSTM_Truncated@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/Truncated/.. DeviceId=0 +running on localhost at 2015/10/24 13:03:38 +command line: +/home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/LSTM/Truncated/../cntk.config RunDir=/tmp/cntk-test-20151024130338.510376/Speech/LSTM_Truncated@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/Truncated/.. DeviceId=0 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> precision=float @@ -15,9 +15,6 @@ speechTrain=[ modelPath=$RunDir$/models/cntkSpeech.dnn deviceId=$DeviceId$ traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=$NDLDir$/lstmp-3layer_WithSelfStab.ndl - ] SGD=[ epochSize=20480 minibatchSize=20 @@ -191,10 +188,10 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features); ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output') // sadly we can't say x - y since we want to assign a tag ] ] -RunDir=/tmp/cntk-test-20150908125603.612544/Speech_LSTM@release_gpu +RunDir=/tmp/cntk-test-20151024130338.510376/Speech/LSTM_Truncated@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/Truncated/.. DeviceId=0 -NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< @@ -207,12 +204,9 @@ frameMode=false Truncated=true speechTrain=[ action=train - modelPath=/tmp/cntk-test-20150908125603.612544/Speech_LSTM@release_gpu/models/cntkSpeech.dnn + modelPath=/tmp/cntk-test-20151024130338.510376/Speech/LSTM_Truncated@debug_gpu/models/cntkSpeech.dnn deviceId=0 traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=/home/mluser/src/cplx_master/Tests/Speech/LSTM/lstmp-3layer_WithSelfStab.ndl - ] SGD=[ epochSize=20480 minibatchSize=20 @@ -386,30 +380,27 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features); ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output') // sadly we can't say x - y since we want to assign a tag ] ] -RunDir=/tmp/cntk-test-20150908125603.612544/Speech_LSTM@release_gpu +RunDir=/tmp/cntk-test-20151024130338.510376/Speech/LSTM_Truncated@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/Truncated/.. DeviceId=0 -NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> configparameters: cntk.config:command=speechTrain +configparameters: cntk.config:ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/Truncated/.. configparameters: cntk.config:DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data configparameters: cntk.config:deviceId=0 configparameters: cntk.config:frameMode=false -configparameters: cntk.config:NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM configparameters: cntk.config:parallelTrain=false configparameters: cntk.config:precision=float -configparameters: cntk.config:RunDir=/tmp/cntk-test-20150908125603.612544/Speech_LSTM@release_gpu +configparameters: cntk.config:RunDir=/tmp/cntk-test-20151024130338.510376/Speech/LSTM_Truncated@debug_gpu configparameters: cntk.config:speechTrain=[ action=train - modelPath=/tmp/cntk-test-20150908125603.612544/Speech_LSTM@release_gpu/models/cntkSpeech.dnn + modelPath=/tmp/cntk-test-20151024130338.510376/Speech/LSTM_Truncated@debug_gpu/models/cntkSpeech.dnn deviceId=0 traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=/home/mluser/src/cplx_master/Tests/Speech/LSTM/lstmp-3layer_WithSelfStab.ndl - ] SGD=[ epochSize=20480 minibatchSize=20 @@ -588,7 +579,11 @@ configparameters: cntk.config:Truncated=true <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< command: speechTrain precision = float -NDLBuilder Using GPU 0 +CNTKModelPath: /tmp/cntk-test-20151024130338.510376/Speech/LSTM_Truncated@debug_gpu/models/cntkSpeech.dnn +CNTKCommandTrainInfo: speechTrain : 4 +CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4 +CNTKCommandTrainBegin: speechTrain +ExperimentalNetworkBuilder using GPU 0 reading script file /home/mluser/src/cplx_master/Tests/Speech/Data/glob_0000.scp ... 948 entries trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion total 132 state names in state list /home/mluser/src/cplx_master/Tests/Speech/Data/state.list @@ -596,1781 +591,3382 @@ htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Speech/Data/gl ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances label set 0: 129 classes minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames - nodes in the recurrent loops : -LSTMoutput1.unnamed174 LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.bit LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.unnamed224 LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.bit LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.unnamed274 LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.bit LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.unnamed174 LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.bit LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.unnamed224 LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.bit LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.unnamed274 LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.bit LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Printing Gradient Computation Node Order ... - -cr[0, 0] = CrossEntropyWithSoftmax(labels[132, 1], LSTMoutputW[0, 0]) -LSTMoutputW[0, 0] = Plus(unnamed283[0, 0], b[132, 1]) -b[132, 1] = LearnableParameter -unnamed283[0, 0] = Times(W[132, 256], unnamed284[0, 0]) -unnamed284[0, 0] = Scale(expsW[0, 0], LSTMoutput3.output[0, 0]) -LSTMoutput3.output[0, 0] = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[0, 0]) -LSTMoutput3.unnamed275[0, 0] = Scale(LSTMoutput3.expsWmr[0, 0], LSTMoutput3.mt[0, 0]) -LSTMoutput3.mt[0, 0] = ElementTimes(LSTMoutput3.ot[0, 0], LSTMoutput3.unnamed274[0, 0]) -LSTMoutput3.unnamed274[0, 0] = Tanh(LSTMoutput3.ct[0, 0]) -LSTMoutput3.ot[0, 0] = Sigmoid(LSTMoutput3.unnamed271[0, 0]) -LSTMoutput3.unnamed271[0, 0] = Plus(LSTMoutput3.unnamed272[0, 0], LSTMoutput3.Wcoct[0, 0]) -LSTMoutput3.Wcoct[0, 0] = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[0, 0]) -LSTMoutput3.unnamed270[0, 0] = Scale(LSTMoutput3.expsWco[0, 0], LSTMoutput3.ct[0, 0]) -LSTMoutput3.ct[0, 0] = Plus(LSTMoutput3.bft[0, 0], LSTMoutput3.bit[0, 0]) -LSTMoutput3.bit[0, 0] = ElementTimes(LSTMoutput3.it[0, 0], LSTMoutput3.unnamed259[0, 0]) -LSTMoutput3.unnamed259[0, 0] = Tanh(LSTMoutput3.unnamed260[0, 0]) -LSTMoutput3.unnamed260[0, 0] = Plus(LSTMoutput3.Wxcx[0, 0], LSTMoutput3.unnamed261[0, 0]) -LSTMoutput3.unnamed261[0, 0] = Plus(LSTMoutput3.Whcdh[0, 0], LSTMoutput3.bc[1024, 1]) -LSTMoutput3.Whcdh[0, 0] = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[0, 0]) -LSTMoutput3.unnamed258[0, 0] = Scale(LSTMoutput3.expsWhc[0, 0], LSTMoutput3.dh[256, 1]) -LSTMoutput3.it[0, 0] = Sigmoid(LSTMoutput3.unnamed254[0, 0]) -LSTMoutput3.unnamed254[0, 0] = Plus(LSTMoutput3.unnamed255[0, 0], LSTMoutput3.Wcidc[0, 0]) -LSTMoutput3.Wcidc[0, 0] = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[0, 0]) -LSTMoutput3.unnamed253[0, 0] = Scale(LSTMoutput3.expsWci[0, 0], LSTMoutput3.dc[1024, 1]) -LSTMoutput3.unnamed255[0, 0] = Plus(LSTMoutput3.unnamed256[0, 0], LSTMoutput3.Whidh[0, 0]) -LSTMoutput3.Whidh[0, 0] = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[0, 0]) -LSTMoutput3.unnamed252[0, 0] = Scale(LSTMoutput3.expsWhi[0, 0], LSTMoutput3.dh[256, 1]) -LSTMoutput3.bft[0, 0] = ElementTimes(LSTMoutput3.ft[0, 0], LSTMoutput3.dc[1024, 1]) -LSTMoutput3.ft[0, 0] = Sigmoid(LSTMoutput3.unnamed265[0, 0]) -LSTMoutput3.unnamed265[0, 0] = Plus(LSTMoutput3.unnamed266[0, 0], LSTMoutput3.Wcfdc[0, 0]) -LSTMoutput3.Wcfdc[0, 0] = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[0, 0]) -LSTMoutput3.unnamed264[0, 0] = Scale(LSTMoutput3.expsWcf[0, 0], LSTMoutput3.dc[1024, 1]) -LSTMoutput3.dc[1024, 1] = PastValue(LSTMoutput3.ct[0, 0]) -LSTMoutput3.unnamed266[0, 0] = Plus(LSTMoutput3.unnamed267[0, 0], LSTMoutput3.Whfdh[0, 0]) -LSTMoutput3.Whfdh[0, 0] = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[0, 0]) -LSTMoutput3.unnamed263[0, 0] = Scale(LSTMoutput3.expsWhf[0, 0], LSTMoutput3.dh[256, 1]) -LSTMoutput3.unnamed272[0, 0] = Plus(LSTMoutput3.unnamed273[0, 0], LSTMoutput3.Whodh[0, 0]) -LSTMoutput3.Whodh[0, 0] = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[0, 0]) -LSTMoutput3.unnamed269[0, 0] = Scale(LSTMoutput3.expsWho[0, 0], LSTMoutput3.dh[256, 1]) -LSTMoutput3.dh[256, 1] = PastValue(LSTMoutput3.output[0, 0]) -LSTMoutput3.bc[1024, 1] = LearnableParameter -LSTMoutput3.expsWhc[0, 0] = Exp(LSTMoutput3.sWhc[1, 1]) -LSTMoutput3.sWhc[1, 1] = LearnableParameter -LSTMoutput3.Whc[1024, 256] = LearnableParameter -LSTMoutput3.Wxcx[0, 0] = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[0, 0]) -LSTMoutput3.unnamed257[0, 0] = Scale(LSTMoutput3.expsWxc[0, 0], LSTMoutput2.output[0, 0]) -LSTMoutput3.expsWxc[0, 0] = Exp(LSTMoutput3.sWxc[1, 1]) -LSTMoutput3.sWxc[1, 1] = LearnableParameter -LSTMoutput3.Wxc[1024, 256] = LearnableParameter -LSTMoutput3.expsWci[0, 0] = Exp(LSTMoutput3.sWci[1, 1]) -LSTMoutput3.sWci[1, 1] = LearnableParameter -LSTMoutput3.Wci[1024, 1] = LearnableParameter -LSTMoutput3.expsWhi[0, 0] = Exp(LSTMoutput3.sWhi[1, 1]) -LSTMoutput3.sWhi[1, 1] = LearnableParameter -LSTMoutput3.Whi[1024, 256] = LearnableParameter -LSTMoutput3.unnamed256[0, 0] = Plus(LSTMoutput3.Wxix[0, 0], LSTMoutput3.bi[1024, 1]) -LSTMoutput3.bi[1024, 1] = LearnableParameter -LSTMoutput3.Wxix[0, 0] = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[0, 0]) -LSTMoutput3.unnamed251[0, 0] = Scale(LSTMoutput3.expsWxi[0, 0], LSTMoutput2.output[0, 0]) -LSTMoutput3.expsWxi[0, 0] = Exp(LSTMoutput3.sWxi[1, 1]) -LSTMoutput3.sWxi[1, 1] = LearnableParameter -LSTMoutput3.Wxi[1024, 256] = LearnableParameter -LSTMoutput3.expsWcf[0, 0] = Exp(LSTMoutput3.sWcf[1, 1]) -LSTMoutput3.sWcf[1, 1] = LearnableParameter -LSTMoutput3.Wcf[1024, 1] = LearnableParameter -LSTMoutput3.expsWhf[0, 0] = Exp(LSTMoutput3.sWhf[1, 1]) -LSTMoutput3.sWhf[1, 1] = LearnableParameter -LSTMoutput3.Whf[1024, 256] = LearnableParameter -LSTMoutput3.unnamed267[0, 0] = Plus(LSTMoutput3.Wxfx[0, 0], LSTMoutput3.bf[1024, 1]) -LSTMoutput3.bf[1024, 1] = LearnableParameter -LSTMoutput3.Wxfx[0, 0] = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[0, 0]) -LSTMoutput3.unnamed262[0, 0] = Scale(LSTMoutput3.expsWxf[0, 0], LSTMoutput2.output[0, 0]) -LSTMoutput3.expsWxf[0, 0] = Exp(LSTMoutput3.sWxf[1, 1]) -LSTMoutput3.sWxf[1, 1] = LearnableParameter -LSTMoutput3.Wxf[1024, 256] = LearnableParameter -LSTMoutput3.expsWco[0, 0] = Exp(LSTMoutput3.sWco[1, 1]) -LSTMoutput3.sWco[1, 1] = LearnableParameter -LSTMoutput3.Wco[1024, 1] = LearnableParameter -LSTMoutput3.expsWho[0, 0] = Exp(LSTMoutput3.sWho[1, 1]) -LSTMoutput3.sWho[1, 1] = LearnableParameter -LSTMoutput3.Who[1024, 256] = LearnableParameter -LSTMoutput3.unnamed273[0, 0] = Plus(LSTMoutput3.Wxox[0, 0], LSTMoutput3.bo[1024, 1]) -LSTMoutput3.bo[1024, 1] = LearnableParameter -LSTMoutput3.Wxox[0, 0] = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[0, 0]) -LSTMoutput3.unnamed268[0, 0] = Scale(LSTMoutput3.expsWxo[0, 0], LSTMoutput2.output[0, 0]) -LSTMoutput2.output[0, 0] = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[0, 0]) -LSTMoutput2.unnamed225[0, 0] = Scale(LSTMoutput2.expsWmr[0, 0], LSTMoutput2.mt[0, 0]) -LSTMoutput2.mt[0, 0] = ElementTimes(LSTMoutput2.ot[0, 0], LSTMoutput2.unnamed224[0, 0]) -LSTMoutput2.unnamed224[0, 0] = Tanh(LSTMoutput2.ct[0, 0]) -LSTMoutput2.ot[0, 0] = Sigmoid(LSTMoutput2.unnamed221[0, 0]) -LSTMoutput2.unnamed221[0, 0] = Plus(LSTMoutput2.unnamed222[0, 0], LSTMoutput2.Wcoct[0, 0]) -LSTMoutput2.Wcoct[0, 0] = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[0, 0]) -LSTMoutput2.unnamed220[0, 0] = Scale(LSTMoutput2.expsWco[0, 0], LSTMoutput2.ct[0, 0]) -LSTMoutput2.ct[0, 0] = Plus(LSTMoutput2.bft[0, 0], LSTMoutput2.bit[0, 0]) -LSTMoutput2.bit[0, 0] = ElementTimes(LSTMoutput2.it[0, 0], LSTMoutput2.unnamed209[0, 0]) -LSTMoutput2.unnamed209[0, 0] = Tanh(LSTMoutput2.unnamed210[0, 0]) -LSTMoutput2.unnamed210[0, 0] = Plus(LSTMoutput2.Wxcx[0, 0], LSTMoutput2.unnamed211[0, 0]) -LSTMoutput2.unnamed211[0, 0] = Plus(LSTMoutput2.Whcdh[0, 0], LSTMoutput2.bc[1024, 1]) -LSTMoutput2.Whcdh[0, 0] = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[0, 0]) -LSTMoutput2.unnamed208[0, 0] = Scale(LSTMoutput2.expsWhc[0, 0], LSTMoutput2.dh[256, 1]) -LSTMoutput2.it[0, 0] = Sigmoid(LSTMoutput2.unnamed204[0, 0]) -LSTMoutput2.unnamed204[0, 0] = Plus(LSTMoutput2.unnamed205[0, 0], LSTMoutput2.Wcidc[0, 0]) -LSTMoutput2.Wcidc[0, 0] = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[0, 0]) -LSTMoutput2.unnamed203[0, 0] = Scale(LSTMoutput2.expsWci[0, 0], LSTMoutput2.dc[1024, 1]) -LSTMoutput2.unnamed205[0, 0] = Plus(LSTMoutput2.unnamed206[0, 0], LSTMoutput2.Whidh[0, 0]) -LSTMoutput2.Whidh[0, 0] = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[0, 0]) -LSTMoutput2.unnamed202[0, 0] = Scale(LSTMoutput2.expsWhi[0, 0], LSTMoutput2.dh[256, 1]) -LSTMoutput2.bft[0, 0] = ElementTimes(LSTMoutput2.ft[0, 0], LSTMoutput2.dc[1024, 1]) -LSTMoutput2.ft[0, 0] = Sigmoid(LSTMoutput2.unnamed215[0, 0]) -LSTMoutput2.unnamed215[0, 0] = Plus(LSTMoutput2.unnamed216[0, 0], LSTMoutput2.Wcfdc[0, 0]) -LSTMoutput2.Wcfdc[0, 0] = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[0, 0]) -LSTMoutput2.unnamed214[0, 0] = Scale(LSTMoutput2.expsWcf[0, 0], LSTMoutput2.dc[1024, 1]) -LSTMoutput2.dc[1024, 1] = PastValue(LSTMoutput2.ct[0, 0]) -LSTMoutput2.unnamed216[0, 0] = Plus(LSTMoutput2.unnamed217[0, 0], LSTMoutput2.Whfdh[0, 0]) -LSTMoutput2.Whfdh[0, 0] = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[0, 0]) -LSTMoutput2.unnamed213[0, 0] = Scale(LSTMoutput2.expsWhf[0, 0], LSTMoutput2.dh[256, 1]) -LSTMoutput2.unnamed222[0, 0] = Plus(LSTMoutput2.unnamed223[0, 0], LSTMoutput2.Whodh[0, 0]) -LSTMoutput2.Whodh[0, 0] = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[0, 0]) -LSTMoutput2.unnamed219[0, 0] = Scale(LSTMoutput2.expsWho[0, 0], LSTMoutput2.dh[256, 1]) -LSTMoutput2.dh[256, 1] = PastValue(LSTMoutput2.output[0, 0]) -LSTMoutput2.bc[1024, 1] = LearnableParameter -LSTMoutput2.expsWhc[0, 0] = Exp(LSTMoutput2.sWhc[1, 1]) -LSTMoutput2.sWhc[1, 1] = LearnableParameter -LSTMoutput2.Whc[1024, 256] = LearnableParameter -LSTMoutput2.Wxcx[0, 0] = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[0, 0]) -LSTMoutput2.unnamed207[0, 0] = Scale(LSTMoutput2.expsWxc[0, 0], LSTMoutput1.output[0, 0]) -LSTMoutput2.expsWxc[0, 0] = Exp(LSTMoutput2.sWxc[1, 1]) -LSTMoutput2.sWxc[1, 1] = LearnableParameter -LSTMoutput2.Wxc[1024, 256] = LearnableParameter -LSTMoutput2.expsWci[0, 0] = Exp(LSTMoutput2.sWci[1, 1]) -LSTMoutput2.sWci[1, 1] = LearnableParameter -LSTMoutput2.Wci[1024, 1] = LearnableParameter -LSTMoutput2.expsWhi[0, 0] = Exp(LSTMoutput2.sWhi[1, 1]) -LSTMoutput2.sWhi[1, 1] = LearnableParameter -LSTMoutput2.Whi[1024, 256] = LearnableParameter -LSTMoutput2.unnamed206[0, 0] = Plus(LSTMoutput2.Wxix[0, 0], LSTMoutput2.bi[1024, 1]) -LSTMoutput2.bi[1024, 1] = LearnableParameter -LSTMoutput2.Wxix[0, 0] = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[0, 0]) -LSTMoutput2.unnamed201[0, 0] = Scale(LSTMoutput2.expsWxi[0, 0], LSTMoutput1.output[0, 0]) -LSTMoutput2.expsWxi[0, 0] = Exp(LSTMoutput2.sWxi[1, 1]) -LSTMoutput2.sWxi[1, 1] = LearnableParameter -LSTMoutput2.Wxi[1024, 256] = LearnableParameter -LSTMoutput2.expsWcf[0, 0] = Exp(LSTMoutput2.sWcf[1, 1]) -LSTMoutput2.sWcf[1, 1] = LearnableParameter -LSTMoutput2.Wcf[1024, 1] = LearnableParameter -LSTMoutput2.expsWhf[0, 0] = Exp(LSTMoutput2.sWhf[1, 1]) -LSTMoutput2.sWhf[1, 1] = LearnableParameter -LSTMoutput2.Whf[1024, 256] = LearnableParameter -LSTMoutput2.unnamed217[0, 0] = Plus(LSTMoutput2.Wxfx[0, 0], LSTMoutput2.bf[1024, 1]) -LSTMoutput2.bf[1024, 1] = LearnableParameter -LSTMoutput2.Wxfx[0, 0] = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[0, 0]) -LSTMoutput2.unnamed212[0, 0] = Scale(LSTMoutput2.expsWxf[0, 0], LSTMoutput1.output[0, 0]) -LSTMoutput2.expsWxf[0, 0] = Exp(LSTMoutput2.sWxf[1, 1]) -LSTMoutput2.sWxf[1, 1] = LearnableParameter -LSTMoutput2.Wxf[1024, 256] = LearnableParameter -LSTMoutput2.expsWco[0, 0] = Exp(LSTMoutput2.sWco[1, 1]) -LSTMoutput2.sWco[1, 1] = LearnableParameter -LSTMoutput2.Wco[1024, 1] = LearnableParameter -LSTMoutput2.expsWho[0, 0] = Exp(LSTMoutput2.sWho[1, 1]) -LSTMoutput2.sWho[1, 1] = LearnableParameter -LSTMoutput2.Who[1024, 256] = LearnableParameter -LSTMoutput2.unnamed223[0, 0] = Plus(LSTMoutput2.Wxox[0, 0], LSTMoutput2.bo[1024, 1]) -LSTMoutput2.bo[1024, 1] = LearnableParameter -LSTMoutput2.Wxox[0, 0] = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[0, 0]) -LSTMoutput2.unnamed218[0, 0] = Scale(LSTMoutput2.expsWxo[0, 0], LSTMoutput1.output[0, 0]) -LSTMoutput1.output[0, 0] = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[0, 0]) -LSTMoutput1.unnamed175[0, 0] = Scale(LSTMoutput1.expsWmr[0, 0], LSTMoutput1.mt[0, 0]) -LSTMoutput1.mt[0, 0] = ElementTimes(LSTMoutput1.ot[0, 0], LSTMoutput1.unnamed174[0, 0]) -LSTMoutput1.unnamed174[0, 0] = Tanh(LSTMoutput1.ct[0, 0]) -LSTMoutput1.ot[0, 0] = Sigmoid(LSTMoutput1.unnamed171[0, 0]) -LSTMoutput1.unnamed171[0, 0] = Plus(LSTMoutput1.unnamed172[0, 0], LSTMoutput1.Wcoct[0, 0]) -LSTMoutput1.Wcoct[0, 0] = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[0, 0]) -LSTMoutput1.unnamed170[0, 0] = Scale(LSTMoutput1.expsWco[0, 0], LSTMoutput1.ct[0, 0]) -LSTMoutput1.ct[0, 0] = Plus(LSTMoutput1.bft[0, 0], LSTMoutput1.bit[0, 0]) -LSTMoutput1.bit[0, 0] = ElementTimes(LSTMoutput1.it[0, 0], LSTMoutput1.unnamed159[0, 0]) -LSTMoutput1.unnamed159[0, 0] = Tanh(LSTMoutput1.unnamed160[0, 0]) -LSTMoutput1.unnamed160[0, 0] = Plus(LSTMoutput1.Wxcx[0, 0], LSTMoutput1.unnamed161[0, 0]) -LSTMoutput1.unnamed161[0, 0] = Plus(LSTMoutput1.Whcdh[0, 0], LSTMoutput1.bc[1024, 1]) -LSTMoutput1.Whcdh[0, 0] = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[0, 0]) -LSTMoutput1.unnamed158[0, 0] = Scale(LSTMoutput1.expsWhc[0, 0], LSTMoutput1.dh[256, 1]) -LSTMoutput1.it[0, 0] = Sigmoid(LSTMoutput1.unnamed154[0, 0]) -LSTMoutput1.unnamed154[0, 0] = Plus(LSTMoutput1.unnamed155[0, 0], LSTMoutput1.Wcidc[0, 0]) -LSTMoutput1.Wcidc[0, 0] = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[0, 0]) -LSTMoutput1.unnamed153[0, 0] = Scale(LSTMoutput1.expsWci[0, 0], LSTMoutput1.dc[1024, 1]) -LSTMoutput1.unnamed155[0, 0] = Plus(LSTMoutput1.unnamed156[0, 0], LSTMoutput1.Whidh[0, 0]) -LSTMoutput1.Whidh[0, 0] = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[0, 0]) -LSTMoutput1.unnamed152[0, 0] = Scale(LSTMoutput1.expsWhi[0, 0], LSTMoutput1.dh[256, 1]) -LSTMoutput1.bft[0, 0] = ElementTimes(LSTMoutput1.ft[0, 0], LSTMoutput1.dc[1024, 1]) -LSTMoutput1.ft[0, 0] = Sigmoid(LSTMoutput1.unnamed165[0, 0]) -LSTMoutput1.unnamed165[0, 0] = Plus(LSTMoutput1.unnamed166[0, 0], LSTMoutput1.Wcfdc[0, 0]) -LSTMoutput1.Wcfdc[0, 0] = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[0, 0]) -LSTMoutput1.unnamed164[0, 0] = Scale(LSTMoutput1.expsWcf[0, 0], LSTMoutput1.dc[1024, 1]) -LSTMoutput1.dc[1024, 1] = PastValue(LSTMoutput1.ct[0, 0]) -LSTMoutput1.unnamed166[0, 0] = Plus(LSTMoutput1.unnamed167[0, 0], LSTMoutput1.Whfdh[0, 0]) -LSTMoutput1.Whfdh[0, 0] = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[0, 0]) -LSTMoutput1.unnamed163[0, 0] = Scale(LSTMoutput1.expsWhf[0, 0], LSTMoutput1.dh[256, 1]) -LSTMoutput1.unnamed172[0, 0] = Plus(LSTMoutput1.unnamed173[0, 0], LSTMoutput1.Whodh[0, 0]) -LSTMoutput1.Whodh[0, 0] = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[0, 0]) -LSTMoutput1.unnamed169[0, 0] = Scale(LSTMoutput1.expsWho[0, 0], LSTMoutput1.dh[256, 1]) -LSTMoutput1.dh[256, 1] = PastValue(LSTMoutput1.output[0, 0]) -LSTMoutput1.bc[1024, 1] = LearnableParameter -LSTMoutput1.expsWhc[0, 0] = Exp(LSTMoutput1.sWhc[1, 1]) -LSTMoutput1.sWhc[1, 1] = LearnableParameter -LSTMoutput1.Whc[1024, 256] = LearnableParameter -LSTMoutput1.Wxcx[0, 0] = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[0, 0]) -LSTMoutput1.unnamed157[0, 0] = Scale(LSTMoutput1.expsWxc[0, 0], featNorm.xNorm[0, 0]) -LSTMoutput1.expsWxc[0, 0] = Exp(LSTMoutput1.sWxc[1, 1]) -LSTMoutput1.sWxc[1, 1] = LearnableParameter -LSTMoutput1.Wxc[1024, 33] = LearnableParameter -LSTMoutput1.expsWci[0, 0] = Exp(LSTMoutput1.sWci[1, 1]) -LSTMoutput1.sWci[1, 1] = LearnableParameter -LSTMoutput1.Wci[1024, 1] = LearnableParameter -LSTMoutput1.expsWhi[0, 0] = Exp(LSTMoutput1.sWhi[1, 1]) -LSTMoutput1.sWhi[1, 1] = LearnableParameter -LSTMoutput1.Whi[1024, 256] = LearnableParameter -LSTMoutput1.unnamed156[0, 0] = Plus(LSTMoutput1.Wxix[0, 0], LSTMoutput1.bi[1024, 1]) -LSTMoutput1.bi[1024, 1] = LearnableParameter -LSTMoutput1.Wxix[0, 0] = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[0, 0]) -LSTMoutput1.unnamed151[0, 0] = Scale(LSTMoutput1.expsWxi[0, 0], featNorm.xNorm[0, 0]) -LSTMoutput1.expsWxi[0, 0] = Exp(LSTMoutput1.sWxi[1, 1]) -LSTMoutput1.sWxi[1, 1] = LearnableParameter -LSTMoutput1.Wxi[1024, 33] = LearnableParameter -LSTMoutput1.expsWcf[0, 0] = Exp(LSTMoutput1.sWcf[1, 1]) -LSTMoutput1.sWcf[1, 1] = LearnableParameter -LSTMoutput1.Wcf[1024, 1] = LearnableParameter -LSTMoutput1.expsWhf[0, 0] = Exp(LSTMoutput1.sWhf[1, 1]) -LSTMoutput1.sWhf[1, 1] = LearnableParameter -LSTMoutput1.Whf[1024, 256] = LearnableParameter -LSTMoutput1.unnamed167[0, 0] = Plus(LSTMoutput1.Wxfx[0, 0], LSTMoutput1.bf[1024, 1]) -LSTMoutput1.bf[1024, 1] = LearnableParameter -LSTMoutput1.Wxfx[0, 0] = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[0, 0]) -LSTMoutput1.unnamed162[0, 0] = Scale(LSTMoutput1.expsWxf[0, 0], featNorm.xNorm[0, 0]) -LSTMoutput1.expsWxf[0, 0] = Exp(LSTMoutput1.sWxf[1, 1]) -LSTMoutput1.sWxf[1, 1] = LearnableParameter -LSTMoutput1.Wxf[1024, 33] = LearnableParameter -LSTMoutput1.expsWco[0, 0] = Exp(LSTMoutput1.sWco[1, 1]) -LSTMoutput1.sWco[1, 1] = LearnableParameter -LSTMoutput1.Wco[1024, 1] = LearnableParameter -LSTMoutput1.expsWho[0, 0] = Exp(LSTMoutput1.sWho[1, 1]) -LSTMoutput1.sWho[1, 1] = LearnableParameter -LSTMoutput1.Who[1024, 256] = LearnableParameter -LSTMoutput1.unnamed173[0, 0] = Plus(LSTMoutput1.Wxox[0, 0], LSTMoutput1.bo[1024, 1]) -LSTMoutput1.bo[1024, 1] = LearnableParameter -LSTMoutput1.Wxox[0, 0] = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[0, 0]) -LSTMoutput1.unnamed168[0, 0] = Scale(LSTMoutput1.expsWxo[0, 0], featNorm.xNorm[0, 0]) -featNorm.xNorm[0, 0] = PerDimMeanVarNormalization(feashift[0, 0], featNorm.xMean[0, 0], featNorm.xStdDev[0, 0]) -featNorm.xStdDev[0, 0] = InvStdDev(feashift[0, 0]) -featNorm.xMean[0, 0] = Mean(feashift[0, 0]) -feashift[0, 0] = RowSlice(features[363, 1]) -features[363, 1] = InputValue -LSTMoutput1.expsWxo[0, 0] = Exp(LSTMoutput1.sWxo[1, 1]) -LSTMoutput1.sWxo[1, 1] = LearnableParameter -LSTMoutput1.Wxo[1024, 33] = LearnableParameter -LSTMoutput1.expsWmr[0, 0] = Exp(LSTMoutput1.sWmr[1, 1]) -LSTMoutput1.sWmr[1, 1] = LearnableParameter -LSTMoutput1.Wmr[256, 1024] = LearnableParameter -LSTMoutput2.expsWxo[0, 0] = Exp(LSTMoutput2.sWxo[1, 1]) -LSTMoutput2.sWxo[1, 1] = LearnableParameter -LSTMoutput2.Wxo[1024, 256] = LearnableParameter -LSTMoutput2.expsWmr[0, 0] = Exp(LSTMoutput2.sWmr[1, 1]) -LSTMoutput2.sWmr[1, 1] = LearnableParameter -LSTMoutput2.Wmr[256, 1024] = LearnableParameter -LSTMoutput3.expsWxo[0, 0] = Exp(LSTMoutput3.sWxo[1, 1]) -LSTMoutput3.sWxo[1, 1] = LearnableParameter -LSTMoutput3.Wxo[1024, 256] = LearnableParameter -LSTMoutput3.expsWmr[0, 0] = Exp(LSTMoutput3.sWmr[1, 1]) -LSTMoutput3.sWmr[1, 1] = LearnableParameter -LSTMoutput3.Wmr[256, 1024] = LearnableParameter -expsW[0, 0] = Exp(sW[1, 1]) -sW[1, 1] = LearnableParameter -W[132, 256] = LearnableParameter -labels[132, 1] = InputValue - -Validating node cr - -Validating --> labels = InputValue -Validating --> W = LearnableParameter -Validating --> sW = LearnableParameter -Validating --> expsW = Exp(sW[1, 1]) -Validating --> LSTMoutput3.Wmr = LearnableParameter -Validating --> LSTMoutput3.sWmr = LearnableParameter -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -Validating --> LSTMoutput3.Wxo = LearnableParameter -Validating --> LSTMoutput3.sWxo = LearnableParameter -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -Validating --> LSTMoutput2.Wmr = LearnableParameter -Validating --> LSTMoutput2.sWmr = LearnableParameter -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -Validating --> LSTMoutput2.Wxo = LearnableParameter -Validating --> LSTMoutput2.sWxo = LearnableParameter -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -Validating --> LSTMoutput1.Wmr = LearnableParameter -Validating --> LSTMoutput1.sWmr = LearnableParameter -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -Validating --> LSTMoutput1.Wxo = LearnableParameter -Validating --> LSTMoutput1.sWxo = LearnableParameter -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -Validating --> features = InputValue -Validating --> feashift = RowSlice(features[363, 1]) -Validating --> featNorm.xMean = Mean(feashift[33, 1]) -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 1]) -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 1]) -Validating --> LSTMoutput1.bo = LearnableParameter -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 1], LSTMoutput1.bo[1024, 1]) -Validating --> LSTMoutput1.Who = LearnableParameter -Validating --> LSTMoutput1.sWho = LearnableParameter -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -Validating --> LSTMoutput1.Wco = LearnableParameter -Validating --> LSTMoutput1.sWco = LearnableParameter -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -Validating --> LSTMoutput1.Wxf = LearnableParameter -Validating --> LSTMoutput1.sWxf = LearnableParameter -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 1]) -Validating --> LSTMoutput1.bf = LearnableParameter -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 1], LSTMoutput1.bf[1024, 1]) -Validating --> LSTMoutput1.Whf = LearnableParameter -Validating --> LSTMoutput1.sWhf = LearnableParameter -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -Validating --> LSTMoutput1.Wcf = LearnableParameter -Validating --> LSTMoutput1.sWcf = LearnableParameter -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -Validating --> LSTMoutput1.Wxi = LearnableParameter -Validating --> LSTMoutput1.sWxi = LearnableParameter -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 1]) -Validating --> LSTMoutput1.bi = LearnableParameter -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 1], LSTMoutput1.bi[1024, 1]) -Validating --> LSTMoutput1.Whi = LearnableParameter -Validating --> LSTMoutput1.sWhi = LearnableParameter -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -Validating --> LSTMoutput1.Wci = LearnableParameter -Validating --> LSTMoutput1.sWci = LearnableParameter -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -Validating --> LSTMoutput1.Wxc = LearnableParameter -Validating --> LSTMoutput1.sWxc = LearnableParameter -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 1]) -Validating --> LSTMoutput1.Whc = LearnableParameter -Validating --> LSTMoutput1.sWhc = LearnableParameter -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -Validating --> LSTMoutput1.bc = LearnableParameter -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[0 {W=472446402560, H=14145, C=120}, 0]) -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256 {W=472446402560, H=14145, C=120}, 1]) -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256 {W=472446402560, H=14145, C=120}, 1]) -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 1], LSTMoutput1.Whodh[1024, 1]) -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256 {W=472446402560, H=14145, C=120}, 1]) -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256 {W=472446402560, H=14145, C=120}, 1]) -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 1], LSTMoutput1.Whfdh[1024, 1]) -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[0 {W=472446402560, H=24545, C=120}, 0]) -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 1], LSTMoutput1.Wcfdc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256 {W=472446402560, H=14145, C=120}, 1]) -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256 {W=472446402560, H=14145, C=120}, 1]) -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 1], LSTMoutput1.Whidh[1024, 1]) -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 1], LSTMoutput1.Wcidc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256 {W=472446402560, H=14145, C=120}, 1]) -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256 {W=472446402560, H=14145, C=120}, 1]) -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1]) -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 1], LSTMoutput1.unnamed161[1024, 1]) -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 1]) -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.unnamed159[1024, 1]) -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.bit[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 1], LSTMoutput1.Wcoct[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.unnamed174[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 1]) -Validating --> LSTMoutput2.bo = LearnableParameter -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 1], LSTMoutput2.bo[1024, 1]) -Validating --> LSTMoutput2.Who = LearnableParameter -Validating --> LSTMoutput2.sWho = LearnableParameter -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -Validating --> LSTMoutput2.Wco = LearnableParameter -Validating --> LSTMoutput2.sWco = LearnableParameter -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -Validating --> LSTMoutput2.Wxf = LearnableParameter -Validating --> LSTMoutput2.sWxf = LearnableParameter -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 1]) -Validating --> LSTMoutput2.bf = LearnableParameter -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 1], LSTMoutput2.bf[1024, 1]) -Validating --> LSTMoutput2.Whf = LearnableParameter -Validating --> LSTMoutput2.sWhf = LearnableParameter -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -Validating --> LSTMoutput2.Wcf = LearnableParameter -Validating --> LSTMoutput2.sWcf = LearnableParameter -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -Validating --> LSTMoutput2.Wxi = LearnableParameter -Validating --> LSTMoutput2.sWxi = LearnableParameter -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 1]) -Validating --> LSTMoutput2.bi = LearnableParameter -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 1], LSTMoutput2.bi[1024, 1]) -Validating --> LSTMoutput2.Whi = LearnableParameter -Validating --> LSTMoutput2.sWhi = LearnableParameter -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -Validating --> LSTMoutput2.Wci = LearnableParameter -Validating --> LSTMoutput2.sWci = LearnableParameter -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -Validating --> LSTMoutput2.Wxc = LearnableParameter -Validating --> LSTMoutput2.sWxc = LearnableParameter -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 1]) -Validating --> LSTMoutput2.Whc = LearnableParameter -Validating --> LSTMoutput2.sWhc = LearnableParameter -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -Validating --> LSTMoutput2.bc = LearnableParameter -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[0 {W=472446402560, H=31873, C=120}, 0]) -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256 {W=472446402560, H=31873, C=120}, 1]) -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256 {W=472446402560, H=31873, C=120}, 1]) -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 1], LSTMoutput2.Whodh[1024, 1]) -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256 {W=472446402560, H=31873, C=120}, 1]) -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256 {W=472446402560, H=31873, C=120}, 1]) -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 1], LSTMoutput2.Whfdh[1024, 1]) -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[0 {W=472446402560, H=42273, C=120}, 0]) -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 1], LSTMoutput2.Wcfdc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256 {W=472446402560, H=31873, C=120}, 1]) -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256 {W=472446402560, H=31873, C=120}, 1]) -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 1], LSTMoutput2.Whidh[1024, 1]) -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 1], LSTMoutput2.Wcidc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256 {W=472446402560, H=31873, C=120}, 1]) -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256 {W=472446402560, H=31873, C=120}, 1]) -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1]) -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 1], LSTMoutput2.unnamed211[1024, 1]) -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 1]) -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.unnamed209[1024, 1]) -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.bit[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 1], LSTMoutput2.Wcoct[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.unnamed224[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 1]) -Validating --> LSTMoutput3.bo = LearnableParameter -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 1], LSTMoutput3.bo[1024, 1]) -Validating --> LSTMoutput3.Who = LearnableParameter -Validating --> LSTMoutput3.sWho = LearnableParameter -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -Validating --> LSTMoutput3.Wco = LearnableParameter -Validating --> LSTMoutput3.sWco = LearnableParameter -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -Validating --> LSTMoutput3.Wxf = LearnableParameter -Validating --> LSTMoutput3.sWxf = LearnableParameter -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 1]) -Validating --> LSTMoutput3.bf = LearnableParameter -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 1], LSTMoutput3.bf[1024, 1]) -Validating --> LSTMoutput3.Whf = LearnableParameter -Validating --> LSTMoutput3.sWhf = LearnableParameter -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -Validating --> LSTMoutput3.Wcf = LearnableParameter -Validating --> LSTMoutput3.sWcf = LearnableParameter -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -Validating --> LSTMoutput3.Wxi = LearnableParameter -Validating --> LSTMoutput3.sWxi = LearnableParameter -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 1]) -Validating --> LSTMoutput3.bi = LearnableParameter -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 1], LSTMoutput3.bi[1024, 1]) -Validating --> LSTMoutput3.Whi = LearnableParameter -Validating --> LSTMoutput3.sWhi = LearnableParameter -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -Validating --> LSTMoutput3.Wci = LearnableParameter -Validating --> LSTMoutput3.sWci = LearnableParameter -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -Validating --> LSTMoutput3.Wxc = LearnableParameter -Validating --> LSTMoutput3.sWxc = LearnableParameter -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 1]) -Validating --> LSTMoutput3.Whc = LearnableParameter -Validating --> LSTMoutput3.sWhc = LearnableParameter -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -Validating --> LSTMoutput3.bc = LearnableParameter -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[0 {W=472446402560, H=51281, C=120}, 0]) -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256 {W=472446402560, H=51281, C=120}, 1]) -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256 {W=472446402560, H=51281, C=120}, 1]) -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 1], LSTMoutput3.Whodh[1024, 1]) -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256 {W=472446402560, H=51281, C=120}, 1]) -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256 {W=472446402560, H=51281, C=120}, 1]) -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 1], LSTMoutput3.Whfdh[1024, 1]) -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[0 {W=472446402560, H=61793, C=120}, 0]) -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 1], LSTMoutput3.Wcfdc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256 {W=472446402560, H=51281, C=120}, 1]) -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256 {W=472446402560, H=51281, C=120}, 1]) -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 1], LSTMoutput3.Whidh[1024, 1]) -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 1], LSTMoutput3.Wcidc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256 {W=472446402560, H=51281, C=120}, 1]) -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256 {W=472446402560, H=51281, C=120}, 1]) -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1]) -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 1], LSTMoutput3.unnamed261[1024, 1]) -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 1]) -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.unnamed259[1024, 1]) -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.bit[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 1], LSTMoutput3.Wcoct[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.unnamed274[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 1]) -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 1]) -Validating --> b = LearnableParameter -Validating --> LSTMoutputW = Plus(unnamed283[132, 1], b[132, 1]) -Validating --> cr = CrossEntropyWithSoftmax(labels[132, 1], LSTMoutputW[132, 1]) - - nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Validating node ScaledLogLikelihood - -Validating --> W = LearnableParameter -Validating --> sW = LearnableParameter -Validating --> expsW = Exp(sW[1, 1]) -Validating --> LSTMoutput3.Wmr = LearnableParameter -Validating --> LSTMoutput3.sWmr = LearnableParameter -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -Validating --> LSTMoutput3.Wxo = LearnableParameter -Validating --> LSTMoutput3.sWxo = LearnableParameter -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -Validating --> LSTMoutput2.Wmr = LearnableParameter -Validating --> LSTMoutput2.sWmr = LearnableParameter -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -Validating --> LSTMoutput2.Wxo = LearnableParameter -Validating --> LSTMoutput2.sWxo = LearnableParameter -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -Validating --> LSTMoutput1.Wmr = LearnableParameter -Validating --> LSTMoutput1.sWmr = LearnableParameter -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -Validating --> LSTMoutput1.Wxo = LearnableParameter -Validating --> LSTMoutput1.sWxo = LearnableParameter -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -Validating --> features = InputValue -Validating --> feashift = RowSlice(features[363, 1]) -Validating --> featNorm.xMean = Mean(feashift[33, 1]) -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 1]) -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 1]) -Validating --> LSTMoutput1.bo = LearnableParameter -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 1], LSTMoutput1.bo[1024, 1]) -Validating --> LSTMoutput1.Who = LearnableParameter -Validating --> LSTMoutput1.sWho = LearnableParameter -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -Validating --> LSTMoutput1.Wco = LearnableParameter -Validating --> LSTMoutput1.sWco = LearnableParameter -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -Validating --> LSTMoutput1.Wxf = LearnableParameter -Validating --> LSTMoutput1.sWxf = LearnableParameter -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 1]) -Validating --> LSTMoutput1.bf = LearnableParameter -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 1], LSTMoutput1.bf[1024, 1]) -Validating --> LSTMoutput1.Whf = LearnableParameter -Validating --> LSTMoutput1.sWhf = LearnableParameter -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -Validating --> LSTMoutput1.Wcf = LearnableParameter -Validating --> LSTMoutput1.sWcf = LearnableParameter -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -Validating --> LSTMoutput1.Wxi = LearnableParameter -Validating --> LSTMoutput1.sWxi = LearnableParameter -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 1]) -Validating --> LSTMoutput1.bi = LearnableParameter -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 1], LSTMoutput1.bi[1024, 1]) -Validating --> LSTMoutput1.Whi = LearnableParameter -Validating --> LSTMoutput1.sWhi = LearnableParameter -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -Validating --> LSTMoutput1.Wci = LearnableParameter -Validating --> LSTMoutput1.sWci = LearnableParameter -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -Validating --> LSTMoutput1.Wxc = LearnableParameter -Validating --> LSTMoutput1.sWxc = LearnableParameter -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 1]) -Validating --> LSTMoutput1.Whc = LearnableParameter -Validating --> LSTMoutput1.sWhc = LearnableParameter -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -Validating --> LSTMoutput1.bc = LearnableParameter -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, 1]) -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 1], LSTMoutput1.Whodh[1024, 1]) -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, 1]) -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 1], LSTMoutput1.Whfdh[1024, 1]) -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 1], LSTMoutput1.Wcfdc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, 1]) -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 1], LSTMoutput1.Whidh[1024, 1]) -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 1], LSTMoutput1.Wcidc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, 1]) -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1]) -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 1], LSTMoutput1.unnamed161[1024, 1]) -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 1]) -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.unnamed159[1024, 1]) -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.bit[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 1], LSTMoutput1.Wcoct[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.unnamed174[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 1]) -Validating --> LSTMoutput2.bo = LearnableParameter -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 1], LSTMoutput2.bo[1024, 1]) -Validating --> LSTMoutput2.Who = LearnableParameter -Validating --> LSTMoutput2.sWho = LearnableParameter -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -Validating --> LSTMoutput2.Wco = LearnableParameter -Validating --> LSTMoutput2.sWco = LearnableParameter -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -Validating --> LSTMoutput2.Wxf = LearnableParameter -Validating --> LSTMoutput2.sWxf = LearnableParameter -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 1]) -Validating --> LSTMoutput2.bf = LearnableParameter -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 1], LSTMoutput2.bf[1024, 1]) -Validating --> LSTMoutput2.Whf = LearnableParameter -Validating --> LSTMoutput2.sWhf = LearnableParameter -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -Validating --> LSTMoutput2.Wcf = LearnableParameter -Validating --> LSTMoutput2.sWcf = LearnableParameter -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -Validating --> LSTMoutput2.Wxi = LearnableParameter -Validating --> LSTMoutput2.sWxi = LearnableParameter -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 1]) -Validating --> LSTMoutput2.bi = LearnableParameter -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 1], LSTMoutput2.bi[1024, 1]) -Validating --> LSTMoutput2.Whi = LearnableParameter -Validating --> LSTMoutput2.sWhi = LearnableParameter -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -Validating --> LSTMoutput2.Wci = LearnableParameter -Validating --> LSTMoutput2.sWci = LearnableParameter -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -Validating --> LSTMoutput2.Wxc = LearnableParameter -Validating --> LSTMoutput2.sWxc = LearnableParameter -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 1]) -Validating --> LSTMoutput2.Whc = LearnableParameter -Validating --> LSTMoutput2.sWhc = LearnableParameter -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -Validating --> LSTMoutput2.bc = LearnableParameter -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, 1]) -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 1], LSTMoutput2.Whodh[1024, 1]) -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, 1]) -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 1], LSTMoutput2.Whfdh[1024, 1]) -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 1], LSTMoutput2.Wcfdc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, 1]) -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 1], LSTMoutput2.Whidh[1024, 1]) -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 1], LSTMoutput2.Wcidc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, 1]) -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1]) -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 1], LSTMoutput2.unnamed211[1024, 1]) -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 1]) -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.unnamed209[1024, 1]) -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.bit[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 1], LSTMoutput2.Wcoct[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.unnamed224[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 1]) -Validating --> LSTMoutput3.bo = LearnableParameter -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 1], LSTMoutput3.bo[1024, 1]) -Validating --> LSTMoutput3.Who = LearnableParameter -Validating --> LSTMoutput3.sWho = LearnableParameter -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -Validating --> LSTMoutput3.Wco = LearnableParameter -Validating --> LSTMoutput3.sWco = LearnableParameter -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -Validating --> LSTMoutput3.Wxf = LearnableParameter -Validating --> LSTMoutput3.sWxf = LearnableParameter -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 1]) -Validating --> LSTMoutput3.bf = LearnableParameter -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 1], LSTMoutput3.bf[1024, 1]) -Validating --> LSTMoutput3.Whf = LearnableParameter -Validating --> LSTMoutput3.sWhf = LearnableParameter -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -Validating --> LSTMoutput3.Wcf = LearnableParameter -Validating --> LSTMoutput3.sWcf = LearnableParameter -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -Validating --> LSTMoutput3.Wxi = LearnableParameter -Validating --> LSTMoutput3.sWxi = LearnableParameter -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 1]) -Validating --> LSTMoutput3.bi = LearnableParameter -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 1], LSTMoutput3.bi[1024, 1]) -Validating --> LSTMoutput3.Whi = LearnableParameter -Validating --> LSTMoutput3.sWhi = LearnableParameter -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -Validating --> LSTMoutput3.Wci = LearnableParameter -Validating --> LSTMoutput3.sWci = LearnableParameter -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -Validating --> LSTMoutput3.Wxc = LearnableParameter -Validating --> LSTMoutput3.sWxc = LearnableParameter -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 1]) -Validating --> LSTMoutput3.Whc = LearnableParameter -Validating --> LSTMoutput3.sWhc = LearnableParameter -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -Validating --> LSTMoutput3.bc = LearnableParameter -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, 1]) -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, 1]) -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 1], LSTMoutput3.Whodh[1024, 1]) -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, 1]) -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 1], LSTMoutput3.Whfdh[1024, 1]) -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 1], LSTMoutput3.Wcfdc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, 1]) -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 1], LSTMoutput3.Whidh[1024, 1]) -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 1], LSTMoutput3.Wcidc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, 1]) -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1]) -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 1], LSTMoutput3.unnamed261[1024, 1]) -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 1]) -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.unnamed259[1024, 1]) -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.bit[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 1], LSTMoutput3.Wcoct[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.unnamed274[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 1]) -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 1]) -Validating --> b = LearnableParameter -Validating --> LSTMoutputW = Plus(unnamed283[132, 1], b[132, 1]) -Validating --> labels = InputValue -Validating --> logPrior.Prior = Mean(labels[132, 1]) -Validating --> logPrior.LogPrior = Log(logPrior.Prior[132, 1]) -Validating --> ScaledLogLikelihood = Minus(LSTMoutputW[132, 1], logPrior.LogPrior[132, 1]) - - nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Validating node Err - -Validating --> labels = InputValue -Validating --> W = LearnableParameter -Validating --> sW = LearnableParameter -Validating --> expsW = Exp(sW[1, 1]) -Validating --> LSTMoutput3.Wmr = LearnableParameter -Validating --> LSTMoutput3.sWmr = LearnableParameter -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -Validating --> LSTMoutput3.Wxo = LearnableParameter -Validating --> LSTMoutput3.sWxo = LearnableParameter -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -Validating --> LSTMoutput2.Wmr = LearnableParameter -Validating --> LSTMoutput2.sWmr = LearnableParameter -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -Validating --> LSTMoutput2.Wxo = LearnableParameter -Validating --> LSTMoutput2.sWxo = LearnableParameter -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -Validating --> LSTMoutput1.Wmr = LearnableParameter -Validating --> LSTMoutput1.sWmr = LearnableParameter -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -Validating --> LSTMoutput1.Wxo = LearnableParameter -Validating --> LSTMoutput1.sWxo = LearnableParameter -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -Validating --> features = InputValue -Validating --> feashift = RowSlice(features[363, 1]) -Validating --> featNorm.xMean = Mean(feashift[33, 1]) -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 1]) -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 1]) -Validating --> LSTMoutput1.bo = LearnableParameter -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 1], LSTMoutput1.bo[1024, 1]) -Validating --> LSTMoutput1.Who = LearnableParameter -Validating --> LSTMoutput1.sWho = LearnableParameter -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -Validating --> LSTMoutput1.Wco = LearnableParameter -Validating --> LSTMoutput1.sWco = LearnableParameter -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -Validating --> LSTMoutput1.Wxf = LearnableParameter -Validating --> LSTMoutput1.sWxf = LearnableParameter -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 1]) -Validating --> LSTMoutput1.bf = LearnableParameter -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 1], LSTMoutput1.bf[1024, 1]) -Validating --> LSTMoutput1.Whf = LearnableParameter -Validating --> LSTMoutput1.sWhf = LearnableParameter -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -Validating --> LSTMoutput1.Wcf = LearnableParameter -Validating --> LSTMoutput1.sWcf = LearnableParameter -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -Validating --> LSTMoutput1.Wxi = LearnableParameter -Validating --> LSTMoutput1.sWxi = LearnableParameter -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 1]) -Validating --> LSTMoutput1.bi = LearnableParameter -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 1], LSTMoutput1.bi[1024, 1]) -Validating --> LSTMoutput1.Whi = LearnableParameter -Validating --> LSTMoutput1.sWhi = LearnableParameter -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -Validating --> LSTMoutput1.Wci = LearnableParameter -Validating --> LSTMoutput1.sWci = LearnableParameter -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -Validating --> LSTMoutput1.Wxc = LearnableParameter -Validating --> LSTMoutput1.sWxc = LearnableParameter -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 1]) -Validating --> LSTMoutput1.Whc = LearnableParameter -Validating --> LSTMoutput1.sWhc = LearnableParameter -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -Validating --> LSTMoutput1.bc = LearnableParameter -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, 1]) -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 1], LSTMoutput1.Whodh[1024, 1]) -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, 1]) -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 1], LSTMoutput1.Whfdh[1024, 1]) -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 1], LSTMoutput1.Wcfdc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, 1]) -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 1], LSTMoutput1.Whidh[1024, 1]) -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 1], LSTMoutput1.Wcidc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, 1]) -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1]) -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 1], LSTMoutput1.unnamed161[1024, 1]) -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 1]) -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.unnamed159[1024, 1]) -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.bit[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 1], LSTMoutput1.Wcoct[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.unnamed174[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 1]) -Validating --> LSTMoutput2.bo = LearnableParameter -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 1], LSTMoutput2.bo[1024, 1]) -Validating --> LSTMoutput2.Who = LearnableParameter -Validating --> LSTMoutput2.sWho = LearnableParameter -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -Validating --> LSTMoutput2.Wco = LearnableParameter -Validating --> LSTMoutput2.sWco = LearnableParameter -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -Validating --> LSTMoutput2.Wxf = LearnableParameter -Validating --> LSTMoutput2.sWxf = LearnableParameter -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 1]) -Validating --> LSTMoutput2.bf = LearnableParameter -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 1], LSTMoutput2.bf[1024, 1]) -Validating --> LSTMoutput2.Whf = LearnableParameter -Validating --> LSTMoutput2.sWhf = LearnableParameter -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -Validating --> LSTMoutput2.Wcf = LearnableParameter -Validating --> LSTMoutput2.sWcf = LearnableParameter -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -Validating --> LSTMoutput2.Wxi = LearnableParameter -Validating --> LSTMoutput2.sWxi = LearnableParameter -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 1]) -Validating --> LSTMoutput2.bi = LearnableParameter -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 1], LSTMoutput2.bi[1024, 1]) -Validating --> LSTMoutput2.Whi = LearnableParameter -Validating --> LSTMoutput2.sWhi = LearnableParameter -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -Validating --> LSTMoutput2.Wci = LearnableParameter -Validating --> LSTMoutput2.sWci = LearnableParameter -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -Validating --> LSTMoutput2.Wxc = LearnableParameter -Validating --> LSTMoutput2.sWxc = LearnableParameter -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 1]) -Validating --> LSTMoutput2.Whc = LearnableParameter -Validating --> LSTMoutput2.sWhc = LearnableParameter -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -Validating --> LSTMoutput2.bc = LearnableParameter -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, 1]) -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 1], LSTMoutput2.Whodh[1024, 1]) -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, 1]) -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 1], LSTMoutput2.Whfdh[1024, 1]) -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 1], LSTMoutput2.Wcfdc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, 1]) -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 1], LSTMoutput2.Whidh[1024, 1]) -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 1], LSTMoutput2.Wcidc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, 1]) -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1]) -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 1], LSTMoutput2.unnamed211[1024, 1]) -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 1]) -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.unnamed209[1024, 1]) -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.bit[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 1], LSTMoutput2.Wcoct[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.unnamed224[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 1]) -Validating --> LSTMoutput3.bo = LearnableParameter -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 1], LSTMoutput3.bo[1024, 1]) -Validating --> LSTMoutput3.Who = LearnableParameter -Validating --> LSTMoutput3.sWho = LearnableParameter -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -Validating --> LSTMoutput3.Wco = LearnableParameter -Validating --> LSTMoutput3.sWco = LearnableParameter -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -Validating --> LSTMoutput3.Wxf = LearnableParameter -Validating --> LSTMoutput3.sWxf = LearnableParameter -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 1]) -Validating --> LSTMoutput3.bf = LearnableParameter -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 1], LSTMoutput3.bf[1024, 1]) -Validating --> LSTMoutput3.Whf = LearnableParameter -Validating --> LSTMoutput3.sWhf = LearnableParameter -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -Validating --> LSTMoutput3.Wcf = LearnableParameter -Validating --> LSTMoutput3.sWcf = LearnableParameter -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -Validating --> LSTMoutput3.Wxi = LearnableParameter -Validating --> LSTMoutput3.sWxi = LearnableParameter -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 1]) -Validating --> LSTMoutput3.bi = LearnableParameter -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 1], LSTMoutput3.bi[1024, 1]) -Validating --> LSTMoutput3.Whi = LearnableParameter -Validating --> LSTMoutput3.sWhi = LearnableParameter -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -Validating --> LSTMoutput3.Wci = LearnableParameter -Validating --> LSTMoutput3.sWci = LearnableParameter -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -Validating --> LSTMoutput3.Wxc = LearnableParameter -Validating --> LSTMoutput3.sWxc = LearnableParameter -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 1]) -Validating --> LSTMoutput3.Whc = LearnableParameter -Validating --> LSTMoutput3.sWhc = LearnableParameter -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -Validating --> LSTMoutput3.bc = LearnableParameter -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, 1]) -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, 1]) -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 1], LSTMoutput3.Whodh[1024, 1]) -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, 1]) -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 1], LSTMoutput3.Whfdh[1024, 1]) -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 1], LSTMoutput3.Wcfdc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, 1]) -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 1], LSTMoutput3.Whidh[1024, 1]) -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 1], LSTMoutput3.Wcidc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, 1]) -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1]) -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 1], LSTMoutput3.unnamed261[1024, 1]) -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 1]) -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.unnamed259[1024, 1]) -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.bit[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 1], LSTMoutput3.Wcoct[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.unnamed274[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 1]) -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 1]) -Validating --> b = LearnableParameter -Validating --> LSTMoutputW = Plus(unnamed283[132, 1], b[132, 1]) -Validating --> Err = ErrorPrediction(labels[132, 1], LSTMoutputW[132, 1]) - +Node --> B = LearnableParameter +Node --> labels = InputValue +Node --> LSTMoutputW./*+*/left./***/left = LearnableParameter +Node --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].Wmr = LearnableParameter +Node --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].Wmr = LearnableParameter +Node --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].Wmr = LearnableParameter +Node --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> features = InputValue +Node --> feashift = RowSlice +Node --> featNorm.meanVector = Mean +Node --> featNorm.invStdDevVector = InvStdDev +Node --> featNorm = PerDimMeanVarNormalization +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].dh = PastValue +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[1].ot.z./*+*/left = Plus +Node --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[1].ft.z./*+*/left = Plus +Node --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[1].dc = PastValue +Node --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale +Node --> LSTMoutput[1].ft.z./*+*/right = DiagTimes +Node --> LSTMoutput[1].ft.z = Plus +Node --> LSTMoutput[1].ft = Sigmoid +Node --> LSTMoutput[1].bft = ElementTimes +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[1].it.z./*+*/left = Plus +Node --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[1].it.z./*+*/right.matrix = Scale +Node --> LSTMoutput[1].it.z./*+*/right = DiagTimes +Node --> LSTMoutput[1].it.z = Plus +Node --> LSTMoutput[1].it = Sigmoid +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus +Node --> LSTMoutput[1].bit./*.**/right.z = Plus +Node --> LSTMoutput[1].bit./*.**/right = Tanh +Node --> LSTMoutput[1].bit = ElementTimes +Node --> LSTMoutput[1].ct = Plus +Node --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale +Node --> LSTMoutput[1].ot.z./*+*/right = DiagTimes +Node --> LSTMoutput[1].ot.z = Plus +Node --> LSTMoutput[1].ot = Sigmoid +Node --> LSTMoutput[1].mt./*.**/right = Tanh +Node --> LSTMoutput[1].mt = ElementTimes +Node --> LSTMoutput[1].output./***/right = Scale +Node --> LSTMoutput[1].output = Times +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].dh = PastValue +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[2].ot.z./*+*/left = Plus +Node --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[2].ft.z./*+*/left = Plus +Node --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[2].dc = PastValue +Node --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale +Node --> LSTMoutput[2].ft.z./*+*/right = DiagTimes +Node --> LSTMoutput[2].ft.z = Plus +Node --> LSTMoutput[2].ft = Sigmoid +Node --> LSTMoutput[2].bft = ElementTimes +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[2].it.z./*+*/left = Plus +Node --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[2].it.z./*+*/right.matrix = Scale +Node --> LSTMoutput[2].it.z./*+*/right = DiagTimes +Node --> LSTMoutput[2].it.z = Plus +Node --> LSTMoutput[2].it = Sigmoid +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus +Node --> LSTMoutput[2].bit./*.**/right.z = Plus +Node --> LSTMoutput[2].bit./*.**/right = Tanh +Node --> LSTMoutput[2].bit = ElementTimes +Node --> LSTMoutput[2].ct = Plus +Node --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale +Node --> LSTMoutput[2].ot.z./*+*/right = DiagTimes +Node --> LSTMoutput[2].ot.z = Plus +Node --> LSTMoutput[2].ot = Sigmoid +Node --> LSTMoutput[2].mt./*.**/right = Tanh +Node --> LSTMoutput[2].mt = ElementTimes +Node --> LSTMoutput[2].output./***/right = Scale +Node --> LSTMoutput[2].output = Times +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].dh = PastValue +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[3].ot.z./*+*/left = Plus +Node --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[3].ft.z./*+*/left = Plus +Node --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[3].dc = PastValue +Node --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale +Node --> LSTMoutput[3].ft.z./*+*/right = DiagTimes +Node --> LSTMoutput[3].ft.z = Plus +Node --> LSTMoutput[3].ft = Sigmoid +Node --> LSTMoutput[3].bft = ElementTimes +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[3].it.z./*+*/left = Plus +Node --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[3].it.z./*+*/right.matrix = Scale +Node --> LSTMoutput[3].it.z./*+*/right = DiagTimes +Node --> LSTMoutput[3].it.z = Plus +Node --> LSTMoutput[3].it = Sigmoid +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus +Node --> LSTMoutput[3].bit./*.**/right.z = Plus +Node --> LSTMoutput[3].bit./*.**/right = Tanh +Node --> LSTMoutput[3].bit = ElementTimes +Node --> LSTMoutput[3].ct = Plus +Node --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale +Node --> LSTMoutput[3].ot.z./*+*/right = DiagTimes +Node --> LSTMoutput[3].ot.z = Plus +Node --> LSTMoutput[3].ot = Sigmoid +Node --> LSTMoutput[3].mt./*.**/right = Tanh +Node --> LSTMoutput[3].mt = ElementTimes +Node --> LSTMoutput[3].output./***/right = Scale +Node --> LSTMoutput[3].output = Times +Node --> LSTMoutputW./*+*/left./***/right = Scale +Node --> LSTMoutputW./*+*/left = Times +Node --> LSTMoutputW = Plus +Node --> Err = ErrorPrediction +Node --> logPrior.x = Mean +Node --> logPrior = Log +Node --> ScaledLogLikelihood = Minus +Node --> cr = CrossEntropyWithSoftmax +N9Microsoft3MSR4CNTK18ComputationNetworkE [ + B : LearnableParameter 132 x 1 () + cr : CrossEntropyWithSoftmax 0 x 0 ( + labels + LSTMoutputW + ) + Err : ErrorPrediction 0 x 0 ( + labels + LSTMoutputW + ) + feashift : RowSlice 0 x 0 ( + features + ) + featNorm : PerDimMeanVarNormalization 0 x 0 ( + feashift + featNorm.meanVector + featNorm.invStdDevVector + ) + featNorm.invStdDevVector : InvStdDev 0 x 0 ( + feashift + ) + featNorm.meanVector : Mean 0 x 0 ( + feashift + ) + features : InputValue 363 x 1 () + labels : InputValue 132 x 1 () + logPrior : Log 0 x 0 ( + logPrior.x + ) + logPrior.x : Mean 0 x 0 ( + labels + ) + LSTMoutput[1].bft : ElementTimes 0 x 0 ( + LSTMoutput[1].ft + LSTMoutput[1].dc + ) + LSTMoutput[1].bit : ElementTimes 0 x 0 ( + LSTMoutput[1].it + LSTMoutput[1].bit./*.**/right + ) + LSTMoutput[1].bit./*.**/right : Tanh 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z + ) + LSTMoutput[1].bit./*.**/right.z : Plus 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/left + LSTMoutput[1].bit./*.**/right.z./*+*/right + ) + LSTMoutput[1].bit./*.**/right.z./*+*/left : Times 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right + ) + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 33 () + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor + featNorm + ) + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].bit./*.**/right.z./*+*/right : Plus 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right + ) + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right + ) + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].dh + ) + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[1].ct : Plus 0 x 0 ( + LSTMoutput[1].bft + LSTMoutput[1].bit + ) + LSTMoutput[1].dc : PastValue 1024 x 1 ( + LSTMoutput[1].ct + ) + LSTMoutput[1].dh : PastValue 256 x 1 ( + LSTMoutput[1].output + ) + LSTMoutput[1].ft : Sigmoid 0 x 0 ( + LSTMoutput[1].ft.z + ) + LSTMoutput[1].ft.z : Plus 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left + LSTMoutput[1].ft.z./*+*/right + ) + LSTMoutput[1].ft.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left + LSTMoutput[1].ft.z./*+*/left./*+*/right + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 () + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + featNorm + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[1].ft.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[1].dh + ) + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].ft.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[1].ft.z./*+*/right.matrix + ) + LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[1].ft.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[1].dc + ) + LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].it : Sigmoid 0 x 0 ( + LSTMoutput[1].it.z + ) + LSTMoutput[1].it.z : Plus 0 x 0 ( + LSTMoutput[1].it.z./*+*/left + LSTMoutput[1].it.z./*+*/right + ) + LSTMoutput[1].it.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left + LSTMoutput[1].it.z./*+*/left./*+*/right + ) + LSTMoutput[1].it.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 () + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + featNorm + ) + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[1].it.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/right./***/left + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[1].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[1].dh + ) + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].it.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[1].it.z./*+*/right.matrix + ) + LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[1].it.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[1].dc + ) + LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].mt : ElementTimes 0 x 0 ( + LSTMoutput[1].ot + LSTMoutput[1].mt./*.**/right + ) + LSTMoutput[1].mt./*.**/right : Tanh 0 x 0 ( + LSTMoutput[1].ct + ) + LSTMoutput[1].ot : Sigmoid 0 x 0 ( + LSTMoutput[1].ot.z + ) + LSTMoutput[1].ot.z : Plus 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left + LSTMoutput[1].ot.z./*+*/right + ) + LSTMoutput[1].ot.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left + LSTMoutput[1].ot.z./*+*/left./*+*/right + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 () + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + featNorm + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[1].ot.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[1].dh + ) + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].ot.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[1].ot.z./*+*/right.matrix + ) + LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[1].ot.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[1].ct + ) + LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].output : Times 0 x 0 ( + LSTMoutput[1].Wmr + LSTMoutput[1].output./***/right + ) + LSTMoutput[1].output./***/right : Scale 0 x 0 ( + LSTMoutput[1].output./***/right.scalarScalingFactor + LSTMoutput[1].mt + ) + LSTMoutput[1].output./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].output./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].Wmr : LearnableParameter 256 x 1024 () + LSTMoutput[2].bft : ElementTimes 0 x 0 ( + LSTMoutput[2].ft + LSTMoutput[2].dc + ) + LSTMoutput[2].bit : ElementTimes 0 x 0 ( + LSTMoutput[2].it + LSTMoutput[2].bit./*.**/right + ) + LSTMoutput[2].bit./*.**/right : Tanh 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z + ) + LSTMoutput[2].bit./*.**/right.z : Plus 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/left + LSTMoutput[2].bit./*.**/right.z./*+*/right + ) + LSTMoutput[2].bit./*.**/right.z./*+*/left : Times 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right + ) + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].output + ) + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].bit./*.**/right.z./*+*/right : Plus 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right + ) + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right + ) + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].dh + ) + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[2].ct : Plus 0 x 0 ( + LSTMoutput[2].bft + LSTMoutput[2].bit + ) + LSTMoutput[2].dc : PastValue 1024 x 1 ( + LSTMoutput[2].ct + ) + LSTMoutput[2].dh : PastValue 256 x 1 ( + LSTMoutput[2].output + ) + LSTMoutput[2].ft : Sigmoid 0 x 0 ( + LSTMoutput[2].ft.z + ) + LSTMoutput[2].ft.z : Plus 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left + LSTMoutput[2].ft.z./*+*/right + ) + LSTMoutput[2].ft.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left + LSTMoutput[2].ft.z./*+*/left./*+*/right + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].output + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[2].ft.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[2].dh + ) + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].ft.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[2].ft.z./*+*/right.matrix + ) + LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[2].ft.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[2].dc + ) + LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].it : Sigmoid 0 x 0 ( + LSTMoutput[2].it.z + ) + LSTMoutput[2].it.z : Plus 0 x 0 ( + LSTMoutput[2].it.z./*+*/left + LSTMoutput[2].it.z./*+*/right + ) + LSTMoutput[2].it.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left + LSTMoutput[2].it.z./*+*/left./*+*/right + ) + LSTMoutput[2].it.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].output + ) + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[2].it.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/right./***/left + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[2].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[2].dh + ) + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].it.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[2].it.z./*+*/right.matrix + ) + LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[2].it.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[2].dc + ) + LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].mt : ElementTimes 0 x 0 ( + LSTMoutput[2].ot + LSTMoutput[2].mt./*.**/right + ) + LSTMoutput[2].mt./*.**/right : Tanh 0 x 0 ( + LSTMoutput[2].ct + ) + LSTMoutput[2].ot : Sigmoid 0 x 0 ( + LSTMoutput[2].ot.z + ) + LSTMoutput[2].ot.z : Plus 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left + LSTMoutput[2].ot.z./*+*/right + ) + LSTMoutput[2].ot.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left + LSTMoutput[2].ot.z./*+*/left./*+*/right + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].output + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[2].ot.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[2].dh + ) + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].ot.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[2].ot.z./*+*/right.matrix + ) + LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[2].ot.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[2].ct + ) + LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].output : Times 0 x 0 ( + LSTMoutput[2].Wmr + LSTMoutput[2].output./***/right + ) + LSTMoutput[2].output./***/right : Scale 0 x 0 ( + LSTMoutput[2].output./***/right.scalarScalingFactor + LSTMoutput[2].mt + ) + LSTMoutput[2].output./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].output./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].Wmr : LearnableParameter 256 x 1024 () + LSTMoutput[3].bft : ElementTimes 0 x 0 ( + LSTMoutput[3].ft + LSTMoutput[3].dc + ) + LSTMoutput[3].bit : ElementTimes 0 x 0 ( + LSTMoutput[3].it + LSTMoutput[3].bit./*.**/right + ) + LSTMoutput[3].bit./*.**/right : Tanh 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z + ) + LSTMoutput[3].bit./*.**/right.z : Plus 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/left + LSTMoutput[3].bit./*.**/right.z./*+*/right + ) + LSTMoutput[3].bit./*.**/right.z./*+*/left : Times 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right + ) + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].output + ) + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].bit./*.**/right.z./*+*/right : Plus 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right + ) + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right + ) + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor + LSTMoutput[3].dh + ) + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[3].ct : Plus 0 x 0 ( + LSTMoutput[3].bft + LSTMoutput[3].bit + ) + LSTMoutput[3].dc : PastValue 1024 x 1 ( + LSTMoutput[3].ct + ) + LSTMoutput[3].dh : PastValue 256 x 1 ( + LSTMoutput[3].output + ) + LSTMoutput[3].ft : Sigmoid 0 x 0 ( + LSTMoutput[3].ft.z + ) + LSTMoutput[3].ft.z : Plus 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left + LSTMoutput[3].ft.z./*+*/right + ) + LSTMoutput[3].ft.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left + LSTMoutput[3].ft.z./*+*/left./*+*/right + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].output + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[3].ft.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[3].dh + ) + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].ft.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[3].ft.z./*+*/right.matrix + ) + LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[3].ft.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[3].dc + ) + LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].it : Sigmoid 0 x 0 ( + LSTMoutput[3].it.z + ) + LSTMoutput[3].it.z : Plus 0 x 0 ( + LSTMoutput[3].it.z./*+*/left + LSTMoutput[3].it.z./*+*/right + ) + LSTMoutput[3].it.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left + LSTMoutput[3].it.z./*+*/left./*+*/right + ) + LSTMoutput[3].it.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].output + ) + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[3].it.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/right./***/left + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[3].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[3].dh + ) + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].it.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[3].it.z./*+*/right.matrix + ) + LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[3].it.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[3].dc + ) + LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].mt : ElementTimes 0 x 0 ( + LSTMoutput[3].ot + LSTMoutput[3].mt./*.**/right + ) + LSTMoutput[3].mt./*.**/right : Tanh 0 x 0 ( + LSTMoutput[3].ct + ) + LSTMoutput[3].ot : Sigmoid 0 x 0 ( + LSTMoutput[3].ot.z + ) + LSTMoutput[3].ot.z : Plus 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left + LSTMoutput[3].ot.z./*+*/right + ) + LSTMoutput[3].ot.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left + LSTMoutput[3].ot.z./*+*/left./*+*/right + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].output + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[3].ot.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[3].dh + ) + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].ot.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[3].ot.z./*+*/right.matrix + ) + LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[3].ot.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[3].ct + ) + LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].output : Times 0 x 0 ( + LSTMoutput[3].Wmr + LSTMoutput[3].output./***/right + ) + LSTMoutput[3].output./***/right : Scale 0 x 0 ( + LSTMoutput[3].output./***/right.scalarScalingFactor + LSTMoutput[3].mt + ) + LSTMoutput[3].output./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].output./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].Wmr : LearnableParameter 256 x 1024 () + LSTMoutputW : Plus 0 x 0 ( + LSTMoutputW./*+*/left + B + ) + LSTMoutputW./*+*/left : Times 0 x 0 ( + LSTMoutputW./*+*/left./***/left + LSTMoutputW./*+*/left./***/right + ) + LSTMoutputW./*+*/left./***/left : LearnableParameter 132 x 256 () + LSTMoutputW./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutputW./*+*/left./***/right.scalarScalingFactor + LSTMoutput[3].output + ) + LSTMoutputW./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + ScaledLogLikelihood : Minus 0 x 0 ( + LSTMoutputW + logPrior + ) +] GetTrainCriterionNodes ... GetEvalCriterionNodes ... nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output +LSTMoutput[1].mt./*.**/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].mt./*.**/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].mt./*.**/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].mt./*.**/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].mt./*.**/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].mt./*.**/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output -Validating node cr +Validating for node cr. 272 nodes to process in pass 1. -Validating --> labels = InputValue -Validating --> W = LearnableParameter -Validating --> sW = LearnableParameter -Validating --> expsW = Exp(sW[1, 1]) -Validating --> LSTMoutput3.Wmr = LearnableParameter -Validating --> LSTMoutput3.sWmr = LearnableParameter -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -Validating --> LSTMoutput3.Wxo = LearnableParameter -Validating --> LSTMoutput3.sWxo = LearnableParameter -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -Validating --> LSTMoutput2.Wmr = LearnableParameter -Validating --> LSTMoutput2.sWmr = LearnableParameter -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -Validating --> LSTMoutput2.Wxo = LearnableParameter -Validating --> LSTMoutput2.sWxo = LearnableParameter -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -Validating --> LSTMoutput1.Wmr = LearnableParameter -Validating --> LSTMoutput1.sWmr = LearnableParameter -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -Validating --> LSTMoutput1.Wxo = LearnableParameter -Validating --> LSTMoutput1.sWxo = LearnableParameter -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -Validating --> features = InputValue -Validating --> feashift = RowSlice(features[363, 1]) -Validating --> featNorm.xMean = Mean(feashift[33, 1]) -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 1]) -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 1]) -Validating --> LSTMoutput1.bo = LearnableParameter -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 1], LSTMoutput1.bo[1024, 1]) -Validating --> LSTMoutput1.Who = LearnableParameter -Validating --> LSTMoutput1.sWho = LearnableParameter -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -Validating --> LSTMoutput1.Wco = LearnableParameter -Validating --> LSTMoutput1.sWco = LearnableParameter -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -Validating --> LSTMoutput1.Wxf = LearnableParameter -Validating --> LSTMoutput1.sWxf = LearnableParameter -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 1]) -Validating --> LSTMoutput1.bf = LearnableParameter -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 1], LSTMoutput1.bf[1024, 1]) -Validating --> LSTMoutput1.Whf = LearnableParameter -Validating --> LSTMoutput1.sWhf = LearnableParameter -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -Validating --> LSTMoutput1.Wcf = LearnableParameter -Validating --> LSTMoutput1.sWcf = LearnableParameter -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -Validating --> LSTMoutput1.Wxi = LearnableParameter -Validating --> LSTMoutput1.sWxi = LearnableParameter -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 1]) -Validating --> LSTMoutput1.bi = LearnableParameter -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 1], LSTMoutput1.bi[1024, 1]) -Validating --> LSTMoutput1.Whi = LearnableParameter -Validating --> LSTMoutput1.sWhi = LearnableParameter -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -Validating --> LSTMoutput1.Wci = LearnableParameter -Validating --> LSTMoutput1.sWci = LearnableParameter -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -Validating --> LSTMoutput1.Wxc = LearnableParameter -Validating --> LSTMoutput1.sWxc = LearnableParameter -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 1]) -Validating --> LSTMoutput1.Whc = LearnableParameter -Validating --> LSTMoutput1.sWhc = LearnableParameter -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -Validating --> LSTMoutput1.bc = LearnableParameter -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, 1]) -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 1], LSTMoutput1.Whodh[1024, 1]) -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, 1]) -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 1], LSTMoutput1.Whfdh[1024, 1]) -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 1], LSTMoutput1.Wcfdc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, 1]) -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 1], LSTMoutput1.Whidh[1024, 1]) -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 1], LSTMoutput1.Wcidc[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, 1]) -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1]) -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 1], LSTMoutput1.unnamed161[1024, 1]) -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 1]) -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.unnamed159[1024, 1]) -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.bit[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 1], LSTMoutput1.Wcoct[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.unnamed174[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=472446402560, H=24545, C=120}, 1]) -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 1]) -Validating --> LSTMoutput2.bo = LearnableParameter -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 1], LSTMoutput2.bo[1024, 1]) -Validating --> LSTMoutput2.Who = LearnableParameter -Validating --> LSTMoutput2.sWho = LearnableParameter -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -Validating --> LSTMoutput2.Wco = LearnableParameter -Validating --> LSTMoutput2.sWco = LearnableParameter -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -Validating --> LSTMoutput2.Wxf = LearnableParameter -Validating --> LSTMoutput2.sWxf = LearnableParameter -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 1]) -Validating --> LSTMoutput2.bf = LearnableParameter -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 1], LSTMoutput2.bf[1024, 1]) -Validating --> LSTMoutput2.Whf = LearnableParameter -Validating --> LSTMoutput2.sWhf = LearnableParameter -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -Validating --> LSTMoutput2.Wcf = LearnableParameter -Validating --> LSTMoutput2.sWcf = LearnableParameter -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -Validating --> LSTMoutput2.Wxi = LearnableParameter -Validating --> LSTMoutput2.sWxi = LearnableParameter -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 1]) -Validating --> LSTMoutput2.bi = LearnableParameter -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 1], LSTMoutput2.bi[1024, 1]) -Validating --> LSTMoutput2.Whi = LearnableParameter -Validating --> LSTMoutput2.sWhi = LearnableParameter -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -Validating --> LSTMoutput2.Wci = LearnableParameter -Validating --> LSTMoutput2.sWci = LearnableParameter -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -Validating --> LSTMoutput2.Wxc = LearnableParameter -Validating --> LSTMoutput2.sWxc = LearnableParameter -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 1]) -Validating --> LSTMoutput2.Whc = LearnableParameter -Validating --> LSTMoutput2.sWhc = LearnableParameter -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -Validating --> LSTMoutput2.bc = LearnableParameter -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, 1]) -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 1], LSTMoutput2.Whodh[1024, 1]) -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, 1]) -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 1], LSTMoutput2.Whfdh[1024, 1]) -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 1], LSTMoutput2.Wcfdc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, 1]) -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 1], LSTMoutput2.Whidh[1024, 1]) -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 1], LSTMoutput2.Wcidc[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, 1]) -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1]) -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 1], LSTMoutput2.unnamed211[1024, 1]) -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 1]) -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.unnamed209[1024, 1]) -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.bit[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 1], LSTMoutput2.Wcoct[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.unnamed224[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=472446402560, H=42273, C=120}, 1]) -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 1]) -Validating --> LSTMoutput3.bo = LearnableParameter -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 1], LSTMoutput3.bo[1024, 1]) -Validating --> LSTMoutput3.Who = LearnableParameter -Validating --> LSTMoutput3.sWho = LearnableParameter -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -Validating --> LSTMoutput3.Wco = LearnableParameter -Validating --> LSTMoutput3.sWco = LearnableParameter -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -Validating --> LSTMoutput3.Wxf = LearnableParameter -Validating --> LSTMoutput3.sWxf = LearnableParameter -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 1]) -Validating --> LSTMoutput3.bf = LearnableParameter -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 1], LSTMoutput3.bf[1024, 1]) -Validating --> LSTMoutput3.Whf = LearnableParameter -Validating --> LSTMoutput3.sWhf = LearnableParameter -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -Validating --> LSTMoutput3.Wcf = LearnableParameter -Validating --> LSTMoutput3.sWcf = LearnableParameter -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -Validating --> LSTMoutput3.Wxi = LearnableParameter -Validating --> LSTMoutput3.sWxi = LearnableParameter -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 1]) -Validating --> LSTMoutput3.bi = LearnableParameter -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 1], LSTMoutput3.bi[1024, 1]) -Validating --> LSTMoutput3.Whi = LearnableParameter -Validating --> LSTMoutput3.sWhi = LearnableParameter -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -Validating --> LSTMoutput3.Wci = LearnableParameter -Validating --> LSTMoutput3.sWci = LearnableParameter -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -Validating --> LSTMoutput3.Wxc = LearnableParameter -Validating --> LSTMoutput3.sWxc = LearnableParameter -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 1]) -Validating --> LSTMoutput3.Whc = LearnableParameter -Validating --> LSTMoutput3.sWhc = LearnableParameter -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -Validating --> LSTMoutput3.bc = LearnableParameter -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, 1]) -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, 1]) -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 1], LSTMoutput3.Whodh[1024, 1]) -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, 1]) -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 1], LSTMoutput3.Whfdh[1024, 1]) -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 1], LSTMoutput3.Wcfdc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, 1]) -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 1], LSTMoutput3.Whidh[1024, 1]) -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 1], LSTMoutput3.Wcidc[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, 1]) -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1]) -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 1], LSTMoutput3.unnamed261[1024, 1]) -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 1]) -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.unnamed259[1024, 1]) -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.bit[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 1], LSTMoutput3.Wcoct[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.unnamed274[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=472446402560, H=61793, C=120}, 1]) -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 1]) -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 1]) -Validating --> b = LearnableParameter -Validating --> LSTMoutputW = Plus(unnamed283[132, 1], b[132, 1]) -Validating --> cr = CrossEntropyWithSoftmax(labels[132, 1], LSTMoutputW[132, 1]) +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] -Found 6 PreCompute nodes - NodeName: featNorm.xMean - NodeName: featNorm.xStdDev - NodeName: logPrior.Prior - NodeName: featNorm.xMean - NodeName: featNorm.xStdDev - NodeName: logPrior.Prior +Validating for node cr. 183 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] + +Validating for node cr. 60 nodes to process in pass 3. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] + +Validating for node cr, final verification. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] + +127 out of 272 nodes do not share the minibatch layout with the input data. + + +Precomputing --> 3 PreCompute nodes found. + + NodeName: featNorm.invStdDevVector + NodeName: featNorm.meanVector + NodeName: logPrior.x minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output -Validating node featNorm.xMean +Validating for node featNorm.invStdDevVector. 3 nodes to process in pass 1. -Validating --> features = InputValue -Validating --> feashift = RowSlice(features[363, 640]) -Validating --> featNorm.xMean = Mean(feashift[33, 640]) +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] + +Validating for node featNorm.invStdDevVector, final verification. + +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] + +1 out of 3 nodes do not share the minibatch layout with the input data. nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output -Validating node featNorm.xStdDev +Validating for node featNorm.meanVector. 3 nodes to process in pass 1. -Validating --> features = InputValue -Validating --> feashift = RowSlice(features[363, 640]) -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 640]) +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] + +Validating for node featNorm.meanVector, final verification. + +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] + +1 out of 3 nodes do not share the minibatch layout with the input data. nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output -Validating node logPrior.Prior +Validating for node logPrior.x. 2 nodes to process in pass 1. -Validating --> labels = InputValue -Validating --> logPrior.Prior = Mean(labels[132, 640]) +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1] + +Validating for node logPrior.x. 1 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1] + +Validating for node logPrior.x, final verification. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1] + +1 out of 2 nodes do not share the minibatch layout with the input data. + +EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once. + +Precomputing --> Completed. Set Max Temp Mem Size For Convolution Nodes to 0 samples. -Starting Epoch 1: learning rate per sample = 0.000781 momentum = 0.000000 +Starting Epoch 1: learning rate per sample = 0.000781 effective momentum = 0.000000 minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses + nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output + +Validating for node Err. 272 nodes to process in pass 1. + +Validating --> labels = InputValue -> [132, MBSize 640] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 640] +Validating --> feashift = RowSlice(features[363, MBSize 640]) -> [33, MBSize 640] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 640]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 640]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 640], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 640], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 640], LSTMoutput[1].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 640], LSTMoutput[1].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 640], LSTMoutput[1].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 640], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 640], LSTMoutput[2].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 640], LSTMoutput[2].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 640], LSTMoutput[2].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 640], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 640], LSTMoutput[3].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 640], LSTMoutput[3].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 640], LSTMoutput[3].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 640]) -> [132, MBSize 640] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 640], B[132, 1]) -> [132, MBSize 640] +Validating --> Err = ErrorPrediction(labels[132, MBSize 640], LSTMoutputW[132, MBSize 640]) -> [1, 1] + +Validating for node Err. 180 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 640] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 640] +Validating --> feashift = RowSlice(features[363, MBSize 640]) -> [33, MBSize 640] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 640]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 640]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 640], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 640], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 640], LSTMoutput[1].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 640], LSTMoutput[1].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 640], LSTMoutput[1].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 640], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 640], LSTMoutput[2].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 640], LSTMoutput[2].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 640], LSTMoutput[2].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 640], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 640], LSTMoutput[3].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 640], LSTMoutput[3].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 640], LSTMoutput[3].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 640]) -> [132, MBSize 640] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 640], B[132, 1]) -> [132, MBSize 640] +Validating --> Err = ErrorPrediction(labels[132, MBSize 640], LSTMoutputW[132, MBSize 640]) -> [1, 1] + +Validating for node Err. 6 nodes to process in pass 3. + +Validating --> labels = InputValue -> [132, MBSize 640] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 640] +Validating --> feashift = RowSlice(features[363, MBSize 640]) -> [33, MBSize 640] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 640]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 640]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 640], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 640], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 640], LSTMoutput[1].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 640], LSTMoutput[1].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 640], LSTMoutput[1].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 640], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 640], LSTMoutput[2].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 640], LSTMoutput[2].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 640], LSTMoutput[2].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 640], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 640], LSTMoutput[3].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 640], LSTMoutput[3].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 640], LSTMoutput[3].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 640]) -> [132, MBSize 640] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 640], B[132, 1]) -> [132, MBSize 640] +Validating --> Err = ErrorPrediction(labels[132, MBSize 640], LSTMoutputW[132, MBSize 640]) -> [1, 1] + +Validating for node Err, final verification. + +Validating --> labels = InputValue -> [132, MBSize 640] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 640] +Validating --> feashift = RowSlice(features[363, MBSize 640]) -> [33, MBSize 640] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 640]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 640]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 640], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 640], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 640], LSTMoutput[1].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 640], LSTMoutput[1].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 640], LSTMoutput[1].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 640], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 640], LSTMoutput[2].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 640], LSTMoutput[2].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 640], LSTMoutput[2].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 640], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 640], LSTMoutput[3].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 640], LSTMoutput[3].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 640], LSTMoutput[3].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 640]) -> [132, MBSize 640] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 640], B[132, 1]) -> [132, MBSize 640] +Validating --> Err = ErrorPrediction(labels[132, MBSize 640], LSTMoutputW[132, MBSize 640]) -> [1, 1] + +127 out of 272 nodes do not share the minibatch layout with the input data. + Starting minibatch loop. - nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Validating node Err - -Validating --> labels = InputValue -Validating --> W = LearnableParameter -Validating --> sW = LearnableParameter -Validating --> expsW = Exp(sW[1, 1]) -Validating --> LSTMoutput3.Wmr = LearnableParameter -Validating --> LSTMoutput3.sWmr = LearnableParameter -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -Validating --> LSTMoutput3.Wxo = LearnableParameter -Validating --> LSTMoutput3.sWxo = LearnableParameter -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -Validating --> LSTMoutput2.Wmr = LearnableParameter -Validating --> LSTMoutput2.sWmr = LearnableParameter -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -Validating --> LSTMoutput2.Wxo = LearnableParameter -Validating --> LSTMoutput2.sWxo = LearnableParameter -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -Validating --> LSTMoutput1.Wmr = LearnableParameter -Validating --> LSTMoutput1.sWmr = LearnableParameter -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -Validating --> LSTMoutput1.Wxo = LearnableParameter -Validating --> LSTMoutput1.sWxo = LearnableParameter -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -Validating --> features = InputValue -Validating --> feashift = RowSlice(features[363, 640]) -Validating --> featNorm.xMean = Mean(feashift[33, 640]) -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 640]) -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 640], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 640]) -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 640]) -Validating --> LSTMoutput1.bo = LearnableParameter -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 640], LSTMoutput1.bo[1024, 1]) -Validating --> LSTMoutput1.Who = LearnableParameter -Validating --> LSTMoutput1.sWho = LearnableParameter -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -Validating --> LSTMoutput1.Wco = LearnableParameter -Validating --> LSTMoutput1.sWco = LearnableParameter -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -Validating --> LSTMoutput1.Wxf = LearnableParameter -Validating --> LSTMoutput1.sWxf = LearnableParameter -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 640]) -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 640]) -Validating --> LSTMoutput1.bf = LearnableParameter -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 640], LSTMoutput1.bf[1024, 1]) -Validating --> LSTMoutput1.Whf = LearnableParameter -Validating --> LSTMoutput1.sWhf = LearnableParameter -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -Validating --> LSTMoutput1.Wcf = LearnableParameter -Validating --> LSTMoutput1.sWcf = LearnableParameter -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -Validating --> LSTMoutput1.Wxi = LearnableParameter -Validating --> LSTMoutput1.sWxi = LearnableParameter -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 640]) -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 640]) -Validating --> LSTMoutput1.bi = LearnableParameter -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 640], LSTMoutput1.bi[1024, 1]) -Validating --> LSTMoutput1.Whi = LearnableParameter -Validating --> LSTMoutput1.sWhi = LearnableParameter -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -Validating --> LSTMoutput1.Wci = LearnableParameter -Validating --> LSTMoutput1.sWci = LearnableParameter -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -Validating --> LSTMoutput1.Wxc = LearnableParameter -Validating --> LSTMoutput1.sWxc = LearnableParameter -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 640]) -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 640]) -Validating --> LSTMoutput1.Whc = LearnableParameter -Validating --> LSTMoutput1.sWhc = LearnableParameter -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -Validating --> LSTMoutput1.bc = LearnableParameter -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, 640]) -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, 640]) -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, 640]) -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 640], LSTMoutput1.Whodh[1024, 640]) -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, 640]) -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, 640]) -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 640], LSTMoutput1.Whfdh[1024, 640]) -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 640], LSTMoutput1.Wcfdc[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=472446402560, H=24545, C=120}, 640], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, 640]) -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, 640]) -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 640], LSTMoutput1.Whidh[1024, 640]) -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 640], LSTMoutput1.Wcidc[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, 640]) -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, 640]) -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 640], LSTMoutput1.bc[1024, 1]) -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 640], LSTMoutput1.unnamed161[1024, 640]) -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 640]) -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=472446402560, H=24545, C=120}, 640], LSTMoutput1.unnamed159[1024, 640]) -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=472446402560, H=24545, C=120}, 640], LSTMoutput1.bit[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 640], LSTMoutput1.Wcoct[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=472446402560, H=24545, C=120}, 640], LSTMoutput1.unnamed174[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=472446402560, H=24545, C=120}, 640]) -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 640]) -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 640]) -Validating --> LSTMoutput2.bo = LearnableParameter -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 640], LSTMoutput2.bo[1024, 1]) -Validating --> LSTMoutput2.Who = LearnableParameter -Validating --> LSTMoutput2.sWho = LearnableParameter -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -Validating --> LSTMoutput2.Wco = LearnableParameter -Validating --> LSTMoutput2.sWco = LearnableParameter -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -Validating --> LSTMoutput2.Wxf = LearnableParameter -Validating --> LSTMoutput2.sWxf = LearnableParameter -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 640]) -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 640]) -Validating --> LSTMoutput2.bf = LearnableParameter -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 640], LSTMoutput2.bf[1024, 1]) -Validating --> LSTMoutput2.Whf = LearnableParameter -Validating --> LSTMoutput2.sWhf = LearnableParameter -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -Validating --> LSTMoutput2.Wcf = LearnableParameter -Validating --> LSTMoutput2.sWcf = LearnableParameter -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -Validating --> LSTMoutput2.Wxi = LearnableParameter -Validating --> LSTMoutput2.sWxi = LearnableParameter -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 640]) -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 640]) -Validating --> LSTMoutput2.bi = LearnableParameter -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 640], LSTMoutput2.bi[1024, 1]) -Validating --> LSTMoutput2.Whi = LearnableParameter -Validating --> LSTMoutput2.sWhi = LearnableParameter -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -Validating --> LSTMoutput2.Wci = LearnableParameter -Validating --> LSTMoutput2.sWci = LearnableParameter -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -Validating --> LSTMoutput2.Wxc = LearnableParameter -Validating --> LSTMoutput2.sWxc = LearnableParameter -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 640]) -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 640]) -Validating --> LSTMoutput2.Whc = LearnableParameter -Validating --> LSTMoutput2.sWhc = LearnableParameter -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -Validating --> LSTMoutput2.bc = LearnableParameter -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, 640]) -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, 640]) -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, 640]) -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 640], LSTMoutput2.Whodh[1024, 640]) -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, 640]) -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, 640]) -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 640], LSTMoutput2.Whfdh[1024, 640]) -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 640], LSTMoutput2.Wcfdc[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=472446402560, H=42273, C=120}, 640], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, 640]) -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, 640]) -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 640], LSTMoutput2.Whidh[1024, 640]) -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 640], LSTMoutput2.Wcidc[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, 640]) -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, 640]) -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 640], LSTMoutput2.bc[1024, 1]) -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 640], LSTMoutput2.unnamed211[1024, 640]) -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 640]) -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=472446402560, H=42273, C=120}, 640], LSTMoutput2.unnamed209[1024, 640]) -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=472446402560, H=42273, C=120}, 640], LSTMoutput2.bit[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 640], LSTMoutput2.Wcoct[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=472446402560, H=42273, C=120}, 640], LSTMoutput2.unnamed224[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=472446402560, H=42273, C=120}, 640]) -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 640]) -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 640]) -Validating --> LSTMoutput3.bo = LearnableParameter -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 640], LSTMoutput3.bo[1024, 1]) -Validating --> LSTMoutput3.Who = LearnableParameter -Validating --> LSTMoutput3.sWho = LearnableParameter -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -Validating --> LSTMoutput3.Wco = LearnableParameter -Validating --> LSTMoutput3.sWco = LearnableParameter -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -Validating --> LSTMoutput3.Wxf = LearnableParameter -Validating --> LSTMoutput3.sWxf = LearnableParameter -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 640]) -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 640]) -Validating --> LSTMoutput3.bf = LearnableParameter -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 640], LSTMoutput3.bf[1024, 1]) -Validating --> LSTMoutput3.Whf = LearnableParameter -Validating --> LSTMoutput3.sWhf = LearnableParameter -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -Validating --> LSTMoutput3.Wcf = LearnableParameter -Validating --> LSTMoutput3.sWcf = LearnableParameter -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -Validating --> LSTMoutput3.Wxi = LearnableParameter -Validating --> LSTMoutput3.sWxi = LearnableParameter -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 640]) -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 640]) -Validating --> LSTMoutput3.bi = LearnableParameter -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 640], LSTMoutput3.bi[1024, 1]) -Validating --> LSTMoutput3.Whi = LearnableParameter -Validating --> LSTMoutput3.sWhi = LearnableParameter -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -Validating --> LSTMoutput3.Wci = LearnableParameter -Validating --> LSTMoutput3.sWci = LearnableParameter -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -Validating --> LSTMoutput3.Wxc = LearnableParameter -Validating --> LSTMoutput3.sWxc = LearnableParameter -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 640]) -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 640]) -Validating --> LSTMoutput3.Whc = LearnableParameter -Validating --> LSTMoutput3.sWhc = LearnableParameter -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -Validating --> LSTMoutput3.bc = LearnableParameter -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, 640]) -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, 640]) -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, 640]) -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 640], LSTMoutput3.Whodh[1024, 640]) -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, 640]) -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, 640]) -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 640], LSTMoutput3.Whfdh[1024, 640]) -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 640], LSTMoutput3.Wcfdc[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=472446402560, H=61793, C=120}, 640], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, 640]) -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, 640]) -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 640], LSTMoutput3.Whidh[1024, 640]) -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 640], LSTMoutput3.Wcidc[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, 640]) -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, 640]) -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 640], LSTMoutput3.bc[1024, 1]) -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 640], LSTMoutput3.unnamed261[1024, 640]) -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 640]) -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=472446402560, H=61793, C=120}, 640], LSTMoutput3.unnamed259[1024, 640]) -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=472446402560, H=61793, C=120}, 640], LSTMoutput3.bit[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 640], LSTMoutput3.Wcoct[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=472446402560, H=61793, C=120}, 640], LSTMoutput3.unnamed274[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=472446402560, H=61793, C=120}, 640]) -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 640]) -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 640]) -Validating --> b = LearnableParameter -Validating --> LSTMoutputW = Plus(unnamed283[132, 640], b[132, 1]) -Validating --> Err = ErrorPrediction(labels[132, 640], LSTMoutputW[132, 640]) - - Epoch[ 1 of 4]-Minibatch[ 1- 10 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.80573893; EvalErr[0]PerSample = 0.90281248; TotalTime = 2.72155s; TotalTimePerSample = 0.42524ms; SamplesPerSecond = 2351 - Epoch[ 1 of 4]-Minibatch[ 11- 20 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.59921312; EvalErr[0]PerSample = 0.85390627; TotalTime = 2.71606s; TotalTimePerSample = 0.42438ms; SamplesPerSecond = 2356 - Epoch[ 1 of 4]-Minibatch[ 21- 30 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 5.29241562; EvalErr[0]PerSample = 0.87921876; TotalTime = 2.70903s; TotalTimePerSample = 0.42329ms; SamplesPerSecond = 2362 -Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 4.8512392; EvalErrPerSample = 0.86728519; Ave LearnRatePerSample = 0.0007812500116; EpochTime=8.7031 -Starting Epoch 2: learning rate per sample = 0.000781 momentum = 0.899991 + Epoch[ 1 of 4]-Minibatch[ 1- 10 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.80573914; EvalErr[0]PerSample = 0.90281250; TotalTime = 3.70464s; TotalTimePerSample = 0.57885ms; SamplesPerSecond = 1727 + Epoch[ 1 of 4]-Minibatch[ 11- 20 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.59921326; EvalErr[0]PerSample = 0.85390625; TotalTime = 3.83160s; TotalTimePerSample = 0.59869ms; SamplesPerSecond = 1670 + Epoch[ 1 of 4]-Minibatch[ 21- 30 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 5.29241577; EvalErr[0]PerSample = 0.87921875; TotalTime = 3.76323s; TotalTimePerSample = 0.58800ms; SamplesPerSecond = 1700 +Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 4.8512392; EvalErrPerSample = 0.86728519; Ave LearnRatePerSample = 0.0007812500116; EpochTime=12.253369 +Starting Epoch 2: learning rate per sample = 0.000781 effective momentum = 0.900000 minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20546), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 2 of 4]-Minibatch[ 1- 10 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.39003801; EvalErr[0]PerSample = 0.85187501; TotalTime = 2.68673s; TotalTimePerSample = 0.41980ms; SamplesPerSecond = 2382 - Epoch[ 2 of 4]-Minibatch[ 11- 20 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.25110769; EvalErr[0]PerSample = 0.84484375; TotalTime = 2.70369s; TotalTimePerSample = 0.42245ms; SamplesPerSecond = 2367 - Epoch[ 2 of 4]-Minibatch[ 21- 30 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 3.78259087; EvalErr[0]PerSample = 0.74578124; TotalTime = 2.71281s; TotalTimePerSample = 0.42388ms; SamplesPerSecond = 2359 -Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 4.0735416; EvalErrPerSample = 0.79853517; Ave LearnRatePerSample = 0.0007812500116; EpochTime=8.653936 -Starting Epoch 3: learning rate per sample = 0.000781 momentum = 0.899991 + Epoch[ 2 of 4]-Minibatch[ 1- 10 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.39003784; EvalErr[0]PerSample = 0.85187500; TotalTime = 3.68450s; TotalTimePerSample = 0.57570ms; SamplesPerSecond = 1737 + Epoch[ 2 of 4]-Minibatch[ 11- 20 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.25110718; EvalErr[0]PerSample = 0.84484375; TotalTime = 3.83342s; TotalTimePerSample = 0.59897ms; SamplesPerSecond = 1669 + Epoch[ 2 of 4]-Minibatch[ 21- 30 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 3.78258545; EvalErr[0]PerSample = 0.74578125; TotalTime = 3.75383s; TotalTimePerSample = 0.58654ms; SamplesPerSecond = 1704 +Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 4.0735388; EvalErrPerSample = 0.79853517; Ave LearnRatePerSample = 0.0007812500116; EpochTime=12.144325 +Starting Epoch 3: learning rate per sample = 0.000781 effective momentum = 0.900000 minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40980), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 3 of 4]-Minibatch[ 1- 10 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.11762667; EvalErr[0]PerSample = 0.83671874; TotalTime = 2.69289s; TotalTimePerSample = 0.42076ms; SamplesPerSecond = 2376 - Epoch[ 3 of 4]-Minibatch[ 11- 20 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.18654823; EvalErr[0]PerSample = 0.86468750; TotalTime = 2.70456s; TotalTimePerSample = 0.42259ms; SamplesPerSecond = 2366 - Epoch[ 3 of 4]-Minibatch[ 21- 30 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 3.90151119; EvalErr[0]PerSample = 0.83328128; TotalTime = 2.71127s; TotalTimePerSample = 0.42364ms; SamplesPerSecond = 2360 -Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 4.0097828; EvalErrPerSample = 0.83603519; Ave LearnRatePerSample = 0.0007812500116; EpochTime=8.661351 -Starting Epoch 4: learning rate per sample = 0.000781 momentum = 0.899991 + Epoch[ 3 of 4]-Minibatch[ 1- 10 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.11762695; EvalErr[0]PerSample = 0.83671875; TotalTime = 3.71971s; TotalTimePerSample = 0.58120ms; SamplesPerSecond = 1720 + Epoch[ 3 of 4]-Minibatch[ 11- 20 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.18655273; EvalErr[0]PerSample = 0.86468750; TotalTime = 3.81275s; TotalTimePerSample = 0.59574ms; SamplesPerSecond = 1678 + Epoch[ 3 of 4]-Minibatch[ 21- 30 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 3.90151001; EvalErr[0]PerSample = 0.83328125; TotalTime = 3.76173s; TotalTimePerSample = 0.58777ms; SamplesPerSecond = 1701 +Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 4.0097837; EvalErrPerSample = 0.83603519; Ave LearnRatePerSample = 0.0007812500116; EpochTime=12.173884 +Starting Epoch 4: learning rate per sample = 0.000781 effective momentum = 0.900000 minibatchiterator: epoch 3: frames [61440..81920] (first utterance at frame 61662), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 4 of 4]-Minibatch[ 1- 10 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.06626415; EvalErr[0]PerSample = 0.85124999; TotalTime = 2.68899s; TotalTimePerSample = 0.42015ms; SamplesPerSecond = 2380 - Epoch[ 4 of 4]-Minibatch[ 11- 20 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.13874769; EvalErr[0]PerSample = 0.87437499; TotalTime = 2.70160s; TotalTimePerSample = 0.42213ms; SamplesPerSecond = 2368 - Epoch[ 4 of 4]-Minibatch[ 21- 30 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 3.94609857; EvalErr[0]PerSample = 0.81968749; TotalTime = 2.71265s; TotalTimePerSample = 0.42385ms; SamplesPerSecond = 2359 -Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 3.9959295; EvalErrPerSample = 0.83603519; Ave LearnRatePerSample = 0.0007812500116; EpochTime=8.661498 + Epoch[ 4 of 4]-Minibatch[ 1- 10 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.06626434; EvalErr[0]PerSample = 0.85125000; TotalTime = 3.68964s; TotalTimePerSample = 0.57651ms; SamplesPerSecond = 1734 + Epoch[ 4 of 4]-Minibatch[ 11- 20 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.13874786; EvalErr[0]PerSample = 0.87437500; TotalTime = 3.79460s; TotalTimePerSample = 0.59291ms; SamplesPerSecond = 1686 + Epoch[ 4 of 4]-Minibatch[ 21- 30 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 3.94609985; EvalErr[0]PerSample = 0.81968750; TotalTime = 3.77592s; TotalTimePerSample = 0.58999ms; SamplesPerSecond = 1694 +Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 3.9959297; EvalErrPerSample = 0.83603519; Ave LearnRatePerSample = 0.0007812500116; EpochTime=12.162679 +CNTKCommandTrainEnd: speechTrain COMPLETED diff --git a/Tests/Speech/LSTM/Truncated/baseline.windows.gpu.txt b/Tests/Speech/LSTM/Truncated/baseline.windows.gpu.txt index ad2d630f0..3aaebe062 100644 --- a/Tests/Speech/LSTM/Truncated/baseline.windows.gpu.txt +++ b/Tests/Speech/LSTM/Truncated/baseline.windows.gpu.txt @@ -1,16 +1,16 @@ -=== Running /cygdrive/e/NetScale/CNTK/git_repos/public_master/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM\cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20150908130820.629582\Speech_LSTM@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data DeviceId=0 NDLDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM +=== Running /cygdrive/e/NetScale/CNTK/git_repos/cplx_master2/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024135527.675673\Speech\LSTM_Truncated@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM DeviceId=0 ------------------------------------------------------------------- Build info: - Built time: Sep 8 2015 13:07:27 - Last modified date: Tue Sep 8 13:07:20 2015 + Built time: Oct 24 2015 13:33:25 + Last modified date: Thu Oct 22 16:00:27 2015 Built by amitaga on Amitaga-Win-DT3 - Build Path: E:\NetScale\CNTK\git_repos\public_master\MachineLearning\CNTK\ + Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\ CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 ------------------------------------------------------------------- -running on Amitaga-Win-DT3 at 2015/09/08 21:08:21 -command line options: -configFile=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM\cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20150908130820.629582\Speech_LSTM@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data DeviceId=0 NDLDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM +running on Amitaga-Win-DT3 at 2015/10/24 21:55:28 +command line: +E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024135527.675673\Speech\LSTM_Truncated@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM DeviceId=0 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> precision=float @@ -24,9 +24,6 @@ speechTrain=[ modelPath=$RunDir$/models/cntkSpeech.dnn deviceId=$DeviceId$ traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=$NDLDir$/lstmp-3layer_WithSelfStab.ndl - ] SGD=[ epochSize=20480 minibatchSize=20 @@ -200,10 +197,10 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features); ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output') // sadly we can't say x - y since we want to assign a tag ] ] -RunDir=C:\cygwin64\tmp\cntk-test-20150908130820.629582\Speech_LSTM@debug_gpu -DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data +RunDir=C:\cygwin64\tmp\cntk-test-20151024135527.675673\Speech\LSTM_Truncated@debug_gpu +DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM DeviceId=0 -NDLDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< @@ -216,12 +213,9 @@ frameMode=false Truncated=true speechTrain=[ action=train - modelPath=C:\cygwin64\tmp\cntk-test-20150908130820.629582\Speech_LSTM@debug_gpu/models/cntkSpeech.dnn + modelPath=C:\cygwin64\tmp\cntk-test-20151024135527.675673\Speech\LSTM_Truncated@debug_gpu/models/cntkSpeech.dnn deviceId=0 traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM/lstmp-3layer_WithSelfStab.ndl - ] SGD=[ epochSize=20480 minibatchSize=20 @@ -241,11 +235,11 @@ speechTrain=[ features=[ dim=363 type=Real - scpFile=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/glob_0000.scp + scpFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.scp ] labels=[ - mlfFile=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/glob_0000.mlf - labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/state.list + mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf + labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list labelDim=132 labelType=Category ] @@ -395,30 +389,27 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features); ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output') // sadly we can't say x - y since we want to assign a tag ] ] -RunDir=C:\cygwin64\tmp\cntk-test-20150908130820.629582\Speech_LSTM@debug_gpu -DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data +RunDir=C:\cygwin64\tmp\cntk-test-20151024135527.675673\Speech\LSTM_Truncated@debug_gpu +DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM DeviceId=0 -NDLDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> configparameters: cntk.config:command=speechTrain -configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data +configparameters: cntk.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM +configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data configparameters: cntk.config:deviceId=0 configparameters: cntk.config:frameMode=false -configparameters: cntk.config:NDLDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM configparameters: cntk.config:parallelTrain=false configparameters: cntk.config:precision=float -configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20150908130820.629582\Speech_LSTM@debug_gpu +configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024135527.675673\Speech\LSTM_Truncated@debug_gpu configparameters: cntk.config:speechTrain=[ action=train - modelPath=C:\cygwin64\tmp\cntk-test-20150908130820.629582\Speech_LSTM@debug_gpu/models/cntkSpeech.dnn + modelPath=C:\cygwin64\tmp\cntk-test-20151024135527.675673\Speech\LSTM_Truncated@debug_gpu/models/cntkSpeech.dnn deviceId=0 traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM/lstmp-3layer_WithSelfStab.ndl - ] SGD=[ epochSize=20480 minibatchSize=20 @@ -438,11 +429,11 @@ configparameters: cntk.config:speechTrain=[ features=[ dim=363 type=Real - scpFile=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/glob_0000.scp + scpFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.scp ] labels=[ - mlfFile=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/glob_0000.mlf - labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/state.list + mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf + labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list labelDim=132 labelType=Category ] @@ -597,1789 +588,3393 @@ configparameters: cntk.config:Truncated=true <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< command: speechTrain precision = float -NDLBuilder Using GPU 0 -reading script file E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/glob_0000.scp ... 948 entries +CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024135527.675673\Speech\LSTM_Truncated@debug_gpu/models/cntkSpeech.dnn +CNTKCommandTrainInfo: speechTrain : 4 +CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4 +CNTKCommandTrainBegin: speechTrain +ExperimentalNetworkBuilder using GPU 0 +reading script file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.scp ... 948 entries trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion -total 132 state names in state list E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/state.list -htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/glob_0000.mlf ... total 948 entries +total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list +htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf ... total 948 entries ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances label set 0: 129 classes minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames - nodes in the recurrent loops : -LSTMoutput1.unnamed174 LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.bit LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.unnamed224 LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.bit LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.unnamed274 LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.bit LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.unnamed174 LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.bit LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.unnamed224 LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.bit LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.unnamed274 LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.bit LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Printing Gradient Computation Node Order ... - -cr[0, 0] = CrossEntropyWithSoftmax(labels[132, 1], LSTMoutputW[0, 0]) -LSTMoutputW[0, 0] = Plus(unnamed283[0, 0], b[132, 1]) -b[132, 1] = LearnableParameter -unnamed283[0, 0] = Times(W[132, 256], unnamed284[0, 0]) -unnamed284[0, 0] = Scale(expsW[0, 0], LSTMoutput3.output[0, 0]) -LSTMoutput3.output[0, 0] = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[0, 0]) -LSTMoutput3.unnamed275[0, 0] = Scale(LSTMoutput3.expsWmr[0, 0], LSTMoutput3.mt[0, 0]) -LSTMoutput3.mt[0, 0] = ElementTimes(LSTMoutput3.ot[0, 0], LSTMoutput3.unnamed274[0, 0]) -LSTMoutput3.unnamed274[0, 0] = Tanh(LSTMoutput3.ct[0, 0]) -LSTMoutput3.ot[0, 0] = Sigmoid(LSTMoutput3.unnamed271[0, 0]) -LSTMoutput3.unnamed271[0, 0] = Plus(LSTMoutput3.unnamed272[0, 0], LSTMoutput3.Wcoct[0, 0]) -LSTMoutput3.Wcoct[0, 0] = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[0, 0]) -LSTMoutput3.unnamed270[0, 0] = Scale(LSTMoutput3.expsWco[0, 0], LSTMoutput3.ct[0, 0]) -LSTMoutput3.ct[0, 0] = Plus(LSTMoutput3.bft[0, 0], LSTMoutput3.bit[0, 0]) -LSTMoutput3.bit[0, 0] = ElementTimes(LSTMoutput3.it[0, 0], LSTMoutput3.unnamed259[0, 0]) -LSTMoutput3.unnamed259[0, 0] = Tanh(LSTMoutput3.unnamed260[0, 0]) -LSTMoutput3.unnamed260[0, 0] = Plus(LSTMoutput3.Wxcx[0, 0], LSTMoutput3.unnamed261[0, 0]) -LSTMoutput3.unnamed261[0, 0] = Plus(LSTMoutput3.Whcdh[0, 0], LSTMoutput3.bc[1024, 1]) -LSTMoutput3.Whcdh[0, 0] = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[0, 0]) -LSTMoutput3.unnamed258[0, 0] = Scale(LSTMoutput3.expsWhc[0, 0], LSTMoutput3.dh[256, 1]) -LSTMoutput3.it[0, 0] = Sigmoid(LSTMoutput3.unnamed254[0, 0]) -LSTMoutput3.unnamed254[0, 0] = Plus(LSTMoutput3.unnamed255[0, 0], LSTMoutput3.Wcidc[0, 0]) -LSTMoutput3.Wcidc[0, 0] = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[0, 0]) -LSTMoutput3.unnamed253[0, 0] = Scale(LSTMoutput3.expsWci[0, 0], LSTMoutput3.dc[1024, 1]) -LSTMoutput3.unnamed255[0, 0] = Plus(LSTMoutput3.unnamed256[0, 0], LSTMoutput3.Whidh[0, 0]) -LSTMoutput3.Whidh[0, 0] = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[0, 0]) -LSTMoutput3.unnamed252[0, 0] = Scale(LSTMoutput3.expsWhi[0, 0], LSTMoutput3.dh[256, 1]) -LSTMoutput3.bft[0, 0] = ElementTimes(LSTMoutput3.ft[0, 0], LSTMoutput3.dc[1024, 1]) -LSTMoutput3.ft[0, 0] = Sigmoid(LSTMoutput3.unnamed265[0, 0]) -LSTMoutput3.unnamed265[0, 0] = Plus(LSTMoutput3.unnamed266[0, 0], LSTMoutput3.Wcfdc[0, 0]) -LSTMoutput3.Wcfdc[0, 0] = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[0, 0]) -LSTMoutput3.unnamed264[0, 0] = Scale(LSTMoutput3.expsWcf[0, 0], LSTMoutput3.dc[1024, 1]) -LSTMoutput3.dc[1024, 1] = PastValue(LSTMoutput3.ct[0, 0]) -LSTMoutput3.unnamed266[0, 0] = Plus(LSTMoutput3.unnamed267[0, 0], LSTMoutput3.Whfdh[0, 0]) -LSTMoutput3.Whfdh[0, 0] = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[0, 0]) -LSTMoutput3.unnamed263[0, 0] = Scale(LSTMoutput3.expsWhf[0, 0], LSTMoutput3.dh[256, 1]) -LSTMoutput3.unnamed272[0, 0] = Plus(LSTMoutput3.unnamed273[0, 0], LSTMoutput3.Whodh[0, 0]) -LSTMoutput3.Whodh[0, 0] = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[0, 0]) -LSTMoutput3.unnamed269[0, 0] = Scale(LSTMoutput3.expsWho[0, 0], LSTMoutput3.dh[256, 1]) -LSTMoutput3.dh[256, 1] = PastValue(LSTMoutput3.output[0, 0]) -LSTMoutput3.bc[1024, 1] = LearnableParameter -LSTMoutput3.expsWhc[0, 0] = Exp(LSTMoutput3.sWhc[1, 1]) -LSTMoutput3.sWhc[1, 1] = LearnableParameter -LSTMoutput3.Whc[1024, 256] = LearnableParameter -LSTMoutput3.Wxcx[0, 0] = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[0, 0]) -LSTMoutput3.unnamed257[0, 0] = Scale(LSTMoutput3.expsWxc[0, 0], LSTMoutput2.output[0, 0]) -LSTMoutput3.expsWxc[0, 0] = Exp(LSTMoutput3.sWxc[1, 1]) -LSTMoutput3.sWxc[1, 1] = LearnableParameter -LSTMoutput3.Wxc[1024, 256] = LearnableParameter -LSTMoutput3.expsWci[0, 0] = Exp(LSTMoutput3.sWci[1, 1]) -LSTMoutput3.sWci[1, 1] = LearnableParameter -LSTMoutput3.Wci[1024, 1] = LearnableParameter -LSTMoutput3.expsWhi[0, 0] = Exp(LSTMoutput3.sWhi[1, 1]) -LSTMoutput3.sWhi[1, 1] = LearnableParameter -LSTMoutput3.Whi[1024, 256] = LearnableParameter -LSTMoutput3.unnamed256[0, 0] = Plus(LSTMoutput3.Wxix[0, 0], LSTMoutput3.bi[1024, 1]) -LSTMoutput3.bi[1024, 1] = LearnableParameter -LSTMoutput3.Wxix[0, 0] = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[0, 0]) -LSTMoutput3.unnamed251[0, 0] = Scale(LSTMoutput3.expsWxi[0, 0], LSTMoutput2.output[0, 0]) -LSTMoutput3.expsWxi[0, 0] = Exp(LSTMoutput3.sWxi[1, 1]) -LSTMoutput3.sWxi[1, 1] = LearnableParameter -LSTMoutput3.Wxi[1024, 256] = LearnableParameter -LSTMoutput3.expsWcf[0, 0] = Exp(LSTMoutput3.sWcf[1, 1]) -LSTMoutput3.sWcf[1, 1] = LearnableParameter -LSTMoutput3.Wcf[1024, 1] = LearnableParameter -LSTMoutput3.expsWhf[0, 0] = Exp(LSTMoutput3.sWhf[1, 1]) -LSTMoutput3.sWhf[1, 1] = LearnableParameter -LSTMoutput3.Whf[1024, 256] = LearnableParameter -LSTMoutput3.unnamed267[0, 0] = Plus(LSTMoutput3.Wxfx[0, 0], LSTMoutput3.bf[1024, 1]) -LSTMoutput3.bf[1024, 1] = LearnableParameter -LSTMoutput3.Wxfx[0, 0] = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[0, 0]) -LSTMoutput3.unnamed262[0, 0] = Scale(LSTMoutput3.expsWxf[0, 0], LSTMoutput2.output[0, 0]) -LSTMoutput3.expsWxf[0, 0] = Exp(LSTMoutput3.sWxf[1, 1]) -LSTMoutput3.sWxf[1, 1] = LearnableParameter -LSTMoutput3.Wxf[1024, 256] = LearnableParameter -LSTMoutput3.expsWco[0, 0] = Exp(LSTMoutput3.sWco[1, 1]) -LSTMoutput3.sWco[1, 1] = LearnableParameter -LSTMoutput3.Wco[1024, 1] = LearnableParameter -LSTMoutput3.expsWho[0, 0] = Exp(LSTMoutput3.sWho[1, 1]) -LSTMoutput3.sWho[1, 1] = LearnableParameter -LSTMoutput3.Who[1024, 256] = LearnableParameter -LSTMoutput3.unnamed273[0, 0] = Plus(LSTMoutput3.Wxox[0, 0], LSTMoutput3.bo[1024, 1]) -LSTMoutput3.bo[1024, 1] = LearnableParameter -LSTMoutput3.Wxox[0, 0] = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[0, 0]) -LSTMoutput3.unnamed268[0, 0] = Scale(LSTMoutput3.expsWxo[0, 0], LSTMoutput2.output[0, 0]) -LSTMoutput2.output[0, 0] = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[0, 0]) -LSTMoutput2.unnamed225[0, 0] = Scale(LSTMoutput2.expsWmr[0, 0], LSTMoutput2.mt[0, 0]) -LSTMoutput2.mt[0, 0] = ElementTimes(LSTMoutput2.ot[0, 0], LSTMoutput2.unnamed224[0, 0]) -LSTMoutput2.unnamed224[0, 0] = Tanh(LSTMoutput2.ct[0, 0]) -LSTMoutput2.ot[0, 0] = Sigmoid(LSTMoutput2.unnamed221[0, 0]) -LSTMoutput2.unnamed221[0, 0] = Plus(LSTMoutput2.unnamed222[0, 0], LSTMoutput2.Wcoct[0, 0]) -LSTMoutput2.Wcoct[0, 0] = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[0, 0]) -LSTMoutput2.unnamed220[0, 0] = Scale(LSTMoutput2.expsWco[0, 0], LSTMoutput2.ct[0, 0]) -LSTMoutput2.ct[0, 0] = Plus(LSTMoutput2.bft[0, 0], LSTMoutput2.bit[0, 0]) -LSTMoutput2.bit[0, 0] = ElementTimes(LSTMoutput2.it[0, 0], LSTMoutput2.unnamed209[0, 0]) -LSTMoutput2.unnamed209[0, 0] = Tanh(LSTMoutput2.unnamed210[0, 0]) -LSTMoutput2.unnamed210[0, 0] = Plus(LSTMoutput2.Wxcx[0, 0], LSTMoutput2.unnamed211[0, 0]) -LSTMoutput2.unnamed211[0, 0] = Plus(LSTMoutput2.Whcdh[0, 0], LSTMoutput2.bc[1024, 1]) -LSTMoutput2.Whcdh[0, 0] = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[0, 0]) -LSTMoutput2.unnamed208[0, 0] = Scale(LSTMoutput2.expsWhc[0, 0], LSTMoutput2.dh[256, 1]) -LSTMoutput2.it[0, 0] = Sigmoid(LSTMoutput2.unnamed204[0, 0]) -LSTMoutput2.unnamed204[0, 0] = Plus(LSTMoutput2.unnamed205[0, 0], LSTMoutput2.Wcidc[0, 0]) -LSTMoutput2.Wcidc[0, 0] = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[0, 0]) -LSTMoutput2.unnamed203[0, 0] = Scale(LSTMoutput2.expsWci[0, 0], LSTMoutput2.dc[1024, 1]) -LSTMoutput2.unnamed205[0, 0] = Plus(LSTMoutput2.unnamed206[0, 0], LSTMoutput2.Whidh[0, 0]) -LSTMoutput2.Whidh[0, 0] = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[0, 0]) -LSTMoutput2.unnamed202[0, 0] = Scale(LSTMoutput2.expsWhi[0, 0], LSTMoutput2.dh[256, 1]) -LSTMoutput2.bft[0, 0] = ElementTimes(LSTMoutput2.ft[0, 0], LSTMoutput2.dc[1024, 1]) -LSTMoutput2.ft[0, 0] = Sigmoid(LSTMoutput2.unnamed215[0, 0]) -LSTMoutput2.unnamed215[0, 0] = Plus(LSTMoutput2.unnamed216[0, 0], LSTMoutput2.Wcfdc[0, 0]) -LSTMoutput2.Wcfdc[0, 0] = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[0, 0]) -LSTMoutput2.unnamed214[0, 0] = Scale(LSTMoutput2.expsWcf[0, 0], LSTMoutput2.dc[1024, 1]) -LSTMoutput2.dc[1024, 1] = PastValue(LSTMoutput2.ct[0, 0]) -LSTMoutput2.unnamed216[0, 0] = Plus(LSTMoutput2.unnamed217[0, 0], LSTMoutput2.Whfdh[0, 0]) -LSTMoutput2.Whfdh[0, 0] = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[0, 0]) -LSTMoutput2.unnamed213[0, 0] = Scale(LSTMoutput2.expsWhf[0, 0], LSTMoutput2.dh[256, 1]) -LSTMoutput2.unnamed222[0, 0] = Plus(LSTMoutput2.unnamed223[0, 0], LSTMoutput2.Whodh[0, 0]) -LSTMoutput2.Whodh[0, 0] = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[0, 0]) -LSTMoutput2.unnamed219[0, 0] = Scale(LSTMoutput2.expsWho[0, 0], LSTMoutput2.dh[256, 1]) -LSTMoutput2.dh[256, 1] = PastValue(LSTMoutput2.output[0, 0]) -LSTMoutput2.bc[1024, 1] = LearnableParameter -LSTMoutput2.expsWhc[0, 0] = Exp(LSTMoutput2.sWhc[1, 1]) -LSTMoutput2.sWhc[1, 1] = LearnableParameter -LSTMoutput2.Whc[1024, 256] = LearnableParameter -LSTMoutput2.Wxcx[0, 0] = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[0, 0]) -LSTMoutput2.unnamed207[0, 0] = Scale(LSTMoutput2.expsWxc[0, 0], LSTMoutput1.output[0, 0]) -LSTMoutput2.expsWxc[0, 0] = Exp(LSTMoutput2.sWxc[1, 1]) -LSTMoutput2.sWxc[1, 1] = LearnableParameter -LSTMoutput2.Wxc[1024, 256] = LearnableParameter -LSTMoutput2.expsWci[0, 0] = Exp(LSTMoutput2.sWci[1, 1]) -LSTMoutput2.sWci[1, 1] = LearnableParameter -LSTMoutput2.Wci[1024, 1] = LearnableParameter -LSTMoutput2.expsWhi[0, 0] = Exp(LSTMoutput2.sWhi[1, 1]) -LSTMoutput2.sWhi[1, 1] = LearnableParameter -LSTMoutput2.Whi[1024, 256] = LearnableParameter -LSTMoutput2.unnamed206[0, 0] = Plus(LSTMoutput2.Wxix[0, 0], LSTMoutput2.bi[1024, 1]) -LSTMoutput2.bi[1024, 1] = LearnableParameter -LSTMoutput2.Wxix[0, 0] = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[0, 0]) -LSTMoutput2.unnamed201[0, 0] = Scale(LSTMoutput2.expsWxi[0, 0], LSTMoutput1.output[0, 0]) -LSTMoutput2.expsWxi[0, 0] = Exp(LSTMoutput2.sWxi[1, 1]) -LSTMoutput2.sWxi[1, 1] = LearnableParameter -LSTMoutput2.Wxi[1024, 256] = LearnableParameter -LSTMoutput2.expsWcf[0, 0] = Exp(LSTMoutput2.sWcf[1, 1]) -LSTMoutput2.sWcf[1, 1] = LearnableParameter -LSTMoutput2.Wcf[1024, 1] = LearnableParameter -LSTMoutput2.expsWhf[0, 0] = Exp(LSTMoutput2.sWhf[1, 1]) -LSTMoutput2.sWhf[1, 1] = LearnableParameter -LSTMoutput2.Whf[1024, 256] = LearnableParameter -LSTMoutput2.unnamed217[0, 0] = Plus(LSTMoutput2.Wxfx[0, 0], LSTMoutput2.bf[1024, 1]) -LSTMoutput2.bf[1024, 1] = LearnableParameter -LSTMoutput2.Wxfx[0, 0] = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[0, 0]) -LSTMoutput2.unnamed212[0, 0] = Scale(LSTMoutput2.expsWxf[0, 0], LSTMoutput1.output[0, 0]) -LSTMoutput2.expsWxf[0, 0] = Exp(LSTMoutput2.sWxf[1, 1]) -LSTMoutput2.sWxf[1, 1] = LearnableParameter -LSTMoutput2.Wxf[1024, 256] = LearnableParameter -LSTMoutput2.expsWco[0, 0] = Exp(LSTMoutput2.sWco[1, 1]) -LSTMoutput2.sWco[1, 1] = LearnableParameter -LSTMoutput2.Wco[1024, 1] = LearnableParameter -LSTMoutput2.expsWho[0, 0] = Exp(LSTMoutput2.sWho[1, 1]) -LSTMoutput2.sWho[1, 1] = LearnableParameter -LSTMoutput2.Who[1024, 256] = LearnableParameter -LSTMoutput2.unnamed223[0, 0] = Plus(LSTMoutput2.Wxox[0, 0], LSTMoutput2.bo[1024, 1]) -LSTMoutput2.bo[1024, 1] = LearnableParameter -LSTMoutput2.Wxox[0, 0] = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[0, 0]) -LSTMoutput2.unnamed218[0, 0] = Scale(LSTMoutput2.expsWxo[0, 0], LSTMoutput1.output[0, 0]) -LSTMoutput1.output[0, 0] = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[0, 0]) -LSTMoutput1.unnamed175[0, 0] = Scale(LSTMoutput1.expsWmr[0, 0], LSTMoutput1.mt[0, 0]) -LSTMoutput1.mt[0, 0] = ElementTimes(LSTMoutput1.ot[0, 0], LSTMoutput1.unnamed174[0, 0]) -LSTMoutput1.unnamed174[0, 0] = Tanh(LSTMoutput1.ct[0, 0]) -LSTMoutput1.ot[0, 0] = Sigmoid(LSTMoutput1.unnamed171[0, 0]) -LSTMoutput1.unnamed171[0, 0] = Plus(LSTMoutput1.unnamed172[0, 0], LSTMoutput1.Wcoct[0, 0]) -LSTMoutput1.Wcoct[0, 0] = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[0, 0]) -LSTMoutput1.unnamed170[0, 0] = Scale(LSTMoutput1.expsWco[0, 0], LSTMoutput1.ct[0, 0]) -LSTMoutput1.ct[0, 0] = Plus(LSTMoutput1.bft[0, 0], LSTMoutput1.bit[0, 0]) -LSTMoutput1.bit[0, 0] = ElementTimes(LSTMoutput1.it[0, 0], LSTMoutput1.unnamed159[0, 0]) -LSTMoutput1.unnamed159[0, 0] = Tanh(LSTMoutput1.unnamed160[0, 0]) -LSTMoutput1.unnamed160[0, 0] = Plus(LSTMoutput1.Wxcx[0, 0], LSTMoutput1.unnamed161[0, 0]) -LSTMoutput1.unnamed161[0, 0] = Plus(LSTMoutput1.Whcdh[0, 0], LSTMoutput1.bc[1024, 1]) -LSTMoutput1.Whcdh[0, 0] = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[0, 0]) -LSTMoutput1.unnamed158[0, 0] = Scale(LSTMoutput1.expsWhc[0, 0], LSTMoutput1.dh[256, 1]) -LSTMoutput1.it[0, 0] = Sigmoid(LSTMoutput1.unnamed154[0, 0]) -LSTMoutput1.unnamed154[0, 0] = Plus(LSTMoutput1.unnamed155[0, 0], LSTMoutput1.Wcidc[0, 0]) -LSTMoutput1.Wcidc[0, 0] = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[0, 0]) -LSTMoutput1.unnamed153[0, 0] = Scale(LSTMoutput1.expsWci[0, 0], LSTMoutput1.dc[1024, 1]) -LSTMoutput1.unnamed155[0, 0] = Plus(LSTMoutput1.unnamed156[0, 0], LSTMoutput1.Whidh[0, 0]) -LSTMoutput1.Whidh[0, 0] = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[0, 0]) -LSTMoutput1.unnamed152[0, 0] = Scale(LSTMoutput1.expsWhi[0, 0], LSTMoutput1.dh[256, 1]) -LSTMoutput1.bft[0, 0] = ElementTimes(LSTMoutput1.ft[0, 0], LSTMoutput1.dc[1024, 1]) -LSTMoutput1.ft[0, 0] = Sigmoid(LSTMoutput1.unnamed165[0, 0]) -LSTMoutput1.unnamed165[0, 0] = Plus(LSTMoutput1.unnamed166[0, 0], LSTMoutput1.Wcfdc[0, 0]) -LSTMoutput1.Wcfdc[0, 0] = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[0, 0]) -LSTMoutput1.unnamed164[0, 0] = Scale(LSTMoutput1.expsWcf[0, 0], LSTMoutput1.dc[1024, 1]) -LSTMoutput1.dc[1024, 1] = PastValue(LSTMoutput1.ct[0, 0]) -LSTMoutput1.unnamed166[0, 0] = Plus(LSTMoutput1.unnamed167[0, 0], LSTMoutput1.Whfdh[0, 0]) -LSTMoutput1.Whfdh[0, 0] = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[0, 0]) -LSTMoutput1.unnamed163[0, 0] = Scale(LSTMoutput1.expsWhf[0, 0], LSTMoutput1.dh[256, 1]) -LSTMoutput1.unnamed172[0, 0] = Plus(LSTMoutput1.unnamed173[0, 0], LSTMoutput1.Whodh[0, 0]) -LSTMoutput1.Whodh[0, 0] = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[0, 0]) -LSTMoutput1.unnamed169[0, 0] = Scale(LSTMoutput1.expsWho[0, 0], LSTMoutput1.dh[256, 1]) -LSTMoutput1.dh[256, 1] = PastValue(LSTMoutput1.output[0, 0]) -LSTMoutput1.bc[1024, 1] = LearnableParameter -LSTMoutput1.expsWhc[0, 0] = Exp(LSTMoutput1.sWhc[1, 1]) -LSTMoutput1.sWhc[1, 1] = LearnableParameter -LSTMoutput1.Whc[1024, 256] = LearnableParameter -LSTMoutput1.Wxcx[0, 0] = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[0, 0]) -LSTMoutput1.unnamed157[0, 0] = Scale(LSTMoutput1.expsWxc[0, 0], featNorm.xNorm[0, 0]) -LSTMoutput1.expsWxc[0, 0] = Exp(LSTMoutput1.sWxc[1, 1]) -LSTMoutput1.sWxc[1, 1] = LearnableParameter -LSTMoutput1.Wxc[1024, 33] = LearnableParameter -LSTMoutput1.expsWci[0, 0] = Exp(LSTMoutput1.sWci[1, 1]) -LSTMoutput1.sWci[1, 1] = LearnableParameter -LSTMoutput1.Wci[1024, 1] = LearnableParameter -LSTMoutput1.expsWhi[0, 0] = Exp(LSTMoutput1.sWhi[1, 1]) -LSTMoutput1.sWhi[1, 1] = LearnableParameter -LSTMoutput1.Whi[1024, 256] = LearnableParameter -LSTMoutput1.unnamed156[0, 0] = Plus(LSTMoutput1.Wxix[0, 0], LSTMoutput1.bi[1024, 1]) -LSTMoutput1.bi[1024, 1] = LearnableParameter -LSTMoutput1.Wxix[0, 0] = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[0, 0]) -LSTMoutput1.unnamed151[0, 0] = Scale(LSTMoutput1.expsWxi[0, 0], featNorm.xNorm[0, 0]) -LSTMoutput1.expsWxi[0, 0] = Exp(LSTMoutput1.sWxi[1, 1]) -LSTMoutput1.sWxi[1, 1] = LearnableParameter -LSTMoutput1.Wxi[1024, 33] = LearnableParameter -LSTMoutput1.expsWcf[0, 0] = Exp(LSTMoutput1.sWcf[1, 1]) -LSTMoutput1.sWcf[1, 1] = LearnableParameter -LSTMoutput1.Wcf[1024, 1] = LearnableParameter -LSTMoutput1.expsWhf[0, 0] = Exp(LSTMoutput1.sWhf[1, 1]) -LSTMoutput1.sWhf[1, 1] = LearnableParameter -LSTMoutput1.Whf[1024, 256] = LearnableParameter -LSTMoutput1.unnamed167[0, 0] = Plus(LSTMoutput1.Wxfx[0, 0], LSTMoutput1.bf[1024, 1]) -LSTMoutput1.bf[1024, 1] = LearnableParameter -LSTMoutput1.Wxfx[0, 0] = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[0, 0]) -LSTMoutput1.unnamed162[0, 0] = Scale(LSTMoutput1.expsWxf[0, 0], featNorm.xNorm[0, 0]) -LSTMoutput1.expsWxf[0, 0] = Exp(LSTMoutput1.sWxf[1, 1]) -LSTMoutput1.sWxf[1, 1] = LearnableParameter -LSTMoutput1.Wxf[1024, 33] = LearnableParameter -LSTMoutput1.expsWco[0, 0] = Exp(LSTMoutput1.sWco[1, 1]) -LSTMoutput1.sWco[1, 1] = LearnableParameter -LSTMoutput1.Wco[1024, 1] = LearnableParameter -LSTMoutput1.expsWho[0, 0] = Exp(LSTMoutput1.sWho[1, 1]) -LSTMoutput1.sWho[1, 1] = LearnableParameter -LSTMoutput1.Who[1024, 256] = LearnableParameter -LSTMoutput1.unnamed173[0, 0] = Plus(LSTMoutput1.Wxox[0, 0], LSTMoutput1.bo[1024, 1]) -LSTMoutput1.bo[1024, 1] = LearnableParameter -LSTMoutput1.Wxox[0, 0] = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[0, 0]) -LSTMoutput1.unnamed168[0, 0] = Scale(LSTMoutput1.expsWxo[0, 0], featNorm.xNorm[0, 0]) -featNorm.xNorm[0, 0] = PerDimMeanVarNormalization(feashift[0, 0], featNorm.xMean[0, 0], featNorm.xStdDev[0, 0]) -featNorm.xStdDev[0, 0] = InvStdDev(feashift[0, 0]) -featNorm.xMean[0, 0] = Mean(feashift[0, 0]) -feashift[0, 0] = RowSlice(features[363, 1]) -features[363, 1] = InputValue -LSTMoutput1.expsWxo[0, 0] = Exp(LSTMoutput1.sWxo[1, 1]) -LSTMoutput1.sWxo[1, 1] = LearnableParameter -LSTMoutput1.Wxo[1024, 33] = LearnableParameter -LSTMoutput1.expsWmr[0, 0] = Exp(LSTMoutput1.sWmr[1, 1]) -LSTMoutput1.sWmr[1, 1] = LearnableParameter -LSTMoutput1.Wmr[256, 1024] = LearnableParameter -LSTMoutput2.expsWxo[0, 0] = Exp(LSTMoutput2.sWxo[1, 1]) -LSTMoutput2.sWxo[1, 1] = LearnableParameter -LSTMoutput2.Wxo[1024, 256] = LearnableParameter -LSTMoutput2.expsWmr[0, 0] = Exp(LSTMoutput2.sWmr[1, 1]) -LSTMoutput2.sWmr[1, 1] = LearnableParameter -LSTMoutput2.Wmr[256, 1024] = LearnableParameter -LSTMoutput3.expsWxo[0, 0] = Exp(LSTMoutput3.sWxo[1, 1]) -LSTMoutput3.sWxo[1, 1] = LearnableParameter -LSTMoutput3.Wxo[1024, 256] = LearnableParameter -LSTMoutput3.expsWmr[0, 0] = Exp(LSTMoutput3.sWmr[1, 1]) -LSTMoutput3.sWmr[1, 1] = LearnableParameter -LSTMoutput3.Wmr[256, 1024] = LearnableParameter -expsW[0, 0] = Exp(sW[1, 1]) -sW[1, 1] = LearnableParameter -W[132, 256] = LearnableParameter -labels[132, 1] = InputValue - -Validating node cr - -Validating --> labels = InputValue -Validating --> W = LearnableParameter -Validating --> sW = LearnableParameter -Validating --> expsW = Exp(sW[1, 1]) -Validating --> LSTMoutput3.Wmr = LearnableParameter -Validating --> LSTMoutput3.sWmr = LearnableParameter -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -Validating --> LSTMoutput3.Wxo = LearnableParameter -Validating --> LSTMoutput3.sWxo = LearnableParameter -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -Validating --> LSTMoutput2.Wmr = LearnableParameter -Validating --> LSTMoutput2.sWmr = LearnableParameter -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -Validating --> LSTMoutput2.Wxo = LearnableParameter -Validating --> LSTMoutput2.sWxo = LearnableParameter -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -Validating --> LSTMoutput1.Wmr = LearnableParameter -Validating --> LSTMoutput1.sWmr = LearnableParameter -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -Validating --> LSTMoutput1.Wxo = LearnableParameter -Validating --> LSTMoutput1.sWxo = LearnableParameter -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -Validating --> features = InputValue -Validating --> feashift = RowSlice(features[363, 1]) -Validating --> featNorm.xMean = Mean(feashift[33, 1]) -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 1]) -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 1]) -Validating --> LSTMoutput1.bo = LearnableParameter -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 1], LSTMoutput1.bo[1024, 1]) -Validating --> LSTMoutput1.Who = LearnableParameter -Validating --> LSTMoutput1.sWho = LearnableParameter -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -Validating --> LSTMoutput1.Wco = LearnableParameter -Validating --> LSTMoutput1.sWco = LearnableParameter -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -Validating --> LSTMoutput1.Wxf = LearnableParameter -Validating --> LSTMoutput1.sWxf = LearnableParameter -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 1]) -Validating --> LSTMoutput1.bf = LearnableParameter -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 1], LSTMoutput1.bf[1024, 1]) -Validating --> LSTMoutput1.Whf = LearnableParameter -Validating --> LSTMoutput1.sWhf = LearnableParameter -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -Validating --> LSTMoutput1.Wcf = LearnableParameter -Validating --> LSTMoutput1.sWcf = LearnableParameter -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -Validating --> LSTMoutput1.Wxi = LearnableParameter -Validating --> LSTMoutput1.sWxi = LearnableParameter -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 1]) -Validating --> LSTMoutput1.bi = LearnableParameter -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 1], LSTMoutput1.bi[1024, 1]) -Validating --> LSTMoutput1.Whi = LearnableParameter -Validating --> LSTMoutput1.sWhi = LearnableParameter -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -Validating --> LSTMoutput1.Wci = LearnableParameter -Validating --> LSTMoutput1.sWci = LearnableParameter -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -Validating --> LSTMoutput1.Wxc = LearnableParameter -Validating --> LSTMoutput1.sWxc = LearnableParameter -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 1]) -Validating --> LSTMoutput1.Whc = LearnableParameter -Validating --> LSTMoutput1.sWhc = LearnableParameter -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -Validating --> LSTMoutput1.bc = LearnableParameter -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[0 {W=3452816845, H=3452816845, C=3452816845}, 0]) -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 1], LSTMoutput1.Whodh[1024, 1]) -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 1], LSTMoutput1.Whfdh[1024, 1]) -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[0 {W=3452816845, H=3452816845, C=3452816845}, 0]) -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 1], LSTMoutput1.Whidh[1024, 1]) -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1]) -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 1], LSTMoutput1.unnamed161[1024, 1]) -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 1]) -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.unnamed159[1024, 1]) -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 1]) -Validating --> LSTMoutput2.bo = LearnableParameter -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 1], LSTMoutput2.bo[1024, 1]) -Validating --> LSTMoutput2.Who = LearnableParameter -Validating --> LSTMoutput2.sWho = LearnableParameter -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -Validating --> LSTMoutput2.Wco = LearnableParameter -Validating --> LSTMoutput2.sWco = LearnableParameter -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -Validating --> LSTMoutput2.Wxf = LearnableParameter -Validating --> LSTMoutput2.sWxf = LearnableParameter -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 1]) -Validating --> LSTMoutput2.bf = LearnableParameter -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 1], LSTMoutput2.bf[1024, 1]) -Validating --> LSTMoutput2.Whf = LearnableParameter -Validating --> LSTMoutput2.sWhf = LearnableParameter -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -Validating --> LSTMoutput2.Wcf = LearnableParameter -Validating --> LSTMoutput2.sWcf = LearnableParameter -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -Validating --> LSTMoutput2.Wxi = LearnableParameter -Validating --> LSTMoutput2.sWxi = LearnableParameter -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 1]) -Validating --> LSTMoutput2.bi = LearnableParameter -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 1], LSTMoutput2.bi[1024, 1]) -Validating --> LSTMoutput2.Whi = LearnableParameter -Validating --> LSTMoutput2.sWhi = LearnableParameter -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -Validating --> LSTMoutput2.Wci = LearnableParameter -Validating --> LSTMoutput2.sWci = LearnableParameter -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -Validating --> LSTMoutput2.Wxc = LearnableParameter -Validating --> LSTMoutput2.sWxc = LearnableParameter -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 1]) -Validating --> LSTMoutput2.Whc = LearnableParameter -Validating --> LSTMoutput2.sWhc = LearnableParameter -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -Validating --> LSTMoutput2.bc = LearnableParameter -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[0 {W=3452816845, H=3452816845, C=3452816845}, 0]) -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 1], LSTMoutput2.Whodh[1024, 1]) -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 1], LSTMoutput2.Whfdh[1024, 1]) -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[0 {W=3452816845, H=3452816845, C=3452816845}, 0]) -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 1], LSTMoutput2.Whidh[1024, 1]) -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1]) -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 1], LSTMoutput2.unnamed211[1024, 1]) -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 1]) -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.unnamed209[1024, 1]) -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 1]) -Validating --> LSTMoutput3.bo = LearnableParameter -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 1], LSTMoutput3.bo[1024, 1]) -Validating --> LSTMoutput3.Who = LearnableParameter -Validating --> LSTMoutput3.sWho = LearnableParameter -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -Validating --> LSTMoutput3.Wco = LearnableParameter -Validating --> LSTMoutput3.sWco = LearnableParameter -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -Validating --> LSTMoutput3.Wxf = LearnableParameter -Validating --> LSTMoutput3.sWxf = LearnableParameter -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 1]) -Validating --> LSTMoutput3.bf = LearnableParameter -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 1], LSTMoutput3.bf[1024, 1]) -Validating --> LSTMoutput3.Whf = LearnableParameter -Validating --> LSTMoutput3.sWhf = LearnableParameter -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -Validating --> LSTMoutput3.Wcf = LearnableParameter -Validating --> LSTMoutput3.sWcf = LearnableParameter -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -Validating --> LSTMoutput3.Wxi = LearnableParameter -Validating --> LSTMoutput3.sWxi = LearnableParameter -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 1]) -Validating --> LSTMoutput3.bi = LearnableParameter -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 1], LSTMoutput3.bi[1024, 1]) -Validating --> LSTMoutput3.Whi = LearnableParameter -Validating --> LSTMoutput3.sWhi = LearnableParameter -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -Validating --> LSTMoutput3.Wci = LearnableParameter -Validating --> LSTMoutput3.sWci = LearnableParameter -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -Validating --> LSTMoutput3.Wxc = LearnableParameter -Validating --> LSTMoutput3.sWxc = LearnableParameter -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 1]) -Validating --> LSTMoutput3.Whc = LearnableParameter -Validating --> LSTMoutput3.sWhc = LearnableParameter -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -Validating --> LSTMoutput3.bc = LearnableParameter -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[0 {W=3452816845, H=3452816845, C=3452816845}, 0]) -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 1], LSTMoutput3.Whodh[1024, 1]) -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 1], LSTMoutput3.Whfdh[1024, 1]) -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[0 {W=3452816845, H=3452816845, C=3452816845}, 0]) -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 1], LSTMoutput3.Whidh[1024, 1]) -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1]) -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 1], LSTMoutput3.unnamed261[1024, 1]) -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 1]) -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.unnamed259[1024, 1]) -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 1]) -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 1]) -Validating --> b = LearnableParameter -Validating --> LSTMoutputW = Plus(unnamed283[132, 1], b[132, 1]) -Validating --> cr = CrossEntropyWithSoftmax(labels[132, 1], LSTMoutputW[132, 1]) - - nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Validating node ScaledLogLikelihood - -Validating --> W = LearnableParameter -Validating --> sW = LearnableParameter -Validating --> expsW = Exp(sW[1, 1]) -Validating --> LSTMoutput3.Wmr = LearnableParameter -Validating --> LSTMoutput3.sWmr = LearnableParameter -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -Validating --> LSTMoutput3.Wxo = LearnableParameter -Validating --> LSTMoutput3.sWxo = LearnableParameter -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -Validating --> LSTMoutput2.Wmr = LearnableParameter -Validating --> LSTMoutput2.sWmr = LearnableParameter -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -Validating --> LSTMoutput2.Wxo = LearnableParameter -Validating --> LSTMoutput2.sWxo = LearnableParameter -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -Validating --> LSTMoutput1.Wmr = LearnableParameter -Validating --> LSTMoutput1.sWmr = LearnableParameter -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -Validating --> LSTMoutput1.Wxo = LearnableParameter -Validating --> LSTMoutput1.sWxo = LearnableParameter -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -Validating --> features = InputValue -Validating --> feashift = RowSlice(features[363, 1]) -Validating --> featNorm.xMean = Mean(feashift[33, 1]) -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 1]) -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 1]) -Validating --> LSTMoutput1.bo = LearnableParameter -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 1], LSTMoutput1.bo[1024, 1]) -Validating --> LSTMoutput1.Who = LearnableParameter -Validating --> LSTMoutput1.sWho = LearnableParameter -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -Validating --> LSTMoutput1.Wco = LearnableParameter -Validating --> LSTMoutput1.sWco = LearnableParameter -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -Validating --> LSTMoutput1.Wxf = LearnableParameter -Validating --> LSTMoutput1.sWxf = LearnableParameter -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 1]) -Validating --> LSTMoutput1.bf = LearnableParameter -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 1], LSTMoutput1.bf[1024, 1]) -Validating --> LSTMoutput1.Whf = LearnableParameter -Validating --> LSTMoutput1.sWhf = LearnableParameter -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -Validating --> LSTMoutput1.Wcf = LearnableParameter -Validating --> LSTMoutput1.sWcf = LearnableParameter -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -Validating --> LSTMoutput1.Wxi = LearnableParameter -Validating --> LSTMoutput1.sWxi = LearnableParameter -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 1]) -Validating --> LSTMoutput1.bi = LearnableParameter -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 1], LSTMoutput1.bi[1024, 1]) -Validating --> LSTMoutput1.Whi = LearnableParameter -Validating --> LSTMoutput1.sWhi = LearnableParameter -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -Validating --> LSTMoutput1.Wci = LearnableParameter -Validating --> LSTMoutput1.sWci = LearnableParameter -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -Validating --> LSTMoutput1.Wxc = LearnableParameter -Validating --> LSTMoutput1.sWxc = LearnableParameter -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 1]) -Validating --> LSTMoutput1.Whc = LearnableParameter -Validating --> LSTMoutput1.sWhc = LearnableParameter -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -Validating --> LSTMoutput1.bc = LearnableParameter -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, 1]) -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 1], LSTMoutput1.Whodh[1024, 1]) -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, 1]) -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 1], LSTMoutput1.Whfdh[1024, 1]) -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, 1]) -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 1], LSTMoutput1.Whidh[1024, 1]) -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, 1]) -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1]) -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 1], LSTMoutput1.unnamed161[1024, 1]) -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 1]) -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.unnamed159[1024, 1]) -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 1]) -Validating --> LSTMoutput2.bo = LearnableParameter -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 1], LSTMoutput2.bo[1024, 1]) -Validating --> LSTMoutput2.Who = LearnableParameter -Validating --> LSTMoutput2.sWho = LearnableParameter -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -Validating --> LSTMoutput2.Wco = LearnableParameter -Validating --> LSTMoutput2.sWco = LearnableParameter -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -Validating --> LSTMoutput2.Wxf = LearnableParameter -Validating --> LSTMoutput2.sWxf = LearnableParameter -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 1]) -Validating --> LSTMoutput2.bf = LearnableParameter -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 1], LSTMoutput2.bf[1024, 1]) -Validating --> LSTMoutput2.Whf = LearnableParameter -Validating --> LSTMoutput2.sWhf = LearnableParameter -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -Validating --> LSTMoutput2.Wcf = LearnableParameter -Validating --> LSTMoutput2.sWcf = LearnableParameter -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -Validating --> LSTMoutput2.Wxi = LearnableParameter -Validating --> LSTMoutput2.sWxi = LearnableParameter -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 1]) -Validating --> LSTMoutput2.bi = LearnableParameter -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 1], LSTMoutput2.bi[1024, 1]) -Validating --> LSTMoutput2.Whi = LearnableParameter -Validating --> LSTMoutput2.sWhi = LearnableParameter -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -Validating --> LSTMoutput2.Wci = LearnableParameter -Validating --> LSTMoutput2.sWci = LearnableParameter -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -Validating --> LSTMoutput2.Wxc = LearnableParameter -Validating --> LSTMoutput2.sWxc = LearnableParameter -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 1]) -Validating --> LSTMoutput2.Whc = LearnableParameter -Validating --> LSTMoutput2.sWhc = LearnableParameter -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -Validating --> LSTMoutput2.bc = LearnableParameter -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, 1]) -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 1], LSTMoutput2.Whodh[1024, 1]) -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, 1]) -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 1], LSTMoutput2.Whfdh[1024, 1]) -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, 1]) -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 1], LSTMoutput2.Whidh[1024, 1]) -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, 1]) -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1]) -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 1], LSTMoutput2.unnamed211[1024, 1]) -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 1]) -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.unnamed209[1024, 1]) -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 1]) -Validating --> LSTMoutput3.bo = LearnableParameter -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 1], LSTMoutput3.bo[1024, 1]) -Validating --> LSTMoutput3.Who = LearnableParameter -Validating --> LSTMoutput3.sWho = LearnableParameter -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -Validating --> LSTMoutput3.Wco = LearnableParameter -Validating --> LSTMoutput3.sWco = LearnableParameter -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -Validating --> LSTMoutput3.Wxf = LearnableParameter -Validating --> LSTMoutput3.sWxf = LearnableParameter -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 1]) -Validating --> LSTMoutput3.bf = LearnableParameter -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 1], LSTMoutput3.bf[1024, 1]) -Validating --> LSTMoutput3.Whf = LearnableParameter -Validating --> LSTMoutput3.sWhf = LearnableParameter -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -Validating --> LSTMoutput3.Wcf = LearnableParameter -Validating --> LSTMoutput3.sWcf = LearnableParameter -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -Validating --> LSTMoutput3.Wxi = LearnableParameter -Validating --> LSTMoutput3.sWxi = LearnableParameter -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 1]) -Validating --> LSTMoutput3.bi = LearnableParameter -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 1], LSTMoutput3.bi[1024, 1]) -Validating --> LSTMoutput3.Whi = LearnableParameter -Validating --> LSTMoutput3.sWhi = LearnableParameter -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -Validating --> LSTMoutput3.Wci = LearnableParameter -Validating --> LSTMoutput3.sWci = LearnableParameter -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -Validating --> LSTMoutput3.Wxc = LearnableParameter -Validating --> LSTMoutput3.sWxc = LearnableParameter -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 1]) -Validating --> LSTMoutput3.Whc = LearnableParameter -Validating --> LSTMoutput3.sWhc = LearnableParameter -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -Validating --> LSTMoutput3.bc = LearnableParameter -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, 1]) -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, 1]) -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 1], LSTMoutput3.Whodh[1024, 1]) -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, 1]) -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 1], LSTMoutput3.Whfdh[1024, 1]) -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, 1]) -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 1], LSTMoutput3.Whidh[1024, 1]) -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, 1]) -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1]) -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 1], LSTMoutput3.unnamed261[1024, 1]) -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 1]) -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.unnamed259[1024, 1]) -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 1]) -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 1]) -Validating --> b = LearnableParameter -Validating --> LSTMoutputW = Plus(unnamed283[132, 1], b[132, 1]) -Validating --> labels = InputValue -Validating --> logPrior.Prior = Mean(labels[132, 1]) -Validating --> logPrior.LogPrior = Log(logPrior.Prior[132, 1]) -Validating --> ScaledLogLikelihood = Minus(LSTMoutputW[132, 1], logPrior.LogPrior[132, 1]) - - nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Validating node Err - -Validating --> labels = InputValue -Validating --> W = LearnableParameter -Validating --> sW = LearnableParameter -Validating --> expsW = Exp(sW[1, 1]) -Validating --> LSTMoutput3.Wmr = LearnableParameter -Validating --> LSTMoutput3.sWmr = LearnableParameter -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -Validating --> LSTMoutput3.Wxo = LearnableParameter -Validating --> LSTMoutput3.sWxo = LearnableParameter -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -Validating --> LSTMoutput2.Wmr = LearnableParameter -Validating --> LSTMoutput2.sWmr = LearnableParameter -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -Validating --> LSTMoutput2.Wxo = LearnableParameter -Validating --> LSTMoutput2.sWxo = LearnableParameter -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -Validating --> LSTMoutput1.Wmr = LearnableParameter -Validating --> LSTMoutput1.sWmr = LearnableParameter -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -Validating --> LSTMoutput1.Wxo = LearnableParameter -Validating --> LSTMoutput1.sWxo = LearnableParameter -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -Validating --> features = InputValue -Validating --> feashift = RowSlice(features[363, 1]) -Validating --> featNorm.xMean = Mean(feashift[33, 1]) -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 1]) -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 1]) -Validating --> LSTMoutput1.bo = LearnableParameter -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 1], LSTMoutput1.bo[1024, 1]) -Validating --> LSTMoutput1.Who = LearnableParameter -Validating --> LSTMoutput1.sWho = LearnableParameter -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -Validating --> LSTMoutput1.Wco = LearnableParameter -Validating --> LSTMoutput1.sWco = LearnableParameter -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -Validating --> LSTMoutput1.Wxf = LearnableParameter -Validating --> LSTMoutput1.sWxf = LearnableParameter -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 1]) -Validating --> LSTMoutput1.bf = LearnableParameter -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 1], LSTMoutput1.bf[1024, 1]) -Validating --> LSTMoutput1.Whf = LearnableParameter -Validating --> LSTMoutput1.sWhf = LearnableParameter -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -Validating --> LSTMoutput1.Wcf = LearnableParameter -Validating --> LSTMoutput1.sWcf = LearnableParameter -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -Validating --> LSTMoutput1.Wxi = LearnableParameter -Validating --> LSTMoutput1.sWxi = LearnableParameter -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 1]) -Validating --> LSTMoutput1.bi = LearnableParameter -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 1], LSTMoutput1.bi[1024, 1]) -Validating --> LSTMoutput1.Whi = LearnableParameter -Validating --> LSTMoutput1.sWhi = LearnableParameter -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -Validating --> LSTMoutput1.Wci = LearnableParameter -Validating --> LSTMoutput1.sWci = LearnableParameter -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -Validating --> LSTMoutput1.Wxc = LearnableParameter -Validating --> LSTMoutput1.sWxc = LearnableParameter -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 1]) -Validating --> LSTMoutput1.Whc = LearnableParameter -Validating --> LSTMoutput1.sWhc = LearnableParameter -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -Validating --> LSTMoutput1.bc = LearnableParameter -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, 1]) -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 1], LSTMoutput1.Whodh[1024, 1]) -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, 1]) -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 1], LSTMoutput1.Whfdh[1024, 1]) -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, 1]) -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 1], LSTMoutput1.Whidh[1024, 1]) -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, 1]) -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1]) -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 1], LSTMoutput1.unnamed161[1024, 1]) -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 1]) -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.unnamed159[1024, 1]) -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 1]) -Validating --> LSTMoutput2.bo = LearnableParameter -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 1], LSTMoutput2.bo[1024, 1]) -Validating --> LSTMoutput2.Who = LearnableParameter -Validating --> LSTMoutput2.sWho = LearnableParameter -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -Validating --> LSTMoutput2.Wco = LearnableParameter -Validating --> LSTMoutput2.sWco = LearnableParameter -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -Validating --> LSTMoutput2.Wxf = LearnableParameter -Validating --> LSTMoutput2.sWxf = LearnableParameter -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 1]) -Validating --> LSTMoutput2.bf = LearnableParameter -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 1], LSTMoutput2.bf[1024, 1]) -Validating --> LSTMoutput2.Whf = LearnableParameter -Validating --> LSTMoutput2.sWhf = LearnableParameter -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -Validating --> LSTMoutput2.Wcf = LearnableParameter -Validating --> LSTMoutput2.sWcf = LearnableParameter -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -Validating --> LSTMoutput2.Wxi = LearnableParameter -Validating --> LSTMoutput2.sWxi = LearnableParameter -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 1]) -Validating --> LSTMoutput2.bi = LearnableParameter -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 1], LSTMoutput2.bi[1024, 1]) -Validating --> LSTMoutput2.Whi = LearnableParameter -Validating --> LSTMoutput2.sWhi = LearnableParameter -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -Validating --> LSTMoutput2.Wci = LearnableParameter -Validating --> LSTMoutput2.sWci = LearnableParameter -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -Validating --> LSTMoutput2.Wxc = LearnableParameter -Validating --> LSTMoutput2.sWxc = LearnableParameter -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 1]) -Validating --> LSTMoutput2.Whc = LearnableParameter -Validating --> LSTMoutput2.sWhc = LearnableParameter -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -Validating --> LSTMoutput2.bc = LearnableParameter -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, 1]) -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 1], LSTMoutput2.Whodh[1024, 1]) -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, 1]) -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 1], LSTMoutput2.Whfdh[1024, 1]) -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, 1]) -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 1], LSTMoutput2.Whidh[1024, 1]) -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, 1]) -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1]) -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 1], LSTMoutput2.unnamed211[1024, 1]) -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 1]) -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.unnamed209[1024, 1]) -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 1]) -Validating --> LSTMoutput3.bo = LearnableParameter -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 1], LSTMoutput3.bo[1024, 1]) -Validating --> LSTMoutput3.Who = LearnableParameter -Validating --> LSTMoutput3.sWho = LearnableParameter -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -Validating --> LSTMoutput3.Wco = LearnableParameter -Validating --> LSTMoutput3.sWco = LearnableParameter -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -Validating --> LSTMoutput3.Wxf = LearnableParameter -Validating --> LSTMoutput3.sWxf = LearnableParameter -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 1]) -Validating --> LSTMoutput3.bf = LearnableParameter -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 1], LSTMoutput3.bf[1024, 1]) -Validating --> LSTMoutput3.Whf = LearnableParameter -Validating --> LSTMoutput3.sWhf = LearnableParameter -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -Validating --> LSTMoutput3.Wcf = LearnableParameter -Validating --> LSTMoutput3.sWcf = LearnableParameter -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -Validating --> LSTMoutput3.Wxi = LearnableParameter -Validating --> LSTMoutput3.sWxi = LearnableParameter -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 1]) -Validating --> LSTMoutput3.bi = LearnableParameter -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 1], LSTMoutput3.bi[1024, 1]) -Validating --> LSTMoutput3.Whi = LearnableParameter -Validating --> LSTMoutput3.sWhi = LearnableParameter -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -Validating --> LSTMoutput3.Wci = LearnableParameter -Validating --> LSTMoutput3.sWci = LearnableParameter -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -Validating --> LSTMoutput3.Wxc = LearnableParameter -Validating --> LSTMoutput3.sWxc = LearnableParameter -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 1]) -Validating --> LSTMoutput3.Whc = LearnableParameter -Validating --> LSTMoutput3.sWhc = LearnableParameter -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -Validating --> LSTMoutput3.bc = LearnableParameter -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, 1]) -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, 1]) -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 1], LSTMoutput3.Whodh[1024, 1]) -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, 1]) -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 1], LSTMoutput3.Whfdh[1024, 1]) -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, 1]) -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 1], LSTMoutput3.Whidh[1024, 1]) -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, 1]) -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1]) -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 1], LSTMoutput3.unnamed261[1024, 1]) -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 1]) -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.unnamed259[1024, 1]) -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 1]) -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 1]) -Validating --> b = LearnableParameter -Validating --> LSTMoutputW = Plus(unnamed283[132, 1], b[132, 1]) -Validating --> Err = ErrorPrediction(labels[132, 1], LSTMoutputW[132, 1]) - +Node --> B = LearnableParameter +Node --> labels = InputValue +Node --> LSTMoutputW./*+*/left./***/left = LearnableParameter +Node --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].Wmr = LearnableParameter +Node --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].Wmr = LearnableParameter +Node --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].Wmr = LearnableParameter +Node --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> features = InputValue +Node --> feashift = RowSlice +Node --> featNorm.meanVector = Mean +Node --> featNorm.invStdDevVector = InvStdDev +Node --> featNorm = PerDimMeanVarNormalization +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].dh = PastValue +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[1].ot.z./*+*/left = Plus +Node --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[1].ft.z./*+*/left = Plus +Node --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[1].dc = PastValue +Node --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale +Node --> LSTMoutput[1].ft.z./*+*/right = DiagTimes +Node --> LSTMoutput[1].ft.z = Plus +Node --> LSTMoutput[1].ft = Sigmoid +Node --> LSTMoutput[1].bft = ElementTimes +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[1].it.z./*+*/left = Plus +Node --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[1].it.z./*+*/right.matrix = Scale +Node --> LSTMoutput[1].it.z./*+*/right = DiagTimes +Node --> LSTMoutput[1].it.z = Plus +Node --> LSTMoutput[1].it = Sigmoid +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter +Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus +Node --> LSTMoutput[1].bit./*.**/right.z = Plus +Node --> LSTMoutput[1].bit./*.**/right = Tanh +Node --> LSTMoutput[1].bit = ElementTimes +Node --> LSTMoutput[1].ct = Plus +Node --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale +Node --> LSTMoutput[1].ot.z./*+*/right = DiagTimes +Node --> LSTMoutput[1].ot.z = Plus +Node --> LSTMoutput[1].ot = Sigmoid +Node --> LSTMoutput[1].mt./*.**/right = Tanh +Node --> LSTMoutput[1].mt = ElementTimes +Node --> LSTMoutput[1].output./***/right = Scale +Node --> LSTMoutput[1].output = Times +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].dh = PastValue +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[2].ot.z./*+*/left = Plus +Node --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[2].ft.z./*+*/left = Plus +Node --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[2].dc = PastValue +Node --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale +Node --> LSTMoutput[2].ft.z./*+*/right = DiagTimes +Node --> LSTMoutput[2].ft.z = Plus +Node --> LSTMoutput[2].ft = Sigmoid +Node --> LSTMoutput[2].bft = ElementTimes +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[2].it.z./*+*/left = Plus +Node --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[2].it.z./*+*/right.matrix = Scale +Node --> LSTMoutput[2].it.z./*+*/right = DiagTimes +Node --> LSTMoutput[2].it.z = Plus +Node --> LSTMoutput[2].it = Sigmoid +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter +Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus +Node --> LSTMoutput[2].bit./*.**/right.z = Plus +Node --> LSTMoutput[2].bit./*.**/right = Tanh +Node --> LSTMoutput[2].bit = ElementTimes +Node --> LSTMoutput[2].ct = Plus +Node --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale +Node --> LSTMoutput[2].ot.z./*+*/right = DiagTimes +Node --> LSTMoutput[2].ot.z = Plus +Node --> LSTMoutput[2].ot = Sigmoid +Node --> LSTMoutput[2].mt./*.**/right = Tanh +Node --> LSTMoutput[2].mt = ElementTimes +Node --> LSTMoutput[2].output./***/right = Scale +Node --> LSTMoutput[2].output = Times +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].dh = PastValue +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[3].ot.z./*+*/left = Plus +Node --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[3].ft.z./*+*/left = Plus +Node --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[3].dc = PastValue +Node --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale +Node --> LSTMoutput[3].ft.z./*+*/right = DiagTimes +Node --> LSTMoutput[3].ft.z = Plus +Node --> LSTMoutput[3].ft = Sigmoid +Node --> LSTMoutput[3].bft = ElementTimes +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale +Node --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times +Node --> LSTMoutput[3].it.z./*+*/left = Plus +Node --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp +Node --> LSTMoutput[3].it.z./*+*/right.matrix = Scale +Node --> LSTMoutput[3].it.z./*+*/right = DiagTimes +Node --> LSTMoutput[3].it.z = Plus +Node --> LSTMoutput[3].it = Sigmoid +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter +Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus +Node --> LSTMoutput[3].bit./*.**/right.z = Plus +Node --> LSTMoutput[3].bit./*.**/right = Tanh +Node --> LSTMoutput[3].bit = ElementTimes +Node --> LSTMoutput[3].ct = Plus +Node --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale +Node --> LSTMoutput[3].ot.z./*+*/right = DiagTimes +Node --> LSTMoutput[3].ot.z = Plus +Node --> LSTMoutput[3].ot = Sigmoid +Node --> LSTMoutput[3].mt./*.**/right = Tanh +Node --> LSTMoutput[3].mt = ElementTimes +Node --> LSTMoutput[3].output./***/right = Scale +Node --> LSTMoutput[3].output = Times +Node --> LSTMoutputW./*+*/left./***/right = Scale +Node --> LSTMoutputW./*+*/left = Times +Node --> LSTMoutputW = Plus +Node --> Err = ErrorPrediction +Node --> logPrior.x = Mean +Node --> logPrior = Log +Node --> ScaledLogLikelihood = Minus +Node --> cr = CrossEntropyWithSoftmax +class Microsoft::MSR::CNTK::ComputationNetwork [ + B : LearnableParameter 132 x 1 () + cr : CrossEntropyWithSoftmax 0 x 0 ( + labels + LSTMoutputW + ) + Err : ErrorPrediction 0 x 0 ( + labels + LSTMoutputW + ) + feashift : RowSlice 0 x 0 ( + features + ) + featNorm : PerDimMeanVarNormalization 0 x 0 ( + feashift + featNorm.meanVector + featNorm.invStdDevVector + ) + featNorm.invStdDevVector : InvStdDev 0 x 0 ( + feashift + ) + featNorm.meanVector : Mean 0 x 0 ( + feashift + ) + features : InputValue 363 x 1 () + labels : InputValue 132 x 1 () + logPrior : Log 0 x 0 ( + logPrior.x + ) + logPrior.x : Mean 0 x 0 ( + labels + ) + LSTMoutput[1].bft : ElementTimes 0 x 0 ( + LSTMoutput[1].ft + LSTMoutput[1].dc + ) + LSTMoutput[1].bit : ElementTimes 0 x 0 ( + LSTMoutput[1].it + LSTMoutput[1].bit./*.**/right + ) + LSTMoutput[1].bit./*.**/right : Tanh 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z + ) + LSTMoutput[1].bit./*.**/right.z : Plus 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/left + LSTMoutput[1].bit./*.**/right.z./*+*/right + ) + LSTMoutput[1].bit./*.**/right.z./*+*/left : Times 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right + ) + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 33 () + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor + featNorm + ) + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].bit./*.**/right.z./*+*/right : Plus 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right + ) + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right + ) + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].dh + ) + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[1].ct : Plus 0 x 0 ( + LSTMoutput[1].bft + LSTMoutput[1].bit + ) + LSTMoutput[1].dc : PastValue 1024 x 1 ( + LSTMoutput[1].ct + ) + LSTMoutput[1].dh : PastValue 256 x 1 ( + LSTMoutput[1].output + ) + LSTMoutput[1].ft : Sigmoid 0 x 0 ( + LSTMoutput[1].ft.z + ) + LSTMoutput[1].ft.z : Plus 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left + LSTMoutput[1].ft.z./*+*/right + ) + LSTMoutput[1].ft.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left + LSTMoutput[1].ft.z./*+*/left./*+*/right + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 () + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + featNorm + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[1].ft.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[1].dh + ) + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].ft.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[1].ft.z./*+*/right.matrix + ) + LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[1].ft.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[1].dc + ) + LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].it : Sigmoid 0 x 0 ( + LSTMoutput[1].it.z + ) + LSTMoutput[1].it.z : Plus 0 x 0 ( + LSTMoutput[1].it.z./*+*/left + LSTMoutput[1].it.z./*+*/right + ) + LSTMoutput[1].it.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left + LSTMoutput[1].it.z./*+*/left./*+*/right + ) + LSTMoutput[1].it.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 () + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + featNorm + ) + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[1].it.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/right./***/left + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[1].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[1].dh + ) + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].it.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[1].it.z./*+*/right.matrix + ) + LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[1].it.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[1].dc + ) + LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].mt : ElementTimes 0 x 0 ( + LSTMoutput[1].ot + LSTMoutput[1].mt./*.**/right + ) + LSTMoutput[1].mt./*.**/right : Tanh 0 x 0 ( + LSTMoutput[1].ct + ) + LSTMoutput[1].ot : Sigmoid 0 x 0 ( + LSTMoutput[1].ot.z + ) + LSTMoutput[1].ot.z : Plus 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left + LSTMoutput[1].ot.z./*+*/right + ) + LSTMoutput[1].ot.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left + LSTMoutput[1].ot.z./*+*/left./*+*/right + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 () + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + featNorm + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[1].ot.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[1].dh + ) + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].ot.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[1].ot.z./*+*/right.matrix + ) + LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[1].ot.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[1].ct + ) + LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].output : Times 0 x 0 ( + LSTMoutput[1].Wmr + LSTMoutput[1].output./***/right + ) + LSTMoutput[1].output./***/right : Scale 0 x 0 ( + LSTMoutput[1].output./***/right.scalarScalingFactor + LSTMoutput[1].mt + ) + LSTMoutput[1].output./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[1].output./***/right.scalarScalingFactor.x + ) + LSTMoutput[1].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[1].Wmr : LearnableParameter 256 x 1024 () + LSTMoutput[2].bft : ElementTimes 0 x 0 ( + LSTMoutput[2].ft + LSTMoutput[2].dc + ) + LSTMoutput[2].bit : ElementTimes 0 x 0 ( + LSTMoutput[2].it + LSTMoutput[2].bit./*.**/right + ) + LSTMoutput[2].bit./*.**/right : Tanh 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z + ) + LSTMoutput[2].bit./*.**/right.z : Plus 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/left + LSTMoutput[2].bit./*.**/right.z./*+*/right + ) + LSTMoutput[2].bit./*.**/right.z./*+*/left : Times 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right + ) + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].output + ) + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].bit./*.**/right.z./*+*/right : Plus 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right + ) + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right + ) + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].dh + ) + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[2].ct : Plus 0 x 0 ( + LSTMoutput[2].bft + LSTMoutput[2].bit + ) + LSTMoutput[2].dc : PastValue 1024 x 1 ( + LSTMoutput[2].ct + ) + LSTMoutput[2].dh : PastValue 256 x 1 ( + LSTMoutput[2].output + ) + LSTMoutput[2].ft : Sigmoid 0 x 0 ( + LSTMoutput[2].ft.z + ) + LSTMoutput[2].ft.z : Plus 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left + LSTMoutput[2].ft.z./*+*/right + ) + LSTMoutput[2].ft.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left + LSTMoutput[2].ft.z./*+*/left./*+*/right + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].output + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[2].ft.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[2].dh + ) + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].ft.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[2].ft.z./*+*/right.matrix + ) + LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[2].ft.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[2].dc + ) + LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].it : Sigmoid 0 x 0 ( + LSTMoutput[2].it.z + ) + LSTMoutput[2].it.z : Plus 0 x 0 ( + LSTMoutput[2].it.z./*+*/left + LSTMoutput[2].it.z./*+*/right + ) + LSTMoutput[2].it.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left + LSTMoutput[2].it.z./*+*/left./*+*/right + ) + LSTMoutput[2].it.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].output + ) + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[2].it.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/right./***/left + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[2].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[2].dh + ) + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].it.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[2].it.z./*+*/right.matrix + ) + LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[2].it.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[2].dc + ) + LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].mt : ElementTimes 0 x 0 ( + LSTMoutput[2].ot + LSTMoutput[2].mt./*.**/right + ) + LSTMoutput[2].mt./*.**/right : Tanh 0 x 0 ( + LSTMoutput[2].ct + ) + LSTMoutput[2].ot : Sigmoid 0 x 0 ( + LSTMoutput[2].ot.z + ) + LSTMoutput[2].ot.z : Plus 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left + LSTMoutput[2].ot.z./*+*/right + ) + LSTMoutput[2].ot.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left + LSTMoutput[2].ot.z./*+*/left./*+*/right + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[1].output + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[2].ot.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[2].dh + ) + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].ot.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[2].ot.z./*+*/right.matrix + ) + LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[2].ot.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[2].ct + ) + LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].output : Times 0 x 0 ( + LSTMoutput[2].Wmr + LSTMoutput[2].output./***/right + ) + LSTMoutput[2].output./***/right : Scale 0 x 0 ( + LSTMoutput[2].output./***/right.scalarScalingFactor + LSTMoutput[2].mt + ) + LSTMoutput[2].output./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[2].output./***/right.scalarScalingFactor.x + ) + LSTMoutput[2].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[2].Wmr : LearnableParameter 256 x 1024 () + LSTMoutput[3].bft : ElementTimes 0 x 0 ( + LSTMoutput[3].ft + LSTMoutput[3].dc + ) + LSTMoutput[3].bit : ElementTimes 0 x 0 ( + LSTMoutput[3].it + LSTMoutput[3].bit./*.**/right + ) + LSTMoutput[3].bit./*.**/right : Tanh 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z + ) + LSTMoutput[3].bit./*.**/right.z : Plus 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/left + LSTMoutput[3].bit./*.**/right.z./*+*/right + ) + LSTMoutput[3].bit./*.**/right.z./*+*/left : Times 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right + ) + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].output + ) + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].bit./*.**/right.z./*+*/right : Plus 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right + ) + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right + ) + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor + LSTMoutput[3].dh + ) + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[3].ct : Plus 0 x 0 ( + LSTMoutput[3].bft + LSTMoutput[3].bit + ) + LSTMoutput[3].dc : PastValue 1024 x 1 ( + LSTMoutput[3].ct + ) + LSTMoutput[3].dh : PastValue 256 x 1 ( + LSTMoutput[3].output + ) + LSTMoutput[3].ft : Sigmoid 0 x 0 ( + LSTMoutput[3].ft.z + ) + LSTMoutput[3].ft.z : Plus 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left + LSTMoutput[3].ft.z./*+*/right + ) + LSTMoutput[3].ft.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left + LSTMoutput[3].ft.z./*+*/left./*+*/right + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].output + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[3].ft.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[3].dh + ) + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].ft.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[3].ft.z./*+*/right.matrix + ) + LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[3].ft.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[3].dc + ) + LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].it : Sigmoid 0 x 0 ( + LSTMoutput[3].it.z + ) + LSTMoutput[3].it.z : Plus 0 x 0 ( + LSTMoutput[3].it.z./*+*/left + LSTMoutput[3].it.z./*+*/right + ) + LSTMoutput[3].it.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left + LSTMoutput[3].it.z./*+*/left./*+*/right + ) + LSTMoutput[3].it.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].output + ) + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[3].it.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/right./***/left + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[3].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[3].dh + ) + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].it.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[3].it.z./*+*/right.matrix + ) + LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[3].it.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[3].dc + ) + LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].mt : ElementTimes 0 x 0 ( + LSTMoutput[3].ot + LSTMoutput[3].mt./*.**/right + ) + LSTMoutput[3].mt./*.**/right : Tanh 0 x 0 ( + LSTMoutput[3].ct + ) + LSTMoutput[3].ot : Sigmoid 0 x 0 ( + LSTMoutput[3].ot.z + ) + LSTMoutput[3].ot.z : Plus 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left + LSTMoutput[3].ot.z./*+*/right + ) + LSTMoutput[3].ot.z./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left + LSTMoutput[3].ot.z./*+*/left./*+*/right + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left : Plus 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor + LSTMoutput[2].output + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 () + LSTMoutput[3].ot.z./*+*/left./*+*/right : Times 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right + ) + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 () + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor + LSTMoutput[3].dh + ) + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].ot.z./*+*/right : DiagTimes 0 x 0 ( + LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector + LSTMoutput[3].ot.z./*+*/right.matrix + ) + LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 () + LSTMoutput[3].ot.z./*+*/right.matrix : Scale 0 x 0 ( + LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor + LSTMoutput[3].ct + ) + LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x + ) + LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].output : Times 0 x 0 ( + LSTMoutput[3].Wmr + LSTMoutput[3].output./***/right + ) + LSTMoutput[3].output./***/right : Scale 0 x 0 ( + LSTMoutput[3].output./***/right.scalarScalingFactor + LSTMoutput[3].mt + ) + LSTMoutput[3].output./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutput[3].output./***/right.scalarScalingFactor.x + ) + LSTMoutput[3].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + LSTMoutput[3].Wmr : LearnableParameter 256 x 1024 () + LSTMoutputW : Plus 0 x 0 ( + LSTMoutputW./*+*/left + B + ) + LSTMoutputW./*+*/left : Times 0 x 0 ( + LSTMoutputW./*+*/left./***/left + LSTMoutputW./*+*/left./***/right + ) + LSTMoutputW./*+*/left./***/left : LearnableParameter 132 x 256 () + LSTMoutputW./*+*/left./***/right : Scale 0 x 0 ( + LSTMoutputW./*+*/left./***/right.scalarScalingFactor + LSTMoutput[3].output + ) + LSTMoutputW./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 ( + LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x + ) + LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 () + ScaledLogLikelihood : Minus 0 x 0 ( + LSTMoutputW + logPrior + ) +] GetTrainCriterionNodes ... GetEvalCriterionNodes ... nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output +LSTMoutput[1].mt./*.**/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].mt./*.**/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].mt./*.**/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].mt./*.**/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].mt./*.**/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].mt./*.**/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output -Validating node cr +Validating for node cr. 272 nodes to process in pass 1. -Validating --> labels = InputValue -Validating --> W = LearnableParameter -Validating --> sW = LearnableParameter -Validating --> expsW = Exp(sW[1, 1]) -Validating --> LSTMoutput3.Wmr = LearnableParameter -Validating --> LSTMoutput3.sWmr = LearnableParameter -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -Validating --> LSTMoutput3.Wxo = LearnableParameter -Validating --> LSTMoutput3.sWxo = LearnableParameter -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -Validating --> LSTMoutput2.Wmr = LearnableParameter -Validating --> LSTMoutput2.sWmr = LearnableParameter -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -Validating --> LSTMoutput2.Wxo = LearnableParameter -Validating --> LSTMoutput2.sWxo = LearnableParameter -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -Validating --> LSTMoutput1.Wmr = LearnableParameter -Validating --> LSTMoutput1.sWmr = LearnableParameter -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -Validating --> LSTMoutput1.Wxo = LearnableParameter -Validating --> LSTMoutput1.sWxo = LearnableParameter -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -Validating --> features = InputValue -Validating --> feashift = RowSlice(features[363, 1]) -Validating --> featNorm.xMean = Mean(feashift[33, 1]) -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 1]) -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 1]) -Validating --> LSTMoutput1.bo = LearnableParameter -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 1], LSTMoutput1.bo[1024, 1]) -Validating --> LSTMoutput1.Who = LearnableParameter -Validating --> LSTMoutput1.sWho = LearnableParameter -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -Validating --> LSTMoutput1.Wco = LearnableParameter -Validating --> LSTMoutput1.sWco = LearnableParameter -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -Validating --> LSTMoutput1.Wxf = LearnableParameter -Validating --> LSTMoutput1.sWxf = LearnableParameter -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 1]) -Validating --> LSTMoutput1.bf = LearnableParameter -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 1], LSTMoutput1.bf[1024, 1]) -Validating --> LSTMoutput1.Whf = LearnableParameter -Validating --> LSTMoutput1.sWhf = LearnableParameter -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -Validating --> LSTMoutput1.Wcf = LearnableParameter -Validating --> LSTMoutput1.sWcf = LearnableParameter -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -Validating --> LSTMoutput1.Wxi = LearnableParameter -Validating --> LSTMoutput1.sWxi = LearnableParameter -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 1]) -Validating --> LSTMoutput1.bi = LearnableParameter -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 1], LSTMoutput1.bi[1024, 1]) -Validating --> LSTMoutput1.Whi = LearnableParameter -Validating --> LSTMoutput1.sWhi = LearnableParameter -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -Validating --> LSTMoutput1.Wci = LearnableParameter -Validating --> LSTMoutput1.sWci = LearnableParameter -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -Validating --> LSTMoutput1.Wxc = LearnableParameter -Validating --> LSTMoutput1.sWxc = LearnableParameter -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 1]) -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 1]) -Validating --> LSTMoutput1.Whc = LearnableParameter -Validating --> LSTMoutput1.sWhc = LearnableParameter -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -Validating --> LSTMoutput1.bc = LearnableParameter -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, 1]) -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 1], LSTMoutput1.Whodh[1024, 1]) -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, 1]) -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 1], LSTMoutput1.Whfdh[1024, 1]) -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, 1]) -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 1], LSTMoutput1.Whidh[1024, 1]) -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, 1]) -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, 1]) -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1]) -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 1], LSTMoutput1.unnamed161[1024, 1]) -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 1]) -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.unnamed159[1024, 1]) -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 1]) -Validating --> LSTMoutput2.bo = LearnableParameter -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 1], LSTMoutput2.bo[1024, 1]) -Validating --> LSTMoutput2.Who = LearnableParameter -Validating --> LSTMoutput2.sWho = LearnableParameter -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -Validating --> LSTMoutput2.Wco = LearnableParameter -Validating --> LSTMoutput2.sWco = LearnableParameter -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -Validating --> LSTMoutput2.Wxf = LearnableParameter -Validating --> LSTMoutput2.sWxf = LearnableParameter -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 1]) -Validating --> LSTMoutput2.bf = LearnableParameter -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 1], LSTMoutput2.bf[1024, 1]) -Validating --> LSTMoutput2.Whf = LearnableParameter -Validating --> LSTMoutput2.sWhf = LearnableParameter -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -Validating --> LSTMoutput2.Wcf = LearnableParameter -Validating --> LSTMoutput2.sWcf = LearnableParameter -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -Validating --> LSTMoutput2.Wxi = LearnableParameter -Validating --> LSTMoutput2.sWxi = LearnableParameter -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 1]) -Validating --> LSTMoutput2.bi = LearnableParameter -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 1], LSTMoutput2.bi[1024, 1]) -Validating --> LSTMoutput2.Whi = LearnableParameter -Validating --> LSTMoutput2.sWhi = LearnableParameter -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -Validating --> LSTMoutput2.Wci = LearnableParameter -Validating --> LSTMoutput2.sWci = LearnableParameter -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -Validating --> LSTMoutput2.Wxc = LearnableParameter -Validating --> LSTMoutput2.sWxc = LearnableParameter -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 1]) -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 1]) -Validating --> LSTMoutput2.Whc = LearnableParameter -Validating --> LSTMoutput2.sWhc = LearnableParameter -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -Validating --> LSTMoutput2.bc = LearnableParameter -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, 1]) -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 1], LSTMoutput2.Whodh[1024, 1]) -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, 1]) -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 1], LSTMoutput2.Whfdh[1024, 1]) -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, 1]) -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 1], LSTMoutput2.Whidh[1024, 1]) -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, 1]) -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, 1]) -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1]) -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 1], LSTMoutput2.unnamed211[1024, 1]) -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 1]) -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.unnamed209[1024, 1]) -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 1]) -Validating --> LSTMoutput3.bo = LearnableParameter -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 1], LSTMoutput3.bo[1024, 1]) -Validating --> LSTMoutput3.Who = LearnableParameter -Validating --> LSTMoutput3.sWho = LearnableParameter -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -Validating --> LSTMoutput3.Wco = LearnableParameter -Validating --> LSTMoutput3.sWco = LearnableParameter -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -Validating --> LSTMoutput3.Wxf = LearnableParameter -Validating --> LSTMoutput3.sWxf = LearnableParameter -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 1]) -Validating --> LSTMoutput3.bf = LearnableParameter -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 1], LSTMoutput3.bf[1024, 1]) -Validating --> LSTMoutput3.Whf = LearnableParameter -Validating --> LSTMoutput3.sWhf = LearnableParameter -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -Validating --> LSTMoutput3.Wcf = LearnableParameter -Validating --> LSTMoutput3.sWcf = LearnableParameter -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -Validating --> LSTMoutput3.Wxi = LearnableParameter -Validating --> LSTMoutput3.sWxi = LearnableParameter -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 1]) -Validating --> LSTMoutput3.bi = LearnableParameter -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 1], LSTMoutput3.bi[1024, 1]) -Validating --> LSTMoutput3.Whi = LearnableParameter -Validating --> LSTMoutput3.sWhi = LearnableParameter -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -Validating --> LSTMoutput3.Wci = LearnableParameter -Validating --> LSTMoutput3.sWci = LearnableParameter -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -Validating --> LSTMoutput3.Wxc = LearnableParameter -Validating --> LSTMoutput3.sWxc = LearnableParameter -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 1]) -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 1]) -Validating --> LSTMoutput3.Whc = LearnableParameter -Validating --> LSTMoutput3.sWhc = LearnableParameter -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -Validating --> LSTMoutput3.bc = LearnableParameter -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, 1]) -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, 1]) -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 1], LSTMoutput3.Whodh[1024, 1]) -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, 1]) -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 1], LSTMoutput3.Whfdh[1024, 1]) -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, 1]) -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 1], LSTMoutput3.Whidh[1024, 1]) -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, 1]) -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, 1]) -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1]) -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 1], LSTMoutput3.unnamed261[1024, 1]) -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 1]) -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.unnamed259[1024, 1]) -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 1]) -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 1]) -Validating --> b = LearnableParameter -Validating --> LSTMoutputW = Plus(unnamed283[132, 1], b[132, 1]) -Validating --> cr = CrossEntropyWithSoftmax(labels[132, 1], LSTMoutputW[132, 1]) +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] -Found 6 PreCompute nodes - NodeName: featNorm.xMean - NodeName: featNorm.xStdDev - NodeName: logPrior.Prior - NodeName: featNorm.xMean - NodeName: featNorm.xStdDev - NodeName: logPrior.Prior +Validating for node cr. 183 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] + +Validating for node cr. 60 nodes to process in pass 3. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] + +Validating for node cr, final verification. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1] +Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1] + +127 out of 272 nodes do not share the minibatch layout with the input data. + + +Precomputing --> 3 PreCompute nodes found. + + NodeName: featNorm.invStdDevVector + NodeName: featNorm.meanVector + NodeName: logPrior.x minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output -Validating node featNorm.xMean +Validating for node featNorm.invStdDevVector. 3 nodes to process in pass 1. -Validating --> features = InputValue -Validating --> feashift = RowSlice(features[363, 640]) -Validating --> featNorm.xMean = Mean(feashift[33, 640]) +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] + +Validating for node featNorm.invStdDevVector, final verification. + +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1] + +1 out of 3 nodes do not share the minibatch layout with the input data. nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output -Validating node featNorm.xStdDev +Validating for node featNorm.meanVector. 3 nodes to process in pass 1. -Validating --> features = InputValue -Validating --> feashift = RowSlice(features[363, 640]) -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 640]) +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] + +Validating for node featNorm.meanVector, final verification. + +Validating --> features = InputValue -> [363, MBSize 1] +Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1] + +1 out of 3 nodes do not share the minibatch layout with the input data. nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output -Validating node logPrior.Prior +Validating for node logPrior.x. 2 nodes to process in pass 1. -Validating --> labels = InputValue -Validating --> logPrior.Prior = Mean(labels[132, 640]) +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1] + +Validating for node logPrior.x. 1 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1] + +Validating for node logPrior.x, final verification. + +Validating --> labels = InputValue -> [132, MBSize 1] +Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1] + +1 out of 2 nodes do not share the minibatch layout with the input data. + + +Precomputing --> Completed. Set Max Temp Mem Size For Convolution Nodes to 0 samples. -Starting Epoch 1: learning rate per sample = 0.000781 momentum = 0.000000 +Starting Epoch 1: learning rate per sample = 0.000781 effective momentum = 0.000000 minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses + nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output nodes in the recurrent loops : +LSTMoutput[1].dh LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right LSTMoutput[1].ot.z./*+*/left./*+*/right LSTMoutput[1].ot.z./*+*/left LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right LSTMoutput[1].ft.z./*+*/left./*+*/right LSTMoutput[1].ft.z./*+*/left LSTMoutput[1].dc LSTMoutput[1].ft.z./*+*/right.matrix LSTMoutput[1].ft.z./*+*/right LSTMoutput[1].ft.z LSTMoutput[1].ft LSTMoutput[1].bft LSTMoutput[1].it.z./*+*/left./*+*/right./***/right LSTMoutput[1].it.z./*+*/left./*+*/right LSTMoutput[1].it.z./*+*/left LSTMoutput[1].it.z./*+*/right.matrix LSTMoutput[1].it.z./*+*/right LSTMoutput[1].it.z LSTMoutput[1].it LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[1].bit./*.**/right.z./*+*/right LSTMoutput[1].bit./*.**/right.z LSTMoutput[1].bit./*.**/right LSTMoutput[1].bit LSTMoutput[1].ct LSTMoutput[1].ot.z./*+*/right.matrix LSTMoutput[1].ot.z./*+*/right LSTMoutput[1].ot.z LSTMoutput[1].ot LSTMoutput[1].mt./*.**/right LSTMoutput[1].mt LSTMoutput[1].output./***/right LSTMoutput[1].output nodes in the recurrent loops : +LSTMoutput[2].dh LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right LSTMoutput[2].ot.z./*+*/left./*+*/right LSTMoutput[2].ot.z./*+*/left LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right LSTMoutput[2].ft.z./*+*/left./*+*/right LSTMoutput[2].ft.z./*+*/left LSTMoutput[2].dc LSTMoutput[2].ft.z./*+*/right.matrix LSTMoutput[2].ft.z./*+*/right LSTMoutput[2].ft.z LSTMoutput[2].ft LSTMoutput[2].bft LSTMoutput[2].it.z./*+*/left./*+*/right./***/right LSTMoutput[2].it.z./*+*/left./*+*/right LSTMoutput[2].it.z./*+*/left LSTMoutput[2].it.z./*+*/right.matrix LSTMoutput[2].it.z./*+*/right LSTMoutput[2].it.z LSTMoutput[2].it LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[2].bit./*.**/right.z./*+*/right LSTMoutput[2].bit./*.**/right.z LSTMoutput[2].bit./*.**/right LSTMoutput[2].bit LSTMoutput[2].ct LSTMoutput[2].ot.z./*+*/right.matrix LSTMoutput[2].ot.z./*+*/right LSTMoutput[2].ot.z LSTMoutput[2].ot LSTMoutput[2].mt./*.**/right LSTMoutput[2].mt LSTMoutput[2].output./***/right LSTMoutput[2].output nodes in the recurrent loops : +LSTMoutput[3].dh LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right LSTMoutput[3].ot.z./*+*/left./*+*/right LSTMoutput[3].ot.z./*+*/left LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right LSTMoutput[3].ft.z./*+*/left./*+*/right LSTMoutput[3].ft.z./*+*/left LSTMoutput[3].dc LSTMoutput[3].ft.z./*+*/right.matrix LSTMoutput[3].ft.z./*+*/right LSTMoutput[3].ft.z LSTMoutput[3].ft LSTMoutput[3].bft LSTMoutput[3].it.z./*+*/left./*+*/right./***/right LSTMoutput[3].it.z./*+*/left./*+*/right LSTMoutput[3].it.z./*+*/left LSTMoutput[3].it.z./*+*/right.matrix LSTMoutput[3].it.z./*+*/right LSTMoutput[3].it.z LSTMoutput[3].it LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left LSTMoutput[3].bit./*.**/right.z./*+*/right LSTMoutput[3].bit./*.**/right.z LSTMoutput[3].bit./*.**/right LSTMoutput[3].bit LSTMoutput[3].ct LSTMoutput[3].ot.z./*+*/right.matrix LSTMoutput[3].ot.z./*+*/right LSTMoutput[3].ot.z LSTMoutput[3].ot LSTMoutput[3].mt./*.**/right LSTMoutput[3].mt LSTMoutput[3].output./***/right LSTMoutput[3].output + +Validating for node Err. 272 nodes to process in pass 1. + +Validating --> labels = InputValue -> [132, MBSize 640] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 640] +Validating --> feashift = RowSlice(features[363, MBSize 640]) -> [33, MBSize 640] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 640]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 640]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 640], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 640], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 640], LSTMoutput[1].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 640], LSTMoutput[1].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 640], LSTMoutput[1].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 640], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 640], LSTMoutput[2].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 640], LSTMoutput[2].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 640], LSTMoutput[2].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 640], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 640], LSTMoutput[3].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 640], LSTMoutput[3].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 640], LSTMoutput[3].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 640]) -> [132, MBSize 640] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 640], B[132, 1]) -> [132, MBSize 640] +Validating --> Err = ErrorPrediction(labels[132, MBSize 640], LSTMoutputW[132, MBSize 640]) -> [1, 1] + +Validating for node Err. 180 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 640] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 640] +Validating --> feashift = RowSlice(features[363, MBSize 640]) -> [33, MBSize 640] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 640]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 640]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 640], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 640], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 640], LSTMoutput[1].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 640], LSTMoutput[1].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 640], LSTMoutput[1].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 640], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 640], LSTMoutput[2].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 640], LSTMoutput[2].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 640], LSTMoutput[2].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 640], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 640], LSTMoutput[3].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 640], LSTMoutput[3].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 640], LSTMoutput[3].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 640]) -> [132, MBSize 640] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 640], B[132, 1]) -> [132, MBSize 640] +Validating --> Err = ErrorPrediction(labels[132, MBSize 640], LSTMoutputW[132, MBSize 640]) -> [1, 1] + +Validating for node Err. 6 nodes to process in pass 3. + +Validating --> labels = InputValue -> [132, MBSize 640] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 640] +Validating --> feashift = RowSlice(features[363, MBSize 640]) -> [33, MBSize 640] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 640]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 640]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 640], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 640], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 640], LSTMoutput[1].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 640], LSTMoutput[1].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 640], LSTMoutput[1].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 640], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 640], LSTMoutput[2].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 640], LSTMoutput[2].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 640], LSTMoutput[2].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 640], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 640], LSTMoutput[3].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 640], LSTMoutput[3].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 640], LSTMoutput[3].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 640]) -> [132, MBSize 640] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 640], B[132, 1]) -> [132, MBSize 640] +Validating --> Err = ErrorPrediction(labels[132, MBSize 640], LSTMoutputW[132, MBSize 640]) -> [1, 1] + +Validating for node Err, final verification. + +Validating --> labels = InputValue -> [132, MBSize 640] +Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> features = InputValue -> [363, MBSize 640] +Validating --> feashift = RowSlice(features[363, MBSize 640]) -> [33, MBSize 640] +Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 640]) -> [33, 1] +Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 640]) -> [33, 1] +Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 640], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 640], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 640], LSTMoutput[1].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 640], LSTMoutput[1].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 640], LSTMoutput[1].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 640], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 640], LSTMoutput[2].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 640], LSTMoutput[2].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 640], LSTMoutput[2].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1] +Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 640], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 640], LSTMoutput[3].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 640], LSTMoutput[3].bit[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 640], LSTMoutput[3].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 640]) -> [1024, MBSize 640] +Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640] +Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 640]) -> [132, MBSize 640] +Validating --> B = LearnableParameter -> [132, 1] +Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 640], B[132, 1]) -> [132, MBSize 640] +Validating --> Err = ErrorPrediction(labels[132, MBSize 640], LSTMoutputW[132, MBSize 640]) -> [1, 1] + +127 out of 272 nodes do not share the minibatch layout with the input data. + Starting minibatch loop. - nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output nodes in the recurrent loops : -LSTMoutput1.dh LSTMoutput1.unnamed169 LSTMoutput1.Whodh LSTMoutput1.unnamed172 LSTMoutput1.unnamed163 LSTMoutput1.Whfdh LSTMoutput1.unnamed166 LSTMoutput1.dc LSTMoutput1.unnamed164 LSTMoutput1.Wcfdc LSTMoutput1.unnamed165 LSTMoutput1.ft LSTMoutput1.bft LSTMoutput1.unnamed152 LSTMoutput1.Whidh LSTMoutput1.unnamed155 LSTMoutput1.unnamed153 LSTMoutput1.Wcidc LSTMoutput1.unnamed154 LSTMoutput1.it LSTMoutput1.unnamed158 LSTMoutput1.Whcdh LSTMoutput1.unnamed161 LSTMoutput1.unnamed160 LSTMoutput1.unnamed159 LSTMoutput1.bit LSTMoutput1.ct LSTMoutput1.unnamed170 LSTMoutput1.Wcoct LSTMoutput1.unnamed171 LSTMoutput1.ot LSTMoutput1.unnamed174 LSTMoutput1.mt LSTMoutput1.unnamed175 LSTMoutput1.output nodes in the recurrent loops : -LSTMoutput2.dh LSTMoutput2.unnamed219 LSTMoutput2.Whodh LSTMoutput2.unnamed222 LSTMoutput2.unnamed213 LSTMoutput2.Whfdh LSTMoutput2.unnamed216 LSTMoutput2.dc LSTMoutput2.unnamed214 LSTMoutput2.Wcfdc LSTMoutput2.unnamed215 LSTMoutput2.ft LSTMoutput2.bft LSTMoutput2.unnamed202 LSTMoutput2.Whidh LSTMoutput2.unnamed205 LSTMoutput2.unnamed203 LSTMoutput2.Wcidc LSTMoutput2.unnamed204 LSTMoutput2.it LSTMoutput2.unnamed208 LSTMoutput2.Whcdh LSTMoutput2.unnamed211 LSTMoutput2.unnamed210 LSTMoutput2.unnamed209 LSTMoutput2.bit LSTMoutput2.ct LSTMoutput2.unnamed220 LSTMoutput2.Wcoct LSTMoutput2.unnamed221 LSTMoutput2.ot LSTMoutput2.unnamed224 LSTMoutput2.mt LSTMoutput2.unnamed225 LSTMoutput2.output nodes in the recurrent loops : -LSTMoutput3.dh LSTMoutput3.unnamed269 LSTMoutput3.Whodh LSTMoutput3.unnamed272 LSTMoutput3.unnamed263 LSTMoutput3.Whfdh LSTMoutput3.unnamed266 LSTMoutput3.dc LSTMoutput3.unnamed264 LSTMoutput3.Wcfdc LSTMoutput3.unnamed265 LSTMoutput3.ft LSTMoutput3.bft LSTMoutput3.unnamed252 LSTMoutput3.Whidh LSTMoutput3.unnamed255 LSTMoutput3.unnamed253 LSTMoutput3.Wcidc LSTMoutput3.unnamed254 LSTMoutput3.it LSTMoutput3.unnamed258 LSTMoutput3.Whcdh LSTMoutput3.unnamed261 LSTMoutput3.unnamed260 LSTMoutput3.unnamed259 LSTMoutput3.bit LSTMoutput3.ct LSTMoutput3.unnamed270 LSTMoutput3.Wcoct LSTMoutput3.unnamed271 LSTMoutput3.ot LSTMoutput3.unnamed274 LSTMoutput3.mt LSTMoutput3.unnamed275 LSTMoutput3.output - -Validating node Err - -Validating --> labels = InputValue -Validating --> W = LearnableParameter -Validating --> sW = LearnableParameter -Validating --> expsW = Exp(sW[1, 1]) -Validating --> LSTMoutput3.Wmr = LearnableParameter -Validating --> LSTMoutput3.sWmr = LearnableParameter -Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -Validating --> LSTMoutput3.Wxo = LearnableParameter -Validating --> LSTMoutput3.sWxo = LearnableParameter -Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -Validating --> LSTMoutput2.Wmr = LearnableParameter -Validating --> LSTMoutput2.sWmr = LearnableParameter -Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -Validating --> LSTMoutput2.Wxo = LearnableParameter -Validating --> LSTMoutput2.sWxo = LearnableParameter -Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -Validating --> LSTMoutput1.Wmr = LearnableParameter -Validating --> LSTMoutput1.sWmr = LearnableParameter -Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -Validating --> LSTMoutput1.Wxo = LearnableParameter -Validating --> LSTMoutput1.sWxo = LearnableParameter -Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -Validating --> features = InputValue -Validating --> feashift = RowSlice(features[363, 640]) -Validating --> featNorm.xMean = Mean(feashift[33, 640]) -Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 640]) -Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 640], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 640]) -Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 640]) -Validating --> LSTMoutput1.bo = LearnableParameter -Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 640], LSTMoutput1.bo[1024, 1]) -Validating --> LSTMoutput1.Who = LearnableParameter -Validating --> LSTMoutput1.sWho = LearnableParameter -Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -Validating --> LSTMoutput1.Wco = LearnableParameter -Validating --> LSTMoutput1.sWco = LearnableParameter -Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -Validating --> LSTMoutput1.Wxf = LearnableParameter -Validating --> LSTMoutput1.sWxf = LearnableParameter -Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 640]) -Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 640]) -Validating --> LSTMoutput1.bf = LearnableParameter -Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 640], LSTMoutput1.bf[1024, 1]) -Validating --> LSTMoutput1.Whf = LearnableParameter -Validating --> LSTMoutput1.sWhf = LearnableParameter -Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -Validating --> LSTMoutput1.Wcf = LearnableParameter -Validating --> LSTMoutput1.sWcf = LearnableParameter -Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -Validating --> LSTMoutput1.Wxi = LearnableParameter -Validating --> LSTMoutput1.sWxi = LearnableParameter -Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 640]) -Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 640]) -Validating --> LSTMoutput1.bi = LearnableParameter -Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 640], LSTMoutput1.bi[1024, 1]) -Validating --> LSTMoutput1.Whi = LearnableParameter -Validating --> LSTMoutput1.sWhi = LearnableParameter -Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -Validating --> LSTMoutput1.Wci = LearnableParameter -Validating --> LSTMoutput1.sWci = LearnableParameter -Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -Validating --> LSTMoutput1.Wxc = LearnableParameter -Validating --> LSTMoutput1.sWxc = LearnableParameter -Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 640]) -Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 640]) -Validating --> LSTMoutput1.Whc = LearnableParameter -Validating --> LSTMoutput1.sWhc = LearnableParameter -Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -Validating --> LSTMoutput1.bc = LearnableParameter -Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, 640]) -Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, 640]) -Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, 640]) -Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 640], LSTMoutput1.Whodh[1024, 640]) -Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, 640]) -Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, 640]) -Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 640], LSTMoutput1.Whfdh[1024, 640]) -Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 640], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, 640]) -Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, 640]) -Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 640], LSTMoutput1.Whidh[1024, 640]) -Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 640], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, 640]) -Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, 640]) -Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 640], LSTMoutput1.bc[1024, 1]) -Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 640], LSTMoutput1.unnamed161[1024, 640]) -Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 640]) -Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput1.unnamed159[1024, 640]) -Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 640], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 640]) -Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 640]) -Validating --> LSTMoutput2.bo = LearnableParameter -Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 640], LSTMoutput2.bo[1024, 1]) -Validating --> LSTMoutput2.Who = LearnableParameter -Validating --> LSTMoutput2.sWho = LearnableParameter -Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -Validating --> LSTMoutput2.Wco = LearnableParameter -Validating --> LSTMoutput2.sWco = LearnableParameter -Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -Validating --> LSTMoutput2.Wxf = LearnableParameter -Validating --> LSTMoutput2.sWxf = LearnableParameter -Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 640]) -Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 640]) -Validating --> LSTMoutput2.bf = LearnableParameter -Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 640], LSTMoutput2.bf[1024, 1]) -Validating --> LSTMoutput2.Whf = LearnableParameter -Validating --> LSTMoutput2.sWhf = LearnableParameter -Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -Validating --> LSTMoutput2.Wcf = LearnableParameter -Validating --> LSTMoutput2.sWcf = LearnableParameter -Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -Validating --> LSTMoutput2.Wxi = LearnableParameter -Validating --> LSTMoutput2.sWxi = LearnableParameter -Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 640]) -Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 640]) -Validating --> LSTMoutput2.bi = LearnableParameter -Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 640], LSTMoutput2.bi[1024, 1]) -Validating --> LSTMoutput2.Whi = LearnableParameter -Validating --> LSTMoutput2.sWhi = LearnableParameter -Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -Validating --> LSTMoutput2.Wci = LearnableParameter -Validating --> LSTMoutput2.sWci = LearnableParameter -Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -Validating --> LSTMoutput2.Wxc = LearnableParameter -Validating --> LSTMoutput2.sWxc = LearnableParameter -Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 640]) -Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 640]) -Validating --> LSTMoutput2.Whc = LearnableParameter -Validating --> LSTMoutput2.sWhc = LearnableParameter -Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -Validating --> LSTMoutput2.bc = LearnableParameter -Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, 640]) -Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, 640]) -Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, 640]) -Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 640], LSTMoutput2.Whodh[1024, 640]) -Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, 640]) -Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, 640]) -Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 640], LSTMoutput2.Whfdh[1024, 640]) -Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 640], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, 640]) -Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, 640]) -Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 640], LSTMoutput2.Whidh[1024, 640]) -Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 640], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, 640]) -Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, 640]) -Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 640], LSTMoutput2.bc[1024, 1]) -Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 640], LSTMoutput2.unnamed211[1024, 640]) -Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 640]) -Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput2.unnamed209[1024, 640]) -Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 640], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 640]) -Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 640]) -Validating --> LSTMoutput3.bo = LearnableParameter -Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 640], LSTMoutput3.bo[1024, 1]) -Validating --> LSTMoutput3.Who = LearnableParameter -Validating --> LSTMoutput3.sWho = LearnableParameter -Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -Validating --> LSTMoutput3.Wco = LearnableParameter -Validating --> LSTMoutput3.sWco = LearnableParameter -Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -Validating --> LSTMoutput3.Wxf = LearnableParameter -Validating --> LSTMoutput3.sWxf = LearnableParameter -Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 640]) -Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 640]) -Validating --> LSTMoutput3.bf = LearnableParameter -Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 640], LSTMoutput3.bf[1024, 1]) -Validating --> LSTMoutput3.Whf = LearnableParameter -Validating --> LSTMoutput3.sWhf = LearnableParameter -Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -Validating --> LSTMoutput3.Wcf = LearnableParameter -Validating --> LSTMoutput3.sWcf = LearnableParameter -Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -Validating --> LSTMoutput3.Wxi = LearnableParameter -Validating --> LSTMoutput3.sWxi = LearnableParameter -Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 640]) -Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 640]) -Validating --> LSTMoutput3.bi = LearnableParameter -Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 640], LSTMoutput3.bi[1024, 1]) -Validating --> LSTMoutput3.Whi = LearnableParameter -Validating --> LSTMoutput3.sWhi = LearnableParameter -Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -Validating --> LSTMoutput3.Wci = LearnableParameter -Validating --> LSTMoutput3.sWci = LearnableParameter -Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -Validating --> LSTMoutput3.Wxc = LearnableParameter -Validating --> LSTMoutput3.sWxc = LearnableParameter -Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 640]) -Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 640]) -Validating --> LSTMoutput3.Whc = LearnableParameter -Validating --> LSTMoutput3.sWhc = LearnableParameter -Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -Validating --> LSTMoutput3.bc = LearnableParameter -Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, 640]) -Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, 640]) -Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, 640]) -Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 640], LSTMoutput3.Whodh[1024, 640]) -Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, 640]) -Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, 640]) -Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 640], LSTMoutput3.Whfdh[1024, 640]) -Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 640], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, 640]) -Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, 640]) -Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 640], LSTMoutput3.Whidh[1024, 640]) -Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 640], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, 640]) -Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, 640]) -Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 640], LSTMoutput3.bc[1024, 1]) -Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 640], LSTMoutput3.unnamed261[1024, 640]) -Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 640]) -Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput3.unnamed259[1024, 640]) -Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 640], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, 640]) -Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 640]) -Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 640]) -Validating --> b = LearnableParameter -Validating --> LSTMoutputW = Plus(unnamed283[132, 640], b[132, 1]) -Validating --> Err = ErrorPrediction(labels[132, 640], LSTMoutputW[132, 640]) - - Epoch[ 1 of 4]-Minibatch[ 1- 10 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.78817415; EvalErr[0]PerSample = 0.89125001; TotalTime = 17.48173s; TotalTimePerSample = 2.73152ms; SamplesPerSecond = 366 - Epoch[ 1 of 4]-Minibatch[ 11- 20 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.59419441; EvalErr[0]PerSample = 0.86328125; TotalTime = 18.07901s; TotalTimePerSample = 2.82485ms; SamplesPerSecond = 354 - Epoch[ 1 of 4]-Minibatch[ 21- 30 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.52217722; EvalErr[0]PerSample = 0.81859374; TotalTime = 15.52239s; TotalTimePerSample = 2.42537ms; SamplesPerSecond = 412 -Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 4.5853896; EvalErrPerSample = 0.84082031; Ave LearnRatePerSample = 0.0007812500116; EpochTime=54.814574 -Starting Epoch 2: learning rate per sample = 0.000781 momentum = 0.899991 + Epoch[ 1 of 4]-Minibatch[ 1- 10 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.78817383; EvalErr[0]PerSample = 0.89125000; TotalTime = 20.56791s; TotalTimePerSample = 3.21374ms; SamplesPerSecond = 311 + Epoch[ 1 of 4]-Minibatch[ 11- 20 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.59419434; EvalErr[0]PerSample = 0.86328125; TotalTime = 20.21589s; TotalTimePerSample = 3.15873ms; SamplesPerSecond = 316 + Epoch[ 1 of 4]-Minibatch[ 21- 30 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.52217773; EvalErr[0]PerSample = 0.81859375; TotalTime = 22.25449s; TotalTimePerSample = 3.47726ms; SamplesPerSecond = 287 +Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 4.5853896; EvalErrPerSample = 0.84082031; Ave LearnRatePerSample = 0.0007812500116; EpochTime=68.996574 +Starting Epoch 2: learning rate per sample = 0.000781 effective momentum = 0.900000 minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20632), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 2 of 4]-Minibatch[ 1- 10 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.29597616; EvalErr[0]PerSample = 0.82859373; TotalTime = 16.34016s; TotalTimePerSample = 2.55315ms; SamplesPerSecond = 391 - Epoch[ 2 of 4]-Minibatch[ 11- 20 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.27295351; EvalErr[0]PerSample = 0.87312502; TotalTime = 17.48450s; TotalTimePerSample = 2.73195ms; SamplesPerSecond = 366 - Epoch[ 2 of 4]-Minibatch[ 21- 30 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 3.95423460; EvalErr[0]PerSample = 0.82499999; TotalTime = 17.16935s; TotalTimePerSample = 2.68271ms; SamplesPerSecond = 372 -Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 4.1132793; EvalErrPerSample = 0.83588868; Ave LearnRatePerSample = 0.0007812500116; EpochTime=55.11008 -Starting Epoch 3: learning rate per sample = 0.000781 momentum = 0.899991 + Epoch[ 2 of 4]-Minibatch[ 1- 10 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.29597595; EvalErr[0]PerSample = 0.82859375; TotalTime = 20.96682s; TotalTimePerSample = 3.27607ms; SamplesPerSecond = 305 + Epoch[ 2 of 4]-Minibatch[ 11- 20 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.27295776; EvalErr[0]PerSample = 0.87312500; TotalTime = 20.34551s; TotalTimePerSample = 3.17899ms; SamplesPerSecond = 314 + Epoch[ 2 of 4]-Minibatch[ 21- 30 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 3.95423523; EvalErr[0]PerSample = 0.82500000; TotalTime = 20.81350s; TotalTimePerSample = 3.25211ms; SamplesPerSecond = 307 +Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 4.1132798; EvalErrPerSample = 0.83588868; Ave LearnRatePerSample = 0.0007812500116; EpochTime=66.807404 +Starting Epoch 3: learning rate per sample = 0.000781 effective momentum = 0.900000 minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40962), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 3 of 4]-Minibatch[ 1- 10 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.17982197; EvalErr[0]PerSample = 0.85281253; TotalTime = 16.15247s; TotalTimePerSample = 2.52382ms; SamplesPerSecond = 396 - Epoch[ 3 of 4]-Minibatch[ 11- 20 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.16644049; EvalErr[0]PerSample = 0.86703128; TotalTime = 15.53962s; TotalTimePerSample = 2.42807ms; SamplesPerSecond = 411 - Epoch[ 3 of 4]-Minibatch[ 21- 30 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 3.95540762; EvalErr[0]PerSample = 0.83859372; TotalTime = 18.71239s; TotalTimePerSample = 2.92381ms; SamplesPerSecond = 342 -Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 4.0661387; EvalErrPerSample = 0.84653324; Ave LearnRatePerSample = 0.0007812500116; EpochTime=54.14235 -Starting Epoch 4: learning rate per sample = 0.000781 momentum = 0.899991 + Epoch[ 3 of 4]-Minibatch[ 1- 10 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.17982239; EvalErr[0]PerSample = 0.85281250; TotalTime = 18.89055s; TotalTimePerSample = 2.95165ms; SamplesPerSecond = 338 + Epoch[ 3 of 4]-Minibatch[ 11- 20 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.16644226; EvalErr[0]PerSample = 0.86703125; TotalTime = 20.64840s; TotalTimePerSample = 3.22631ms; SamplesPerSecond = 309 + Epoch[ 3 of 4]-Minibatch[ 21- 30 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 3.95540649; EvalErr[0]PerSample = 0.83859375; TotalTime = 20.57245s; TotalTimePerSample = 3.21444ms; SamplesPerSecond = 311 +Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 4.0661392; EvalErrPerSample = 0.84653324; Ave LearnRatePerSample = 0.0007812500116; EpochTime=64.052172 +Starting Epoch 4: learning rate per sample = 0.000781 effective momentum = 0.900000 minibatchiterator: epoch 3: frames [61440..81920] (first utterance at frame 61554), data subset 0 of 1, with 1 datapasses Starting minibatch loop. - Epoch[ 4 of 4]-Minibatch[ 1- 10 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.06800747; EvalErr[0]PerSample = 0.82734376; TotalTime = 17.96433s; TotalTimePerSample = 2.80693ms; SamplesPerSecond = 356 - Epoch[ 4 of 4]-Minibatch[ 11- 20 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.10716391; EvalErr[0]PerSample = 0.88249999; TotalTime = 15.48745s; TotalTimePerSample = 2.41991ms; SamplesPerSecond = 413 - Epoch[ 4 of 4]-Minibatch[ 21- 30 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 3.91763616; EvalErr[0]PerSample = 0.82390624; TotalTime = 16.49760s; TotalTimePerSample = 2.57775ms; SamplesPerSecond = 387 -Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 3.9796886; EvalErrPerSample = 0.82807618; Ave LearnRatePerSample = 0.0007812500116; EpochTime=63.545066 + Epoch[ 4 of 4]-Minibatch[ 1- 10 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.06800842; EvalErr[0]PerSample = 0.82734375; TotalTime = 18.79745s; TotalTimePerSample = 2.93710ms; SamplesPerSecond = 340 + Epoch[ 4 of 4]-Minibatch[ 11- 20 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 4.10716370; EvalErr[0]PerSample = 0.88250000; TotalTime = 18.98044s; TotalTimePerSample = 2.96569ms; SamplesPerSecond = 337 + Epoch[ 4 of 4]-Minibatch[ 21- 30 of 1024]: SamplesSeen = 6400; TrainLossPerSample = 3.91763550; EvalErr[0]PerSample = 0.82390625; TotalTime = 18.64471s; TotalTimePerSample = 2.91324ms; SamplesPerSecond = 343 +Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 3.9796886; EvalErrPerSample = 0.82807618; Ave LearnRatePerSample = 0.0007812500116; EpochTime=60.617335 +CNTKCommandTrainEnd: speechTrain COMPLETED diff --git a/Tests/Speech/QuickE2E/baseline.gpu.txt b/Tests/Speech/QuickE2E/baseline.gpu.txt index e66a6fb45..f37d41184 100644 --- a/Tests/Speech/QuickE2E/baseline.gpu.txt +++ b/Tests/Speech/QuickE2E/baseline.gpu.txt @@ -1,7 +1,7 @@ -=== Running /home/vlivan/cntk/bin/x86_64.gpu.release.acml/cntk configFile=/home/vlivan/cntk/Tests/Speech/QuickE2E/cntk.config RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu DataDir=/home/vlivan/cntk/Tests/Speech/Data DeviceId=Auto -running on localhost at 2015/07/29 19:11:08 -command line options: -configFile=/home/vlivan/cntk/Tests/Speech/QuickE2E/cntk.config RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu DataDir=/home/vlivan/cntk/Tests/Speech/Data DeviceId=Auto +=== Running /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E/cntk.config RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E DeviceId=0 +running on localhost at 2015/10/24 12:49:00 +command line: +/home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E/cntk.config RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E DeviceId=0 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> precision=float @@ -23,6 +23,31 @@ speechTrain=[ uniformInit=true needPrior=true ] + ExperimentalNetworkBuilder=[ // the same as above but with BS + layerSizes=363:512:512:132 + trainingCriterion='CE' + evalCriterion='Err' + applyMeanVarNorm=true + L = Length(layerSizes)-1 // number of model layers + features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') + featNorm = if applyMeanVarNorm + then MeanVarNorm(features) + else features + layers[layer:1..L-1] = if layer > 1 + then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) + else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) + outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) + outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) + CE = if trainingCriterion == 'CE' + then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') + else Fail('unknown trainingCriterion ' + trainingCriterion) + Err = if evalCriterion == 'Err' then + ErrorPrediction(labels, outZ, tag='eval') + else Fail('unknown evalCriterion ' + evalCriterion) + logPrior = LogPrior(labels) + // TODO: how to add a tag to an infix operation? + ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') + ] SGD=[ epochSize=20480 minibatchSize=64:256:1024: @@ -61,21 +86,22 @@ speechTrain=[ ] ] ] -RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu -DataDir=/home/vlivan/cntk/Tests/Speech/Data -DeviceId=Auto +RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu +DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E +DeviceId=0 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> precision=float command=speechTrain -deviceId=Auto +deviceId=0 parallelTrain=false speechTrain=[ action=train - modelPath=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn - deviceId=Auto + modelPath=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn + deviceId=0 traceLevel=1 SimpleNetworkBuilder=[ layerSizes=363:512:512:132 @@ -87,6 +113,31 @@ speechTrain=[ uniformInit=true needPrior=true ] + ExperimentalNetworkBuilder=[ // the same as above but with BS + layerSizes=363:512:512:132 + trainingCriterion='CE' + evalCriterion='Err' + applyMeanVarNorm=true + L = Length(layerSizes)-1 // number of model layers + features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') + featNorm = if applyMeanVarNorm + then MeanVarNorm(features) + else features + layers[layer:1..L-1] = if layer > 1 + then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) + else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) + outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) + outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) + CE = if trainingCriterion == 'CE' + then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') + else Fail('unknown trainingCriterion ' + trainingCriterion) + Err = if evalCriterion == 'Err' then + ErrorPrediction(labels, outZ, tag='eval') + else Fail('unknown evalCriterion ' + evalCriterion) + logPrior = LogPrior(labels) + // TODO: how to add a tag to an infix operation? + ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') + ] SGD=[ epochSize=20480 minibatchSize=64:256:1024: @@ -118,30 +169,32 @@ speechTrain=[ scpFile=glob_0000.scp ] labels=[ - mlfFile=/home/vlivan/cntk/Tests/Speech/Data/glob_0000.mlf - labelMappingFile=/home/vlivan/cntk/Tests/Speech/Data/state.list + mlfFile=/home/mluser/src/cplx_master/Tests/Speech/Data/glob_0000.mlf + labelMappingFile=/home/mluser/src/cplx_master/Tests/Speech/Data/state.list labelDim=132 labelType=Category ] ] ] -RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu -DataDir=/home/vlivan/cntk/Tests/Speech/Data -DeviceId=Auto +RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu +DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E +DeviceId=0 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> configparameters: cntk.config:command=speechTrain -configparameters: cntk.config:DataDir=/home/vlivan/cntk/Tests/Speech/Data -configparameters: cntk.config:deviceId=Auto +configparameters: cntk.config:ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E +configparameters: cntk.config:DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +configparameters: cntk.config:deviceId=0 configparameters: cntk.config:parallelTrain=false configparameters: cntk.config:precision=float -configparameters: cntk.config:RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu +configparameters: cntk.config:RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu configparameters: cntk.config:speechTrain=[ action=train - modelPath=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn - deviceId=Auto + modelPath=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn + deviceId=0 traceLevel=1 SimpleNetworkBuilder=[ layerSizes=363:512:512:132 @@ -153,6 +206,31 @@ configparameters: cntk.config:speechTrain=[ uniformInit=true needPrior=true ] + ExperimentalNetworkBuilder=[ // the same as above but with BS + layerSizes=363:512:512:132 + trainingCriterion='CE' + evalCriterion='Err' + applyMeanVarNorm=true + L = Length(layerSizes)-1 // number of model layers + features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') + featNorm = if applyMeanVarNorm + then MeanVarNorm(features) + else features + layers[layer:1..L-1] = if layer > 1 + then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) + else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) + outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) + outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) + CE = if trainingCriterion == 'CE' + then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') + else Fail('unknown trainingCriterion ' + trainingCriterion) + Err = if evalCriterion == 'Err' then + ErrorPrediction(labels, outZ, tag='eval') + else Fail('unknown evalCriterion ' + evalCriterion) + logPrior = LogPrior(labels) + // TODO: how to add a tag to an infix operation? + ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') + ] SGD=[ epochSize=20480 minibatchSize=64:256:1024: @@ -184,8 +262,8 @@ configparameters: cntk.config:speechTrain=[ scpFile=glob_0000.scp ] labels=[ - mlfFile=/home/vlivan/cntk/Tests/Speech/Data/glob_0000.mlf - labelMappingFile=/home/vlivan/cntk/Tests/Speech/Data/state.list + mlfFile=/home/mluser/src/cplx_master/Tests/Speech/Data/glob_0000.mlf + labelMappingFile=/home/mluser/src/cplx_master/Tests/Speech/Data/state.list labelDim=132 labelType=Category ] @@ -195,178 +273,293 @@ configparameters: cntk.config:speechTrain=[ <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< command: speechTrain precision = float -lsof: WARNING: can't stat() ext4 file system /var/lib/docker/aufs - Output information may be incomplete. -LockDevice: Capture device 0 and lock it for exclusive use -LockDevice: Capture device 0 and lock it for exclusive use +CNTKModelPath: /tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn +CNTKCommandTrainInfo: speechTrain : 3 +CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +CNTKCommandTrainBegin: speechTrain SimpleNetworkBuilder Using GPU 0 reading script file glob_0000.scp ... 948 entries -total 132 state names in state list /home/vlivan/cntk/Tests/Speech/Data/state.list -htkmlfreader: reading MLF file /home/vlivan/cntk/Tests/Speech/Data/glob_0000.mlf ...parse the line 55130 - total 948 entries +trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion +total 132 state names in state list /home/mluser/src/cplx_master/Tests/Speech/Data/state.list +htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Speech/Data/glob_0000.mlf ... total 948 entries ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances label set 0: 129 classes minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames +SetUniformRandomValue (GPU): creating curand object with seed 1 GetTrainCriterionNodes ... GetEvalCriterionNodes ... -Validating node CrossEntropyWithSoftmax +Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. -Validating --> labels = InputValue -Validating --> W2 = LearnableParameter -Validating --> W1 = LearnableParameter -Validating --> W0 = LearnableParameter -Validating --> features = InputValue -Validating --> MeanOfFeatures = Mean(features[363, 3]) -Validating --> InvStdOfFeatures = InvStdDev(features[363, 3]) -Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 3]) -Validating --> B0 = LearnableParameter -Validating --> W0*features+B0 = Plus(W0*features[512, 3], B0[512, 1]) -Validating --> H1 = Sigmoid(W0*features+B0[512, 3]) -Validating --> W1*H1 = Times(W1[512, 512], H1[512, 3]) -Validating --> B1 = LearnableParameter -Validating --> W1*H1+B1 = Plus(W1*H1[512, 3], B1[512, 1]) -Validating --> H2 = Sigmoid(W1*H1+B1[512, 3]) -Validating --> W2*H1 = Times(W2[132, 512], H2[512, 3]) -Validating --> B2 = LearnableParameter -Validating --> HLast = Plus(W2*H1[132, 3], B2[132, 1]) -Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 3], HLast[132, 3]) +Validating --> labels = InputValue -> [132, MBSize 3] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 3] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] + +Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 3] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 3] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] + +Validating for node CrossEntropyWithSoftmax, final verification. + +Validating --> labels = InputValue -> [132, MBSize 3] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 3] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] + +9 out of 20 nodes do not share the minibatch layout with the input data. + + +Precomputing --> 3 PreCompute nodes found. -Found 3 PreCompute nodes NodeName: InvStdOfFeatures NodeName: MeanOfFeatures NodeName: Prior -minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0) with 1 datapasses +minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms -Validating node InvStdOfFeatures +Validating for node InvStdOfFeatures. 2 nodes to process in pass 1. -Validating --> features = InputValue -Validating --> InvStdOfFeatures = InvStdDev(features[363, 64]) +Validating --> features = InputValue -> [363, MBSize 3] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] + +Validating for node InvStdOfFeatures, final verification. + +Validating --> features = InputValue -> [363, MBSize 3] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] + +1 out of 2 nodes do not share the minibatch layout with the input data. -Validating node MeanOfFeatures +Validating for node MeanOfFeatures. 2 nodes to process in pass 1. -Validating --> features = InputValue -Validating --> MeanOfFeatures = Mean(features[363, 64]) +Validating --> features = InputValue -> [363, MBSize 3] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] + +Validating for node MeanOfFeatures, final verification. + +Validating --> features = InputValue -> [363, MBSize 3] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] + +1 out of 2 nodes do not share the minibatch layout with the input data. -Validating node Prior +Validating for node Prior. 2 nodes to process in pass 1. -Validating --> labels = InputValue -Validating --> Prior = Mean(labels[132, 64]) +Validating --> labels = InputValue -> [132, MBSize 3] +Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] + +Validating for node Prior. 1 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 3] +Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] + +Validating for node Prior, final verification. + +Validating --> labels = InputValue -> [132, MBSize 3] +Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] + +1 out of 2 nodes do not share the minibatch layout with the input data. + +EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once. + +Precomputing --> Completed. Set Max Temp Mem Size For Convolution Nodes to 0 samples. -Starting Epoch 1: learning rate per sample = 0.015625 momentum = 0.900000 -minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0) with 1 datapasses +Starting Epoch 1: learning rate per sample = 0.015625 effective momentum = 0.900000 +minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses -Validating node EvalErrorPrediction +Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. -Validating --> labels = InputValue -Validating --> W2 = LearnableParameter -Validating --> W1 = LearnableParameter -Validating --> W0 = LearnableParameter -Validating --> features = InputValue -Validating --> MeanOfFeatures = Mean(features[363, 64]) -Validating --> InvStdOfFeatures = InvStdDev(features[363, 64]) -Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 64], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 64]) -Validating --> B0 = LearnableParameter -Validating --> W0*features+B0 = Plus(W0*features[512, 64], B0[512, 1]) -Validating --> H1 = Sigmoid(W0*features+B0[512, 64]) -Validating --> W1*H1 = Times(W1[512, 512], H1[512, 64]) -Validating --> B1 = LearnableParameter -Validating --> W1*H1+B1 = Plus(W1*H1[512, 64], B1[512, 1]) -Validating --> H2 = Sigmoid(W1*H1+B1[512, 64]) -Validating --> W2*H1 = Times(W2[132, 512], H2[512, 64]) -Validating --> B2 = LearnableParameter -Validating --> HLast = Plus(W2*H1[132, 64], B2[132, 1]) -Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 64], HLast[132, 64]) +Validating --> labels = InputValue -> [132, MBSize 62] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 62] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] - Epoch[1 of 3]-Minibatch[1-10 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.3213539; EvalErr[0]PerSample = 0.89999998; TotalTime=0.064294; TotalTimePerSample=0.00010045938, SamplesPerSecond=9954 - Epoch[1 of 3]-Minibatch[11-20 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.1507101; EvalErr[0]PerSample = 0.8671875; TotalTime=0.055813; TotalTimePerSample=8.7207812e-05, SamplesPerSecond=11466 - Epoch[1 of 3]-Minibatch[21-30 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.9990096; EvalErr[0]PerSample = 0.87656248; TotalTime=0.062703; TotalTimePerSample=9.7973437e-05, SamplesPerSecond=10206 - Epoch[1 of 3]-Minibatch[31-40 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.8694596; EvalErr[0]PerSample = 0.87656248; TotalTime=0.059923; TotalTimePerSample=9.3629687e-05, SamplesPerSecond=10680 - Epoch[1 of 3]-Minibatch[41-50 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.8021927; EvalErr[0]PerSample = 0.87812501; TotalTime=0.061061; TotalTimePerSample=9.5407812e-05, SamplesPerSecond=10481 - Epoch[1 of 3]-Minibatch[51-60 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.7289093; EvalErr[0]PerSample = 0.86874998; TotalTime=0.062101; TotalTimePerSample=9.7032813e-05, SamplesPerSecond=10305 - Epoch[1 of 3]-Minibatch[61-70 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.5618699; EvalErr[0]PerSample = 0.82343751; TotalTime=0.056094; TotalTimePerSample=8.7646875e-05, SamplesPerSecond=11409 - Epoch[1 of 3]-Minibatch[71-80 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.4279053; EvalErr[0]PerSample = 0.80781251; TotalTime=0.063459; TotalTimePerSample=9.9154687e-05, SamplesPerSecond=10085 - Epoch[1 of 3]-Minibatch[81-90 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.3392854; EvalErr[0]PerSample = 0.7734375; TotalTime=0.062265; TotalTimePerSample=9.7289063e-05, SamplesPerSecond=10278 - Epoch[1 of 3]-Minibatch[91-100 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.3639894; EvalErr[0]PerSample = 0.84375; TotalTime=0.059843; TotalTimePerSample=9.3504687e-05, SamplesPerSecond=10694 +Validating for node EvalErrorPrediction. 10 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 62] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 62] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] + +Validating for node EvalErrorPrediction, final verification. + +Validating --> labels = InputValue -> [132, MBSize 62] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 62] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] + +9 out of 20 nodes do not share the minibatch layout with the input data. + + +Starting minibatch loop. + Epoch[ 1 of 3]-Minibatch[ 1- 10 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.32135277; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.05742s; TotalTimePerSample = 0.08972ms; SamplesPerSecond = 11145 + Epoch[ 1 of 3]-Minibatch[ 11- 20 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.15070992; EvalErr[0]PerSample = 0.86718750; TotalTime = 0.05557s; TotalTimePerSample = 0.08682ms; SamplesPerSecond = 11517 + Epoch[ 1 of 3]-Minibatch[ 21- 30 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.99901123; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.05549s; TotalTimePerSample = 0.08671ms; SamplesPerSecond = 11532 + Epoch[ 1 of 3]-Minibatch[ 31- 40 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.86945953; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.05588s; TotalTimePerSample = 0.08732ms; SamplesPerSecond = 11452 + Epoch[ 1 of 3]-Minibatch[ 41- 50 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.80219574; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.05549s; TotalTimePerSample = 0.08670ms; SamplesPerSecond = 11534 + Epoch[ 1 of 3]-Minibatch[ 51- 60 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.72890930; EvalErr[0]PerSample = 0.86875000; TotalTime = 0.05552s; TotalTimePerSample = 0.08675ms; SamplesPerSecond = 11526 + Epoch[ 1 of 3]-Minibatch[ 61- 70 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.56186981; EvalErr[0]PerSample = 0.82343750; TotalTime = 0.05571s; TotalTimePerSample = 0.08705ms; SamplesPerSecond = 11488 + Epoch[ 1 of 3]-Minibatch[ 71- 80 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.42790527; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.05550s; TotalTimePerSample = 0.08672ms; SamplesPerSecond = 11531 + Epoch[ 1 of 3]-Minibatch[ 81- 90 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.33928528; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.05557s; TotalTimePerSample = 0.08683ms; SamplesPerSecond = 11517 + Epoch[ 1 of 3]-Minibatch[ 91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.36398926; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.05550s; TotalTimePerSample = 0.08671ms; SamplesPerSecond = 11532 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[1 of 3]-Minibatch[101-110 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.2122345; EvalErr[0]PerSample = 0.75312501; TotalTime=0.062375; TotalTimePerSample=9.7460937e-05, SamplesPerSecond=10260 - Epoch[1 of 3]-Minibatch[111-120 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.3126526; EvalErr[0]PerSample = 0.78750002; TotalTime=0.061085; TotalTimePerSample=9.5445313e-05, SamplesPerSecond=10477 - Epoch[1 of 3]-Minibatch[121-130 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.1408203; EvalErr[0]PerSample = 0.74687499; TotalTime=0.064562; TotalTimePerSample=0.00010087812, SamplesPerSecond=9912 - Epoch[1 of 3]-Minibatch[131-140 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.006897; EvalErr[0]PerSample = 0.69687498; TotalTime=0.0575; TotalTimePerSample=8.984375e-05, SamplesPerSecond=11130 - Epoch[1 of 3]-Minibatch[141-150 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.0049591; EvalErr[0]PerSample = 0.72343749; TotalTime=0.058338; TotalTimePerSample=9.1153125e-05, SamplesPerSecond=10970 - Epoch[1 of 3]-Minibatch[151-160 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.9785829; EvalErr[0]PerSample = 0.73906249; TotalTime=0.064603; TotalTimePerSample=0.00010094219, SamplesPerSecond=9906 - Epoch[1 of 3]-Minibatch[161-170 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.8568604; EvalErr[0]PerSample = 0.70781249; TotalTime=0.060368; TotalTimePerSample=9.4325e-05, SamplesPerSecond=10601 - Epoch[1 of 3]-Minibatch[171-180 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.6905334; EvalErr[0]PerSample = 0.671875; TotalTime=0.059125; TotalTimePerSample=9.2382812e-05, SamplesPerSecond=10824 - Epoch[1 of 3]-Minibatch[181-190 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.7865357; EvalErr[0]PerSample = 0.70468748; TotalTime=0.056113; TotalTimePerSample=8.7676563e-05, SamplesPerSecond=11405 - Epoch[1 of 3]-Minibatch[191-200 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.5770202; EvalErr[0]PerSample = 0.6484375; TotalTime=0.060745; TotalTimePerSample=9.4914062e-05, SamplesPerSecond=10535 - Epoch[1 of 3]-Minibatch[201-210 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.6157165; EvalErr[0]PerSample = 0.6640625; TotalTime=0.059709; TotalTimePerSample=9.3295312e-05, SamplesPerSecond=10718 - Epoch[1 of 3]-Minibatch[211-220 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.552362; EvalErr[0]PerSample = 0.65781248; TotalTime=0.061917; TotalTimePerSample=9.6745313e-05, SamplesPerSecond=10336 - Epoch[1 of 3]-Minibatch[221-230 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.4821167; EvalErr[0]PerSample = 0.625; TotalTime=0.053813; TotalTimePerSample=8.4082813e-05, SamplesPerSecond=11893 - Epoch[1 of 3]-Minibatch[231-240 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.3877869; EvalErr[0]PerSample = 0.62812501; TotalTime=0.061932; TotalTimePerSample=9.676875e-05, SamplesPerSecond=10333 - Epoch[1 of 3]-Minibatch[241-250 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.3690064; EvalErr[0]PerSample = 0.6484375; TotalTime=0.059294; TotalTimePerSample=9.2646875e-05, SamplesPerSecond=10793 - Epoch[1 of 3]-Minibatch[251-260 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.4396729; EvalErr[0]PerSample = 0.6328125; TotalTime=0.060513; TotalTimePerSample=9.4551562e-05, SamplesPerSecond=10576 - Epoch[1 of 3]-Minibatch[261-270 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.3028197; EvalErr[0]PerSample = 0.61250001; TotalTime=0.06037; TotalTimePerSample=9.4328125e-05, SamplesPerSecond=10601 - Epoch[1 of 3]-Minibatch[271-280 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.1966858; EvalErr[0]PerSample = 0.55937499; TotalTime=0.056485; TotalTimePerSample=8.8257812e-05, SamplesPerSecond=11330 - Epoch[1 of 3]-Minibatch[281-290 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.2898011; EvalErr[0]PerSample = 0.60468751; TotalTime=0.059356; TotalTimePerSample=9.274375e-05, SamplesPerSecond=10782 - Epoch[1 of 3]-Minibatch[291-300 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.1775086; EvalErr[0]PerSample = 0.62187499; TotalTime=0.059501; TotalTimePerSample=9.2970312e-05, SamplesPerSecond=10756 - Epoch[1 of 3]-Minibatch[301-310 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.2626343; EvalErr[0]PerSample = 0.59687501; TotalTime=0.064342; TotalTimePerSample=0.00010053437, SamplesPerSecond=9946 - Epoch[1 of 3]-Minibatch[311-320 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.1507263; EvalErr[0]PerSample = 0.5625; TotalTime=0.064522; TotalTimePerSample=0.00010081563, SamplesPerSecond=9919 -Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9799569; EvalErrPerSample = 0.72216797; Ave LearnRatePerSample = 0.015625; EpochTime=1.935613 -Starting Epoch 2: learning rate per sample = 0.001953 momentum = 0.656119 -minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480) with 1 datapasses - Epoch[2 of 3]-Minibatch[1-10 of 80]: SamplesSeen = 2560; TrainLossPerSample = 2.0159853; EvalErr[0]PerSample = 0.54140627; TotalTime=0.102487; TotalTimePerSample=4.0033984e-05, SamplesPerSecond=24978 - Epoch[2 of 3]-Minibatch[11-20 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.9881856; EvalErr[0]PerSample = 0.54296875; TotalTime=0.09473; TotalTimePerSample=3.7003906e-05, SamplesPerSecond=27024 - Epoch[2 of 3]-Minibatch[21-30 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.9869812; EvalErr[0]PerSample = 0.54140627; TotalTime=0.091318; TotalTimePerSample=3.5671094e-05, SamplesPerSecond=28033 - Epoch[2 of 3]-Minibatch[31-40 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.9312614; EvalErr[0]PerSample = 0.5277344; TotalTime=0.092408; TotalTimePerSample=3.6096875e-05, SamplesPerSecond=27703 - Epoch[2 of 3]-Minibatch[41-50 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.9006774; EvalErr[0]PerSample = 0.52656251; TotalTime=0.098698; TotalTimePerSample=3.8553906e-05, SamplesPerSecond=25937 - Epoch[2 of 3]-Minibatch[51-60 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.9711578; EvalErr[0]PerSample = 0.54140627; TotalTime=0.0896; TotalTimePerSample=3.5e-05, SamplesPerSecond=28571 - Epoch[2 of 3]-Minibatch[61-70 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.8951813; EvalErr[0]PerSample = 0.52031249; TotalTime=0.092477; TotalTimePerSample=3.6123828e-05, SamplesPerSecond=27682 - Epoch[2 of 3]-Minibatch[71-80 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.904506; EvalErr[0]PerSample = 0.53164065; TotalTime=0.091179; TotalTimePerSample=3.5616797e-05, SamplesPerSecond=28076 -Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.949242; EvalErrPerSample = 0.53417969; Ave LearnRatePerSample = 0.001953125; EpochTime=0.753703 -Starting Epoch 3: learning rate per sample = 0.000098 momentum = 0.656119 -minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960) with 1 datapasses - Epoch[3 of 3]-Minibatch[1-10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.8735985; EvalErr[0]PerSample = 0.51933593; TotalTime=0.27395; TotalTimePerSample=2.675293e-05, SamplesPerSecond=37379 - Epoch[3 of 3]-Minibatch[11-20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.8665626; EvalErr[0]PerSample = 0.51748049; TotalTime=0.261453; TotalTimePerSample=2.553252e-05, SamplesPerSecond=39165 -Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.51840824; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.537273 + Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.21223450; EvalErr[0]PerSample = 0.75312500; TotalTime = 0.05582s; TotalTimePerSample = 0.08723ms; SamplesPerSecond = 11464 + Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.31265259; EvalErr[0]PerSample = 0.78750000; TotalTime = 0.05591s; TotalTimePerSample = 0.08736ms; SamplesPerSecond = 11446 + Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.14082031; EvalErr[0]PerSample = 0.74687500; TotalTime = 0.05556s; TotalTimePerSample = 0.08680ms; SamplesPerSecond = 11520 + Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.00689697; EvalErr[0]PerSample = 0.69687500; TotalTime = 0.05566s; TotalTimePerSample = 0.08696ms; SamplesPerSecond = 11499 + Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.00496216; EvalErr[0]PerSample = 0.72343750; TotalTime = 0.05562s; TotalTimePerSample = 0.08690ms; SamplesPerSecond = 11506 + Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.97858887; EvalErr[0]PerSample = 0.73906250; TotalTime = 0.05559s; TotalTimePerSample = 0.08687ms; SamplesPerSecond = 11512 + Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.85686035; EvalErr[0]PerSample = 0.70781250; TotalTime = 0.05570s; TotalTimePerSample = 0.08703ms; SamplesPerSecond = 11490 + Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.69053345; EvalErr[0]PerSample = 0.67187500; TotalTime = 0.05565s; TotalTimePerSample = 0.08695ms; SamplesPerSecond = 11501 + Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.78653564; EvalErr[0]PerSample = 0.70468750; TotalTime = 0.05552s; TotalTimePerSample = 0.08674ms; SamplesPerSecond = 11528 + Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.57702026; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.05548s; TotalTimePerSample = 0.08669ms; SamplesPerSecond = 11535 + Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.61571655; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.05545s; TotalTimePerSample = 0.08663ms; SamplesPerSecond = 11542 + Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.55236206; EvalErr[0]PerSample = 0.65781250; TotalTime = 0.05567s; TotalTimePerSample = 0.08698ms; SamplesPerSecond = 11496 + Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.48211670; EvalErr[0]PerSample = 0.62500000; TotalTime = 0.05560s; TotalTimePerSample = 0.08688ms; SamplesPerSecond = 11510 + Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.38778687; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.05546s; TotalTimePerSample = 0.08666ms; SamplesPerSecond = 11539 + Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.36900635; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.05560s; TotalTimePerSample = 0.08687ms; SamplesPerSecond = 11511 + Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.43967285; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.05553s; TotalTimePerSample = 0.08677ms; SamplesPerSecond = 11524 + Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.30281982; EvalErr[0]PerSample = 0.61250000; TotalTime = 0.05553s; TotalTimePerSample = 0.08677ms; SamplesPerSecond = 11525 + Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.19668579; EvalErr[0]PerSample = 0.55937500; TotalTime = 0.05553s; TotalTimePerSample = 0.08677ms; SamplesPerSecond = 11525 + Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.28980103; EvalErr[0]PerSample = 0.60468750; TotalTime = 0.05551s; TotalTimePerSample = 0.08674ms; SamplesPerSecond = 11529 + Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.17750854; EvalErr[0]PerSample = 0.62187500; TotalTime = 0.05574s; TotalTimePerSample = 0.08709ms; SamplesPerSecond = 11482 + Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.26263428; EvalErr[0]PerSample = 0.59687500; TotalTime = 0.05555s; TotalTimePerSample = 0.08679ms; SamplesPerSecond = 11521 + Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.15072632; EvalErr[0]PerSample = 0.56250000; TotalTime = 0.05427s; TotalTimePerSample = 0.08479ms; SamplesPerSecond = 11793 +Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9799573; EvalErrPerSample = 0.72216797; Ave LearnRatePerSample = 0.015625; EpochTime=1.785537 +Starting Epoch 2: learning rate per sample = 0.001953 effective momentum = 0.656119 +minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 0 of 1, with 1 datapasses + +Starting minibatch loop. + Epoch[ 2 of 3]-Minibatch[ 1- 10 of 80]: SamplesSeen = 2560; TrainLossPerSample = 2.01598530; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.09354s; TotalTimePerSample = 0.03654ms; SamplesPerSecond = 27367 + Epoch[ 2 of 3]-Minibatch[ 11- 20 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98818569; EvalErr[0]PerSample = 0.54296875; TotalTime = 0.09083s; TotalTimePerSample = 0.03548ms; SamplesPerSecond = 28184 + Epoch[ 2 of 3]-Minibatch[ 21- 30 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98698120; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.09109s; TotalTimePerSample = 0.03558ms; SamplesPerSecond = 28103 + Epoch[ 2 of 3]-Minibatch[ 31- 40 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.93126144; EvalErr[0]PerSample = 0.52773437; TotalTime = 0.09077s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28203 + Epoch[ 2 of 3]-Minibatch[ 41- 50 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90067749; EvalErr[0]PerSample = 0.52656250; TotalTime = 0.09081s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28191 + Epoch[ 2 of 3]-Minibatch[ 51- 60 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.97115784; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.09085s; TotalTimePerSample = 0.03549ms; SamplesPerSecond = 28179 + Epoch[ 2 of 3]-Minibatch[ 61- 70 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89518127; EvalErr[0]PerSample = 0.52031250; TotalTime = 0.09092s; TotalTimePerSample = 0.03552ms; SamplesPerSecond = 28155 + Epoch[ 2 of 3]-Minibatch[ 71- 80 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90450592; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.08529s; TotalTimePerSample = 0.03332ms; SamplesPerSecond = 30014 +Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.949242; EvalErrPerSample = 0.53417969; Ave LearnRatePerSample = 0.001953125; EpochTime=0.732528 +Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 +minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 1, with 1 datapasses + +Starting minibatch loop. + Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87359848; EvalErr[0]PerSample = 0.51933594; TotalTime = 0.24564s; TotalTimePerSample = 0.02399ms; SamplesPerSecond = 41687 + Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86656265; EvalErr[0]PerSample = 0.51748047; TotalTime = 0.21814s; TotalTimePerSample = 0.02130ms; SamplesPerSecond = 46943 +Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.51840824; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.493964 +CNTKCommandTrainEnd: speechTrain COMPLETED - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -WARNING: - -You should always run with libnvidia-ml.so that is installed with your -NVIDIA Display Driver. By default it's installed in /usr/lib and /usr/lib64. -libnvidia-ml.so in GDK package is a stub library that is attached only for -build purposes (e.g. machine that you build your application doesn't have -to have Display Driver installed). -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Linked to libnvidia-ml library at wrong path : /usr/src/gdk/nvml/lib/libnvidia-ml.so.1 - - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -WARNING: - -You should always run with libnvidia-ml.so that is installed with your -NVIDIA Display Driver. By default it's installed in /usr/lib and /usr/lib64. -libnvidia-ml.so in GDK package is a stub library that is attached only for -build purposes (e.g. machine that you build your application doesn't have -to have Display Driver installed). -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! === Deleting last epoch data ==== Re-running from checkpoint -running on localhost at 2015/07/29 19:11:14 -command line options: -configFile=/home/vlivan/cntk/Tests/Speech/QuickE2E/cntk.config RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu DataDir=/home/vlivan/cntk/Tests/Speech/Data DeviceId=Auto +=== Running /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E/cntk.config RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E DeviceId=0 +running on localhost at 2015/10/24 12:49:11 +command line: +/home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E/cntk.config RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E DeviceId=0 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> precision=float @@ -388,6 +581,31 @@ speechTrain=[ uniformInit=true needPrior=true ] + ExperimentalNetworkBuilder=[ // the same as above but with BS + layerSizes=363:512:512:132 + trainingCriterion='CE' + evalCriterion='Err' + applyMeanVarNorm=true + L = Length(layerSizes)-1 // number of model layers + features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') + featNorm = if applyMeanVarNorm + then MeanVarNorm(features) + else features + layers[layer:1..L-1] = if layer > 1 + then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) + else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) + outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) + outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) + CE = if trainingCriterion == 'CE' + then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') + else Fail('unknown trainingCriterion ' + trainingCriterion) + Err = if evalCriterion == 'Err' then + ErrorPrediction(labels, outZ, tag='eval') + else Fail('unknown evalCriterion ' + evalCriterion) + logPrior = LogPrior(labels) + // TODO: how to add a tag to an infix operation? + ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') + ] SGD=[ epochSize=20480 minibatchSize=64:256:1024: @@ -426,21 +644,22 @@ speechTrain=[ ] ] ] -RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu -DataDir=/home/vlivan/cntk/Tests/Speech/Data -DeviceId=Auto +RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu +DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E +DeviceId=0 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> precision=float command=speechTrain -deviceId=Auto +deviceId=0 parallelTrain=false speechTrain=[ action=train - modelPath=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn - deviceId=Auto + modelPath=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn + deviceId=0 traceLevel=1 SimpleNetworkBuilder=[ layerSizes=363:512:512:132 @@ -452,6 +671,31 @@ speechTrain=[ uniformInit=true needPrior=true ] + ExperimentalNetworkBuilder=[ // the same as above but with BS + layerSizes=363:512:512:132 + trainingCriterion='CE' + evalCriterion='Err' + applyMeanVarNorm=true + L = Length(layerSizes)-1 // number of model layers + features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') + featNorm = if applyMeanVarNorm + then MeanVarNorm(features) + else features + layers[layer:1..L-1] = if layer > 1 + then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) + else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) + outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) + outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) + CE = if trainingCriterion == 'CE' + then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') + else Fail('unknown trainingCriterion ' + trainingCriterion) + Err = if evalCriterion == 'Err' then + ErrorPrediction(labels, outZ, tag='eval') + else Fail('unknown evalCriterion ' + evalCriterion) + logPrior = LogPrior(labels) + // TODO: how to add a tag to an infix operation? + ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') + ] SGD=[ epochSize=20480 minibatchSize=64:256:1024: @@ -483,30 +727,32 @@ speechTrain=[ scpFile=glob_0000.scp ] labels=[ - mlfFile=/home/vlivan/cntk/Tests/Speech/Data/glob_0000.mlf - labelMappingFile=/home/vlivan/cntk/Tests/Speech/Data/state.list + mlfFile=/home/mluser/src/cplx_master/Tests/Speech/Data/glob_0000.mlf + labelMappingFile=/home/mluser/src/cplx_master/Tests/Speech/Data/state.list labelDim=132 labelType=Category ] ] ] -RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu -DataDir=/home/vlivan/cntk/Tests/Speech/Data -DeviceId=Auto +RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu +DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E +DeviceId=0 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> configparameters: cntk.config:command=speechTrain -configparameters: cntk.config:DataDir=/home/vlivan/cntk/Tests/Speech/Data -configparameters: cntk.config:deviceId=Auto +configparameters: cntk.config:ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E +configparameters: cntk.config:DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data +configparameters: cntk.config:deviceId=0 configparameters: cntk.config:parallelTrain=false configparameters: cntk.config:precision=float -configparameters: cntk.config:RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu +configparameters: cntk.config:RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu configparameters: cntk.config:speechTrain=[ action=train - modelPath=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn - deviceId=Auto + modelPath=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn + deviceId=0 traceLevel=1 SimpleNetworkBuilder=[ layerSizes=363:512:512:132 @@ -518,6 +764,31 @@ configparameters: cntk.config:speechTrain=[ uniformInit=true needPrior=true ] + ExperimentalNetworkBuilder=[ // the same as above but with BS + layerSizes=363:512:512:132 + trainingCriterion='CE' + evalCriterion='Err' + applyMeanVarNorm=true + L = Length(layerSizes)-1 // number of model layers + features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') + featNorm = if applyMeanVarNorm + then MeanVarNorm(features) + else features + layers[layer:1..L-1] = if layer > 1 + then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) + else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) + outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) + outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) + CE = if trainingCriterion == 'CE' + then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') + else Fail('unknown trainingCriterion ' + trainingCriterion) + Err = if evalCriterion == 'Err' then + ErrorPrediction(labels, outZ, tag='eval') + else Fail('unknown evalCriterion ' + evalCriterion) + logPrior = LogPrior(labels) + // TODO: how to add a tag to an infix operation? + ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') + ] SGD=[ epochSize=20480 minibatchSize=64:256:1024: @@ -549,8 +820,8 @@ configparameters: cntk.config:speechTrain=[ scpFile=glob_0000.scp ] labels=[ - mlfFile=/home/vlivan/cntk/Tests/Speech/Data/glob_0000.mlf - labelMappingFile=/home/vlivan/cntk/Tests/Speech/Data/state.list + mlfFile=/home/mluser/src/cplx_master/Tests/Speech/Data/glob_0000.mlf + labelMappingFile=/home/mluser/src/cplx_master/Tests/Speech/Data/state.list labelDim=132 labelType=Category ] @@ -560,24 +831,24 @@ configparameters: cntk.config:speechTrain=[ <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< command: speechTrain precision = float -lsof: WARNING: can't stat() ext4 file system /var/lib/docker/aufs - Output information may be incomplete. -LockDevice: Capture device 0 and lock it for exclusive use -LockDevice: Capture device 0 and lock it for exclusive use +CNTKModelPath: /tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn +CNTKCommandTrainInfo: speechTrain : 3 +CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +CNTKCommandTrainBegin: speechTrain SimpleNetworkBuilder Using GPU 0 reading script file glob_0000.scp ... 948 entries -total 132 state names in state list /home/vlivan/cntk/Tests/Speech/Data/state.list -htkmlfreader: reading MLF file /home/vlivan/cntk/Tests/Speech/Data/glob_0000.mlf ...parse the line 55130 - total 948 entries +trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion +total 132 state names in state list /home/mluser/src/cplx_master/Tests/Speech/Data/state.list +htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Speech/Data/glob_0000.mlf ... total 948 entries ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances label set 0: 129 classes minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames -Starting from checkpoint. Load Network From File /tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn.2. +Starting from checkpoint. Load Network From File /tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn.2. Printing Gradient Computation Node Order ... -CrossEntropyWithSoftmax[0, 0] = CrossEntropyWithSoftmax(labels[132, 256], HLast[0, 0]) +CrossEntropyWithSoftmax[0, 0] = CrossEntropyWithSoftmax(labels[132, 0], HLast[0, 0]) HLast[0, 0] = Plus(W2*H1[0, 0], B2[132, 1]) B2[132, 1] = LearnableParameter W2*H1[0, 0] = Times(W2[132, 512], H2[0, 0]) @@ -589,170 +860,475 @@ H1[0, 0] = Sigmoid(W0*features+B0[0, 0]) W0*features+B0[0, 0] = Plus(W0*features[0, 0], B0[512, 1]) B0[512, 1] = LearnableParameter W0*features[0, 0] = Times(W0[512, 363], MVNormalizedFeatures[0, 0]) -MVNormalizedFeatures[0, 0] = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -InvStdOfFeatures[363, 1] = InvStdDev(features[363, 256]) -MeanOfFeatures[363, 1] = Mean(features[363, 256]) -features[363, 256] = InputValue +MVNormalizedFeatures[0, 0] = PerDimMeanVarNormalization(features[363, 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) +InvStdOfFeatures[363, 1] = InvStdDev(features[363, 0]) +MeanOfFeatures[363, 1] = Mean(features[363, 0]) +features[363, 0] = InputValue W0[512, 363] = LearnableParameter W1[512, 512] = LearnableParameter W2[132, 512] = LearnableParameter -labels[132, 256] = InputValue +labels[132, 0] = InputValue -Validating node CrossEntropyWithSoftmax +Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. -Validating --> labels = InputValue -Validating --> W2 = LearnableParameter -Validating --> W1 = LearnableParameter -Validating --> W0 = LearnableParameter -Validating --> features = InputValue -Validating --> MeanOfFeatures = Mean(features[363, 256]) -Validating --> InvStdOfFeatures = InvStdDev(features[363, 256]) -Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256]) -Validating --> B0 = LearnableParameter -Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1]) -Validating --> H1 = Sigmoid(W0*features+B0[512, 256]) -Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256]) -Validating --> B1 = LearnableParameter -Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1]) -Validating --> H2 = Sigmoid(W1*H1+B1[512, 256]) -Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256]) -Validating --> B2 = LearnableParameter -Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1]) -Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 256], HLast[132, 256]) +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +Validating for node CrossEntropyWithSoftmax, final verification. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +9 out of 20 nodes do not share the minibatch layout with the input data. -Validating node ScaledLogLikelihood +Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. -Validating --> W2 = LearnableParameter -Validating --> W1 = LearnableParameter -Validating --> W0 = LearnableParameter -Validating --> features = InputValue -Validating --> MeanOfFeatures = Mean(features[363, 256]) -Validating --> InvStdOfFeatures = InvStdDev(features[363, 256]) -Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256]) -Validating --> B0 = LearnableParameter -Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1]) -Validating --> H1 = Sigmoid(W0*features+B0[512, 256]) -Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256]) -Validating --> B1 = LearnableParameter -Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1]) -Validating --> H2 = Sigmoid(W1*H1+B1[512, 256]) -Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256]) -Validating --> B2 = LearnableParameter -Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1]) -Validating --> labels = InputValue -Validating --> Prior = Mean(labels[132, 256]) -Validating --> LogOfPrior = Log(Prior[132, 1]) -Validating --> ScaledLogLikelihood = Minus(HLast[132, 256], LogOfPrior[132, 1]) +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +Validating for node CrossEntropyWithSoftmax. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +Validating for node CrossEntropyWithSoftmax, final verification. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +9 out of 20 nodes do not share the minibatch layout with the input data. -Validating node EvalErrorPrediction +Validating for node ScaledLogLikelihood. 22 nodes to process in pass 1. -Validating --> labels = InputValue -Validating --> W2 = LearnableParameter -Validating --> W1 = LearnableParameter -Validating --> W0 = LearnableParameter -Validating --> features = InputValue -Validating --> MeanOfFeatures = Mean(features[363, 256]) -Validating --> InvStdOfFeatures = InvStdDev(features[363, 256]) -Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256]) -Validating --> B0 = LearnableParameter -Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1]) -Validating --> H1 = Sigmoid(W0*features+B0[512, 256]) -Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256]) -Validating --> B1 = LearnableParameter -Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1]) -Validating --> H2 = Sigmoid(W1*H1+B1[512, 256]) -Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256]) -Validating --> B2 = LearnableParameter -Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1]) -Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 256], HLast[132, 256]) +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1] +Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0] + +Validating for node ScaledLogLikelihood. 11 nodes to process in pass 2. + +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1] +Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0] + +Validating for node ScaledLogLikelihood, final verification. + +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1] +Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0] + +10 out of 22 nodes do not share the minibatch layout with the input data. + + + +Validating for node ScaledLogLikelihood. 22 nodes to process in pass 1. + +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1] +Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0] + +Validating for node ScaledLogLikelihood. 9 nodes to process in pass 2. + +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1] +Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0] + +Validating for node ScaledLogLikelihood, final verification. + +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1] +Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0] + +10 out of 22 nodes do not share the minibatch layout with the input data. + + + +Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +Validating for node EvalErrorPrediction. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +Validating for node EvalErrorPrediction, final verification. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +9 out of 20 nodes do not share the minibatch layout with the input data. + + + +Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +Validating for node EvalErrorPrediction. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +Validating for node EvalErrorPrediction, final verification. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +9 out of 20 nodes do not share the minibatch layout with the input data. GetTrainCriterionNodes ... GetEvalCriterionNodes ... - - -Validating node CrossEntropyWithSoftmax - -Validating --> labels = InputValue -Validating --> W2 = LearnableParameter -Validating --> W1 = LearnableParameter -Validating --> W0 = LearnableParameter -Validating --> features = InputValue -Validating --> MeanOfFeatures = Mean(features[363, 256]) -Validating --> InvStdOfFeatures = InvStdDev(features[363, 256]) -Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256]) -Validating --> B0 = LearnableParameter -Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1]) -Validating --> H1 = Sigmoid(W0*features+B0[512, 256]) -Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256]) -Validating --> B1 = LearnableParameter -Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1]) -Validating --> H2 = Sigmoid(W1*H1+B1[512, 256]) -Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256]) -Validating --> B2 = LearnableParameter -Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1]) -Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 256], HLast[132, 256]) - No PreCompute nodes found, skipping PreCompute step Set Max Temp Mem Size For Convolution Nodes to 0 samples. -Starting Epoch 3: learning rate per sample = 0.000098 momentum = 0.656119 -minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960) with 1 datapasses +Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 +minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 1, with 1 datapasses requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms - -Validating node EvalErrorPrediction - -Validating --> labels = InputValue -Validating --> W2 = LearnableParameter -Validating --> W1 = LearnableParameter -Validating --> W0 = LearnableParameter -Validating --> features = InputValue -Validating --> MeanOfFeatures = Mean(features[363, 1024]) -Validating --> InvStdOfFeatures = InvStdDev(features[363, 1024]) -Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 1024], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 1024]) -Validating --> B0 = LearnableParameter -Validating --> W0*features+B0 = Plus(W0*features[512, 1024], B0[512, 1]) -Validating --> H1 = Sigmoid(W0*features+B0[512, 1024]) -Validating --> W1*H1 = Times(W1[512, 512], H1[512, 1024]) -Validating --> B1 = LearnableParameter -Validating --> W1*H1+B1 = Plus(W1*H1[512, 1024], B1[512, 1]) -Validating --> H2 = Sigmoid(W1*H1+B1[512, 1024]) -Validating --> W2*H1 = Times(W2[132, 512], H2[512, 1024]) -Validating --> B2 = LearnableParameter -Validating --> HLast = Plus(W2*H1[132, 1024], B2[132, 1]) -Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 1024], HLast[132, 1024]) - - Epoch[3 of 3]-Minibatch[1-10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.8735985; EvalErr[0]PerSample = 0.51933593; TotalTime=0.430752; TotalTimePerSample=4.2065625e-05, SamplesPerSecond=23772 - Epoch[3 of 3]-Minibatch[11-20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.8665626; EvalErr[0]PerSample = 0.51748049; TotalTime=0.2702; TotalTimePerSample=2.6386719e-05, SamplesPerSecond=37897 -Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.51840824; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.868162 +Starting minibatch loop. +EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once. + Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87359848; EvalErr[0]PerSample = 0.51933594; TotalTime = 0.32305s; TotalTimePerSample = 0.03155ms; SamplesPerSecond = 31698 + Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86656265; EvalErr[0]PerSample = 0.51748047; TotalTime = 0.21717s; TotalTimePerSample = 0.02121ms; SamplesPerSecond = 47152 +Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.51840824; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=1.439589 +CNTKCommandTrainEnd: speechTrain COMPLETED - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -WARNING: - -You should always run with libnvidia-ml.so that is installed with your -NVIDIA Display Driver. By default it's installed in /usr/lib and /usr/lib64. -libnvidia-ml.so in GDK package is a stub library that is attached only for -build purposes (e.g. machine that you build your application doesn't have -to have Display Driver installed). -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -Linked to libnvidia-ml library at wrong path : /usr/src/gdk/nvml/lib/libnvidia-ml.so.1 - - -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -WARNING: - -You should always run with libnvidia-ml.so that is installed with your -NVIDIA Display Driver. By default it's installed in /usr/lib and /usr/lib64. -libnvidia-ml.so in GDK package is a stub library that is attached only for -build purposes (e.g. machine that you build your application doesn't have -to have Display Driver installed). -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! diff --git a/Tests/Speech/QuickE2E/baseline.windows.gpu.txt b/Tests/Speech/QuickE2E/baseline.windows.gpu.txt index 5c336a37b..d47fb54a4 100644 --- a/Tests/Speech/QuickE2E/baseline.windows.gpu.txt +++ b/Tests/Speech/QuickE2E/baseline.windows.gpu.txt @@ -1,18 +1,16 @@ -=== Running /cygdrive/c/Users/svcphil/workspace.vlivan/CNTK-Build-Windows/x64/release/cntk.exe configFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\QuickE2E\cntk.config RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data DeviceId=Auto +=== Running /cygdrive/e/NetScale/CNTK/git_repos/cplx_master2/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E DeviceId=0 ------------------------------------------------------------------- Build info: - Built time: Aug 11 2015 16:18:17 - Last modified date: Tue Aug 11 16:16:08 2015 - Built by svcphil on dphaim-26-new - Build Path: C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\MachineLearning\CNTK\ + Built time: Oct 24 2015 13:33:25 + Last modified date: Thu Oct 22 16:00:27 2015 + Built by amitaga on Amitaga-Win-DT3 + Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\ CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 - Build Branch: master - Build SHA1: 397cc7cc16c00b1c12864d331c0729fde7a1bde3 ------------------------------------------------------------------- -running on dphaim-26-new at 2015/08/11 17:47:26 -command line options: -configFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\QuickE2E\cntk.config RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data DeviceId=Auto +running on Amitaga-Win-DT3 at 2015/10/24 22:07:22 +command line: +E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E DeviceId=0 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> precision=float @@ -34,6 +32,31 @@ speechTrain=[ uniformInit=true needPrior=true ] + ExperimentalNetworkBuilder=[ // the same as above but with BS + layerSizes=363:512:512:132 + trainingCriterion='CE' + evalCriterion='Err' + applyMeanVarNorm=true + L = Length(layerSizes)-1 // number of model layers + features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') + featNorm = if applyMeanVarNorm + then MeanVarNorm(features) + else features + layers[layer:1..L-1] = if layer > 1 + then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) + else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) + outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) + outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) + CE = if trainingCriterion == 'CE' + then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') + else Fail('unknown trainingCriterion ' + trainingCriterion) + Err = if evalCriterion == 'Err' then + ErrorPrediction(labels, outZ, tag='eval') + else Fail('unknown evalCriterion ' + evalCriterion) + logPrior = LogPrior(labels) + // TODO: how to add a tag to an infix operation? + ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') + ] SGD=[ epochSize=20480 minibatchSize=64:256:1024: @@ -72,21 +95,22 @@ speechTrain=[ ] ] ] -RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu -DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data -DeviceId=Auto +RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu +DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E +DeviceId=0 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> precision=float command=speechTrain -deviceId=Auto +deviceId=0 parallelTrain=false speechTrain=[ action=train - modelPath=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn - deviceId=Auto + modelPath=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn + deviceId=0 traceLevel=1 SimpleNetworkBuilder=[ layerSizes=363:512:512:132 @@ -98,6 +122,31 @@ speechTrain=[ uniformInit=true needPrior=true ] + ExperimentalNetworkBuilder=[ // the same as above but with BS + layerSizes=363:512:512:132 + trainingCriterion='CE' + evalCriterion='Err' + applyMeanVarNorm=true + L = Length(layerSizes)-1 // number of model layers + features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') + featNorm = if applyMeanVarNorm + then MeanVarNorm(features) + else features + layers[layer:1..L-1] = if layer > 1 + then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) + else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) + outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) + outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) + CE = if trainingCriterion == 'CE' + then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') + else Fail('unknown trainingCriterion ' + trainingCriterion) + Err = if evalCriterion == 'Err' then + ErrorPrediction(labels, outZ, tag='eval') + else Fail('unknown evalCriterion ' + evalCriterion) + logPrior = LogPrior(labels) + // TODO: how to add a tag to an infix operation? + ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') + ] SGD=[ epochSize=20480 minibatchSize=64:256:1024: @@ -129,30 +178,32 @@ speechTrain=[ scpFile=glob_0000.scp ] labels=[ - mlfFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf - labelMappingFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list + mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf + labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list labelDim=132 labelType=Category ] ] ] -RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu -DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data -DeviceId=Auto +RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu +DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E +DeviceId=0 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> configparameters: cntk.config:command=speechTrain -configparameters: cntk.config:DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data -configparameters: cntk.config:deviceId=Auto +configparameters: cntk.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E +configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +configparameters: cntk.config:deviceId=0 configparameters: cntk.config:parallelTrain=false configparameters: cntk.config:precision=float -configparameters: cntk.config:RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu +configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu configparameters: cntk.config:speechTrain=[ action=train - modelPath=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn - deviceId=Auto + modelPath=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn + deviceId=0 traceLevel=1 SimpleNetworkBuilder=[ layerSizes=363:512:512:132 @@ -164,6 +215,31 @@ configparameters: cntk.config:speechTrain=[ uniformInit=true needPrior=true ] + ExperimentalNetworkBuilder=[ // the same as above but with BS + layerSizes=363:512:512:132 + trainingCriterion='CE' + evalCriterion='Err' + applyMeanVarNorm=true + L = Length(layerSizes)-1 // number of model layers + features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') + featNorm = if applyMeanVarNorm + then MeanVarNorm(features) + else features + layers[layer:1..L-1] = if layer > 1 + then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) + else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) + outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) + outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) + CE = if trainingCriterion == 'CE' + then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') + else Fail('unknown trainingCriterion ' + trainingCriterion) + Err = if evalCriterion == 'Err' then + ErrorPrediction(labels, outZ, tag='eval') + else Fail('unknown evalCriterion ' + evalCriterion) + logPrior = LogPrior(labels) + // TODO: how to add a tag to an infix operation? + ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') + ] SGD=[ epochSize=20480 minibatchSize=64:256:1024: @@ -195,8 +271,8 @@ configparameters: cntk.config:speechTrain=[ scpFile=glob_0000.scp ] labels=[ - mlfFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf - labelMappingFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list + mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf + labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list labelDim=132 labelType=Category ] @@ -206,168 +282,301 @@ configparameters: cntk.config:speechTrain=[ <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< command: speechTrain precision = float -LockDevice: Capture device 1 and lock it for exclusive use -LockDevice: Capture device 2 and lock it for exclusive use -LockDevice: Capture device 3 and lock it for exclusive use -LockDevice: Capture device 0 and lock it for exclusive use -LockDevice: Capture device 1 and lock it for exclusive use -SimpleNetworkBuilder Using GPU 1 +CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn +CNTKCommandTrainInfo: speechTrain : 3 +CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +CNTKCommandTrainBegin: speechTrain +SimpleNetworkBuilder Using GPU 0 reading script file glob_0000.scp ... 948 entries trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion -total 132 state names in state list C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list -htkmlfreader: reading MLF file C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf ... total 948 entries +total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list +htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf ... total 948 entries ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances label set 0: 129 classes minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames +SetUniformRandomValue (GPU): creating curand object with seed 1 GetTrainCriterionNodes ... GetEvalCriterionNodes ... -Validating node CrossEntropyWithSoftmax +Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. -Validating --> labels = InputValue -Validating --> W2 = LearnableParameter -Validating --> W1 = LearnableParameter -Validating --> W0 = LearnableParameter -Validating --> features = InputValue -Validating --> MeanOfFeatures = Mean(features[363, 3]) -Validating --> InvStdOfFeatures = InvStdDev(features[363, 3]) -Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 3]) -Validating --> B0 = LearnableParameter -Validating --> W0*features+B0 = Plus(W0*features[512, 3], B0[512, 1]) -Validating --> H1 = Sigmoid(W0*features+B0[512, 3]) -Validating --> W1*H1 = Times(W1[512, 512], H1[512, 3]) -Validating --> B1 = LearnableParameter -Validating --> W1*H1+B1 = Plus(W1*H1[512, 3], B1[512, 1]) -Validating --> H2 = Sigmoid(W1*H1+B1[512, 3]) -Validating --> W2*H1 = Times(W2[132, 512], H2[512, 3]) -Validating --> B2 = LearnableParameter -Validating --> HLast = Plus(W2*H1[132, 3], B2[132, 1]) -Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 3], HLast[132, 3]) +Validating --> labels = InputValue -> [132, MBSize 3] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 3] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] + +Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 3] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 3] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] + +Validating for node CrossEntropyWithSoftmax, final verification. + +Validating --> labels = InputValue -> [132, MBSize 3] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 3] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1] + +9 out of 20 nodes do not share the minibatch layout with the input data. + + +Precomputing --> 3 PreCompute nodes found. -Found 3 PreCompute nodes NodeName: InvStdOfFeatures NodeName: MeanOfFeatures NodeName: Prior -minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0) with 1 datapasses +minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms -Validating node InvStdOfFeatures +Validating for node InvStdOfFeatures. 2 nodes to process in pass 1. -Validating --> features = InputValue -Validating --> InvStdOfFeatures = InvStdDev(features[363, 64]) +Validating --> features = InputValue -> [363, MBSize 3] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] + +Validating for node InvStdOfFeatures, final verification. + +Validating --> features = InputValue -> [363, MBSize 3] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1] + +1 out of 2 nodes do not share the minibatch layout with the input data. -Validating node MeanOfFeatures +Validating for node MeanOfFeatures. 2 nodes to process in pass 1. -Validating --> features = InputValue -Validating --> MeanOfFeatures = Mean(features[363, 64]) +Validating --> features = InputValue -> [363, MBSize 3] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] + +Validating for node MeanOfFeatures, final verification. + +Validating --> features = InputValue -> [363, MBSize 3] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1] + +1 out of 2 nodes do not share the minibatch layout with the input data. -Validating node Prior +Validating for node Prior. 2 nodes to process in pass 1. -Validating --> labels = InputValue -Validating --> Prior = Mean(labels[132, 64]) +Validating --> labels = InputValue -> [132, MBSize 3] +Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] + +Validating for node Prior. 1 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 3] +Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] + +Validating for node Prior, final verification. + +Validating --> labels = InputValue -> [132, MBSize 3] +Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1] + +1 out of 2 nodes do not share the minibatch layout with the input data. + + +Precomputing --> Completed. Set Max Temp Mem Size For Convolution Nodes to 0 samples. -Starting Epoch 1: learning rate per sample = 0.015625 momentum = 0.900000 -minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0) with 1 datapasses +Starting Epoch 1: learning rate per sample = 0.015625 effective momentum = 0.900000 +minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses -Validating node EvalErrorPrediction +Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. -Validating --> labels = InputValue -Validating --> W2 = LearnableParameter -Validating --> W1 = LearnableParameter -Validating --> W0 = LearnableParameter -Validating --> features = InputValue -Validating --> MeanOfFeatures = Mean(features[363, 64]) -Validating --> InvStdOfFeatures = InvStdDev(features[363, 64]) -Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 64], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 64]) -Validating --> B0 = LearnableParameter -Validating --> W0*features+B0 = Plus(W0*features[512, 64], B0[512, 1]) -Validating --> H1 = Sigmoid(W0*features+B0[512, 64]) -Validating --> W1*H1 = Times(W1[512, 512], H1[512, 64]) -Validating --> B1 = LearnableParameter -Validating --> W1*H1+B1 = Plus(W1*H1[512, 64], B1[512, 1]) -Validating --> H2 = Sigmoid(W1*H1+B1[512, 64]) -Validating --> W2*H1 = Times(W2[132, 512], H2[512, 64]) -Validating --> B2 = LearnableParameter -Validating --> HLast = Plus(W2*H1[132, 64], B2[132, 1]) -Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 64], HLast[132, 64]) +Validating --> labels = InputValue -> [132, MBSize 62] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 62] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] - Epoch[ 1 of 3]-Minibatch[ 1- 10 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.45646143; EvalErr[0]PerSample = 0.92500001; TotalTime = 0.03190s; TotalTimePerSample = 0.04985ms; SamplesPerSecond = 20061 - Epoch[ 1 of 3]-Minibatch[ 11- 20 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.22315693; EvalErr[0]PerSample = 0.90156251; TotalTime = 0.02454s; TotalTimePerSample = 0.03835ms; SamplesPerSecond = 26075 - Epoch[ 1 of 3]-Minibatch[ 21- 30 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.95180511; EvalErr[0]PerSample = 0.84687501; TotalTime = 0.02438s; TotalTimePerSample = 0.03809ms; SamplesPerSecond = 26254 - Epoch[ 1 of 3]-Minibatch[ 31- 40 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.94157934; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.02445s; TotalTimePerSample = 0.03820ms; SamplesPerSecond = 26181 - Epoch[ 1 of 3]-Minibatch[ 41- 50 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.85668945; EvalErr[0]PerSample = 0.91093749; TotalTime = 0.02429s; TotalTimePerSample = 0.03795ms; SamplesPerSecond = 26352 - Epoch[ 1 of 3]-Minibatch[ 51- 60 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.72866368; EvalErr[0]PerSample = 0.89531249; TotalTime = 0.02445s; TotalTimePerSample = 0.03820ms; SamplesPerSecond = 26178 - Epoch[ 1 of 3]-Minibatch[ 61- 70 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.51809072; EvalErr[0]PerSample = 0.82968748; TotalTime = 0.02423s; TotalTimePerSample = 0.03786ms; SamplesPerSecond = 26415 - Epoch[ 1 of 3]-Minibatch[ 71- 80 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.48454905; EvalErr[0]PerSample = 0.80781251; TotalTime = 0.02249s; TotalTimePerSample = 0.03514ms; SamplesPerSecond = 28457 - Epoch[ 1 of 3]-Minibatch[ 81- 90 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.33829641; EvalErr[0]PerSample = 0.76875001; TotalTime = 0.02169s; TotalTimePerSample = 0.03390ms; SamplesPerSecond = 29501 - Epoch[ 1 of 3]-Minibatch[ 91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.50167227; EvalErr[0]PerSample = 0.79843748; TotalTime = 0.02178s; TotalTimePerSample = 0.03403ms; SamplesPerSecond = 29386 +Validating for node EvalErrorPrediction. 10 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 62] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 62] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] + +Validating for node EvalErrorPrediction, final verification. + +Validating --> labels = InputValue -> [132, MBSize 62] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 62] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1] + +9 out of 20 nodes do not share the minibatch layout with the input data. + + +Starting minibatch loop. + Epoch[ 1 of 3]-Minibatch[ 1- 10 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.45645981; EvalErr[0]PerSample = 0.92500000; TotalTime = 0.15527s; TotalTimePerSample = 0.24261ms; SamplesPerSecond = 4121 + Epoch[ 1 of 3]-Minibatch[ 11- 20 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.22315750; EvalErr[0]PerSample = 0.90156250; TotalTime = 0.17254s; TotalTimePerSample = 0.26960ms; SamplesPerSecond = 3709 + Epoch[ 1 of 3]-Minibatch[ 21- 30 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.95180664; EvalErr[0]PerSample = 0.84687500; TotalTime = 0.16283s; TotalTimePerSample = 0.25443ms; SamplesPerSecond = 3930 + Epoch[ 1 of 3]-Minibatch[ 31- 40 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.94158020; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.15770s; TotalTimePerSample = 0.24641ms; SamplesPerSecond = 4058 + Epoch[ 1 of 3]-Minibatch[ 41- 50 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.85668945; EvalErr[0]PerSample = 0.91093750; TotalTime = 0.17209s; TotalTimePerSample = 0.26889ms; SamplesPerSecond = 3719 + Epoch[ 1 of 3]-Minibatch[ 51- 60 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.72866364; EvalErr[0]PerSample = 0.89531250; TotalTime = 0.16186s; TotalTimePerSample = 0.25291ms; SamplesPerSecond = 3954 + Epoch[ 1 of 3]-Minibatch[ 61- 70 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.51809235; EvalErr[0]PerSample = 0.82968750; TotalTime = 0.15901s; TotalTimePerSample = 0.24846ms; SamplesPerSecond = 4024 + Epoch[ 1 of 3]-Minibatch[ 71- 80 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.48455200; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.15480s; TotalTimePerSample = 0.24188ms; SamplesPerSecond = 4134 + Epoch[ 1 of 3]-Minibatch[ 81- 90 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.33829346; EvalErr[0]PerSample = 0.76875000; TotalTime = 0.15737s; TotalTimePerSample = 0.24588ms; SamplesPerSecond = 4066 + Epoch[ 1 of 3]-Minibatch[ 91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.50167236; EvalErr[0]PerSample = 0.79843750; TotalTime = 0.15904s; TotalTimePerSample = 0.24849ms; SamplesPerSecond = 4024 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times. - Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.22861624; EvalErr[0]PerSample = 0.80000001; TotalTime = 0.02166s; TotalTimePerSample = 0.03385ms; SamplesPerSecond = 29546 - Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.32616878; EvalErr[0]PerSample = 0.79062498; TotalTime = 0.02063s; TotalTimePerSample = 0.03224ms; SamplesPerSecond = 31018 - Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.16897583; EvalErr[0]PerSample = 0.77968752; TotalTime = 0.01950s; TotalTimePerSample = 0.03048ms; SamplesPerSecond = 32813 - Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.08891916; EvalErr[0]PerSample = 0.77656251; TotalTime = 0.01961s; TotalTimePerSample = 0.03063ms; SamplesPerSecond = 32644 - Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.06004953; EvalErr[0]PerSample = 0.72968751; TotalTime = 0.01950s; TotalTimePerSample = 0.03046ms; SamplesPerSecond = 32825 - Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.91128540; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.01965s; TotalTimePerSample = 0.03070ms; SamplesPerSecond = 32571 - Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.90172124; EvalErr[0]PerSample = 0.72968751; TotalTime = 0.01828s; TotalTimePerSample = 0.02857ms; SamplesPerSecond = 35003 - Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.73261714; EvalErr[0]PerSample = 0.65312499; TotalTime = 0.01799s; TotalTimePerSample = 0.02811ms; SamplesPerSecond = 35569 - Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.66515493; EvalErr[0]PerSample = 0.68437499; TotalTime = 0.01789s; TotalTimePerSample = 0.02796ms; SamplesPerSecond = 35766 - Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.67383432; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.01792s; TotalTimePerSample = 0.02800ms; SamplesPerSecond = 35708 - Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.52869272; EvalErr[0]PerSample = 0.63593751; TotalTime = 0.01805s; TotalTimePerSample = 0.02821ms; SamplesPerSecond = 35451 - Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.60032344; EvalErr[0]PerSample = 0.66718751; TotalTime = 0.01696s; TotalTimePerSample = 0.02650ms; SamplesPerSecond = 37738 - Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.51134038; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.01658s; TotalTimePerSample = 0.02591ms; SamplesPerSecond = 38598 - Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.45362544; EvalErr[0]PerSample = 0.63749999; TotalTime = 0.01663s; TotalTimePerSample = 0.02598ms; SamplesPerSecond = 38491 - Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.41640615; EvalErr[0]PerSample = 0.61562502; TotalTime = 0.01670s; TotalTimePerSample = 0.02610ms; SamplesPerSecond = 38321 - Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.39745474; EvalErr[0]PerSample = 0.62812501; TotalTime = 0.01672s; TotalTimePerSample = 0.02612ms; SamplesPerSecond = 38279 - Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.16415405; EvalErr[0]PerSample = 0.56718749; TotalTime = 0.01621s; TotalTimePerSample = 0.02533ms; SamplesPerSecond = 39481 - Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.30347300; EvalErr[0]PerSample = 0.63593751; TotalTime = 0.01583s; TotalTimePerSample = 0.02474ms; SamplesPerSecond = 40427 - Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.24398804; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.01579s; TotalTimePerSample = 0.02467ms; SamplesPerSecond = 40542 - Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.15322256; EvalErr[0]PerSample = 0.57968748; TotalTime = 0.01582s; TotalTimePerSample = 0.02472ms; SamplesPerSecond = 40447 - Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.21664429; EvalErr[0]PerSample = 0.59531248; TotalTime = 0.01570s; TotalTimePerSample = 0.02453ms; SamplesPerSecond = 40761 - Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.25246572; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.01556s; TotalTimePerSample = 0.02431ms; SamplesPerSecond = 41139 -Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0000031; EvalErrPerSample = 0.72836918; Ave LearnRatePerSample = 0.015625; EpochTime=0.657568 -Starting Epoch 2: learning rate per sample = 0.001953 momentum = 0.656119 -minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480) with 1 datapasses - Epoch[ 2 of 3]-Minibatch[ 1- 10 of 80]: SamplesSeen = 2560; TrainLossPerSample = 2.08151960; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.03143s; TotalTimePerSample = 0.01228ms; SamplesPerSecond = 81456 - Epoch[ 2 of 3]-Minibatch[ 11- 20 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98395634; EvalErr[0]PerSample = 0.54257810; TotalTime = 0.02295s; TotalTimePerSample = 0.00896ms; SamplesPerSecond = 111561 - Epoch[ 2 of 3]-Minibatch[ 21- 30 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98575521; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.02287s; TotalTimePerSample = 0.00893ms; SamplesPerSecond = 111951 - Epoch[ 2 of 3]-Minibatch[ 31- 40 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90484965; EvalErr[0]PerSample = 0.53164065; TotalTime = 0.02284s; TotalTimePerSample = 0.00892ms; SamplesPerSecond = 112069 - Epoch[ 2 of 3]-Minibatch[ 41- 50 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.88324130; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.02277s; TotalTimePerSample = 0.00889ms; SamplesPerSecond = 112448 - Epoch[ 2 of 3]-Minibatch[ 51- 60 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89109266; EvalErr[0]PerSample = 0.53359377; TotalTime = 0.02287s; TotalTimePerSample = 0.00894ms; SamplesPerSecond = 111917 - Epoch[ 2 of 3]-Minibatch[ 61- 70 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89496076; EvalErr[0]PerSample = 0.52890623; TotalTime = 0.02279s; TotalTimePerSample = 0.00890ms; SamplesPerSecond = 112325 - Epoch[ 2 of 3]-Minibatch[ 71- 80 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.85944366; EvalErr[0]PerSample = 0.52265626; TotalTime = 0.02265s; TotalTimePerSample = 0.00885ms; SamplesPerSecond = 113044 -Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9356024; EvalErrPerSample = 0.53603518; Ave LearnRatePerSample = 0.001953125; EpochTime=0.192318 -Starting Epoch 3: learning rate per sample = 0.000098 momentum = 0.656119 -minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960) with 1 datapasses - Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86752820; EvalErr[0]PerSample = 0.52177733; TotalTime = 0.08080s; TotalTimePerSample = 0.00789ms; SamplesPerSecond = 126735 - Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87358737; EvalErr[0]PerSample = 0.51542968; TotalTime = 0.05544s; TotalTimePerSample = 0.00541ms; SamplesPerSecond = 184694 -Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705578; EvalErrPerSample = 0.5186035; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=0.139063 + Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.22861633; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.16485s; TotalTimePerSample = 0.25757ms; SamplesPerSecond = 3882 + Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.32616882; EvalErr[0]PerSample = 0.79062500; TotalTime = 0.15116s; TotalTimePerSample = 0.23618ms; SamplesPerSecond = 4234 + Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.16897583; EvalErr[0]PerSample = 0.77968750; TotalTime = 0.15167s; TotalTimePerSample = 0.23699ms; SamplesPerSecond = 4219 + Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.08891907; EvalErr[0]PerSample = 0.77656250; TotalTime = 0.16170s; TotalTimePerSample = 0.25265ms; SamplesPerSecond = 3958 + Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.06005249; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.15522s; TotalTimePerSample = 0.24254ms; SamplesPerSecond = 4123 + Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.91128540; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.15756s; TotalTimePerSample = 0.24618ms; SamplesPerSecond = 4062 + Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.90172119; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.15992s; TotalTimePerSample = 0.24987ms; SamplesPerSecond = 4002 + Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.73261719; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.16060s; TotalTimePerSample = 0.25093ms; SamplesPerSecond = 3985 + Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.66515503; EvalErr[0]PerSample = 0.68437500; TotalTime = 0.15478s; TotalTimePerSample = 0.24184ms; SamplesPerSecond = 4134 + Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.67383423; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.14563s; TotalTimePerSample = 0.22755ms; SamplesPerSecond = 4394 + Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.52869263; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.15331s; TotalTimePerSample = 0.23955ms; SamplesPerSecond = 4174 + Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.60032349; EvalErr[0]PerSample = 0.66718750; TotalTime = 0.15816s; TotalTimePerSample = 0.24713ms; SamplesPerSecond = 4046 + Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.51134033; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.16587s; TotalTimePerSample = 0.25917ms; SamplesPerSecond = 3858 + Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.45362549; EvalErr[0]PerSample = 0.63750000; TotalTime = 0.15854s; TotalTimePerSample = 0.24772ms; SamplesPerSecond = 4036 + Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.41640015; EvalErr[0]PerSample = 0.61562500; TotalTime = 0.15948s; TotalTimePerSample = 0.24919ms; SamplesPerSecond = 4013 + Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.39745483; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.16179s; TotalTimePerSample = 0.25280ms; SamplesPerSecond = 3955 + Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.16415405; EvalErr[0]PerSample = 0.56718750; TotalTime = 0.16235s; TotalTimePerSample = 0.25367ms; SamplesPerSecond = 3942 + Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.30347290; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.15271s; TotalTimePerSample = 0.23861ms; SamplesPerSecond = 4190 + Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.24398804; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.16522s; TotalTimePerSample = 0.25815ms; SamplesPerSecond = 3873 + Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.15322266; EvalErr[0]PerSample = 0.57968750; TotalTime = 0.15988s; TotalTimePerSample = 0.24982ms; SamplesPerSecond = 4002 + Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.21664429; EvalErr[0]PerSample = 0.59531250; TotalTime = 0.14906s; TotalTimePerSample = 0.23290ms; SamplesPerSecond = 4293 + Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.25246582; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.14161s; TotalTimePerSample = 0.22126ms; SamplesPerSecond = 4519 +Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0000031; EvalErrPerSample = 0.72836918; Ave LearnRatePerSample = 0.015625; EpochTime=5.105428 +Starting Epoch 2: learning rate per sample = 0.001953 effective momentum = 0.656119 +minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 0 of 1, with 1 datapasses + +Starting minibatch loop. + Epoch[ 2 of 3]-Minibatch[ 1- 10 of 80]: SamplesSeen = 2560; TrainLossPerSample = 2.08151951; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.35121s; TotalTimePerSample = 0.13719ms; SamplesPerSecond = 7289 + Epoch[ 2 of 3]-Minibatch[ 11- 20 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98395710; EvalErr[0]PerSample = 0.54257813; TotalTime = 0.31404s; TotalTimePerSample = 0.12267ms; SamplesPerSecond = 8151 + Epoch[ 2 of 3]-Minibatch[ 21- 30 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.98575516; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.27053s; TotalTimePerSample = 0.10567ms; SamplesPerSecond = 9463 + Epoch[ 2 of 3]-Minibatch[ 31- 40 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.90485039; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.24565s; TotalTimePerSample = 0.09596ms; SamplesPerSecond = 10421 + Epoch[ 2 of 3]-Minibatch[ 41- 50 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.88324280; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.22956s; TotalTimePerSample = 0.08967ms; SamplesPerSecond = 11151 + Epoch[ 2 of 3]-Minibatch[ 51- 60 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89109344; EvalErr[0]PerSample = 0.53359375; TotalTime = 0.22156s; TotalTimePerSample = 0.08655ms; SamplesPerSecond = 11554 + Epoch[ 2 of 3]-Minibatch[ 61- 70 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.89496155; EvalErr[0]PerSample = 0.52890625; TotalTime = 0.21987s; TotalTimePerSample = 0.08589ms; SamplesPerSecond = 11643 + Epoch[ 2 of 3]-Minibatch[ 71- 80 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.85944366; EvalErr[0]PerSample = 0.52265625; TotalTime = 0.19881s; TotalTimePerSample = 0.07766ms; SamplesPerSecond = 12876 +Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.935603; EvalErrPerSample = 0.53603518; Ave LearnRatePerSample = 0.001953125; EpochTime=2.098193 +Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 +minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 1, with 1 datapasses + +Starting minibatch loop. + Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86752853; EvalErr[0]PerSample = 0.52177734; TotalTime = 0.71783s; TotalTimePerSample = 0.07010ms; SamplesPerSecond = 14265 + Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87358818; EvalErr[0]PerSample = 0.51542969; TotalTime = 0.60551s; TotalTimePerSample = 0.05913ms; SamplesPerSecond = 16911 +Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705584; EvalErrPerSample = 0.5186035; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=1.428405 +CNTKCommandTrainEnd: speechTrain COMPLETED === Deleting last epoch data ==== Re-running from checkpoint +=== Running /cygdrive/e/NetScale/CNTK/git_repos/cplx_master2/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E DeviceId=0 ------------------------------------------------------------------- Build info: - Built time: Aug 11 2015 16:18:17 - Last modified date: Tue Aug 11 16:16:08 2015 - Built by svcphil on dphaim-26-new - Build Path: C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\MachineLearning\CNTK\ + Built time: Oct 24 2015 13:33:25 + Last modified date: Thu Oct 22 16:00:27 2015 + Built by amitaga on Amitaga-Win-DT3 + Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\ CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 - Build Branch: master - Build SHA1: 397cc7cc16c00b1c12864d331c0729fde7a1bde3 ------------------------------------------------------------------- -running on dphaim-26-new at 2015/08/11 17:47:34 -command line options: -configFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\QuickE2E\cntk.config RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data DeviceId=Auto +running on Amitaga-Win-DT3 at 2015/10/24 22:08:20 +command line: +E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E DeviceId=0 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> precision=float @@ -389,6 +598,31 @@ speechTrain=[ uniformInit=true needPrior=true ] + ExperimentalNetworkBuilder=[ // the same as above but with BS + layerSizes=363:512:512:132 + trainingCriterion='CE' + evalCriterion='Err' + applyMeanVarNorm=true + L = Length(layerSizes)-1 // number of model layers + features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') + featNorm = if applyMeanVarNorm + then MeanVarNorm(features) + else features + layers[layer:1..L-1] = if layer > 1 + then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) + else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) + outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) + outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) + CE = if trainingCriterion == 'CE' + then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') + else Fail('unknown trainingCriterion ' + trainingCriterion) + Err = if evalCriterion == 'Err' then + ErrorPrediction(labels, outZ, tag='eval') + else Fail('unknown evalCriterion ' + evalCriterion) + logPrior = LogPrior(labels) + // TODO: how to add a tag to an infix operation? + ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') + ] SGD=[ epochSize=20480 minibatchSize=64:256:1024: @@ -427,21 +661,22 @@ speechTrain=[ ] ] ] -RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu -DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data -DeviceId=Auto +RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu +DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E +DeviceId=0 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> precision=float command=speechTrain -deviceId=Auto +deviceId=0 parallelTrain=false speechTrain=[ action=train - modelPath=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn - deviceId=Auto + modelPath=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn + deviceId=0 traceLevel=1 SimpleNetworkBuilder=[ layerSizes=363:512:512:132 @@ -453,6 +688,31 @@ speechTrain=[ uniformInit=true needPrior=true ] + ExperimentalNetworkBuilder=[ // the same as above but with BS + layerSizes=363:512:512:132 + trainingCriterion='CE' + evalCriterion='Err' + applyMeanVarNorm=true + L = Length(layerSizes)-1 // number of model layers + features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') + featNorm = if applyMeanVarNorm + then MeanVarNorm(features) + else features + layers[layer:1..L-1] = if layer > 1 + then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) + else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) + outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) + outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) + CE = if trainingCriterion == 'CE' + then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') + else Fail('unknown trainingCriterion ' + trainingCriterion) + Err = if evalCriterion == 'Err' then + ErrorPrediction(labels, outZ, tag='eval') + else Fail('unknown evalCriterion ' + evalCriterion) + logPrior = LogPrior(labels) + // TODO: how to add a tag to an infix operation? + ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') + ] SGD=[ epochSize=20480 minibatchSize=64:256:1024: @@ -484,30 +744,32 @@ speechTrain=[ scpFile=glob_0000.scp ] labels=[ - mlfFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf - labelMappingFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list + mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf + labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list labelDim=132 labelType=Category ] ] ] -RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu -DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data -DeviceId=Auto +RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu +DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E +DeviceId=0 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> configparameters: cntk.config:command=speechTrain -configparameters: cntk.config:DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data -configparameters: cntk.config:deviceId=Auto +configparameters: cntk.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E +configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data +configparameters: cntk.config:deviceId=0 configparameters: cntk.config:parallelTrain=false configparameters: cntk.config:precision=float -configparameters: cntk.config:RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu +configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu configparameters: cntk.config:speechTrain=[ action=train - modelPath=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn - deviceId=Auto + modelPath=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn + deviceId=0 traceLevel=1 SimpleNetworkBuilder=[ layerSizes=363:512:512:132 @@ -519,6 +781,31 @@ configparameters: cntk.config:speechTrain=[ uniformInit=true needPrior=true ] + ExperimentalNetworkBuilder=[ // the same as above but with BS + layerSizes=363:512:512:132 + trainingCriterion='CE' + evalCriterion='Err' + applyMeanVarNorm=true + L = Length(layerSizes)-1 // number of model layers + features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') + featNorm = if applyMeanVarNorm + then MeanVarNorm(features) + else features + layers[layer:1..L-1] = if layer > 1 + then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) + else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) + outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) + outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) + CE = if trainingCriterion == 'CE' + then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') + else Fail('unknown trainingCriterion ' + trainingCriterion) + Err = if evalCriterion == 'Err' then + ErrorPrediction(labels, outZ, tag='eval') + else Fail('unknown evalCriterion ' + evalCriterion) + logPrior = LogPrior(labels) + // TODO: how to add a tag to an infix operation? + ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') + ] SGD=[ epochSize=20480 minibatchSize=64:256:1024: @@ -550,8 +837,8 @@ configparameters: cntk.config:speechTrain=[ scpFile=glob_0000.scp ] labels=[ - mlfFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf - labelMappingFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list + mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf + labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list labelDim=132 labelType=Category ] @@ -561,25 +848,24 @@ configparameters: cntk.config:speechTrain=[ <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< command: speechTrain precision = float -LockDevice: Capture device 1 and lock it for exclusive use -LockDevice: Capture device 2 and lock it for exclusive use -LockDevice: Capture device 3 and lock it for exclusive use -LockDevice: Capture device 0 and lock it for exclusive use -LockDevice: Capture device 1 and lock it for exclusive use -SimpleNetworkBuilder Using GPU 1 +CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn +CNTKCommandTrainInfo: speechTrain : 3 +CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +CNTKCommandTrainBegin: speechTrain +SimpleNetworkBuilder Using GPU 0 reading script file glob_0000.scp ... 948 entries trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion -total 132 state names in state list C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list -htkmlfreader: reading MLF file C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf ... total 948 entries +total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list +htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf ... total 948 entries ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances label set 0: 129 classes minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames -Starting from checkpoint. Load Network From File C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn.2. +Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn.2. Printing Gradient Computation Node Order ... -CrossEntropyWithSoftmax[0, 0] = CrossEntropyWithSoftmax(labels[132, 256], HLast[0, 0]) +CrossEntropyWithSoftmax[0, 0] = CrossEntropyWithSoftmax(labels[132, 0], HLast[0, 0]) HLast[0, 0] = Plus(W2*H1[0, 0], B2[132, 1]) B2[132, 1] = LearnableParameter W2*H1[0, 0] = Times(W2[132, 512], H2[0, 0]) @@ -591,148 +877,474 @@ H1[0, 0] = Sigmoid(W0*features+B0[0, 0]) W0*features+B0[0, 0] = Plus(W0*features[0, 0], B0[512, 1]) B0[512, 1] = LearnableParameter W0*features[0, 0] = Times(W0[512, 363], MVNormalizedFeatures[0, 0]) -MVNormalizedFeatures[0, 0] = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -InvStdOfFeatures[363, 1] = InvStdDev(features[363, 256]) -MeanOfFeatures[363, 1] = Mean(features[363, 256]) -features[363, 256] = InputValue +MVNormalizedFeatures[0, 0] = PerDimMeanVarNormalization(features[363, 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) +InvStdOfFeatures[363, 1] = InvStdDev(features[363, 0]) +MeanOfFeatures[363, 1] = Mean(features[363, 0]) +features[363, 0] = InputValue W0[512, 363] = LearnableParameter W1[512, 512] = LearnableParameter W2[132, 512] = LearnableParameter -labels[132, 256] = InputValue +labels[132, 0] = InputValue -Validating node CrossEntropyWithSoftmax +Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. -Validating --> labels = InputValue -Validating --> W2 = LearnableParameter -Validating --> W1 = LearnableParameter -Validating --> W0 = LearnableParameter -Validating --> features = InputValue -Validating --> MeanOfFeatures = Mean(features[363, 256]) -Validating --> InvStdOfFeatures = InvStdDev(features[363, 256]) -Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256]) -Validating --> B0 = LearnableParameter -Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1]) -Validating --> H1 = Sigmoid(W0*features+B0[512, 256]) -Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256]) -Validating --> B1 = LearnableParameter -Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1]) -Validating --> H2 = Sigmoid(W1*H1+B1[512, 256]) -Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256]) -Validating --> B2 = LearnableParameter -Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1]) -Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 256], HLast[132, 256]) +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +Validating for node CrossEntropyWithSoftmax, final verification. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +9 out of 20 nodes do not share the minibatch layout with the input data. -Validating node ScaledLogLikelihood +Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1. -Validating --> W2 = LearnableParameter -Validating --> W1 = LearnableParameter -Validating --> W0 = LearnableParameter -Validating --> features = InputValue -Validating --> MeanOfFeatures = Mean(features[363, 256]) -Validating --> InvStdOfFeatures = InvStdDev(features[363, 256]) -Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256]) -Validating --> B0 = LearnableParameter -Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1]) -Validating --> H1 = Sigmoid(W0*features+B0[512, 256]) -Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256]) -Validating --> B1 = LearnableParameter -Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1]) -Validating --> H2 = Sigmoid(W1*H1+B1[512, 256]) -Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256]) -Validating --> B2 = LearnableParameter -Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1]) -Validating --> labels = InputValue -Validating --> Prior = Mean(labels[132, 256]) -Validating --> LogOfPrior = Log(Prior[132, 1]) -Validating --> ScaledLogLikelihood = Minus(HLast[132, 256], LogOfPrior[132, 1]) +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +Validating for node CrossEntropyWithSoftmax. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +Validating for node CrossEntropyWithSoftmax, final verification. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +9 out of 20 nodes do not share the minibatch layout with the input data. -Validating node EvalErrorPrediction +Validating for node ScaledLogLikelihood. 22 nodes to process in pass 1. -Validating --> labels = InputValue -Validating --> W2 = LearnableParameter -Validating --> W1 = LearnableParameter -Validating --> W0 = LearnableParameter -Validating --> features = InputValue -Validating --> MeanOfFeatures = Mean(features[363, 256]) -Validating --> InvStdOfFeatures = InvStdDev(features[363, 256]) -Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256]) -Validating --> B0 = LearnableParameter -Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1]) -Validating --> H1 = Sigmoid(W0*features+B0[512, 256]) -Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256]) -Validating --> B1 = LearnableParameter -Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1]) -Validating --> H2 = Sigmoid(W1*H1+B1[512, 256]) -Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256]) -Validating --> B2 = LearnableParameter -Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1]) -Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 256], HLast[132, 256]) +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1] +Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0] + +Validating for node ScaledLogLikelihood. 11 nodes to process in pass 2. + +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1] +Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0] + +Validating for node ScaledLogLikelihood, final verification. + +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1] +Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0] + +10 out of 22 nodes do not share the minibatch layout with the input data. + + + +Validating for node ScaledLogLikelihood. 22 nodes to process in pass 1. + +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1] +Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0] + +Validating for node ScaledLogLikelihood. 9 nodes to process in pass 2. + +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1] +Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0] + +Validating for node ScaledLogLikelihood, final verification. + +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1] +Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1] +Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0] + +10 out of 22 nodes do not share the minibatch layout with the input data. + + + +Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +Validating for node EvalErrorPrediction. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +Validating for node EvalErrorPrediction, final verification. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +9 out of 20 nodes do not share the minibatch layout with the input data. + + + +Validating for node EvalErrorPrediction. 20 nodes to process in pass 1. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +Validating for node EvalErrorPrediction. 9 nodes to process in pass 2. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +Validating for node EvalErrorPrediction, final verification. + +Validating --> labels = InputValue -> [132, MBSize 0] +Validating --> W2 = LearnableParameter -> [132, 512] +Validating --> W1 = LearnableParameter -> [512, 512] +Validating --> W0 = LearnableParameter -> [512, 363] +Validating --> features = InputValue -> [363, MBSize 0] +Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1] +Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1] +Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0] +Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0] +Validating --> B0 = LearnableParameter -> [512, 1] +Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0] +Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> B1 = LearnableParameter -> [512, 1] +Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0] +Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0] +Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0] +Validating --> B2 = LearnableParameter -> [132, 1] +Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0] +Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1] + +9 out of 20 nodes do not share the minibatch layout with the input data. GetTrainCriterionNodes ... GetEvalCriterionNodes ... - - -Validating node CrossEntropyWithSoftmax - -Validating --> labels = InputValue -Validating --> W2 = LearnableParameter -Validating --> W1 = LearnableParameter -Validating --> W0 = LearnableParameter -Validating --> features = InputValue -Validating --> MeanOfFeatures = Mean(features[363, 256]) -Validating --> InvStdOfFeatures = InvStdDev(features[363, 256]) -Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256]) -Validating --> B0 = LearnableParameter -Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1]) -Validating --> H1 = Sigmoid(W0*features+B0[512, 256]) -Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256]) -Validating --> B1 = LearnableParameter -Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1]) -Validating --> H2 = Sigmoid(W1*H1+B1[512, 256]) -Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256]) -Validating --> B2 = LearnableParameter -Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1]) -Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 256], HLast[132, 256]) - No PreCompute nodes found, skipping PreCompute step Set Max Temp Mem Size For Convolution Nodes to 0 samples. -Starting Epoch 3: learning rate per sample = 0.000098 momentum = 0.656119 -minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960) with 1 datapasses +Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 +minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 1, with 1 datapasses requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms - -Validating node EvalErrorPrediction - -Validating --> labels = InputValue -Validating --> W2 = LearnableParameter -Validating --> W1 = LearnableParameter -Validating --> W0 = LearnableParameter -Validating --> features = InputValue -Validating --> MeanOfFeatures = Mean(features[363, 1024]) -Validating --> InvStdOfFeatures = InvStdDev(features[363, 1024]) -Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 1024], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 1024]) -Validating --> B0 = LearnableParameter -Validating --> W0*features+B0 = Plus(W0*features[512, 1024], B0[512, 1]) -Validating --> H1 = Sigmoid(W0*features+B0[512, 1024]) -Validating --> W1*H1 = Times(W1[512, 512], H1[512, 1024]) -Validating --> B1 = LearnableParameter -Validating --> W1*H1+B1 = Plus(W1*H1[512, 1024], B1[512, 1]) -Validating --> H2 = Sigmoid(W1*H1+B1[512, 1024]) -Validating --> W2*H1 = Times(W2[132, 512], H2[512, 1024]) -Validating --> B2 = LearnableParameter -Validating --> HLast = Plus(W2*H1[132, 1024], B2[132, 1]) -Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 1024], HLast[132, 1024]) - - Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86752820; EvalErr[0]PerSample = 0.52177733; TotalTime = 0.42093s; TotalTimePerSample = 0.04111ms; SamplesPerSecond = 24327 - Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87358737; EvalErr[0]PerSample = 0.51542968; TotalTime = 0.05521s; TotalTimePerSample = 0.00539ms; SamplesPerSecond = 185480 -Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705578; EvalErrPerSample = 0.5186035; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=0.690137 +Starting minibatch loop. + Epoch[ 3 of 3]-Minibatch[ 1- 10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.86752853; EvalErr[0]PerSample = 0.52177734; TotalTime = 1.50756s; TotalTimePerSample = 0.14722ms; SamplesPerSecond = 6792 + Epoch[ 3 of 3]-Minibatch[ 11- 20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.87358818; EvalErr[0]PerSample = 0.51542969; TotalTime = 0.86938s; TotalTimePerSample = 0.08490ms; SamplesPerSecond = 11778 +Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705584; EvalErrPerSample = 0.5186035; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=6.283729 +CNTKCommandTrainEnd: speechTrain COMPLETED