зеркало из https://github.com/mozilla/kaldi.git
trunk: modifying recipes for neural net training with the new online preconditioning; adding/modifying example scripts for RM and Fisher; various cosmetic code changes.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4088 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
f67874f274
Коммит
99873c6171
|
@ -1,11 +1,13 @@
|
|||
#!/bin/bash
|
||||
|
||||
|
||||
# this (local/nnet2/run_6c_gpu.sh) trains a p-norm neural network on top of
|
||||
# the SAT system in 5a.
|
||||
# It uses the _fast.sh version of the script, which is faster than the old
|
||||
# one, and also the --first-component-power 0.5 option, which we believe
|
||||
# improves results (we're waiting for the numbers though).
|
||||
|
||||
|
||||
dir=nnet6c_gpu
|
||||
dir=nnet6c5_gpu
|
||||
train_stage=-10
|
||||
|
||||
. ./cmd.sh
|
||||
|
@ -28,11 +30,13 @@ parallel_opts="-l gpu=1" # This is suitable for the CLSP network, you'll likely
|
|||
|
||||
if [ ! -f exp/$dir/final.mdl ]; then
|
||||
|
||||
steps/nnet2/train_pnorm.sh --stage $train_stage --num-epochs 10 --get-egs-stage 3 --stage -3 \
|
||||
steps/nnet2/train_pnorm_fast.sh --stage $train_stage --num-epochs 8 \
|
||||
--first-component-power 0.5 \
|
||||
--egs-dir exp/nnet6c3_gpu/egs \
|
||||
--num-epochs-extra 4 \
|
||||
--samples-per-iter 400000 \
|
||||
--io-opts "-tc 10" \
|
||||
--num-epochs-extra 5 \
|
||||
--num-jobs-nnet 8 --num-threads 1 --max-change 40.0 \
|
||||
--num-jobs-nnet 8 --num-threads 1 \
|
||||
--minibatch-size 512 --parallel-opts "$parallel_opts" \
|
||||
--mix-up 15000 \
|
||||
--initial-learning-rate 0.08 --final-learning-rate 0.008 \
|
||||
|
@ -43,7 +47,7 @@ parallel_opts="-l gpu=1" # This is suitable for the CLSP network, you'll likely
|
|||
data/train data/lang exp/tri5a exp/$dir || exit 1;
|
||||
fi
|
||||
|
||||
steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 30 \
|
||||
steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 25 \
|
||||
--config conf/decode.config --transform-dir exp/tri5a/decode_dev \
|
||||
exp/tri5a/graph data/dev exp/$dir/decode_dev &
|
||||
|
||||
|
|
|
@ -105,14 +105,17 @@ exit 0
|
|||
%WER 1.72 [ 216 / 12533, 25 ins, 38 del, 153 sub ] exp/nnet4b_gpu/decode/wer_4
|
||||
%WER 8.34 [ 1045 / 12533, 94 ins, 146 del, 805 sub ] exp/nnet4b_gpu/decode_ug/wer_10
|
||||
|
||||
%WER 1.75 [ 219 / 12533, 23 ins, 54 del, 142 sub ] exp/nnet4c/decode/wer_4
|
||||
%WER 8.83 [ 1107 / 12533, 70 ins, 206 del, 831 sub ] exp/nnet4c/decode_ug/wer_10
|
||||
%WER 1.72 [ 216 / 12533, 24 ins, 51 del, 141 sub ] exp/nnet4c/decode/wer_4
|
||||
%WER 9.04 [ 1133 / 12533, 110 ins, 170 del, 853 sub ] exp/nnet4c/decode_ug/wer_7
|
||||
|
||||
%WER 1.76 [ 220 / 12533, 16 ins, 60 del, 144 sub ] exp/nnet4c_gpu/decode/wer_6
|
||||
%WER 8.82 [ 1106 / 12533, 90 ins, 173 del, 843 sub ] exp/nnet4c_gpu/decode_ug/wer_11
|
||||
%WER 1.83 [ 229 / 12533, 28 ins, 57 del, 144 sub ] exp/nnet4c_gpu/decode/wer_4
|
||||
%WER 9.08 [ 1138 / 12533, 111 ins, 166 del, 861 sub ] exp/nnet4c_gpu/decode_ug/wer_7
|
||||
|
||||
%WER 1.61 [ 202 / 12533, 29 ins, 34 del, 139 sub ] exp/nnet4d_gpu/decode/wer_2
|
||||
%WER 8.48 [ 1063 / 12533, 76 ins, 176 del, 811 sub ] exp/nnet4d_gpu/decode_ug/wer_10
|
||||
%WER 1.60 [ 201 / 12533, 28 ins, 43 del, 130 sub ] exp/nnet4d/decode/wer_4
|
||||
%WER 8.24 [ 1033 / 12533, 83 ins, 166 del, 784 sub ] exp/nnet4d/decode_ug/wer_11
|
||||
|
||||
%WER 1.69 [ 212 / 12533, 23 ins, 54 del, 135 sub ] exp/nnet4d_gpu/decode/wer_5
|
||||
%WER 8.39 [ 1052 / 12533, 86 ins, 174 del, 792 sub ] exp/nnet4d_gpu/decode_ug/wer_11
|
||||
|
||||
%WER 1.37 [ 172 / 12533, 14 ins, 36 del, 122 sub ] exp/nnet4e_gpu/decode/wer_3
|
||||
%WER 8.03 [ 1006 / 12533, 61 ins, 179 del, 766 sub ] exp/nnet4e_gpu/decode_ug/wer_8
|
||||
|
|
|
@ -1,27 +1,56 @@
|
|||
#!/bin/bash
|
||||
|
||||
|
||||
# This is neural net training on top of adapted 40-dimensional features.
|
||||
#
|
||||
# The same script works for GPUs, and for CPU only (with --use-gpu false).
|
||||
|
||||
train_stage=-10
|
||||
use_gpu=true
|
||||
|
||||
. cmd.sh
|
||||
. ./path.sh
|
||||
. utils/parse_options.sh
|
||||
|
||||
( steps/nnet2/train_tanh.sh --num-epochs 20 \
|
||||
--num-epochs-extra 10 --add-layers-period 1 \
|
||||
|
||||
if $use_gpu; then
|
||||
if ! cuda-compiled; then
|
||||
cat <<EOF && exit 1
|
||||
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
|
||||
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
|
||||
where "nvcc" is installed.
|
||||
EOF
|
||||
fi
|
||||
parallel_opts="-l gpu=1"
|
||||
num_threads=1
|
||||
dir=exp/nnet4c_gpu
|
||||
else
|
||||
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
|
||||
# almost the same, but this may be a little bit slow.
|
||||
num_threads=16
|
||||
parallel_opts="-pe smp $num_threads"
|
||||
dir=exp/nnet4c
|
||||
fi
|
||||
|
||||
|
||||
|
||||
if [ ! -f $dir/final.mdl ]; then
|
||||
steps/nnet2/train_tanh_fast.sh --stage $train_stage \
|
||||
--num-epochs 20 \
|
||||
--add-layers-period 1 \
|
||||
--num-hidden-layers 2 \
|
||||
--mix-up 4000 \
|
||||
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
|
||||
--cmd "$decode_cmd" \
|
||||
--hidden-layer-dim 375 \
|
||||
data/train data/lang exp/tri3b_ali exp/nnet4c
|
||||
data/train data/lang exp/tri3b_ali $dir || exit 1;
|
||||
fi
|
||||
|
||||
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--transform-dir exp/tri3b/decode \
|
||||
exp/tri3b/graph data/test exp/nnet4c/decode
|
||||
|
||||
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--transform-dir exp/tri3b/decode_ug \
|
||||
exp/tri3b/graph_ug data/test exp/nnet4c/decode_ug
|
||||
|
||||
)
|
||||
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--transform-dir exp/tri3b/decode \
|
||||
exp/tri3b/graph data/test $dir/decode &
|
||||
|
||||
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--transform-dir exp/tri3b/decode_ug \
|
||||
exp/tri3b/graph_ug data/test $dir/decode_ug
|
||||
|
||||
wait
|
||||
|
|
|
@ -1,42 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# This is neural net training on top of adapted 40-dimensional features.
|
||||
# This version of the script uses GPUs. We distinguish it by putting "_gpu"
|
||||
# at the end of the directory name.
|
||||
#
|
||||
# Since we're using one quarter the number of jobs (num-jobs-nnet) as the
|
||||
# run_4c.sh script, we halve the learning rate (generally speaking, splitting
|
||||
# the difference like this is probably a good idea.)
|
||||
|
||||
|
||||
parallel_opts="-l gpu=1" # This is suitable for the CLSP network, you'll likely have to change it.
|
||||
|
||||
. ./cmd.sh
|
||||
. ./path.sh
|
||||
! cuda-compiled && cat <<EOF && exit 1
|
||||
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
|
||||
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
|
||||
where "nvcc" is installed.
|
||||
EOF
|
||||
|
||||
( steps/nnet2/train_tanh.sh --num-epochs 20 \
|
||||
--num-jobs-nnet 4 --num-threads 1 --parallel-opts "$parallel_opts" \
|
||||
--num-epochs-extra 10 --add-layers-period 1 \
|
||||
--num-hidden-layers 2 \
|
||||
--mix-up 4000 \
|
||||
--initial-learning-rate 0.01 --final-learning-rate 0.002 \
|
||||
--cmd "$decode_cmd" \
|
||||
--hidden-layer-dim 375 \
|
||||
data/train data/lang exp/tri3b_ali exp/nnet4c_gpu
|
||||
|
||||
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--transform-dir exp/tri3b/decode \
|
||||
exp/tri3b/graph data/test exp/nnet4c_gpu/decode
|
||||
|
||||
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--transform-dir exp/tri3b/decode_ug \
|
||||
exp/tri3b/graph_ug data/test exp/nnet4c_gpu/decode_ug
|
||||
|
||||
)
|
||||
|
||||
|
|
@ -1,21 +1,42 @@
|
|||
#!/bin/bash
|
||||
|
||||
# This is pnorm neural net training on top of adapted 40-dimensional features.
|
||||
# This version of the script uses GPUs. We distinguish it by putting "_gpu"
|
||||
# at the end of the directory name.
|
||||
|
||||
# local/nnet2/run_4d.sh is the new, faster version of the p-norm training script.
|
||||
# The same script works for GPUs, and for CPU only (with --use-gpu false).
|
||||
|
||||
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
|
||||
# almost the same, but this may be a little bit slow.
|
||||
num_threads=16
|
||||
parallel_opts="-pe smp $num_threads"
|
||||
|
||||
train_stage=-10
|
||||
use_gpu=true
|
||||
|
||||
. cmd.sh
|
||||
. ./path.sh
|
||||
. utils/parse_options.sh
|
||||
|
||||
dir=exp/nnet4d
|
||||
( steps/nnet2/train_pnorm.sh --num-epochs 20 \
|
||||
--num-jobs-nnet 4 --num-threads $num_threads --parallel-opts "$parallel_opts" \
|
||||
|
||||
if $use_gpu; then
|
||||
if ! cuda-compiled; then
|
||||
cat <<EOF && exit 1
|
||||
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
|
||||
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
|
||||
where "nvcc" is installed.
|
||||
EOF
|
||||
fi
|
||||
parallel_opts="-l gpu=1"
|
||||
num_threads=1
|
||||
dir=exp/nnet4d_gpu
|
||||
else
|
||||
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
|
||||
# almost the same, but this may be a little bit slow.
|
||||
num_threads=16
|
||||
parallel_opts="-pe smp $num_threads"
|
||||
dir=exp/nnet4d
|
||||
fi
|
||||
|
||||
|
||||
|
||||
if [ ! -f $dir/final.mdl ]; then
|
||||
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
|
||||
--num-threads $num_threads --parallel-opts "$parallel_opts" \
|
||||
--num-jobs-nnet 4 \
|
||||
--num-epochs-extra 10 --add-layers-period 1 \
|
||||
--num-hidden-layers 2 \
|
||||
--mix-up 4000 \
|
||||
|
@ -23,15 +44,16 @@ dir=exp/nnet4d
|
|||
--cmd "$decode_cmd" \
|
||||
--pnorm-input-dim 1000 \
|
||||
--pnorm-output-dim 200 \
|
||||
--combine-regularizer 1.0e-12 \
|
||||
data/train data/lang exp/tri3b_ali $dir
|
||||
data/train data/lang exp/tri3b_ali $dir || exit 1;
|
||||
fi
|
||||
|
||||
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--transform-dir exp/tri3b/decode \
|
||||
exp/tri3b/graph data/test $dir/decode
|
||||
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--transform-dir exp/tri3b/decode \
|
||||
exp/tri3b/graph data/test $dir/decode &
|
||||
|
||||
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--transform-dir exp/tri3b/decode_ug \
|
||||
exp/tri3b/graph_ug data/test $dir/decode_ug
|
||||
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--transform-dir exp/tri3b/decode_ug \
|
||||
exp/tri3b/graph_ug data/test $dir/decode_ug
|
||||
|
||||
wait
|
||||
|
||||
)
|
||||
|
|
|
@ -1,41 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# This is pnorm neural net training on top of adapted 40-dimensional features.
|
||||
# This version of the script uses GPUs. We distinguish it by putting "_gpu"
|
||||
# at the end of the directory name.
|
||||
|
||||
|
||||
parallel_opts="-l gpu=1"
|
||||
|
||||
. ./cmd.sh
|
||||
. ./path.sh
|
||||
! cuda-compiled && cat <<EOF && exit 1
|
||||
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
|
||||
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
|
||||
where "nvcc" is installed.
|
||||
EOF
|
||||
|
||||
dir=exp/nnet4d_gpu
|
||||
( steps/nnet2/train_pnorm.sh --num-epochs 20 \
|
||||
--num-jobs-nnet 4 --num-threads 1 --parallel-opts "$parallel_opts" \
|
||||
--num-epochs-extra 10 --add-layers-period 1 \
|
||||
--num-hidden-layers 2 \
|
||||
--mix-up 4000 \
|
||||
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
|
||||
--cmd "$decode_cmd" \
|
||||
--pnorm-input-dim 750 \
|
||||
--pnorm-output-dim 150 \
|
||||
--combine-regularizer 1.0e-12 \
|
||||
data/train data/lang exp/tri3b_ali $dir
|
||||
|
||||
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--transform-dir exp/tri3b/decode \
|
||||
exp/tri3b/graph data/test $dir/decode
|
||||
|
||||
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--transform-dir exp/tri3b/decode_ug \
|
||||
exp/tri3b/graph_ug data/test $dir/decode_ug
|
||||
|
||||
)
|
||||
|
||||
|
|
@ -7,7 +7,7 @@
|
|||
use_gpu=true
|
||||
|
||||
if $use_gpu; then
|
||||
# This example runs on top of "raw-fMLLR" features:
|
||||
# This example runs on top of "raw-fMLLR" features.
|
||||
# We don't have a GPU version of this script.
|
||||
#local/nnet2/run_4a_gpu.sh
|
||||
|
||||
|
@ -15,11 +15,11 @@ if $use_gpu; then
|
|||
local/nnet2/run_4b_gpu.sh
|
||||
|
||||
# This one is on top of 40-dim + fMLLR features
|
||||
local/nnet2/run_4c_gpu.sh
|
||||
local/nnet2/run_4c.sh --use-gpu true
|
||||
|
||||
# This one is for training pnorm nnets on top of 40-dim + fMLLR features
|
||||
# **THIS IS THE PRIMARY RECIPE**
|
||||
local/nnet2/run_4d_gpu.sh
|
||||
local/nnet2/run_4d.sh --use-gpu true
|
||||
|
||||
# This is discriminative training on top of 4c.
|
||||
local/nnet2/run_5c_gpu.sh
|
||||
|
@ -34,11 +34,12 @@ else
|
|||
# This one is on top of filter-bank features, with only CMN.
|
||||
local/nnet2/run_4b.sh
|
||||
|
||||
# This one is on top of 40-dim + fMLLR features
|
||||
local/nnet2/run_4c.sh
|
||||
# This one is on top of 40-dim + fMLLR features, it's a fairly
|
||||
# normal tanh system.
|
||||
local/nnet2/run_4c.sh --use-gpu false
|
||||
|
||||
# **THIS IS THE PRIMARY RECIPE**
|
||||
local/nnet2/run_4d.sh
|
||||
# **THIS IS THE PRIMARY RECIPE (40-dim + fMLLR + p-norm neural net)**
|
||||
local/nnet2/run_4d.sh --use-gpu false
|
||||
|
||||
# This is discriminative training on top of 4c.
|
||||
local/nnet2/run_5c.sh
|
||||
|
|
|
@ -1,15 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey).
|
||||
# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).
|
||||
# 2013 Xiaohui Zhang
|
||||
# 2013 Guoguo Chen
|
||||
# Apache 2.0.
|
||||
|
||||
|
||||
# This script trains neural network with pnorm nonlinearities.
|
||||
# The difference with train_tanh.sh is that, instead of setting
|
||||
# hidden_layer_size, you should set pnorm_input_dim and pnorm_output_dim.
|
||||
# Also the P value (the order of the p-norm) should be set.
|
||||
# train_pnorm_fast.sh is a new, improved version of train_pnorm.sh, which uses
|
||||
# the 'online' preconditioning method. For GPUs it's about two times faster
|
||||
# than before (although that's partly due to optimizations that will also help
|
||||
# the old recipe), and for CPUs it gives better performance than the old method
|
||||
# (I believe); also, the difference in optimization performance between CPU and
|
||||
# GPU is almost gone. The old train_pnorm.sh script is now deprecated.
|
||||
# We made this a separate script because not all of the options that the
|
||||
# old script accepted, are still accepted.
|
||||
|
||||
# Begin configuration section.
|
||||
cmd=run.pl
|
||||
|
@ -22,18 +26,15 @@ num_iters_final=20 # Maximum number of final iterations to give to the
|
|||
initial_learning_rate=0.04
|
||||
final_learning_rate=0.004
|
||||
bias_stddev=0.5
|
||||
softmax_learning_rate_factor=1.0 # In the default setting keep the same learning rate.
|
||||
|
||||
softmax_learning_rate_factor=1.0 # In the default setting keep the same learning
|
||||
# rate for the final layer.
|
||||
pnorm_input_dim=3000
|
||||
pnorm_output_dim=300
|
||||
first_component_power=1.0 # could set this to 0.5, sometimes seems to improve results.
|
||||
p=2
|
||||
minibatch_size=128 # by default use a smallish minibatch size for neural net
|
||||
# training; this controls instability which would otherwise
|
||||
# be a problem with multi-threaded update. Note: it also
|
||||
# interacts with the "preconditioned" update which generally
|
||||
# works better with larger minibatch size, so it's not
|
||||
# completely cost free.
|
||||
# be a problem with multi-threaded update.
|
||||
|
||||
samples_per_iter=200000 # each iteration of training, see this many samples
|
||||
# per job. This option is passed to get_egs.sh
|
||||
|
@ -59,14 +60,18 @@ io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one t
|
|||
splice_width=4 # meaning +- 4 frames on each side for second LDA
|
||||
randprune=4.0 # speeds up LDA.
|
||||
alpha=4.0 # relates to preconditioning.
|
||||
update_period=4 # relates to preconditioning: says how often we update the subspace.
|
||||
update_period=4 # relates to online preconditioning: says how often we update the subspace.
|
||||
num_samples_history=2000 # relates to online preconditioning
|
||||
max_change_per_sample=0.1
|
||||
max_change_per_sample=0.075
|
||||
precondition_rank_in=20 # relates to online preconditioning
|
||||
precondition_rank_out=80 # relates to online preconditioning
|
||||
|
||||
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
|
||||
# specified.)
|
||||
num_threads=16
|
||||
parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" # by default we use 16 threads; this lets the queue know.
|
||||
# note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
|
||||
combine_parallel_opts="-pe smp 8" # queue options for the "combine" stage.
|
||||
cleanup=true
|
||||
egs_dir=
|
||||
lda_opts=
|
||||
|
@ -75,8 +80,6 @@ egs_opts=
|
|||
transform_dir= # If supplied, overrides alidir
|
||||
prior_subset_size=10000 # 10k samples per job, for computing priors. Should be
|
||||
# more than enough.
|
||||
precondition_rank_in=20
|
||||
precondition_rank_out=80
|
||||
# End configuration section.
|
||||
|
||||
|
||||
|
@ -262,9 +265,10 @@ echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling "
|
|||
echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
|
||||
echo "$0: (while reducing learning rate) + (with constant learning rate)."
|
||||
|
||||
|
||||
finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
|
||||
# This is when we decide to mix up from: halfway between when we've finished
|
||||
# adding the hidden layers and the end of training.
|
||||
finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
|
||||
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]
|
||||
|
||||
if [ $num_threads -eq 1 ]; then
|
||||
|
@ -296,6 +300,7 @@ while [ $x -lt $num_iters ]; do
|
|||
fi
|
||||
|
||||
echo "Training neural net (pass $x)"
|
||||
|
||||
if [ $x -gt 0 ] && \
|
||||
[ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
|
||||
[ $[($x-1) % $add_layers_period] -eq 0 ]; then
|
||||
|
@ -303,18 +308,29 @@ while [ $x -lt $num_iters ]; do
|
|||
else
|
||||
mdl=$dir/$x.mdl
|
||||
fi
|
||||
|
||||
|
||||
$cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
|
||||
if [ $x -eq 0 ] || [ "$mdl" != "$dir/$x.mdl" ]; then
|
||||
# on iteration zero or when we just added a layer, use a smaller minibatch
|
||||
# size and just one job: the model-averaging doesn't seem to be helpful
|
||||
# when the model is changing too fast (i.e. it worsens the objective
|
||||
# function), and the smaller minibatch size will help to keep
|
||||
# the update stable.
|
||||
this_minibatch_size=$[$minibatch_size/2];
|
||||
this_num_jobs_nnet=1
|
||||
else
|
||||
this_minibatch_size=$minibatch_size
|
||||
this_num_jobs_nnet=$num_jobs_nnet
|
||||
fi
|
||||
|
||||
$cmd $parallel_opts JOB=1:$this_num_jobs_nnet $dir/log/train.$x.JOB.log \
|
||||
nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
|
||||
ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
|
||||
nnet-train$train_suffix \
|
||||
--minibatch-size=$minibatch_size --srand=$x "$mdl" \
|
||||
nnet-train$train_suffix \
|
||||
--minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
|
||||
ark:- $dir/$[$x+1].JOB.mdl \
|
||||
|| exit 1;
|
||||
|
||||
nnets_list=
|
||||
for n in `seq 1 $num_jobs_nnet`; do
|
||||
for n in `seq 1 $this_num_jobs_nnet`; do
|
||||
nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
|
||||
done
|
||||
|
||||
|
@ -331,7 +347,7 @@ while [ $x -lt $num_iters ]; do
|
|||
else lr=$learning_rate; fi
|
||||
lr_string="$lr_string:$lr"
|
||||
done
|
||||
|
||||
|
||||
$cmd $dir/log/average.$x.log \
|
||||
nnet-am-average $nnets_list - \| \
|
||||
nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
|
||||
|
@ -372,10 +388,9 @@ if [ $stage -le $num_iters ]; then
|
|||
# Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
|
||||
# if there are many models it can give out-of-memory error; set num-threads to 8
|
||||
# to speed it up (this isn't ideal...)
|
||||
this_num_threads=$num_threads
|
||||
[ $this_num_threads -lt 8 ] && this_num_threads=8
|
||||
num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
|
||||
mb=$[($num_egs+$this_num_threads-1)/$this_num_threads]
|
||||
combine_num_threads=8
|
||||
mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
|
||||
[ $mb -gt 512 ] && mb=512
|
||||
# Setting --initial-model to a large value makes it initialize the combination
|
||||
# with the average of all the models. It's important not to start with a
|
||||
|
@ -384,15 +399,15 @@ if [ $stage -le $num_iters ]; then
|
|||
# nnet-combine-fast uses for scaling, which after flooring and inversion, has
|
||||
# the effect that the initial model chosen gets much higher learning rates
|
||||
# than the others. This prevents the optimization from working well.
|
||||
$cmd $parallel_opts $dir/log/combine.log \
|
||||
$cmd $combine_parallel_opts $dir/log/combine.log \
|
||||
nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
|
||||
--num-threads=$this_num_threads \
|
||||
--num-threads=$combine_num_threads \
|
||||
--verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
|
||||
$dir/final.mdl || exit 1;
|
||||
|
||||
# Normalize stddev for affine or block affine layers that are followed by a
|
||||
# pnorm layer and then a normalize layer.
|
||||
$cmd $parallel_opts $dir/log/normalize.log \
|
||||
$cmd $dir/log/normalize.log \
|
||||
nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;
|
||||
|
||||
# Compute the probability of the final, combined model with
|
|
@ -0,0 +1,451 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
|
||||
|
||||
# This script trains a fairly vanilla network with tanh nonlinearities.
|
||||
|
||||
# train_tanh_fast.sh is a new, improved version of train_tanh.sh, which uses
|
||||
# the 'online' preconditioning method. For GPUs it's about two times faster
|
||||
# than before (although that's partly due to optimizations that will also help
|
||||
# the old recipe), and for CPUs it gives better performance than the old method
|
||||
# (I believe); also, the difference in optimization performance between CPU and
|
||||
# GPU is almost gone. The old train_tanh.sh script is now deprecated.
|
||||
# We made this a separate script because not all of the options that the
|
||||
# old script accepted, are still accepted.
|
||||
|
||||
|
||||
# Begin configuration section.
|
||||
cmd=run.pl
|
||||
num_epochs=15 # Number of epochs during which we reduce
|
||||
# the learning rate; number of iteration is worked out from this.
|
||||
num_epochs_extra=5 # Number of epochs after we stop reducing
|
||||
# the learning rate.
|
||||
num_iters_final=20 # Maximum number of final iterations to give to the
|
||||
# optimization over the validation set.
|
||||
initial_learning_rate=0.04
|
||||
final_learning_rate=0.004
|
||||
bias_stddev=0.5
|
||||
shrink_interval=5 # shrink every $shrink_interval iters except while we are
|
||||
# still adding layers, when we do it every iter.
|
||||
shrink=true
|
||||
num_frames_shrink=2000 # note: must be <= --num-frames-diagnostic option to get_egs.sh, if
|
||||
# given.
|
||||
final_learning_rate_factor=0.5 # Train the two last layers of parameters half as
|
||||
# fast as the other layers, by default.
|
||||
|
||||
hidden_layer_dim=300 # You may want this larger, e.g. 1024 or 2048.
|
||||
|
||||
minibatch_size=128 # by default use a smallish minibatch size for neural net
|
||||
# training; this controls instability which would otherwise
|
||||
# be a problem with multi-threaded update.
|
||||
|
||||
samples_per_iter=200000 # each iteration of training, see this many samples
|
||||
# per job. This option is passed to get_egs.sh.
|
||||
num_jobs_nnet=16 # Number of neural net jobs to run in parallel. This option
|
||||
# is passed to get_egs.sh.
|
||||
get_egs_stage=0
|
||||
spk_vecs_dir=
|
||||
|
||||
shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of
|
||||
# the samples on each iter. You could set it to 0 or to a large
|
||||
# value for complete randomization, but this would both consume
|
||||
# memory and cause spikes in disk I/O. Smaller is easier on
|
||||
# disk and memory but less random. It's not a huge deal though,
|
||||
# as samples are anyway randomized right at the start.
|
||||
|
||||
add_layers_period=2 # by default, add new layers every 2 iterations.
|
||||
num_hidden_layers=3 # This is an important configuration value that you might
|
||||
# want to tune.
|
||||
stage=-5
|
||||
|
||||
io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. These don't
|
||||
splice_width=4 # meaning +- 4 frames on each side for second LDA
|
||||
randprune=4.0 # speeds up LDA.
|
||||
alpha=4.0 # relates to preconditioning.
|
||||
update_period=4 # relates to online preconditioning: says how often we update the subspace.
|
||||
num_samples_history=2000 # relates to online preconditioning
|
||||
max_change_per_sample=0.075
|
||||
# we make the [input, output] ranks less different for the tanh setup than for
|
||||
# the pnorm setup, as we don't have the difference in dimensions to deal with.
|
||||
precondition_rank_in=30 # relates to online preconditioning
|
||||
precondition_rank_out=60 # relates to online preconditioning
|
||||
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
|
||||
# specified.)
|
||||
num_threads=16
|
||||
parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" # by default we use 16 threads; this lets the queue know.
|
||||
# note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
|
||||
combine_parallel_opts="-pe smp 8" # queue options for the "combine" stage.
|
||||
cleanup=true
|
||||
egs_dir=
|
||||
lda_opts=
|
||||
egs_opts=
|
||||
transform_dir=
|
||||
prior_subset_size=10000 # 10k samples per job, for computing priors. Should be
|
||||
# more than enough.
|
||||
# End configuration section.
|
||||
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
if [ -f path.sh ]; then . ./path.sh; fi
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
|
||||
if [ $# != 4 ]; then
|
||||
echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
|
||||
echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
|
||||
echo ""
|
||||
echo "Main options (for others, see top of script file)"
|
||||
echo " --config <config-file> # config file containing options"
|
||||
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
|
||||
echo " --num-epochs <#epochs|15> # Number of epochs of main training"
|
||||
echo " # while reducing learning rate (determines #iterations, together"
|
||||
echo " # with --samples-per-iter and --num-jobs-nnet)"
|
||||
echo " --num-epochs-extra <#epochs-extra|5> # Number of extra epochs of training"
|
||||
echo " # after learning rate fully reduced"
|
||||
echo " --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
|
||||
echo " # data, 0.01 for large data"
|
||||
echo " --final-learning-rate <final-learning-rate|0.004> # Learning rate at end of training, e.g. 0.004 for small"
|
||||
echo " # data, 0.001 for large data"
|
||||
echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
|
||||
echo " --initial-num-hidden-layers <#hidden-layers|1> # Number of hidden layers to start with."
|
||||
echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers"
|
||||
echo " --mix-up <#pseudo-gaussians|0> # Can be used to have multiple targets in final output layer,"
|
||||
echo " # per context-dependent state. Try a number several times #states."
|
||||
echo " --num-jobs-nnet <num-jobs|8> # Number of parallel jobs to use for main neural net"
|
||||
echo " # training (will affect results as well as speed; try 8, 16)"
|
||||
echo " # Note: if you increase this, you may want to also increase"
|
||||
echo " # the learning rate."
|
||||
echo " --num-threads <num-threads|16> # Number of parallel threads per job (will affect results"
|
||||
echo " # as well as speed; may interact with batch size; if you increase"
|
||||
echo " # this, you may want to decrease the batch size."
|
||||
echo " --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\"> # extra options to pass to e.g. queue.pl for processes that"
|
||||
echo " # use multiple threads... note, you might have to reduce mem_free,ram_free"
|
||||
echo " # versus your defaults, because it gets multiplied by the -pe smp argument."
|
||||
echo " --io-opts <opts|\"-tc 10\"> # Options given to e.g. queue.pl for jobs that do a lot of I/O."
|
||||
echo " --minibatch-size <minibatch-size|128> # Size of minibatch to process (note: product with --num-threads"
|
||||
echo " # should not get too large, e.g. >2k)."
|
||||
echo " --samples-per-iter <#samples|200000> # Number of samples of data to process per iteration, per"
|
||||
echo " # process."
|
||||
echo " --splice-width <width|4> # Number of frames on each side to append for feature input"
|
||||
echo " # (note: we splice processed, typically 40-dimensional frames"
|
||||
echo " --lda-dim <dim|250> # Dimension to reduce spliced features to with LDA"
|
||||
echo " --num-iters-final <#iters|10> # Number of final iterations to give to nnet-combine-fast to "
|
||||
echo " # interpolate parameters (the weights are learned with a validation set)"
|
||||
echo " --num-utts-subset <#utts|300> # Number of utterances in subsets used for validation and diagnostics"
|
||||
echo " # (the validation subset is held out from training)"
|
||||
echo " --num-frames-diagnostic <#frames|4000> # Number of frames used in computing (train,valid) diagnostics"
|
||||
echo " --num-valid-frames-combine <#frames|10000> # Number of frames used in getting combination weights at the"
|
||||
echo " # very end."
|
||||
echo " --stage <stage|-9> # Used to run a partially-completed training process from somewhere in"
|
||||
echo " # the middle."
|
||||
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
data=$1
|
||||
lang=$2
|
||||
alidir=$3
|
||||
dir=$4
|
||||
|
||||
# Check some files.
|
||||
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
|
||||
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
|
||||
done
|
||||
|
||||
|
||||
# Set some variables.
|
||||
num_leaves=`am-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
|
||||
|
||||
nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir...
|
||||
# in this dir we'll have just one job.
|
||||
sdata=$data/split$nj
|
||||
utils/split_data.sh $data $nj
|
||||
|
||||
mkdir -p $dir/log
|
||||
splice_opts=`cat $alidir/splice_opts 2>/dev/null`
|
||||
cp $alidir/splice_opts $dir 2>/dev/null
|
||||
cp $alidir/cmvn_opts $dir 2>/dev/null
|
||||
cp $alidir/tree $dir
|
||||
|
||||
|
||||
[ -z "$transform_dir" ] && transform_dir=$alidir
|
||||
|
||||
if [ $stage -le -4 ]; then
|
||||
echo "$0: calling get_lda.sh"
|
||||
steps/nnet2/get_lda.sh $lda_opts --transform-dir $transform_dir --splice-width $splice_width --cmd "$cmd" $data $lang $alidir $dir || exit 1;
|
||||
fi
|
||||
|
||||
# these files will have been written by get_lda.sh
|
||||
feat_dim=`cat $dir/feat_dim` || exit 1;
|
||||
lda_dim=`cat $dir/lda_dim` || exit 1;
|
||||
|
||||
if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
|
||||
echo "$0: calling get_egs.sh"
|
||||
[ ! -z $spk_vecs_dir ] && spk_vecs_opt="--spk-vecs-dir $spk_vecs_dir";
|
||||
steps/nnet2/get_egs.sh $spk_vecs_opt --transform-dir $transform_dir --samples-per-iter $samples_per_iter \
|
||||
--num-jobs-nnet $num_jobs_nnet --splice-width $splice_width --stage $get_egs_stage \
|
||||
--cmd "$cmd" $egs_opts --io-opts "$io_opts" \
|
||||
$data $lang $alidir $dir || exit 1;
|
||||
fi
|
||||
|
||||
if [ -z $egs_dir ]; then
|
||||
egs_dir=$dir/egs
|
||||
fi
|
||||
|
||||
iters_per_epoch=`cat $egs_dir/iters_per_epoch` || exit 1;
|
||||
! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
|
||||
echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
|
||||
num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;
|
||||
|
||||
|
||||
if ! [ $num_hidden_layers -ge 1 ]; then
|
||||
echo "Invalid num-hidden-layers $num_hidden_layers"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $stage -le -2 ]; then
|
||||
echo "$0: initializing neural net";
|
||||
|
||||
# Get spk-vec dim (in case we're using them).
|
||||
if [ ! -z "$spk_vecs_dir" ]; then
|
||||
spk_vec_dim=$[$(copy-vector --print-args=false "ark:cat $spk_vecs_dir/vecs.1|" ark,t:- | head -n 1 | wc -w) - 3];
|
||||
! [ $spk_vec_dim -gt 0 ] && echo "Error getting spk-vec dim" && exit 1;
|
||||
ext_lda_dim=$[$lda_dim + $spk_vec_dim]
|
||||
extend-transform-dim --new-dimension=$ext_lda_dim $dir/lda.mat $dir/lda_ext.mat || exit 1;
|
||||
lda_mat=$dir/lda_ext.mat
|
||||
ext_feat_dim=$[$feat_dim + $spk_vec_dim]
|
||||
else
|
||||
spk_vec_dim=0
|
||||
lda_mat=$dir/lda.mat
|
||||
ext_lda_dim=$lda_dim
|
||||
ext_feat_dim=$feat_dim
|
||||
fi
|
||||
|
||||
online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"
|
||||
|
||||
stddev=`perl -e "print 1.0/sqrt($hidden_layer_dim);"`
|
||||
cat >$dir/nnet.config <<EOF
|
||||
SpliceComponent input-dim=$ext_feat_dim left-context=$splice_width right-context=$splice_width const-component-dim=$spk_vec_dim
|
||||
FixedAffineComponent matrix=$lda_mat
|
||||
AffineComponentPreconditionedOnline input-dim=$ext_lda_dim output-dim=$hidden_layer_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
|
||||
TanhComponent dim=$hidden_layer_dim
|
||||
AffineComponentPreconditionedOnline input-dim=$hidden_layer_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
|
||||
SoftmaxComponent dim=$num_leaves
|
||||
EOF
|
||||
|
||||
# to hidden.config it will write the part of the config corresponding to a
|
||||
# single hidden layer; we need this to add new layers.
|
||||
cat >$dir/hidden.config <<EOF
|
||||
AffineComponentPreconditionedOnline input-dim=$hidden_layer_dim output-dim=$hidden_layer_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
|
||||
TanhComponent dim=$hidden_layer_dim
|
||||
EOF
|
||||
$cmd $dir/log/nnet_init.log \
|
||||
nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
|
||||
$dir/0.mdl || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le -1 ]; then
|
||||
echo "Training transition probabilities and setting priors"
|
||||
$cmd $dir/log/train_trans.log \
|
||||
nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
|
||||
|| exit 1;
|
||||
fi
|
||||
|
||||
num_iters_reduce=$[$num_epochs * $iters_per_epoch];
|
||||
num_iters_extra=$[$num_epochs_extra * $iters_per_epoch];
|
||||
num_iters=$[$num_iters_reduce+$num_iters_extra]
|
||||
|
||||
echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling "
|
||||
echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
|
||||
echo "$0: (while reducing learning rate) + (with constant learning rate)."
|
||||
|
||||
# This is when we decide to mix up from: halfway between when we've finished
|
||||
# adding the hidden layers and the end of training.
|
||||
finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
|
||||
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]
|
||||
|
||||
if [ $num_threads -eq 1 ]; then
|
||||
train_suffix="-simple" # this enables us to use GPU code if
|
||||
# we have just one thread.
|
||||
if ! cuda-compiled; then
|
||||
echo "$0: WARNING: you are running with one thread but you have not compiled"
|
||||
echo " for CUDA. You may be running a setup optimized for GPUs. If you have"
|
||||
echo " GPUs and have nvcc installed, go to src/ and do ./configure; make"
|
||||
fi
|
||||
else
|
||||
train_suffix="-parallel --num-threads=$num_threads"
|
||||
fi
|
||||
|
||||
|
||||
|
||||
x=0
|
||||
while [ $x -lt $num_iters ]; do
|
||||
if [ $x -ge 0 ] && [ $stage -le $x ]; then
|
||||
# Set off jobs doing some diagnostics, in the background.
|
||||
$cmd $dir/log/compute_prob_valid.$x.log \
|
||||
nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
|
||||
$cmd $dir/log/compute_prob_train.$x.log \
|
||||
nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
|
||||
if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
|
||||
$cmd $dir/log/progress.$x.log \
|
||||
nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
|
||||
ark:$egs_dir/train_diagnostic.egs '&&' \
|
||||
nnet-am-info $dir/$x.mdl &
|
||||
fi
|
||||
|
||||
echo "Training neural net (pass $x)"
|
||||
if [ $x -gt 0 ] && \
|
||||
[ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
|
||||
[ $[($x-1) % $add_layers_period] -eq 0 ]; then
|
||||
mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert $dir/$x.mdl - - |"
|
||||
else
|
||||
mdl=$dir/$x.mdl
|
||||
fi
|
||||
|
||||
if [ $x -eq 0 ] || [ "$mdl" != "$dir/$x.mdl" ]; then
|
||||
# on iteration zero or when we just added a layer, use a smaller minibatch
|
||||
# size and just one job: the model-averaging doesn't seem to be helpful
|
||||
# when the model is changing too fast (i.e. it worsens the objective
|
||||
# function), and the smaller minibatch size will help to keep
|
||||
# the update stable.
|
||||
this_minibatch_size=$[$minibatch_size/2];
|
||||
this_num_jobs_nnet=1
|
||||
else
|
||||
this_minibatch_size=$minibatch_size
|
||||
this_num_jobs_nnet=$num_jobs_nnet
|
||||
fi
|
||||
|
||||
$cmd $parallel_opts JOB=1:$this_num_jobs_nnet $dir/log/train.$x.JOB.log \
|
||||
nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
|
||||
ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
|
||||
nnet-train$train_suffix \
|
||||
--minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
|
||||
ark:- $dir/$[$x+1].JOB.mdl \
|
||||
|| exit 1;
|
||||
|
||||
nnets_list=
|
||||
for n in `seq 1 $this_num_jobs_nnet`; do
|
||||
nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
|
||||
done
|
||||
|
||||
learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters_reduce $initial_learning_rate $final_learning_rate`;
|
||||
last_layer_learning_rate=`perl -e "print $learning_rate * $final_learning_rate_factor;"`;
|
||||
nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo 2>/dev/null || exit 1
|
||||
nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
|
||||
na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
|
||||
# na is number of last updatable AffineComponent layer [one-based, counting only
|
||||
# updatable components.]
|
||||
# The last two layers will get this (usually lower) learning rate.
|
||||
lr_string="$learning_rate"
|
||||
for n in `seq 2 $nu`; do
|
||||
if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$last_layer_learning_rate;
|
||||
else lr=$learning_rate; fi
|
||||
lr_string="$lr_string:$lr"
|
||||
done
|
||||
|
||||
$cmd $dir/log/average.$x.log \
|
||||
nnet-am-average $nnets_list - \| \
|
||||
nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
|
||||
|
||||
if $shrink && [ $[$x % $shrink_interval] -eq 0 ]; then
|
||||
mb=$[($num_frames_shrink+$num_threads-1)/$num_threads]
|
||||
$cmd $parallel_opts $dir/log/shrink.$x.log \
|
||||
nnet-subset-egs --n=$num_frames_shrink --randomize-order=true --srand=$x \
|
||||
ark:$egs_dir/train_diagnostic.egs ark:- \| \
|
||||
nnet-combine-fast --use-gpu=no --num-threads=$num_threads --verbose=3 --minibatch-size=$mb \
|
||||
$dir/$[$x+1].mdl ark:- $dir/$[$x+1].mdl || exit 1;
|
||||
else
|
||||
# On other iters, do nnet-am-fix which is much faster and has roughly
|
||||
# the same effect.
|
||||
nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log
|
||||
fi
|
||||
|
||||
if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
|
||||
# mix up.
|
||||
echo Mixing up from $num_leaves to $mix_up components
|
||||
$cmd $dir/log/mix_up.$x.log \
|
||||
nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
|
||||
$dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
|
||||
fi
|
||||
rm $nnets_list
|
||||
fi
|
||||
x=$[$x+1]
|
||||
done
|
||||
|
||||
# Now do combination.
|
||||
# At the end, final.mdl will be a combination of the last e.g. 10 models.
|
||||
nnets_list=()
|
||||
if [ $num_iters_final -gt $num_iters_extra ]; then
|
||||
echo "Setting num_iters_final=$num_iters_extra"
|
||||
fi
|
||||
start=$[$num_iters-$num_iters_final+1]
|
||||
for x in `seq $start $num_iters`; do
|
||||
idx=$[$x-$start]
|
||||
if [ $x -gt $mix_up_iter ]; then
|
||||
nnets_list[$idx]=$dir/$x.mdl # "nnet-am-copy --remove-dropout=true $dir/$x.mdl - |"
|
||||
fi
|
||||
done
|
||||
|
||||
if [ $stage -le $num_iters ]; then
|
||||
echo "Doing final combination to produce final.mdl"
|
||||
# Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as if
|
||||
# there are many models it can give out-of-memory error on the GPU; set
|
||||
# num-threads to 8 to speed it up (this isn't ideal...)
|
||||
num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
|
||||
combine_num_threads=8
|
||||
mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
|
||||
[ $mb -gt 512 ] && mb=512
|
||||
$cmd $combine_parallel_opts $dir/log/combine.log \
|
||||
nnet-combine-fast --use-gpu=no --num-threads=$combine_num_threads \
|
||||
--verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
|
||||
$dir/final.mdl || exit 1;
|
||||
|
||||
# Compute the probability of the final, combined model with
|
||||
# the same subset we used for the previous compute_probs, as the
|
||||
# different subsets will lead to different probs.
|
||||
$cmd $dir/log/compute_prob_valid.final.log \
|
||||
nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
|
||||
$cmd $dir/log/compute_prob_train.final.log \
|
||||
nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &
|
||||
fi
|
||||
|
||||
if [ $stage -le $[$num_iters+1] ]; then
|
||||
echo "Getting average posterior for purposes of adjusting the priors."
|
||||
# Note: this just uses CPUs, using a smallish subset of data.
|
||||
rm $dir/post.*.vec 2>/dev/null
|
||||
$cmd JOB=1:$num_jobs_nnet $dir/log/get_post.JOB.log \
|
||||
nnet-subset-egs --n=$prior_subset_size ark:$egs_dir/egs.JOB.0.ark ark:- \| \
|
||||
nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
|
||||
matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.JOB.vec || exit 1;
|
||||
|
||||
sleep 3; # make sure there is time for $dir/post.*.vec to appear.
|
||||
|
||||
$cmd $dir/log/vector_sum.log \
|
||||
vector-sum $dir/post.*.vec $dir/post.vec || exit 1;
|
||||
|
||||
rm $dir/post.*.vec;
|
||||
|
||||
echo "Re-adjusting priors based on computed posteriors"
|
||||
$cmd $dir/log/adjust_priors.log \
|
||||
nnet-adjust-priors $dir/final.mdl $dir/post.vec $dir/final.mdl || exit 1;
|
||||
fi
|
||||
|
||||
sleep 2
|
||||
|
||||
echo Done
|
||||
|
||||
if $cleanup; then
|
||||
echo Cleaning up data
|
||||
if [ $egs_dir == "$dir/egs" ]; then
|
||||
echo Removing training examples
|
||||
rm $dir/egs/egs*
|
||||
fi
|
||||
echo Removing most of the models
|
||||
for x in `seq 0 $num_iters`; do
|
||||
if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
|
||||
# delete all but every 10th model; don't delete the ones which combine to form the final model.
|
||||
rm $dir/$x.mdl
|
||||
fi
|
||||
done
|
||||
fi
|
|
@ -207,8 +207,23 @@ if (! $sync) { # We're not submitting with -sync y, so we
|
|||
}
|
||||
}
|
||||
# We will need the sge_job_id, to check that job still exists
|
||||
$sge_job_id=`grep "Your job" $queue_logfile | awk '{ print \$3 }' | sed 's|\\\..*||'`;
|
||||
chomp($sge_job_id);
|
||||
{ # Get the SGE job-id from the log file in q/
|
||||
open(L, "<$queue_logfile") || die "Error opening log file $queue_logfile";
|
||||
undef $sge_job_id;
|
||||
while (<L>) {
|
||||
if (m/Your job\S* (\d+)[. ].+ has been submitted/) {
|
||||
if (defined $sge_job_id) {
|
||||
die "Error: your job was submitted more than once (see $queue_logfile)";
|
||||
} else {
|
||||
$sge_job_id = $1;
|
||||
}
|
||||
}
|
||||
}
|
||||
close(L);
|
||||
if (!defined $sge_job_id) {
|
||||
die "Error: log file $queue_logfile does not specify the SGE job-id.";
|
||||
}
|
||||
}
|
||||
$check_sge_job_ctr=1;
|
||||
#
|
||||
$wait = 0.1;
|
||||
|
|
|
@ -58,6 +58,9 @@ namespace kaldi {
|
|||
back to where you started from. We don't do this because
|
||||
in some contexts, the transform is made symmetric by multiplying
|
||||
by sqrt(N) in both passes. The user can do this by themselves.
|
||||
|
||||
See also SplitRadixComplexFft, declared in srfft.h, which is more efficient
|
||||
but only works if the length of the input is a power of 2.
|
||||
*/
|
||||
template<typename Real> void ComplexFft (VectorBase<Real> *v, bool forward, Vector<Real> *tmp_work = NULL);
|
||||
|
||||
|
|
|
@ -43,7 +43,7 @@ SplitRadixComplexFft<Real>::SplitRadixComplexFft(MatrixIndexT N) {
|
|||
logm_ ++;
|
||||
}
|
||||
ComputeTables();
|
||||
temp_buffer = NULL;
|
||||
temp_buffer_ = NULL;
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
|
@ -55,21 +55,21 @@ void SplitRadixComplexFft<Real>::ComputeTables() {
|
|||
|
||||
lg2 = logm_ >> 1;
|
||||
if (logm_ & 1) lg2++;
|
||||
brseed = new MatrixIndexT[1 << lg2];
|
||||
brseed[0] = 0;
|
||||
brseed[1] = 1;
|
||||
brseed_ = new MatrixIndexT[1 << lg2];
|
||||
brseed_[0] = 0;
|
||||
brseed_[1] = 1;
|
||||
for (j = 2; j <= lg2; j++) {
|
||||
imax = 1 << (j - 1);
|
||||
for (i = 0; i < imax; i++) {
|
||||
brseed[i] <<= 1;
|
||||
brseed[i + imax] = brseed[i] + 1;
|
||||
brseed_[i] <<= 1;
|
||||
brseed_[i + imax] = brseed_[i] + 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (logm_ < 4) {
|
||||
tab = NULL;
|
||||
tab_ = NULL;
|
||||
} else {
|
||||
tab = new Real* [logm_-3];
|
||||
tab_ = new Real* [logm_-3];
|
||||
for (i = logm_; i>=4 ; i--) {
|
||||
/* Compute a few constants */
|
||||
m = 1 << i; m2 = m / 2; m4 = m2 / 2; m8 = m4 /2;
|
||||
|
@ -77,10 +77,10 @@ void SplitRadixComplexFft<Real>::ComputeTables() {
|
|||
/* Allocate memory for tables */
|
||||
nel = m4 - 2;
|
||||
|
||||
tab[i-4] = new Real[6*nel];
|
||||
tab_[i-4] = new Real[6*nel];
|
||||
|
||||
/* Initialize pointers */
|
||||
cn = tab[i-4]; spcn = cn + nel; smcn = spcn + nel;
|
||||
cn = tab_[i-4]; spcn = cn + nel; smcn = spcn + nel;
|
||||
c3n = smcn + nel; spc3n = c3n + nel; smc3n = spc3n + nel;
|
||||
|
||||
/* Compute tables */
|
||||
|
@ -99,14 +99,14 @@ void SplitRadixComplexFft<Real>::ComputeTables() {
|
|||
|
||||
template<typename Real>
|
||||
SplitRadixComplexFft<Real>::~SplitRadixComplexFft() {
|
||||
delete [] brseed;
|
||||
if (tab != NULL) {
|
||||
delete [] brseed_;
|
||||
if (tab_ != NULL) {
|
||||
for (MatrixIndexT i = 0; i < logm_-3; i++)
|
||||
delete [] tab[i];
|
||||
delete [] tab;
|
||||
delete [] tab_[i];
|
||||
delete [] tab_;
|
||||
}
|
||||
if (temp_buffer != NULL)
|
||||
delete [] temp_buffer;
|
||||
// "delete" only does something if it's a non-NULL pointer.
|
||||
delete [] temp_buffer_;
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
|
@ -125,29 +125,29 @@ void SplitRadixComplexFft<Real>::Compute(Real *xr, Real *xi, bool forward) const
|
|||
|
||||
template<typename Real>
|
||||
void SplitRadixComplexFft<Real>::Compute(Real *x, bool forward) {
|
||||
if (temp_buffer == NULL)
|
||||
temp_buffer = new Real[N_];
|
||||
if (temp_buffer_== NULL)
|
||||
temp_buffer_ = new Real[N_];
|
||||
for (MatrixIndexT i = 0; i < N_; i++) {
|
||||
x[i] = x[i*2]; // put the real part in the first half of x.
|
||||
temp_buffer[i] = x[i*2 + 1]; // put the imaginary part in temp_buffer.
|
||||
temp_buffer_[i] = x[i*2 + 1]; // put the imaginary part in temp_buffer.
|
||||
}
|
||||
// copy the imaginary part back to the second half of x.
|
||||
memcpy(static_cast<void*>(x+N_),
|
||||
static_cast<void*>(temp_buffer),
|
||||
memcpy(static_cast<void*>(x + N_),
|
||||
static_cast<void*>(temp_buffer_),
|
||||
sizeof(Real) * N_);
|
||||
|
||||
Compute(x, x+N_, forward);
|
||||
Compute(x, x + N_, forward);
|
||||
// Now change the format back to interleaved.
|
||||
memcpy(static_cast<void*>(temp_buffer),
|
||||
static_cast<void*>(x+N_),
|
||||
memcpy(static_cast<void*>(temp_buffer_),
|
||||
static_cast<void*>(x + N_),
|
||||
sizeof(Real) * N_);
|
||||
for (MatrixIndexT i = N_-1; i > 0; i--) { // don't include 0,
|
||||
// in case MatrixIndexT is unsigned, the loop would not terminate.
|
||||
// Treat it as a special case.
|
||||
x[i*2] = x[i];
|
||||
x[i*2 + 1] = temp_buffer[i];
|
||||
x[i*2 + 1] = temp_buffer_[i];
|
||||
}
|
||||
x[1] = temp_buffer[0]; // special case of i = 0.
|
||||
x[1] = temp_buffer_[0]; // special case of i = 0.
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
|
@ -162,11 +162,11 @@ void SplitRadixComplexFft<Real>::BitReversePermute(Real *x, MatrixIndexT logm) c
|
|||
|
||||
/* Unshuffling loop */
|
||||
for (off = 1; off < n; off++) {
|
||||
fj = n * brseed[off]; i = off; j = fj;
|
||||
fj = n * brseed_[off]; i = off; j = fj;
|
||||
tmp = x[i]; x[i] = x[j]; x[j] = tmp;
|
||||
xp = &x[i];
|
||||
brp = &(brseed[1]);
|
||||
for (gno = 1; gno < brseed[off]; gno++) {
|
||||
brp = &(brseed_[1]);
|
||||
for (gno = 1; gno < brseed_[off]; gno++) {
|
||||
xp += n;
|
||||
j = fj + *brp++;
|
||||
xq = x + j;
|
||||
|
@ -281,7 +281,7 @@ void SplitRadixComplexFft<Real>::ComputeRecursive(Real *xr, Real *xi, MatrixInde
|
|||
xi1 = xi + m2; xi2 = xi1 + m4;
|
||||
if (logm >= 4) {
|
||||
nel = m4 - 2;
|
||||
cn = tab[logm-4]; spcn = cn + nel; smcn = spcn + nel;
|
||||
cn = tab_[logm-4]; spcn = cn + nel; smcn = spcn + nel;
|
||||
c3n = smcn + nel; spc3n = c3n + nel; smc3n = spc3n + nel;
|
||||
}
|
||||
xr1++; xr2++; xi1++; xi2++;
|
||||
|
|
|
@ -40,11 +40,14 @@ namespace kaldi {
|
|||
// permission, optimized by Go Vivace Inc., and converted into C++ by
|
||||
// Microsoft Corporation
|
||||
// This is a more efficient way of doing the complex FFT than ComplexFft
|
||||
// above, but it only works for powers of 2.
|
||||
// (declared in matrix-functios.h), but it only works for powers of 2.
|
||||
// Note: in multi-threaded code, you would need to have one of these objects per
|
||||
// thread, because multiple calls to Compute in parallel would not work.
|
||||
template<typename Real>
|
||||
class SplitRadixComplexFft {
|
||||
public:
|
||||
typedef MatrixIndexT Integer;
|
||||
|
||||
// N is the number of complex points (must be a power of two, or this
|
||||
// will crash). Note that the constructor does some work so it's best to
|
||||
// initialize the object once and do the computation many times.
|
||||
|
@ -73,12 +76,12 @@ class SplitRadixComplexFft {
|
|||
Integer logm_; // log(N) [a slight mismatch in notation which we have not
|
||||
// bothered to fix].
|
||||
|
||||
Integer *brseed;
|
||||
Integer *brseed_;
|
||||
// brseed is Evans' seed table, ref: (Ref: D. M. W.
|
||||
// Evans, "An improved digit-reversal permutation algorithm ...",
|
||||
// IEEE Trans. ASSP, Aug. 1987, pp. 1120-1125).
|
||||
Real **tab; // Tables of butterfly coefficients.
|
||||
Real *temp_buffer; // Allocated only if someone calls Compute with only
|
||||
Real **tab_; // Tables of butterfly coefficients.
|
||||
Real *temp_buffer_; // Allocated only if someone calls Compute with only
|
||||
// one argument and we need a temporary buffer while creating interleaved
|
||||
// data.
|
||||
};
|
||||
|
|
|
@ -241,12 +241,19 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
|
|||
locked = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!locked) {
|
||||
// We're not updating the parameters, either because another thread is
|
||||
// working on updating them, or because another thread already did so from
|
||||
// the same or later starting point (making our update stale), or because
|
||||
// update_period_ > 1. We just apply the preconditioning and return.
|
||||
|
||||
// note: we don't bother with any locks before incrementing
|
||||
// num_updates_skipped_ below, because the worst that could happen is that,
|
||||
// on very rare occasions, we could skip one or two more updates than we
|
||||
// intended.
|
||||
num_updates_skipped_++;
|
||||
|
||||
BaseFloat tr_Rt_RtT = TraceMatMat(*R_t, *R_t, kTrans);
|
||||
// P_t = R_t - H_t W_t
|
||||
R_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);
|
||||
|
@ -258,11 +265,6 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
|
|||
BaseFloat gamma_t = (tr_Pt_PtT == 0.0 ? 1.0 :
|
||||
sqrt(tr_Rt_RtT / tr_Pt_PtT));
|
||||
*scale = gamma_t;
|
||||
// note: we don't bother with any locks before incrementing
|
||||
// num_updates_skipped_ below, because the worst that could happen is that,
|
||||
// on very rare occasions, we could skip one or two more updates than we
|
||||
// intended.
|
||||
num_updates_skipped_++;
|
||||
return;
|
||||
}
|
||||
J_t.AddMatMat(1.0, H_t, kTrans, *R_t, kNoTrans, 0.0); // J_t = H_t^T R_t
|
||||
|
@ -295,6 +297,7 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
|
|||
BaseFloat beta_t = rho_t * (1.0 + alpha_) + alpha_ * d_t.Sum() / D;
|
||||
Vector<BaseFloat> e_t(R), sqrt_e_t(R), inv_sqrt_e_t(R);
|
||||
ComputeEt(d_t, beta_t, &e_t, &sqrt_e_t, &inv_sqrt_e_t);
|
||||
KALDI_VLOG(5) << "e_t = " << e_t;
|
||||
|
||||
SpMatrix<BaseFloat> Z_t(R);
|
||||
ComputeZt(N, rho_t, d_t, inv_sqrt_e_t, K_t_cpu, L_t_cpu, &Z_t);
|
||||
|
@ -494,7 +497,6 @@ void OnlinePreconditioner::ComputeEt(const VectorBase<BaseFloat> &d_t,
|
|||
BaseFloat *e = e_t->Data();
|
||||
for (int32 i = 0; i < D; i++)
|
||||
e[i] = 1.0 / (beta_t / d[i] + 1);
|
||||
KALDI_VLOG(5) << "e_t = " << *e_t;
|
||||
sqrt_e_t->CopyFromVec(*e_t);
|
||||
sqrt_e_t->ApplyPow(0.5);
|
||||
inv_sqrt_e_t->CopyFromVec(*sqrt_e_t);
|
||||
|
|
|
@ -67,10 +67,10 @@ void NnetUpdater::Propagate() {
|
|||
(c>0 && nnet_.GetComponent(c-1).BackpropNeedsOutput()) ||
|
||||
component.BackpropNeedsInput();
|
||||
if (g_kaldi_verbose_level >= 3 && num_times_printed < 100) {
|
||||
KALDI_LOG << "Stddev of data for component " << c
|
||||
<< " for this minibatch is "
|
||||
<< (TraceMatMat(forward_data_[c], forward_data_[c], kTrans) /
|
||||
(forward_data_[c].NumRows() * forward_data_[c].NumCols()));
|
||||
KALDI_VLOG(3) << "Stddev of data for component " << c
|
||||
<< " for this minibatch is "
|
||||
<< (TraceMatMat(forward_data_[c], forward_data_[c], kTrans) /
|
||||
(forward_data_[c].NumRows() * forward_data_[c].NumCols()));
|
||||
num_times_printed++;
|
||||
}
|
||||
if (!need_last_output)
|
||||
|
|
|
@ -26,7 +26,8 @@ namespace nnet2 {
|
|||
NnetSimpleTrainer::NnetSimpleTrainer(
|
||||
const NnetSimpleTrainerConfig &config,
|
||||
Nnet *nnet):
|
||||
config_(config), nnet_(nnet) {
|
||||
config_(config), nnet_(nnet), logprob_this_phase_(0.0),
|
||||
weight_this_phase_(0.0), logprob_total_(0.0), weight_total_(0.0) {
|
||||
num_phases_ = 0;
|
||||
bool first_time = true;
|
||||
BeginNewPhase(first_time);
|
||||
|
@ -44,7 +45,7 @@ void NnetSimpleTrainer::TrainOneMinibatch() {
|
|||
logprob_this_phase_ += DoBackprop(*nnet_,
|
||||
buffer_,
|
||||
nnet_);
|
||||
count_this_phase_ += buffer_.size();
|
||||
weight_this_phase_ += TotalNnetTrainingWeight(buffer_);
|
||||
buffer_.clear();
|
||||
minibatches_seen_this_phase_++;
|
||||
if (minibatches_seen_this_phase_ == config_.minibatches_per_phase) {
|
||||
|
@ -56,10 +57,12 @@ void NnetSimpleTrainer::TrainOneMinibatch() {
|
|||
void NnetSimpleTrainer::BeginNewPhase(bool first_time) {
|
||||
if (!first_time)
|
||||
KALDI_LOG << "Training objective function (this phase) is "
|
||||
<< (logprob_this_phase_/count_this_phase_) << " over "
|
||||
<< count_this_phase_ << " frames.";
|
||||
<< (logprob_this_phase_/weight_this_phase_) << " over "
|
||||
<< weight_this_phase_ << " frames.";
|
||||
logprob_total_ += logprob_this_phase_;
|
||||
weight_total_ += weight_this_phase_;
|
||||
logprob_this_phase_ = 0.0;
|
||||
count_this_phase_ = 0.0;
|
||||
weight_this_phase_ = 0.0;
|
||||
minibatches_seen_this_phase_ = 0;
|
||||
num_phases_++;
|
||||
}
|
||||
|
@ -75,6 +78,13 @@ NnetSimpleTrainer::~NnetSimpleTrainer() {
|
|||
BeginNewPhase(first_time);
|
||||
}
|
||||
}
|
||||
if (weight_total_ == 0.0) {
|
||||
KALDI_WARN << "No data seen.";
|
||||
} else {
|
||||
KALDI_LOG << "Did backprop on " << weight_total_
|
||||
<< " examples, average log-prob per frame is "
|
||||
<< (logprob_total_ / weight_total_);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -81,7 +81,10 @@ class NnetSimpleTrainer {
|
|||
std::vector<NnetExample> buffer_;
|
||||
|
||||
double logprob_this_phase_; // Needed for accumulating train log-prob on each phase.
|
||||
double count_this_phase_; // count corresponding to the above.
|
||||
double weight_this_phase_; // count corresponding to the above.
|
||||
|
||||
double logprob_total_;
|
||||
double weight_total_;
|
||||
};
|
||||
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче