trunk: modifying recipes for neural net training with the new online preconditioning; adding/modifying example scripts for RM and Fisher; various cosmetic code changes.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4088 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2014-06-28 05:11:57 +00:00
Родитель f67874f274
Коммит 99873c6171
17 изменённых файлов: 695 добавлений и 217 удалений

Просмотреть файл

@ -1,11 +1,13 @@
#!/bin/bash
# this (local/nnet2/run_6c_gpu.sh) trains a p-norm neural network on top of
# the SAT system in 5a.
# It uses the _fast.sh version of the script, which is faster than the old
# one, and also the --first-component-power 0.5 option, which we believe
# improves results (we're waiting for the numbers though).
dir=nnet6c_gpu
dir=nnet6c5_gpu
train_stage=-10
. ./cmd.sh
@ -28,11 +30,13 @@ parallel_opts="-l gpu=1" # This is suitable for the CLSP network, you'll likely
if [ ! -f exp/$dir/final.mdl ]; then
steps/nnet2/train_pnorm.sh --stage $train_stage --num-epochs 10 --get-egs-stage 3 --stage -3 \
steps/nnet2/train_pnorm_fast.sh --stage $train_stage --num-epochs 8 \
--first-component-power 0.5 \
--egs-dir exp/nnet6c3_gpu/egs \
--num-epochs-extra 4 \
--samples-per-iter 400000 \
--io-opts "-tc 10" \
--num-epochs-extra 5 \
--num-jobs-nnet 8 --num-threads 1 --max-change 40.0 \
--num-jobs-nnet 8 --num-threads 1 \
--minibatch-size 512 --parallel-opts "$parallel_opts" \
--mix-up 15000 \
--initial-learning-rate 0.08 --final-learning-rate 0.008 \
@ -43,7 +47,7 @@ parallel_opts="-l gpu=1" # This is suitable for the CLSP network, you'll likely
data/train data/lang exp/tri5a exp/$dir || exit 1;
fi
steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 30 \
steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 25 \
--config conf/decode.config --transform-dir exp/tri5a/decode_dev \
exp/tri5a/graph data/dev exp/$dir/decode_dev &

Просмотреть файл

@ -105,14 +105,17 @@ exit 0
%WER 1.72 [ 216 / 12533, 25 ins, 38 del, 153 sub ] exp/nnet4b_gpu/decode/wer_4
%WER 8.34 [ 1045 / 12533, 94 ins, 146 del, 805 sub ] exp/nnet4b_gpu/decode_ug/wer_10
%WER 1.75 [ 219 / 12533, 23 ins, 54 del, 142 sub ] exp/nnet4c/decode/wer_4
%WER 8.83 [ 1107 / 12533, 70 ins, 206 del, 831 sub ] exp/nnet4c/decode_ug/wer_10
%WER 1.72 [ 216 / 12533, 24 ins, 51 del, 141 sub ] exp/nnet4c/decode/wer_4
%WER 9.04 [ 1133 / 12533, 110 ins, 170 del, 853 sub ] exp/nnet4c/decode_ug/wer_7
%WER 1.76 [ 220 / 12533, 16 ins, 60 del, 144 sub ] exp/nnet4c_gpu/decode/wer_6
%WER 8.82 [ 1106 / 12533, 90 ins, 173 del, 843 sub ] exp/nnet4c_gpu/decode_ug/wer_11
%WER 1.83 [ 229 / 12533, 28 ins, 57 del, 144 sub ] exp/nnet4c_gpu/decode/wer_4
%WER 9.08 [ 1138 / 12533, 111 ins, 166 del, 861 sub ] exp/nnet4c_gpu/decode_ug/wer_7
%WER 1.61 [ 202 / 12533, 29 ins, 34 del, 139 sub ] exp/nnet4d_gpu/decode/wer_2
%WER 8.48 [ 1063 / 12533, 76 ins, 176 del, 811 sub ] exp/nnet4d_gpu/decode_ug/wer_10
%WER 1.60 [ 201 / 12533, 28 ins, 43 del, 130 sub ] exp/nnet4d/decode/wer_4
%WER 8.24 [ 1033 / 12533, 83 ins, 166 del, 784 sub ] exp/nnet4d/decode_ug/wer_11
%WER 1.69 [ 212 / 12533, 23 ins, 54 del, 135 sub ] exp/nnet4d_gpu/decode/wer_5
%WER 8.39 [ 1052 / 12533, 86 ins, 174 del, 792 sub ] exp/nnet4d_gpu/decode_ug/wer_11
%WER 1.37 [ 172 / 12533, 14 ins, 36 del, 122 sub ] exp/nnet4e_gpu/decode/wer_3
%WER 8.03 [ 1006 / 12533, 61 ins, 179 del, 766 sub ] exp/nnet4e_gpu/decode_ug/wer_8

Просмотреть файл

@ -1,27 +1,56 @@
#!/bin/bash
# This is neural net training on top of adapted 40-dimensional features.
#
# The same script works for GPUs, and for CPU only (with --use-gpu false).
train_stage=-10
use_gpu=true
. cmd.sh
. ./path.sh
. utils/parse_options.sh
( steps/nnet2/train_tanh.sh --num-epochs 20 \
--num-epochs-extra 10 --add-layers-period 1 \
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
fi
parallel_opts="-l gpu=1"
num_threads=1
dir=exp/nnet4c_gpu
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
parallel_opts="-pe smp $num_threads"
dir=exp/nnet4c
fi
if [ ! -f $dir/final.mdl ]; then
steps/nnet2/train_tanh_fast.sh --stage $train_stage \
--num-epochs 20 \
--add-layers-period 1 \
--num-hidden-layers 2 \
--mix-up 4000 \
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
--cmd "$decode_cmd" \
--hidden-layer-dim 375 \
data/train data/lang exp/tri3b_ali exp/nnet4c
data/train data/lang exp/tri3b_ali $dir || exit 1;
fi
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test exp/nnet4c/decode
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode_ug \
exp/tri3b/graph_ug data/test exp/nnet4c/decode_ug
)
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test $dir/decode &
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode_ug \
exp/tri3b/graph_ug data/test $dir/decode_ug
wait

Просмотреть файл

@ -1,42 +0,0 @@
#!/bin/bash
# This is neural net training on top of adapted 40-dimensional features.
# This version of the script uses GPUs. We distinguish it by putting "_gpu"
# at the end of the directory name.
#
# Since we're using one quarter the number of jobs (num-jobs-nnet) as the
# run_4c.sh script, we halve the learning rate (generally speaking, splitting
# the difference like this is probably a good idea.)
parallel_opts="-l gpu=1" # This is suitable for the CLSP network, you'll likely have to change it.
. ./cmd.sh
. ./path.sh
! cuda-compiled && cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
( steps/nnet2/train_tanh.sh --num-epochs 20 \
--num-jobs-nnet 4 --num-threads 1 --parallel-opts "$parallel_opts" \
--num-epochs-extra 10 --add-layers-period 1 \
--num-hidden-layers 2 \
--mix-up 4000 \
--initial-learning-rate 0.01 --final-learning-rate 0.002 \
--cmd "$decode_cmd" \
--hidden-layer-dim 375 \
data/train data/lang exp/tri3b_ali exp/nnet4c_gpu
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test exp/nnet4c_gpu/decode
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode_ug \
exp/tri3b/graph_ug data/test exp/nnet4c_gpu/decode_ug
)

Просмотреть файл

@ -1,21 +1,42 @@
#!/bin/bash
# This is pnorm neural net training on top of adapted 40-dimensional features.
# This version of the script uses GPUs. We distinguish it by putting "_gpu"
# at the end of the directory name.
# local/nnet2/run_4d.sh is the new, faster version of the p-norm training script.
# The same script works for GPUs, and for CPU only (with --use-gpu false).
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
parallel_opts="-pe smp $num_threads"
train_stage=-10
use_gpu=true
. cmd.sh
. ./path.sh
. utils/parse_options.sh
dir=exp/nnet4d
( steps/nnet2/train_pnorm.sh --num-epochs 20 \
--num-jobs-nnet 4 --num-threads $num_threads --parallel-opts "$parallel_opts" \
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
fi
parallel_opts="-l gpu=1"
num_threads=1
dir=exp/nnet4d_gpu
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
parallel_opts="-pe smp $num_threads"
dir=exp/nnet4d
fi
if [ ! -f $dir/final.mdl ]; then
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
--num-threads $num_threads --parallel-opts "$parallel_opts" \
--num-jobs-nnet 4 \
--num-epochs-extra 10 --add-layers-period 1 \
--num-hidden-layers 2 \
--mix-up 4000 \
@ -23,15 +44,16 @@ dir=exp/nnet4d
--cmd "$decode_cmd" \
--pnorm-input-dim 1000 \
--pnorm-output-dim 200 \
--combine-regularizer 1.0e-12 \
data/train data/lang exp/tri3b_ali $dir
data/train data/lang exp/tri3b_ali $dir || exit 1;
fi
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test $dir/decode
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test $dir/decode &
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode_ug \
exp/tri3b/graph_ug data/test $dir/decode_ug
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode_ug \
exp/tri3b/graph_ug data/test $dir/decode_ug
wait
)

Просмотреть файл

@ -1,41 +0,0 @@
#!/bin/bash
# This is pnorm neural net training on top of adapted 40-dimensional features.
# This version of the script uses GPUs. We distinguish it by putting "_gpu"
# at the end of the directory name.
parallel_opts="-l gpu=1"
. ./cmd.sh
. ./path.sh
! cuda-compiled && cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
dir=exp/nnet4d_gpu
( steps/nnet2/train_pnorm.sh --num-epochs 20 \
--num-jobs-nnet 4 --num-threads 1 --parallel-opts "$parallel_opts" \
--num-epochs-extra 10 --add-layers-period 1 \
--num-hidden-layers 2 \
--mix-up 4000 \
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
--cmd "$decode_cmd" \
--pnorm-input-dim 750 \
--pnorm-output-dim 150 \
--combine-regularizer 1.0e-12 \
data/train data/lang exp/tri3b_ali $dir
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test $dir/decode
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode_ug \
exp/tri3b/graph_ug data/test $dir/decode_ug
)

Просмотреть файл

@ -7,7 +7,7 @@
use_gpu=true
if $use_gpu; then
# This example runs on top of "raw-fMLLR" features:
# This example runs on top of "raw-fMLLR" features.
# We don't have a GPU version of this script.
#local/nnet2/run_4a_gpu.sh
@ -15,11 +15,11 @@ if $use_gpu; then
local/nnet2/run_4b_gpu.sh
# This one is on top of 40-dim + fMLLR features
local/nnet2/run_4c_gpu.sh
local/nnet2/run_4c.sh --use-gpu true
# This one is for training pnorm nnets on top of 40-dim + fMLLR features
# **THIS IS THE PRIMARY RECIPE**
local/nnet2/run_4d_gpu.sh
local/nnet2/run_4d.sh --use-gpu true
# This is discriminative training on top of 4c.
local/nnet2/run_5c_gpu.sh
@ -34,11 +34,12 @@ else
# This one is on top of filter-bank features, with only CMN.
local/nnet2/run_4b.sh
# This one is on top of 40-dim + fMLLR features
local/nnet2/run_4c.sh
# This one is on top of 40-dim + fMLLR features, it's a fairly
# normal tanh system.
local/nnet2/run_4c.sh --use-gpu false
# **THIS IS THE PRIMARY RECIPE**
local/nnet2/run_4d.sh
# **THIS IS THE PRIMARY RECIPE (40-dim + fMLLR + p-norm neural net)**
local/nnet2/run_4d.sh --use-gpu false
# This is discriminative training on top of 4c.
local/nnet2/run_5c.sh

Просмотреть файл

@ -1,15 +1,19 @@
#!/bin/bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey).
# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).
# 2013 Xiaohui Zhang
# 2013 Guoguo Chen
# Apache 2.0.
# This script trains neural network with pnorm nonlinearities.
# The difference with train_tanh.sh is that, instead of setting
# hidden_layer_size, you should set pnorm_input_dim and pnorm_output_dim.
# Also the P value (the order of the p-norm) should be set.
# train_pnorm_fast.sh is a new, improved version of train_pnorm.sh, which uses
# the 'online' preconditioning method. For GPUs it's about two times faster
# than before (although that's partly due to optimizations that will also help
# the old recipe), and for CPUs it gives better performance than the old method
# (I believe); also, the difference in optimization performance between CPU and
# GPU is almost gone. The old train_pnorm.sh script is now deprecated.
# We made this a separate script because not all of the options that the
# old script accepted, are still accepted.
# Begin configuration section.
cmd=run.pl
@ -22,18 +26,15 @@ num_iters_final=20 # Maximum number of final iterations to give to the
initial_learning_rate=0.04
final_learning_rate=0.004
bias_stddev=0.5
softmax_learning_rate_factor=1.0 # In the default setting keep the same learning rate.
softmax_learning_rate_factor=1.0 # In the default setting keep the same learning
# rate for the final layer.
pnorm_input_dim=3000
pnorm_output_dim=300
first_component_power=1.0 # could set this to 0.5, sometimes seems to improve results.
p=2
minibatch_size=128 # by default use a smallish minibatch size for neural net
# training; this controls instability which would otherwise
# be a problem with multi-threaded update. Note: it also
# interacts with the "preconditioned" update which generally
# works better with larger minibatch size, so it's not
# completely cost free.
# be a problem with multi-threaded update.
samples_per_iter=200000 # each iteration of training, see this many samples
# per job. This option is passed to get_egs.sh
@ -59,14 +60,18 @@ io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one t
splice_width=4 # meaning +- 4 frames on each side for second LDA
randprune=4.0 # speeds up LDA.
alpha=4.0 # relates to preconditioning.
update_period=4 # relates to preconditioning: says how often we update the subspace.
update_period=4 # relates to online preconditioning: says how often we update the subspace.
num_samples_history=2000 # relates to online preconditioning
max_change_per_sample=0.1
max_change_per_sample=0.075
precondition_rank_in=20 # relates to online preconditioning
precondition_rank_out=80 # relates to online preconditioning
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
# specified.)
num_threads=16
parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" # by default we use 16 threads; this lets the queue know.
# note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_parallel_opts="-pe smp 8" # queue options for the "combine" stage.
cleanup=true
egs_dir=
lda_opts=
@ -75,8 +80,6 @@ egs_opts=
transform_dir= # If supplied, overrides alidir
prior_subset_size=10000 # 10k samples per job, for computing priors. Should be
# more than enough.
precondition_rank_in=20
precondition_rank_out=80
# End configuration section.
@ -262,9 +265,10 @@ echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling "
echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
echo "$0: (while reducing learning rate) + (with constant learning rate)."
finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
# This is when we decide to mix up from: halfway between when we've finished
# adding the hidden layers and the end of training.
finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]
if [ $num_threads -eq 1 ]; then
@ -296,6 +300,7 @@ while [ $x -lt $num_iters ]; do
fi
echo "Training neural net (pass $x)"
if [ $x -gt 0 ] && \
[ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
[ $[($x-1) % $add_layers_period] -eq 0 ]; then
@ -303,18 +308,29 @@ while [ $x -lt $num_iters ]; do
else
mdl=$dir/$x.mdl
fi
$cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
if [ $x -eq 0 ] || [ "$mdl" != "$dir/$x.mdl" ]; then
# on iteration zero or when we just added a layer, use a smaller minibatch
# size and just one job: the model-averaging doesn't seem to be helpful
# when the model is changing too fast (i.e. it worsens the objective
# function), and the smaller minibatch size will help to keep
# the update stable.
this_minibatch_size=$[$minibatch_size/2];
this_num_jobs_nnet=1
else
this_minibatch_size=$minibatch_size
this_num_jobs_nnet=$num_jobs_nnet
fi
$cmd $parallel_opts JOB=1:$this_num_jobs_nnet $dir/log/train.$x.JOB.log \
nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
nnet-train$train_suffix \
--minibatch-size=$minibatch_size --srand=$x "$mdl" \
nnet-train$train_suffix \
--minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
ark:- $dir/$[$x+1].JOB.mdl \
|| exit 1;
nnets_list=
for n in `seq 1 $num_jobs_nnet`; do
for n in `seq 1 $this_num_jobs_nnet`; do
nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
done
@ -331,7 +347,7 @@ while [ $x -lt $num_iters ]; do
else lr=$learning_rate; fi
lr_string="$lr_string:$lr"
done
$cmd $dir/log/average.$x.log \
nnet-am-average $nnets_list - \| \
nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
@ -372,10 +388,9 @@ if [ $stage -le $num_iters ]; then
# Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
# if there are many models it can give out-of-memory error; set num-threads to 8
# to speed it up (this isn't ideal...)
this_num_threads=$num_threads
[ $this_num_threads -lt 8 ] && this_num_threads=8
num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
mb=$[($num_egs+$this_num_threads-1)/$this_num_threads]
combine_num_threads=8
mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
[ $mb -gt 512 ] && mb=512
# Setting --initial-model to a large value makes it initialize the combination
# with the average of all the models. It's important not to start with a
@ -384,15 +399,15 @@ if [ $stage -le $num_iters ]; then
# nnet-combine-fast uses for scaling, which after flooring and inversion, has
# the effect that the initial model chosen gets much higher learning rates
# than the others. This prevents the optimization from working well.
$cmd $parallel_opts $dir/log/combine.log \
$cmd $combine_parallel_opts $dir/log/combine.log \
nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
--num-threads=$this_num_threads \
--num-threads=$combine_num_threads \
--verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
$dir/final.mdl || exit 1;
# Normalize stddev for affine or block affine layers that are followed by a
# pnorm layer and then a normalize layer.
$cmd $parallel_opts $dir/log/normalize.log \
$cmd $dir/log/normalize.log \
nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;
# Compute the probability of the final, combined model with

Просмотреть файл

@ -0,0 +1,451 @@
#!/bin/bash
# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
# This script trains a fairly vanilla network with tanh nonlinearities.
# train_tanh_fast.sh is a new, improved version of train_tanh.sh, which uses
# the 'online' preconditioning method. For GPUs it's about two times faster
# than before (although that's partly due to optimizations that will also help
# the old recipe), and for CPUs it gives better performance than the old method
# (I believe); also, the difference in optimization performance between CPU and
# GPU is almost gone. The old train_tanh.sh script is now deprecated.
# We made this a separate script because not all of the options that the
# old script accepted, are still accepted.
# Begin configuration section.
cmd=run.pl
num_epochs=15 # Number of epochs during which we reduce
# the learning rate; number of iteration is worked out from this.
num_epochs_extra=5 # Number of epochs after we stop reducing
# the learning rate.
num_iters_final=20 # Maximum number of final iterations to give to the
# optimization over the validation set.
initial_learning_rate=0.04
final_learning_rate=0.004
bias_stddev=0.5
shrink_interval=5 # shrink every $shrink_interval iters except while we are
# still adding layers, when we do it every iter.
shrink=true
num_frames_shrink=2000 # note: must be <= --num-frames-diagnostic option to get_egs.sh, if
# given.
final_learning_rate_factor=0.5 # Train the two last layers of parameters half as
# fast as the other layers, by default.
hidden_layer_dim=300 # You may want this larger, e.g. 1024 or 2048.
minibatch_size=128 # by default use a smallish minibatch size for neural net
# training; this controls instability which would otherwise
# be a problem with multi-threaded update.
samples_per_iter=200000 # each iteration of training, see this many samples
# per job. This option is passed to get_egs.sh.
num_jobs_nnet=16 # Number of neural net jobs to run in parallel. This option
# is passed to get_egs.sh.
get_egs_stage=0
spk_vecs_dir=
shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of
# the samples on each iter. You could set it to 0 or to a large
# value for complete randomization, but this would both consume
# memory and cause spikes in disk I/O. Smaller is easier on
# disk and memory but less random. It's not a huge deal though,
# as samples are anyway randomized right at the start.
add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3 # This is an important configuration value that you might
# want to tune.
stage=-5
io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. These don't
splice_width=4 # meaning +- 4 frames on each side for second LDA
randprune=4.0 # speeds up LDA.
alpha=4.0 # relates to preconditioning.
update_period=4 # relates to online preconditioning: says how often we update the subspace.
num_samples_history=2000 # relates to online preconditioning
max_change_per_sample=0.075
# we make the [input, output] ranks less different for the tanh setup than for
# the pnorm setup, as we don't have the difference in dimensions to deal with.
precondition_rank_in=30 # relates to online preconditioning
precondition_rank_out=60 # relates to online preconditioning
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
# specified.)
num_threads=16
parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" # by default we use 16 threads; this lets the queue know.
# note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_parallel_opts="-pe smp 8" # queue options for the "combine" stage.
cleanup=true
egs_dir=
lda_opts=
egs_opts=
transform_dir=
prior_subset_size=10000 # 10k samples per job, for computing priors. Should be
# more than enough.
# End configuration section.
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 4 ]; then
echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
echo ""
echo "Main options (for others, see top of script file)"
echo " --config <config-file> # config file containing options"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --num-epochs <#epochs|15> # Number of epochs of main training"
echo " # while reducing learning rate (determines #iterations, together"
echo " # with --samples-per-iter and --num-jobs-nnet)"
echo " --num-epochs-extra <#epochs-extra|5> # Number of extra epochs of training"
echo " # after learning rate fully reduced"
echo " --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
echo " # data, 0.01 for large data"
echo " --final-learning-rate <final-learning-rate|0.004> # Learning rate at end of training, e.g. 0.004 for small"
echo " # data, 0.001 for large data"
echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
echo " --initial-num-hidden-layers <#hidden-layers|1> # Number of hidden layers to start with."
echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers"
echo " --mix-up <#pseudo-gaussians|0> # Can be used to have multiple targets in final output layer,"
echo " # per context-dependent state. Try a number several times #states."
echo " --num-jobs-nnet <num-jobs|8> # Number of parallel jobs to use for main neural net"
echo " # training (will affect results as well as speed; try 8, 16)"
echo " # Note: if you increase this, you may want to also increase"
echo " # the learning rate."
echo " --num-threads <num-threads|16> # Number of parallel threads per job (will affect results"
echo " # as well as speed; may interact with batch size; if you increase"
echo " # this, you may want to decrease the batch size."
echo " --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\"> # extra options to pass to e.g. queue.pl for processes that"
echo " # use multiple threads... note, you might have to reduce mem_free,ram_free"
echo " # versus your defaults, because it gets multiplied by the -pe smp argument."
echo " --io-opts <opts|\"-tc 10\"> # Options given to e.g. queue.pl for jobs that do a lot of I/O."
echo " --minibatch-size <minibatch-size|128> # Size of minibatch to process (note: product with --num-threads"
echo " # should not get too large, e.g. >2k)."
echo " --samples-per-iter <#samples|200000> # Number of samples of data to process per iteration, per"
echo " # process."
echo " --splice-width <width|4> # Number of frames on each side to append for feature input"
echo " # (note: we splice processed, typically 40-dimensional frames"
echo " --lda-dim <dim|250> # Dimension to reduce spliced features to with LDA"
echo " --num-iters-final <#iters|10> # Number of final iterations to give to nnet-combine-fast to "
echo " # interpolate parameters (the weights are learned with a validation set)"
echo " --num-utts-subset <#utts|300> # Number of utterances in subsets used for validation and diagnostics"
echo " # (the validation subset is held out from training)"
echo " --num-frames-diagnostic <#frames|4000> # Number of frames used in computing (train,valid) diagnostics"
echo " --num-valid-frames-combine <#frames|10000> # Number of frames used in getting combination weights at the"
echo " # very end."
echo " --stage <stage|-9> # Used to run a partially-completed training process from somewhere in"
echo " # the middle."
exit 1;
fi
data=$1
lang=$2
alidir=$3
dir=$4
# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
# Set some variables.
num_leaves=`am-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj
mkdir -p $dir/log
splice_opts=`cat $alidir/splice_opts 2>/dev/null`
cp $alidir/splice_opts $dir 2>/dev/null
cp $alidir/cmvn_opts $dir 2>/dev/null
cp $alidir/tree $dir
[ -z "$transform_dir" ] && transform_dir=$alidir
if [ $stage -le -4 ]; then
echo "$0: calling get_lda.sh"
steps/nnet2/get_lda.sh $lda_opts --transform-dir $transform_dir --splice-width $splice_width --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi
# these files will have been written by get_lda.sh
feat_dim=`cat $dir/feat_dim` || exit 1;
lda_dim=`cat $dir/lda_dim` || exit 1;
if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
echo "$0: calling get_egs.sh"
[ ! -z $spk_vecs_dir ] && spk_vecs_opt="--spk-vecs-dir $spk_vecs_dir";
steps/nnet2/get_egs.sh $spk_vecs_opt --transform-dir $transform_dir --samples-per-iter $samples_per_iter \
--num-jobs-nnet $num_jobs_nnet --splice-width $splice_width --stage $get_egs_stage \
--cmd "$cmd" $egs_opts --io-opts "$io_opts" \
$data $lang $alidir $dir || exit 1;
fi
if [ -z $egs_dir ]; then
egs_dir=$dir/egs
fi
iters_per_epoch=`cat $egs_dir/iters_per_epoch` || exit 1;
! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;
if ! [ $num_hidden_layers -ge 1 ]; then
echo "Invalid num-hidden-layers $num_hidden_layers"
exit 1
fi
if [ $stage -le -2 ]; then
echo "$0: initializing neural net";
# Get spk-vec dim (in case we're using them).
if [ ! -z "$spk_vecs_dir" ]; then
spk_vec_dim=$[$(copy-vector --print-args=false "ark:cat $spk_vecs_dir/vecs.1|" ark,t:- | head -n 1 | wc -w) - 3];
! [ $spk_vec_dim -gt 0 ] && echo "Error getting spk-vec dim" && exit 1;
ext_lda_dim=$[$lda_dim + $spk_vec_dim]
extend-transform-dim --new-dimension=$ext_lda_dim $dir/lda.mat $dir/lda_ext.mat || exit 1;
lda_mat=$dir/lda_ext.mat
ext_feat_dim=$[$feat_dim + $spk_vec_dim]
else
spk_vec_dim=0
lda_mat=$dir/lda.mat
ext_lda_dim=$lda_dim
ext_feat_dim=$feat_dim
fi
online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"
stddev=`perl -e "print 1.0/sqrt($hidden_layer_dim);"`
cat >$dir/nnet.config <<EOF
SpliceComponent input-dim=$ext_feat_dim left-context=$splice_width right-context=$splice_width const-component-dim=$spk_vec_dim
FixedAffineComponent matrix=$lda_mat
AffineComponentPreconditionedOnline input-dim=$ext_lda_dim output-dim=$hidden_layer_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
TanhComponent dim=$hidden_layer_dim
AffineComponentPreconditionedOnline input-dim=$hidden_layer_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF
# to hidden.config it will write the part of the config corresponding to a
# single hidden layer; we need this to add new layers.
cat >$dir/hidden.config <<EOF
AffineComponentPreconditionedOnline input-dim=$hidden_layer_dim output-dim=$hidden_layer_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
TanhComponent dim=$hidden_layer_dim
EOF
$cmd $dir/log/nnet_init.log \
nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
$dir/0.mdl || exit 1;
fi
if [ $stage -le -1 ]; then
echo "Training transition probabilities and setting priors"
$cmd $dir/log/train_trans.log \
nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
|| exit 1;
fi
num_iters_reduce=$[$num_epochs * $iters_per_epoch];
num_iters_extra=$[$num_epochs_extra * $iters_per_epoch];
num_iters=$[$num_iters_reduce+$num_iters_extra]
echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling "
echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
echo "$0: (while reducing learning rate) + (with constant learning rate)."
# This is when we decide to mix up from: halfway between when we've finished
# adding the hidden layers and the end of training.
finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]
if [ $num_threads -eq 1 ]; then
train_suffix="-simple" # this enables us to use GPU code if
# we have just one thread.
if ! cuda-compiled; then
echo "$0: WARNING: you are running with one thread but you have not compiled"
echo " for CUDA. You may be running a setup optimized for GPUs. If you have"
echo " GPUs and have nvcc installed, go to src/ and do ./configure; make"
fi
else
train_suffix="-parallel --num-threads=$num_threads"
fi
x=0
while [ $x -lt $num_iters ]; do
if [ $x -ge 0 ] && [ $stage -le $x ]; then
# Set off jobs doing some diagnostics, in the background.
$cmd $dir/log/compute_prob_valid.$x.log \
nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
$cmd $dir/log/compute_prob_train.$x.log \
nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
$cmd $dir/log/progress.$x.log \
nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
ark:$egs_dir/train_diagnostic.egs '&&' \
nnet-am-info $dir/$x.mdl &
fi
echo "Training neural net (pass $x)"
if [ $x -gt 0 ] && \
[ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
[ $[($x-1) % $add_layers_period] -eq 0 ]; then
mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert $dir/$x.mdl - - |"
else
mdl=$dir/$x.mdl
fi
if [ $x -eq 0 ] || [ "$mdl" != "$dir/$x.mdl" ]; then
# on iteration zero or when we just added a layer, use a smaller minibatch
# size and just one job: the model-averaging doesn't seem to be helpful
# when the model is changing too fast (i.e. it worsens the objective
# function), and the smaller minibatch size will help to keep
# the update stable.
this_minibatch_size=$[$minibatch_size/2];
this_num_jobs_nnet=1
else
this_minibatch_size=$minibatch_size
this_num_jobs_nnet=$num_jobs_nnet
fi
$cmd $parallel_opts JOB=1:$this_num_jobs_nnet $dir/log/train.$x.JOB.log \
nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
nnet-train$train_suffix \
--minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
ark:- $dir/$[$x+1].JOB.mdl \
|| exit 1;
nnets_list=
for n in `seq 1 $this_num_jobs_nnet`; do
nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
done
learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters_reduce $initial_learning_rate $final_learning_rate`;
last_layer_learning_rate=`perl -e "print $learning_rate * $final_learning_rate_factor;"`;
nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo 2>/dev/null || exit 1
nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
# na is number of last updatable AffineComponent layer [one-based, counting only
# updatable components.]
# The last two layers will get this (usually lower) learning rate.
lr_string="$learning_rate"
for n in `seq 2 $nu`; do
if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$last_layer_learning_rate;
else lr=$learning_rate; fi
lr_string="$lr_string:$lr"
done
$cmd $dir/log/average.$x.log \
nnet-am-average $nnets_list - \| \
nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
if $shrink && [ $[$x % $shrink_interval] -eq 0 ]; then
mb=$[($num_frames_shrink+$num_threads-1)/$num_threads]
$cmd $parallel_opts $dir/log/shrink.$x.log \
nnet-subset-egs --n=$num_frames_shrink --randomize-order=true --srand=$x \
ark:$egs_dir/train_diagnostic.egs ark:- \| \
nnet-combine-fast --use-gpu=no --num-threads=$num_threads --verbose=3 --minibatch-size=$mb \
$dir/$[$x+1].mdl ark:- $dir/$[$x+1].mdl || exit 1;
else
# On other iters, do nnet-am-fix which is much faster and has roughly
# the same effect.
nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log
fi
if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
# mix up.
echo Mixing up from $num_leaves to $mix_up components
$cmd $dir/log/mix_up.$x.log \
nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
$dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
fi
rm $nnets_list
fi
x=$[$x+1]
done
# Now do combination.
# At the end, final.mdl will be a combination of the last e.g. 10 models.
nnets_list=()
if [ $num_iters_final -gt $num_iters_extra ]; then
echo "Setting num_iters_final=$num_iters_extra"
fi
start=$[$num_iters-$num_iters_final+1]
for x in `seq $start $num_iters`; do
idx=$[$x-$start]
if [ $x -gt $mix_up_iter ]; then
nnets_list[$idx]=$dir/$x.mdl # "nnet-am-copy --remove-dropout=true $dir/$x.mdl - |"
fi
done
if [ $stage -le $num_iters ]; then
echo "Doing final combination to produce final.mdl"
# Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as if
# there are many models it can give out-of-memory error on the GPU; set
# num-threads to 8 to speed it up (this isn't ideal...)
num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
combine_num_threads=8
mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
[ $mb -gt 512 ] && mb=512
$cmd $combine_parallel_opts $dir/log/combine.log \
nnet-combine-fast --use-gpu=no --num-threads=$combine_num_threads \
--verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
$dir/final.mdl || exit 1;
# Compute the probability of the final, combined model with
# the same subset we used for the previous compute_probs, as the
# different subsets will lead to different probs.
$cmd $dir/log/compute_prob_valid.final.log \
nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
$cmd $dir/log/compute_prob_train.final.log \
nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &
fi
if [ $stage -le $[$num_iters+1] ]; then
echo "Getting average posterior for purposes of adjusting the priors."
# Note: this just uses CPUs, using a smallish subset of data.
rm $dir/post.*.vec 2>/dev/null
$cmd JOB=1:$num_jobs_nnet $dir/log/get_post.JOB.log \
nnet-subset-egs --n=$prior_subset_size ark:$egs_dir/egs.JOB.0.ark ark:- \| \
nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.JOB.vec || exit 1;
sleep 3; # make sure there is time for $dir/post.*.vec to appear.
$cmd $dir/log/vector_sum.log \
vector-sum $dir/post.*.vec $dir/post.vec || exit 1;
rm $dir/post.*.vec;
echo "Re-adjusting priors based on computed posteriors"
$cmd $dir/log/adjust_priors.log \
nnet-adjust-priors $dir/final.mdl $dir/post.vec $dir/final.mdl || exit 1;
fi
sleep 2
echo Done
if $cleanup; then
echo Cleaning up data
if [ $egs_dir == "$dir/egs" ]; then
echo Removing training examples
rm $dir/egs/egs*
fi
echo Removing most of the models
for x in `seq 0 $num_iters`; do
if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
# delete all but every 10th model; don't delete the ones which combine to form the final model.
rm $dir/$x.mdl
fi
done
fi

Просмотреть файл

@ -207,8 +207,23 @@ if (! $sync) { # We're not submitting with -sync y, so we
}
}
# We will need the sge_job_id, to check that job still exists
$sge_job_id=`grep "Your job" $queue_logfile | awk '{ print \$3 }' | sed 's|\\\..*||'`;
chomp($sge_job_id);
{ # Get the SGE job-id from the log file in q/
open(L, "<$queue_logfile") || die "Error opening log file $queue_logfile";
undef $sge_job_id;
while (<L>) {
if (m/Your job\S* (\d+)[. ].+ has been submitted/) {
if (defined $sge_job_id) {
die "Error: your job was submitted more than once (see $queue_logfile)";
} else {
$sge_job_id = $1;
}
}
}
close(L);
if (!defined $sge_job_id) {
die "Error: log file $queue_logfile does not specify the SGE job-id.";
}
}
$check_sge_job_ctr=1;
#
$wait = 0.1;

Просмотреть файл

@ -58,6 +58,9 @@ namespace kaldi {
back to where you started from. We don't do this because
in some contexts, the transform is made symmetric by multiplying
by sqrt(N) in both passes. The user can do this by themselves.
See also SplitRadixComplexFft, declared in srfft.h, which is more efficient
but only works if the length of the input is a power of 2.
*/
template<typename Real> void ComplexFft (VectorBase<Real> *v, bool forward, Vector<Real> *tmp_work = NULL);

Просмотреть файл

@ -43,7 +43,7 @@ SplitRadixComplexFft<Real>::SplitRadixComplexFft(MatrixIndexT N) {
logm_ ++;
}
ComputeTables();
temp_buffer = NULL;
temp_buffer_ = NULL;
}
template<typename Real>
@ -55,21 +55,21 @@ void SplitRadixComplexFft<Real>::ComputeTables() {
lg2 = logm_ >> 1;
if (logm_ & 1) lg2++;
brseed = new MatrixIndexT[1 << lg2];
brseed[0] = 0;
brseed[1] = 1;
brseed_ = new MatrixIndexT[1 << lg2];
brseed_[0] = 0;
brseed_[1] = 1;
for (j = 2; j <= lg2; j++) {
imax = 1 << (j - 1);
for (i = 0; i < imax; i++) {
brseed[i] <<= 1;
brseed[i + imax] = brseed[i] + 1;
brseed_[i] <<= 1;
brseed_[i + imax] = brseed_[i] + 1;
}
}
if (logm_ < 4) {
tab = NULL;
tab_ = NULL;
} else {
tab = new Real* [logm_-3];
tab_ = new Real* [logm_-3];
for (i = logm_; i>=4 ; i--) {
/* Compute a few constants */
m = 1 << i; m2 = m / 2; m4 = m2 / 2; m8 = m4 /2;
@ -77,10 +77,10 @@ void SplitRadixComplexFft<Real>::ComputeTables() {
/* Allocate memory for tables */
nel = m4 - 2;
tab[i-4] = new Real[6*nel];
tab_[i-4] = new Real[6*nel];
/* Initialize pointers */
cn = tab[i-4]; spcn = cn + nel; smcn = spcn + nel;
cn = tab_[i-4]; spcn = cn + nel; smcn = spcn + nel;
c3n = smcn + nel; spc3n = c3n + nel; smc3n = spc3n + nel;
/* Compute tables */
@ -99,14 +99,14 @@ void SplitRadixComplexFft<Real>::ComputeTables() {
template<typename Real>
SplitRadixComplexFft<Real>::~SplitRadixComplexFft() {
delete [] brseed;
if (tab != NULL) {
delete [] brseed_;
if (tab_ != NULL) {
for (MatrixIndexT i = 0; i < logm_-3; i++)
delete [] tab[i];
delete [] tab;
delete [] tab_[i];
delete [] tab_;
}
if (temp_buffer != NULL)
delete [] temp_buffer;
// "delete" only does something if it's a non-NULL pointer.
delete [] temp_buffer_;
}
template<typename Real>
@ -125,29 +125,29 @@ void SplitRadixComplexFft<Real>::Compute(Real *xr, Real *xi, bool forward) const
template<typename Real>
void SplitRadixComplexFft<Real>::Compute(Real *x, bool forward) {
if (temp_buffer == NULL)
temp_buffer = new Real[N_];
if (temp_buffer_== NULL)
temp_buffer_ = new Real[N_];
for (MatrixIndexT i = 0; i < N_; i++) {
x[i] = x[i*2]; // put the real part in the first half of x.
temp_buffer[i] = x[i*2 + 1]; // put the imaginary part in temp_buffer.
temp_buffer_[i] = x[i*2 + 1]; // put the imaginary part in temp_buffer.
}
// copy the imaginary part back to the second half of x.
memcpy(static_cast<void*>(x+N_),
static_cast<void*>(temp_buffer),
memcpy(static_cast<void*>(x + N_),
static_cast<void*>(temp_buffer_),
sizeof(Real) * N_);
Compute(x, x+N_, forward);
Compute(x, x + N_, forward);
// Now change the format back to interleaved.
memcpy(static_cast<void*>(temp_buffer),
static_cast<void*>(x+N_),
memcpy(static_cast<void*>(temp_buffer_),
static_cast<void*>(x + N_),
sizeof(Real) * N_);
for (MatrixIndexT i = N_-1; i > 0; i--) { // don't include 0,
// in case MatrixIndexT is unsigned, the loop would not terminate.
// Treat it as a special case.
x[i*2] = x[i];
x[i*2 + 1] = temp_buffer[i];
x[i*2 + 1] = temp_buffer_[i];
}
x[1] = temp_buffer[0]; // special case of i = 0.
x[1] = temp_buffer_[0]; // special case of i = 0.
}
template<typename Real>
@ -162,11 +162,11 @@ void SplitRadixComplexFft<Real>::BitReversePermute(Real *x, MatrixIndexT logm) c
/* Unshuffling loop */
for (off = 1; off < n; off++) {
fj = n * brseed[off]; i = off; j = fj;
fj = n * brseed_[off]; i = off; j = fj;
tmp = x[i]; x[i] = x[j]; x[j] = tmp;
xp = &x[i];
brp = &(brseed[1]);
for (gno = 1; gno < brseed[off]; gno++) {
brp = &(brseed_[1]);
for (gno = 1; gno < brseed_[off]; gno++) {
xp += n;
j = fj + *brp++;
xq = x + j;
@ -281,7 +281,7 @@ void SplitRadixComplexFft<Real>::ComputeRecursive(Real *xr, Real *xi, MatrixInde
xi1 = xi + m2; xi2 = xi1 + m4;
if (logm >= 4) {
nel = m4 - 2;
cn = tab[logm-4]; spcn = cn + nel; smcn = spcn + nel;
cn = tab_[logm-4]; spcn = cn + nel; smcn = spcn + nel;
c3n = smcn + nel; spc3n = c3n + nel; smc3n = spc3n + nel;
}
xr1++; xr2++; xi1++; xi2++;

Просмотреть файл

@ -40,11 +40,14 @@ namespace kaldi {
// permission, optimized by Go Vivace Inc., and converted into C++ by
// Microsoft Corporation
// This is a more efficient way of doing the complex FFT than ComplexFft
// above, but it only works for powers of 2.
// (declared in matrix-functios.h), but it only works for powers of 2.
// Note: in multi-threaded code, you would need to have one of these objects per
// thread, because multiple calls to Compute in parallel would not work.
template<typename Real>
class SplitRadixComplexFft {
public:
typedef MatrixIndexT Integer;
// N is the number of complex points (must be a power of two, or this
// will crash). Note that the constructor does some work so it's best to
// initialize the object once and do the computation many times.
@ -73,12 +76,12 @@ class SplitRadixComplexFft {
Integer logm_; // log(N) [a slight mismatch in notation which we have not
// bothered to fix].
Integer *brseed;
Integer *brseed_;
// brseed is Evans' seed table, ref: (Ref: D. M. W.
// Evans, "An improved digit-reversal permutation algorithm ...",
// IEEE Trans. ASSP, Aug. 1987, pp. 1120-1125).
Real **tab; // Tables of butterfly coefficients.
Real *temp_buffer; // Allocated only if someone calls Compute with only
Real **tab_; // Tables of butterfly coefficients.
Real *temp_buffer_; // Allocated only if someone calls Compute with only
// one argument and we need a temporary buffer while creating interleaved
// data.
};

Просмотреть файл

@ -241,12 +241,19 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
locked = false;
}
}
if (!locked) {
// We're not updating the parameters, either because another thread is
// working on updating them, or because another thread already did so from
// the same or later starting point (making our update stale), or because
// update_period_ > 1. We just apply the preconditioning and return.
// note: we don't bother with any locks before incrementing
// num_updates_skipped_ below, because the worst that could happen is that,
// on very rare occasions, we could skip one or two more updates than we
// intended.
num_updates_skipped_++;
BaseFloat tr_Rt_RtT = TraceMatMat(*R_t, *R_t, kTrans);
// P_t = R_t - H_t W_t
R_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);
@ -258,11 +265,6 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
BaseFloat gamma_t = (tr_Pt_PtT == 0.0 ? 1.0 :
sqrt(tr_Rt_RtT / tr_Pt_PtT));
*scale = gamma_t;
// note: we don't bother with any locks before incrementing
// num_updates_skipped_ below, because the worst that could happen is that,
// on very rare occasions, we could skip one or two more updates than we
// intended.
num_updates_skipped_++;
return;
}
J_t.AddMatMat(1.0, H_t, kTrans, *R_t, kNoTrans, 0.0); // J_t = H_t^T R_t
@ -295,6 +297,7 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
BaseFloat beta_t = rho_t * (1.0 + alpha_) + alpha_ * d_t.Sum() / D;
Vector<BaseFloat> e_t(R), sqrt_e_t(R), inv_sqrt_e_t(R);
ComputeEt(d_t, beta_t, &e_t, &sqrt_e_t, &inv_sqrt_e_t);
KALDI_VLOG(5) << "e_t = " << e_t;
SpMatrix<BaseFloat> Z_t(R);
ComputeZt(N, rho_t, d_t, inv_sqrt_e_t, K_t_cpu, L_t_cpu, &Z_t);
@ -494,7 +497,6 @@ void OnlinePreconditioner::ComputeEt(const VectorBase<BaseFloat> &d_t,
BaseFloat *e = e_t->Data();
for (int32 i = 0; i < D; i++)
e[i] = 1.0 / (beta_t / d[i] + 1);
KALDI_VLOG(5) << "e_t = " << *e_t;
sqrt_e_t->CopyFromVec(*e_t);
sqrt_e_t->ApplyPow(0.5);
inv_sqrt_e_t->CopyFromVec(*sqrt_e_t);

Просмотреть файл

@ -67,10 +67,10 @@ void NnetUpdater::Propagate() {
(c>0 && nnet_.GetComponent(c-1).BackpropNeedsOutput()) ||
component.BackpropNeedsInput();
if (g_kaldi_verbose_level >= 3 && num_times_printed < 100) {
KALDI_LOG << "Stddev of data for component " << c
<< " for this minibatch is "
<< (TraceMatMat(forward_data_[c], forward_data_[c], kTrans) /
(forward_data_[c].NumRows() * forward_data_[c].NumCols()));
KALDI_VLOG(3) << "Stddev of data for component " << c
<< " for this minibatch is "
<< (TraceMatMat(forward_data_[c], forward_data_[c], kTrans) /
(forward_data_[c].NumRows() * forward_data_[c].NumCols()));
num_times_printed++;
}
if (!need_last_output)

Просмотреть файл

@ -26,7 +26,8 @@ namespace nnet2 {
NnetSimpleTrainer::NnetSimpleTrainer(
const NnetSimpleTrainerConfig &config,
Nnet *nnet):
config_(config), nnet_(nnet) {
config_(config), nnet_(nnet), logprob_this_phase_(0.0),
weight_this_phase_(0.0), logprob_total_(0.0), weight_total_(0.0) {
num_phases_ = 0;
bool first_time = true;
BeginNewPhase(first_time);
@ -44,7 +45,7 @@ void NnetSimpleTrainer::TrainOneMinibatch() {
logprob_this_phase_ += DoBackprop(*nnet_,
buffer_,
nnet_);
count_this_phase_ += buffer_.size();
weight_this_phase_ += TotalNnetTrainingWeight(buffer_);
buffer_.clear();
minibatches_seen_this_phase_++;
if (minibatches_seen_this_phase_ == config_.minibatches_per_phase) {
@ -56,10 +57,12 @@ void NnetSimpleTrainer::TrainOneMinibatch() {
void NnetSimpleTrainer::BeginNewPhase(bool first_time) {
if (!first_time)
KALDI_LOG << "Training objective function (this phase) is "
<< (logprob_this_phase_/count_this_phase_) << " over "
<< count_this_phase_ << " frames.";
<< (logprob_this_phase_/weight_this_phase_) << " over "
<< weight_this_phase_ << " frames.";
logprob_total_ += logprob_this_phase_;
weight_total_ += weight_this_phase_;
logprob_this_phase_ = 0.0;
count_this_phase_ = 0.0;
weight_this_phase_ = 0.0;
minibatches_seen_this_phase_ = 0;
num_phases_++;
}
@ -75,6 +78,13 @@ NnetSimpleTrainer::~NnetSimpleTrainer() {
BeginNewPhase(first_time);
}
}
if (weight_total_ == 0.0) {
KALDI_WARN << "No data seen.";
} else {
KALDI_LOG << "Did backprop on " << weight_total_
<< " examples, average log-prob per frame is "
<< (logprob_total_ / weight_total_);
}
}

Просмотреть файл

@ -81,7 +81,10 @@ class NnetSimpleTrainer {
std::vector<NnetExample> buffer_;
double logprob_this_phase_; // Needed for accumulating train log-prob on each phase.
double count_this_phase_; // count corresponding to the above.
double weight_this_phase_; // count corresponding to the above.
double logprob_total_;
double weight_total_;
};