add fisher_swbd nnet3 and chain recipe

This commit is contained in:
Xingyu Na 2016-04-05 16:17:04 +08:00
Родитель c823bd87c1
Коммит 02cf52a48e
7 изменённых файлов: 1052 добавлений и 0 удалений

Просмотреть файл

@ -42,8 +42,71 @@ for x in exp/nnet2_online/nnet_ms_a_online/decode_eval2000*_fg; do grep Sum $x/
%WER 12.3 | 1831 21395 | 89.2 7.2 3.5 1.5 12.3 50.8 | exp/nnet2_online/nnet_ms_a_online/decode_eval2000_utt_fsh_sw1_fg/score_13/eval2000.ctm.swbd.filt.sys
%WER 11.8 | 1831 21395 | 89.6 7.2 3.2 1.4 11.8 49.0 | exp/nnet2_online/nnet_ms_a_online/decode_eval2000_utt_offline_fsh_sw1_fg/score_11/eval2000.ctm.swbd.filt.sys
# nnet3 result on eval2000
# BLSTM ran for about 760 hours, command:
# local/nnet3/run_lstm.sh --affix bidirectional --lstm-delay " [-1,1] [-2,2] [-3,3] " --label-delay 0 \
# --cell-dim 1024 --recurrent-projection-dim 128 --non-recurrent-projection-dim 128 \
# --chunk-left-context 40 --chunk-right-context 40 \
# --extra-left-context 50 --extra-right-context 50
# use tri-gram
for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
%WER 15.8 | 4459 42989 | 86.1 9.7 4.1 1.9 15.8 52.6 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
%WER 14.8 | 4459 42989 | 86.6 9.2 4.3 1.4 14.8 54.3 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
# rescore with four-gram
for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
%WER 15.4 | 4459 42989 | 86.4 9.5 4.0 1.8 15.4 51.6 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
%WER 14.5 | 4459 42989 | 87.0 9.0 4.0 1.5 14.5 53.7 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys
# nnet3 result on eval2000 for swbd subset
# use tri-gram
for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
%WER 11.6 | 1831 21395 | 89.7 7.3 3.0 1.3 11.6 47.7 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
%WER 10.7 | 1831 21395 | 90.3 6.7 3.0 1.0 10.7 45.9 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
# rescore with four-gram
for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
%WER 11.1 | 1831 21395 | 90.2 7.0 2.8 1.3 11.1 46.2 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
%WER 10.4 | 1831 21395 | 90.6 6.5 2.9 1.0 10.4 45.3 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
# nnet3 result on eval2000 for callhm subset
# use tri-gram
for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum $x/score_*/*.ctm.callhm.filt.sys | utils/best_wer.sh ; done
%WER 19.9 | 2628 21594 | 82.6 12.1 5.3 2.6 19.9 56.0 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
%WER 18.8 | 2628 21594 | 83.1 11.7 5.2 1.9 18.8 60.2 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
# rescore with four-gram
for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
%WER 19.7 | 2628 21594 | 82.7 12.1 5.2 2.4 19.7 55.3 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
%WER 18.6 | 2628 21594 | 83.3 11.5 5.2 1.9 18.6 59.6 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
# chain result on eval2000
# BLSTM ran for about 380 hours
# use tri-gram
for x in exp/chain/*/decode_eval2000*tg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
%WER 13.6 | 4459 42989 | 88.2 7.9 3.9 1.8 13.6 51.0 | exp/chain/tdnn_7b_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys
%WER 12.1 | 4459 42989 | 89.7 6.8 3.5 1.8 12.1 50.2 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.filt.sys
# rescore with four-gram
for x in exp/chain/*/decode_eval2000*fg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
%WER 13.3 | 4459 42989 | 88.4 7.8 3.8 1.8 13.3 50.1 | exp/chain/tdnn_7b_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys
%WER 12.0 | 4459 42989 | 89.6 6.5 3.8 1.7 12.0 49.3 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_fg/score_8_0.5/eval2000_hires.ctm.filt.sys
# chain result on eval2000 for swbd subset
# use tri-gram
for x in exp/chain/*/decode_eval2000*tg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
%WER 9.4 | 1831 21395 | 91.7 5.4 2.9 1.2 9.4 43.9 | exp/chain/tdnn_7b_sp/decode_eval2000_fsh_sw1_tg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys
%WER 8.8 | 1831 21395 | 92.5 5.3 2.2 1.4 8.8 46.9 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_tg/score_7_1.0/eval2000_hires.ctm.swbd.filt.sys
# rescore with four-gram
for x in exp/chain/*/decode_eval2000*fg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
%WER 9.2 | 1831 21395 | 92.1 5.6 2.3 1.3 9.2 42.4 | exp/chain/tdnn_7b_relu_sp/decode_eval2000_fsh_sw1_fg/score_9_0.0/eval2000_hires.ctm.swbd.filt.sys
%WER 8.5 | 1831 21395 | 92.6 4.9 2.4 1.2 8.5 44.1 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_fg/score_9_1.0/eval2000_hires.ctm.swbd.filt.sys
# chain result on eval2000 for callhm subset
# use tri-gram
for x in exp/chain/*/decode_eval2000*tg; do grep Sum $x/score_*/*.ctm.callhm.filt.sys | utils/best_wer.sh ; done
%WER 17.4 | 2628 21594 | 84.7 9.8 5.5 2.1 17.4 55.3 | exp/chain/tdnn_7b_relu_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
%WER 15.3 | 2628 21594 | 86.9 8.3 4.8 2.2 15.3 52.4 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
# rescore with four-gram
for x in exp/chain/*/decode_eval2000*fg; do grep Sum $x/score_*/*.ctm.callhm.filt.sys | utils/best_wer.sh ; done
%WER 17.3 | 2628 21594 | 84.9 9.7 5.5 2.1 17.3 55.0 | exp/chain/tdnn_7b_relu_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
%WER 15.3 | 2628 21594 | 87.0 8.6 4.4 2.4 15.3 52.1 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_fg/score_6_0.5/eval2000_hires.ctm.callhm.filt.sys
# GMM and SGMM numbers reported on rt03
for x in exp/*/decode_rt03*; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
@ -89,3 +152,63 @@ for x in exp/nnet2_online/nnet_ms_a_online/decode_rt03*_fg; do grep Sum $x/scor
%WER 20.2 | 3970 36721 | 88.3 8.1 3.6 8.5 20.2 74.3 | exp/nnet2_online/nnet_ms_a_online/decode_rt03_utt_fsh_sw1_fg/score_11/rt03.ctm.swbd.filt.sys
%WER 19.1 | 3970 36721 | 88.8 7.8 3.4 7.9 19.1 72.2 | exp/nnet2_online/nnet_ms_a_online/decode_rt03_utt_offline_fsh_sw1_fg/score_11/rt03.ctm.swbd.filt.sys
# nnet3 result on rt03
# use tri-gram
for x in exp/nnet3/*/decode_rt03*tg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
%WER 14.7 | 8420 76157 | 86.8 8.9 4.3 1.5 14.7 45.9 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_tg/score_11_0.0/rt03_hires.ctm.filt.sys
%WER 14.2 | 8420 76157 | 87.0 8.7 4.3 1.2 14.2 46.9 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.filt.sys
# rescore with four-gram
for x in exp/nnet3/*/decode_rt03*fg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
%WER 14.4 | 8420 76157 | 87.1 8.8 4.2 1.5 14.4 45.2 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_fg/score_11_0.0/rt03_hires.ctm.filt.sys
%WER 13.9 | 8420 76157 | 87.2 8.4 4.3 1.2 13.9 46.0 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_fg/score_9_0.0/rt03_hires.ctm.filt.sys
# nnet3 result on rt03 for swbd subset
# use tri-gram
for x in exp/nnet3/*/decode_rt03*tg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
%WER 17.4 | 4450 39436 | 84.3 10.6 5.1 1.8 17.4 48.9 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_tg/score_11_0.5/rt03_hires.ctm.swbd.filt.sys
%WER 16.6 | 4450 39436 | 84.7 10.0 5.3 1.3 16.6 49.6 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_tg/score_10_0.5/rt03_hires.ctm.swbd.filt.sys
# rescore with four-gram
for x in exp/nnet3/*/decode_rt03*fg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
%WER 17.1 | 4450 39436 | 84.6 10.3 5.1 1.8 17.1 48.2 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_fg/score_12_0.0/rt03_hires.ctm.swbd.filt.sys
%WER 16.3 | 4450 39436 | 85.0 9.8 5.1 1.3 16.3 49.0 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_fg/score_10_0.0/rt03_hires.ctm.swbd.filt.sys
# nnet3 result on rt03 for fsh subset
# use tri-gram
for x in exp/nnet3/*/decode_rt03*tg; do grep Sum $x/score_*/*.ctm.fsh.filt.sys | utils/best_wer.sh ; done
%WER 11.8 | 3970 36721 | 89.4 7.2 3.5 1.2 11.8 42.5 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_tg/score_11_0.0/rt03_hires.ctm.fsh.filt.sys
%WER 11.6 | 3970 36721 | 89.4 7.1 3.5 1.0 11.6 43.6 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
# rescore with four-gram
for x in exp/nnet3/*/decode_rt03*fg; do grep Sum $x/score_*/*.ctm.fsh.filt.sys | utils/best_wer.sh ; done
%WER 11.4 | 3970 36721 | 89.7 6.9 3.4 1.1 11.4 41.5 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_fg/score_11_0.0/rt03_hires.ctm.fsh.filt.sys
%WER 11.4 | 3970 36721 | 89.5 6.7 3.8 1.0 11.4 42.6 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_fg/score_10_0.0/rt03_hires.ctm.fsh.filt.sys
# chain result on rt03
# BLSTM ran for about 380 hours
# use tri-gram
for x in exp/chain/*/decode_rt03*tg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
%WER 12.7 | 8420 76157 | 88.5 7.2 4.2 1.3 12.7 43.2 | exp/chain/tdnn_7b_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.filt.sys
%WER 11.7 | 8420 76157 | 89.8 6.6 3.6 1.5 11.7 43.7 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.filt.sys
# rescore with four-gram
for x in exp/chain/*/decode_rt03*fg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
%WER 12.4 | 8420 76157 | 88.9 7.0 4.1 1.3 12.4 42.7 | exp/chain/tdnn_7b_sp/decode_rt03_fsh_sw1_fg/score_9_0.0/rt03_hires.ctm.filt.sys
%WER 11.4 | 8420 76157 | 89.9 6.1 3.9 1.3 11.4 43.4 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys
# chain result on rt03 for swbd subset
# use tri-gram
for x in exp/chain/*/decode_rt03*tg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
%WER 15.0 | 4450 39436 | 86.4 8.6 5.0 1.4 15.0 45.8 | exp/chain/tdnn_7b_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys
%WER 13.3 | 4450 39436 | 88.3 7.5 4.2 1.6 13.3 45.2 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
# rescore with four-gram
for x in exp/chain/*/decode_rt03*fg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
%WER 14.8 | 4450 39436 | 86.5 8.0 5.5 1.3 14.8 45.5 | exp/chain/tdnn_7b_sp/decode_rt03_fsh_sw1_fg/score_10_0.0/rt03_hires.ctm.swbd.filt.sys
%WER 13.0 | 4450 39436 | 88.5 7.3 4.2 1.6 13.0 44.8 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
# chain result on rt03 for fsh subset
# use tri-gram
for x in exp/chain/*/decode_rt03*tg; do grep Sum $x/score_*/*.ctm.fsh.filt.sys | utils/best_wer.sh ; done
%WER 10.2 | 3970 36721 | 91.1 6.0 3.0 1.2 10.2 40.2 | exp/chain/tdnn_7b_relu_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys
%WER 9.8 | 3970 36721 | 91.4 5.3 3.3 1.2 9.8 42.0 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
# rescore with four-gram
for x in exp/chain/*/decode_rt03*fg; do grep Sum $x/score_*/*.ctm.fsh.filt.sys | utils/best_wer.sh ; done
%WER 9.8 | 3970 36721 | 91.4 5.8 2.8 1.2 9.8 39.6 | exp/chain/tdnn_7b_relu_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys
%WER 9.6 | 3970 36721 | 91.6 5.2 3.3 1.2 9.6 41.4 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys

Просмотреть файл

@ -0,0 +1,181 @@
#!/bin/bash
# based on run_tdnn_6h.sh
set -e
# configs for 'chain'
stage=12
train_stage=-10
get_egs_stage=-10
dir=exp/chain/blstm_6h
decode_iter=
decode_dir_affix=
# training options
num_epochs=4
remove_egs=false
common_egs_dir=
affix=
chunk_width=150
chunk_left_context=40
chunk_right_context=40
# End configuration section.
echo "$0 $@" # Print the command line for logging
. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
fi
dir=$dir${affix:+_$affix}
train_set=train_nodup_sp
ali_dir=exp/tri5a_ali_nodup
treedir=exp/chain/tri6_tree_11000
lang=data/lang_chain
# The iVector-extraction and feature-dumping parts are the same as the standard
# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
# run those things.
local/nnet3/run_ivector_common.sh --stage $stage \
--speed-perturb true \
--generate-alignments false || exit 1;
if [ $stage -le 9 ]; then
# Get the alignments as lattices (gives the CTC training more freedom).
# use the same num-jobs as the alignments
nj=$(cat $ali_dir/num_jobs) || exit 1;
steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
data/lang exp/tri5a exp/tri5a_lats_nodup_sp
rm exp/tri5a_lats_nodup_sp/fsts.*.gz # save space
fi
if [ $stage -le 10 ]; then
# Create a version of the lang/ directory that has one state per phone in the
# topo file. [note, it really has two states.. the first one is only repeated
# once, the second one has zero or more repeats.]
rm -rf $lang
cp -r data/lang $lang
silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
# Use our special topology... note that later on may have to tune this
# topology.
steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
fi
if [ $stage -le 11 ]; then
# Build a tree using our new topology.
steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
--leftmost-questions-truncate -1 \
--cmd "$train_cmd" 11000 data/$train_set $lang $ali_dir $treedir
fi
if [ $stage -le 12 ]; then
echo "$0: creating neural net configs";
steps/nnet3/lstm/make_configs.py \
--feat-dir data/${train_set}_hires \
--ivector-dir exp/nnet3/ivectors_${train_set} \
--tree-dir $treedir \
--splice-indexes="-2,-1,0,1,2 0 0" \
--lstm-delay=" [-3,3] [-3,3] [-3,3] " \
--xent-regularize 0.1 \
--include-log-softmax false \
--num-lstm-layers 3 \
--cell-dim 1024 \
--hidden-dim 1024 \
--recurrent-projection-dim 256 \
--non-recurrent-projection-dim 256 \
--label-delay 0 \
--self-repair-scale 0.00001 \
$dir/configs || exit 1;
fi
if [ $stage -le 13 ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
utils/create_split_dir.pl \
/export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
fi
touch $dir/egs/.nodelete # keep egs around when that run dies.
steps/nnet3/chain/train.py --stage $train_stage \
--cmd "$decode_cmd" \
--feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
--feat.cmvn-opts "--norm-means=false --norm-vars=false" \
--chain.xent-regularize 0.1 \
--chain.leaky-hmm-coefficient 0.1 \
--chain.l2-regularize 0.00005 \
--chain.apply-deriv-weights false \
--chain.lm-opts="--num-extra-lm-states=2000" \
--chain.left-deriv-truncate 0 \
--trainer.num-chunk-per-minibatch 64 \
--trainer.frames-per-iter 1200000 \
--trainer.max-param-change 1.414 \
--trainer.num-epochs $num_epochs \
--trainer.optimization.shrink-value 0.99 \
--trainer.optimization.num-jobs-initial 3 \
--trainer.optimization.num-jobs-final 16 \
--trainer.optimization.initial-effective-lrate 0.001 \
--trainer.optimization.final-effective-lrate 0.0001 \
--trainer.optimization.momentum 0.0 \
--egs.stage $get_egs_stage \
--egs.opts "--frames-overlap-per-eg 0" \
--egs.chunk-width $chunk_width \
--egs.chunk-left-context $chunk_left_context \
--egs.chunk-right-context $chunk_right_context \
--egs.dir "$common_egs_dir" \
--cleanup.remove-egs $remove_egs \
--feat-dir data/${train_set}_hires \
--tree-dir $treedir \
--lat-dir exp/tri5a_lats_nodup_sp \
--dir $dir || exit 1;
fi
if [ $stage -le 14 ]; then
# Note: it might appear that this $lang directory is mismatched, and it is as
# far as the 'topo' is concerned, but this script doesn't read the 'topo' from
# the lang directory.
utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
fi
decode_suff=fsh_sw1_tg
graph_dir=$dir/graph_fsh_sw1_tg
if [ $stage -le 15 ]; then
iter_opts=
if [ ! -z $decode_iter ]; then
iter_opts=" --iter $decode_iter "
fi
# decoding options
extra_left_context=$[$chunk_left_context+10]
extra_right_context=$[$chunk_right_context+10]
for decode_set in eval2000 rt03; do
(
num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
--nj $num_jobs --cmd "$decode_cmd" $iter_opts \
--extra-left-context $extra_left_context \
--extra-right-context $extra_right_context \
--frames-per-chunk $chunk_width \
--online-ivector-dir exp/nnet3/ivectors_${decode_set} \
$graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
$dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
fi
) &
done
fi
wait;
exit 0;

Просмотреть файл

@ -0,0 +1,162 @@
#!/bin/bash
set -e
# based on run_tdnn_7b.sh in the swbd recipe
# configs for 'chain'
affix=
stage=12
train_stage=-10
get_egs_stage=-10
dir=exp/chain/tdnn_7b
decode_iter=
# training options
num_epochs=4
remove_egs=false
common_egs_dir=
# End configuration section.
echo "$0 $@" # Print the command line for logging
. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
fi
dir=${dir}${affix:+_$affix}
train_set=train_nodup_sp
ali_dir=exp/tri5a_ali_nodup
treedir=exp/chain/tri6_tree_11000
lang=data/lang_chain
# The iVector-extraction and feature-dumping parts are the same as the standard
# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
# run those things.
local/nnet3/run_ivector_common.sh --stage $stage \
--speed-perturb true \
--generate-alignments false || exit 1;
if [ $stage -le 9 ]; then
# Get the alignments as lattices (gives the chain training more freedom).
# use the same num-jobs as the alignments
nj=$(cat $ali_dir/num_jobs) || exit 1;
steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
data/lang exp/tri5a exp/tri5a_lats_nodup_sp
rm exp/tri5a_lats_nodup_sp/fsts.*.gz # save space
fi
if [ $stage -le 10 ]; then
# Create a version of the lang/ directory that has one state per phone in the
# topo file. [note, it really has two states.. the first one is only repeated
# once, the second one has zero or more repeats.]
rm -rf $lang
cp -r data/lang $lang
silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
# Use our special topology... note that later on may have to tune this
# topology.
steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
fi
if [ $stage -le 11 ]; then
# Build a tree using our new topology.
steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
--leftmost-questions-truncate -1 \
--cmd "$train_cmd" 11000 data/$train_set $lang $ali_dir $treedir
fi
if [ $stage -le 12 ]; then
echo "$0: creating neural net configs";
# create the config files for nnet initialization
steps/nnet3/tdnn/make_configs.py \
--self-repair-scale 0.00001 \
--feat-dir data/${train_set}_hires \
--ivector-dir exp/nnet3/ivectors_${train_set} \
--tree-dir $treedir \
--relu-dim 725 \
--splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
--use-presoftmax-prior-scale false \
--xent-regularize 0.1 \
--xent-separate-forward-affine true \
--include-log-softmax false \
--final-layer-normalize-target 0.5 \
$dir/configs || exit 1;
fi
if [ $stage -le 13 ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
utils/create_split_dir.pl \
/export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
fi
touch $dir/egs/.nodelete # keep egs around when that run dies.
steps/nnet3/chain/train.py --stage $train_stage \
--egs.dir "$common_egs_dir" \
--cmd "$decode_cmd" \
--feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
--feat.cmvn-opts "--norm-means=false --norm-vars=false" \
--chain.xent-regularize 0.1 \
--chain.leaky-hmm-coefficient 0.1 \
--chain.l2-regularize 0.00005 \
--chain.apply-deriv-weights false \
--chain.lm-opts="--num-extra-lm-states=2000" \
--egs.stage $get_egs_stage \
--egs.opts "--frames-overlap-per-eg 0" \
--egs.chunk-width 150 \
--trainer.num-chunk-per-minibatch 128 \
--trainer.frames-per-iter 1500000 \
--trainer.num-epochs $num_epochs \
--trainer.optimization.num-jobs-initial 3 \
--trainer.optimization.num-jobs-final 16 \
--trainer.optimization.initial-effective-lrate 0.001 \
--trainer.optimization.final-effective-lrate 0.0001 \
--trainer.max-param-change 2.0 \
--cleanup.remove-egs $remove_egs \
--feat-dir data/${train_set}_hires \
--tree-dir $treedir \
--lat-dir exp/tri5a_lats_nodup_sp \
--dir $dir || exit 1;
fi
if [ $stage -le 14 ]; then
# Note: it might appear that this $lang directory is mismatched, and it is as
# far as the 'topo' is concerned, but this script doesn't read the 'topo' from
# the lang directory.
utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
fi
decode_suff=fsh_sw1_tg
graph_dir=$dir/graph_fsh_sw1_tg
if [ $stage -le 15 ]; then
iter_opts=
if [ ! -z $decode_iter ]; then
iter_opts=" --iter $decode_iter "
fi
for decode_set in eval2000 rt03; do
(
num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
--nj $num_jobs --cmd "$decode_cmd" $iter_opts \
--online-ivector-dir exp/nnet3/ivectors_${decode_set} \
$graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
$dir/decode_${decode_set}${decode_iter:+_$decode_iter}_fsh_sw1_{tg,fg} || exit 1;
fi
) &
done
fi
wait;
exit 0;

Просмотреть файл

@ -0,0 +1,141 @@
#!/bin/bash
. ./cmd.sh
set -e
stage=1
train_stage=-10
generate_alignments=true # false if doing chain training
speed_perturb=true
. ./path.sh
. ./utils/parse_options.sh
# perturbed data preparation
train_set=train_nodup
if [ "$speed_perturb" == "true" ]; then
if [ $stage -le 1 ]; then
#Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
# _sp stands for speed-perturbed
for datadir in train_nodup; do
utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1
utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2
utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2
utils/validate_data_dir.sh --no-feats data/${datadir}_tmp
rm -r data/temp1 data/temp2
mfccdir=mfcc_perturbed
steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
utils/fix_data_dir.sh data/${datadir}_tmp
utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0
utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0
utils/fix_data_dir.sh data/${datadir}_sp
rm -r data/temp0 data/${datadir}_tmp
done
fi
if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then
#obtain the alignment of the perturbed data
steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
data/train_nodup_sp data/lang_nosp exp/tri5a exp/tri5a_ali_nodup_sp || exit 1
fi
train_set=train_nodup_sp
fi
if [ $stage -le 3 ]; then
mfccdir=mfcc_hires
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
date=$(date +'%m_%d_%H_%M')
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_swbd-$date/s5b/$mfccdir/storage $mfccdir/storage
fi
# the 100k_nodup directory is copied seperately, as
# we want to use exp/tri1b_ali_100k_nodup for lda_mllt training
# the main train directory might be speed_perturbed
for dataset in $train_set train_100k_nodup; do
utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
# scale the waveforms, this is useful as we don't use CMVN
data_dir=data/${dataset}_hires
cat $data_dir/wav.scp | python -c "
import sys, os, subprocess, re, random
scale_low = 1.0/8
scale_high = 2.0
for line in sys.stdin.readlines():
if len(line.strip()) == 0:
continue
print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high))
"| sort -k1,1 -u > $data_dir/wav.scp_scaled || exit 1;
mv $data_dir/wav.scp_scaled $data_dir/wav.scp
steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir;
# Remove the small number of utterances that couldn't be extracted for some
# reason (e.g. too short; no such file).
utils/fix_data_dir.sh data/${dataset}_hires;
done
for dataset in eval2000 rt03; do
# Create MFCCs for the eval set
utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \
data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
utils/fix_data_dir.sh data/${dataset}_hires # remove segments with problems
done
# Take the first 30k utterances (about 1/8th of the data) this will be used
# for the diagubm training
utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires
local/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires # 33hr
fi
# ivector extractor training
if [ $stage -le 5 ]; then
# We need to build a small system just because we need the LDA+MLLT transform
# to train the diag-UBM on top of. We use --num-iters 13 because after we get
# the transform (12th iter is the last), any further training is pointless.
# this decision is based on fisher_english
steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
--splice-opts "--left-context=3 --right-context=3" \
5500 90000 data/train_100k_nodup_hires \
data/lang_nosp exp/tri1b_ali exp/nnet3/tri2b
fi
if [ $stage -le 6 ]; then
# To train a diagonal UBM we don't need very much data, so use the smallest subset.
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \
data/${train_set}_30k_nodup_hires 512 exp/nnet3/tri2b exp/nnet3/diag_ubm
fi
if [ $stage -le 7 ]; then
# iVector extractors can be sensitive to the amount of data, but this one has a
# fairly small dim (defaults to 100) so we don't use all of it, we use just the
# 100k subset (just under half the data).
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
data/train_100k_nodup_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
fi
if [ $stage -le 8 ]; then
# We extract iVectors on all the train_nodup data, which will be what we
# train the system on.
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
data/${train_set}_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_$train_set || exit 1;
for data_set in eval2000 rt03; do
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_$data_set || exit 1;
done
fi
exit 0;

Просмотреть файл

@ -0,0 +1,158 @@
#!/bin/bash
# Copyright 2015 Johns Hopkins University (Author: Daniel Povey).
# 2015 Vijayaditya Peddinti
# 2015 Xingyu Na
# 2015 Pegah Ghahrmani
# Apache 2.0.
# this is a basic lstm script
# LSTM script runs for more epochs than the TDNN script
# and each epoch takes twice the time
# At this script level we don't support not running on GPU, as it would be painfully slow.
# If you want to run without GPU you'd have to call lstm/train.sh with --gpu false
stage=0
train_stage=-10
affix=
common_egs_dir=
reporting_email=
# LSTM options
splice_indexes="-2,-1,0,1,2 0 0"
lstm_delay=" -1 -2 -3 "
label_delay=5
num_lstm_layers=3
cell_dim=1024
hidden_dim=1024
recurrent_projection_dim=256
non_recurrent_projection_dim=256
chunk_width=20
chunk_left_context=40
chunk_right_context=0
# training options
num_epochs=8
initial_effective_lrate=0.0003
final_effective_lrate=0.00003
num_jobs_initial=3
num_jobs_final=15
momentum=0.5
num_chunk_per_minibatch=100
samples_per_iter=20000
remove_egs=true
#decode options
extra_left_context=
extra_right_context=
frames_per_chunk=
# End configuration section.
echo "$0 $@" # Print the command line for logging
. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
fi
dir=exp/nnet3/lstm
dir=$dir${affix:+_$affix}
if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
train_set=train_nodup_sp
ali_dir=exp/tri5a_ali_nodup_sp
local/nnet3/run_ivector_common.sh --stage $stage \
--speed-perturb true || exit 1;
if [ $stage -le 9 ]; then
echo "$0: creating neural net configs";
config_extra_opts=()
[ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
steps/nnet3/lstm/make_configs.py "${config_extra_opts[@]}" \
--feat-dir data/${train_set}_hires \
--ivector-dir exp/nnet3/ivectors_${train_set} \
--ali-dir $ali_dir \
--num-lstm-layers $num_lstm_layers \
--splice-indexes "$splice_indexes " \
--cell-dim $cell_dim \
--hidden-dim $hidden_dim \
--recurrent-projection-dim $recurrent_projection_dim \
--non-recurrent-projection-dim $non_recurrent_projection_dim \
--label-delay $label_delay \
--self-repair-scale 0.00001 \
$dir/configs || exit 1;
fi
if [ $stage -le 10 ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
utils/create_split_dir.pl \
/export/b0{3,4,5,6}/$USER/kaldi-data/egs/fisher_swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
fi
steps/nnet3/train_rnn.py --stage=$train_stage \
--cmd="$decode_cmd" \
--feat.online-ivector-dir=exp/nnet3/ivectors_${train_set} \
--feat.cmvn-opts="--norm-means=false --norm-vars=false" \
--trainer.num-epochs=$num_epochs \
--trainer.samples-per-iter=$samples_per_iter \
--trainer.optimization.num-jobs-initial=$num_jobs_initial \
--trainer.optimization.num-jobs-final=$num_jobs_final \
--trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
--trainer.optimization.final-effective-lrate=$final_effective_lrate \
--trainer.optimization.shrink-value 0.99 \
--trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
--trainer.optimization.momentum=$momentum \
--egs.chunk-width=$chunk_width \
--egs.chunk-left-context=$chunk_left_context \
--egs.chunk-right-context=$chunk_right_context \
--egs.dir="$common_egs_dir" \
--cleanup.remove-egs=$remove_egs \
--cleanup.preserve-model-interval=500 \
--use-gpu=true \
--feat-dir=data/${train_set}_hires \
--ali-dir=$ali_dir \
--lang=data/lang \
--reporting.email="$reporting_email" \
--dir=$dir || exit 1;
fi
graph_dir=exp/tri5a/graph_sw1_tg
if [ $stage -le 11 ]; then
if [ -z $extra_left_context ]; then
extra_left_context=$chunk_left_context
fi
if [ -z $extra_right_context ]; then
extra_right_context=$chunk_right_context
fi
if [ -z $frames_per_chunk ]; then
frames_per_chunk=$chunk_width
fi
for decode_set in eval2000 rt03; do
(
num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
steps/nnet3/lstm/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
--extra-left-context $extra_left_context \
--extra-right-context $extra_right_context \
--frames-per-chunk "$frames_per_chunk" \
--online-ivector-dir exp/nnet3/ivectors_${decode_set} \
$graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_fsh_sw1_tg || exit 1;
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
$dir/decode_${decode_set}_fsh_sw1_{tg,fg} || exit 1;
) &
done
fi
wait;
exit 0;

Просмотреть файл

@ -0,0 +1,99 @@
#!/bin/bash
# this is the standard "tdnn" system, built in nnet3; it's what we use to
# call multi-splice.
. ./cmd.sh
# At this script level we don't support not running on GPU, as it would be painfully slow.
# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
# --num-threads 16 and --minibatch-size 128.
stage=0
affix=
train_stage=-10
common_egs_dir=
reporting_email=
remove_egs=true
. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
fi
dir=exp/nnet3/tdnn
dir=$dir${affix:+_$affix}
train_set=train_nodup_sp
ali_dir=exp/tri5a_ali_nodup_sp
local/nnet3/run_ivector_common.sh --stage $stage \
--speed-perturb true || exit 1;
if [ $stage -le 9 ]; then
echo "$0: creating neural net configs";
# create the config files for nnet initialization
python steps/nnet3/tdnn/make_configs.py \
--feat-dir data/${train_set}_hires \
--ivector-dir exp/nnet3/ivectors_${train_set} \
--ali-dir $ali_dir \
--relu-dim 1024 \
--splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -3,3 -7,2 0" \
--use-presoftmax-prior-scale true \
$dir/configs || exit 1;
fi
if [ $stage -le 10 ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
utils/create_split_dir.pl \
/export/b0{3,4,5,6}/$USER/kaldi-data/egs/fisher_swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
fi
steps/nnet3/train_dnn.py --stage=$train_stage \
--cmd="$decode_cmd" \
--feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
--feat.cmvn-opts="--norm-means=false --norm-vars=false" \
--trainer.num-epochs 4 \
--trainer.optimization.num-jobs-initial 3 \
--trainer.optimization.num-jobs-final 16 \
--trainer.optimization.initial-effective-lrate 0.0017 \
--trainer.optimization.final-effective-lrate 0.00017 \
--egs.dir "$common_egs_dir" \
--cleanup.remove-egs $remove_egs \
--cleanup.preserve-model-interval 500 \
--use-gpu true \
--feat-dir=data/${train_set}_hires \
--ali-dir $ali_dir \
--lang data/lang \
--reporting.email="$reporting_email" \
--dir=$dir || exit 1;
fi
graph_dir=exp/tri5a/graph_fsh_sw1_tg
if [ $stage -le 11 ]; then
for decode_set in eval2000 rt03; do
(
num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
--online-ivector-dir exp/nnet3/ivectors_${decode_set} \
$graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_fsh_sw1_tg || exit 1;
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
$dir/decode_${decode_set}_fsh_sw1_{tg,fg} || exit 1;
) &
done
fi
wait;
exit 0;

Просмотреть файл

@ -0,0 +1,188 @@
#!/bin/bash
set -o pipefail
# this is run_tdnn_discriminative.sh
# This script does discriminative training on top of CE nnet3 system.
# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
# since the lattice generation runs in about real-time, so takes of the order of
# 1000 hours of CPU time.
#
. ./cmd.sh
stage=0
train_stage=-10 # can be used to start training in the middle.
get_egs_stage=-10
use_gpu=true # for training
cleanup=false # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
# alignments and degs).
. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh
srcdir=exp/nnet3/tdnn
train_data_dir=data/train_nodup_sp_hires
online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp
degs_dir= # If provided, will skip the degs directory creation
lats_dir= # If provided, will skip denlats creation
## Objective options
criterion=smbr
one_silence_class=true
dir=${srcdir}_${criterion}
## Egs options
frames_per_eg=150
frames_overlap_per_eg=30
truncate_deriv_weights=10
## Nnet training options
effective_learning_rate=0.0000125
max_param_change=1
num_jobs_nnet=4
num_epochs=4
regularization_opts= # Applicable for providing --xent-regularize and --l2-regularize options
minibatch_size=64
adjust_priors=true # May need to be set to false
# because it does not help in some setups
modify_learning_rates=true
last_layer_factor=0.1
## Decode options
decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
EOF
fi
num_threads=1
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
fi
if [ ! -f ${srcdir}/final.mdl ]; then
echo "$0: expected ${srcdir}/final.mdl to exist; first run run_tdnn.sh or run_lstm.sh"
exit 1;
fi
if [ $stage -le 1 ]; then
# hardcode no-GPU for alignment, although you could use GPU [you wouldn't
# get excellent GPU utilization though.]
nj=100 # have a high number of jobs because this could take a while, and we might
# have some stragglers.
steps/nnet3/align.sh --cmd "$decode_cmd" --use-gpu false \
--online-ivector-dir $online_ivector_dir \
--nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
fi
if [ -z "$lats_dir" ]; then
lats_dir=${srcdir}_denlats
if [ $stage -le 2 ]; then
nj=100
# this doesn't really affect anything strongly, except the num-jobs for one of
# the phases of get_egs_discriminative.sh below.
num_threads_denlats=6
subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
# total slots = 80 * 6 = 480.
steps/nnet3/make_denlats.sh --cmd "$decode_cmd" --determinize true \
--online-ivector-dir $online_ivector_dir \
--nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
$train_data_dir data/lang $srcdir ${lats_dir} ;
fi
fi
model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
left_context=$[model_left_context + extra_left_context]
right_context=$[model_right_context + extra_right_context]
valid_left_context=$[valid_left_context + frames_per_eg]
valid_right_context=$[valid_right_context + frames_per_eg]
frame_subsampling_opt=
if [ -f $srcdir/frame_subsampling_factor ]; then
frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
fi
cmvn_opts=`cat $srcdir/cmvn_opts`
if [ -z "$degs_dir" ]; then
degs_dir=${srcdir}_degs
if [ $stage -le 3 ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
utils/create_split_dir.pl \
/export/b0{1,2,12,13}/$USER/kaldi-data/egs/fisher_swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
fi
# have a higher maximum num-jobs if
if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
steps/nnet3/get_egs_discriminative.sh \
--cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
--adjust-priors $adjust_priors \
--online-ivector-dir $online_ivector_dir \
--left-context $left_context --right-context $right_context \
--valid-left-context $valid_left_context --valid-right-context $valid_right_context \
--priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
--frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
$train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
fi
fi
if [ $stage -le 4 ]; then
steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
--stage $train_stage \
--effective-lrate $effective_learning_rate --max-param-change $max_param_change \
--criterion $criterion --drop-frames true \
--num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
--num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
--regularization-opts "$regularization_opts" \
--truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
--modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
${degs_dir} $dir
fi
graph_dir=exp/tri5a/graph_fsh_sw1_tg
if [ $stage -le 5 ]; then
for x in `seq $decode_start_epoch $num_epochs`; do
for decode_set in eval2000 rt03; do
(
num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
iter=epoch$x.adj
steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
--online-ivector-dir exp/nnet3/ivectors_${decode_set} \
$graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_fsh_sw1_tg_$iter ;
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
$dir/decode_${decode_set}_fsh_sw1_{tg,fg}_$iter ;
) &
done
done
fi
wait;
if [ $stage -le 6 ] && $cleanup; then
# if you run with "--cleanup true --stage 6" you can clean up.
rm ${lats_dir}/lat.*.gz || true
rm ${srcdir}_ali/ali.*.gz || true
steps/nnet2/remove_egs.sh ${srcdir}_degs || true
fi
exit 0;