trunk: updates to librispeech recipe, with better 'multi-splice' version of pnorm online-nnet2 recipe.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4765 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2015-01-10 22:02:55 +00:00
Родитель 2bbbea9efe
Коммит b7c7005024
8 изменённых файлов: 667 добавлений и 84 удалений

Просмотреть файл

@ -140,6 +140,8 @@
### online-nnet2 results with a model trained on all(960h) of the training data
### Note: these results are now superseded by the multi-splice (_ms_) results below.
### Be careful when comparing, as the _ms_ results don't yet have the _fglarge tests.
%WER 4.90 [ 2663 / 54402, 388 ins, 273 del, 2002 sub ] exp/nnet2_online/nnet_a_online/decode_dev_clean_fglarge/wer_13
%WER 5.19 [ 2822 / 54402, 406 ins, 311 del, 2105 sub ] exp/nnet2_online/nnet_a_online/decode_dev_clean_tglarge/wer_13
%WER 6.60 [ 3593 / 54402, 457 ins, 426 del, 2710 sub ] exp/nnet2_online/nnet_a_online/decode_dev_clean_tgmed/wer_11
@ -192,3 +194,65 @@
%WER 24.18 [ 12317 / 50948, 1284 ins, 1732 del, 9301 sub ] exp/tri6b/decode_tgsmall_dev_other_rnnlm_h150_me5-1000_L0.5/wer_15
%WER 24.19 [ 12323 / 50948, 1327 ins, 1686 del, 9310 sub ] exp/tri6b/decode_tgsmall_dev_other_rnnlm_h150_me5-1000_L0.75/wer_15
## Multi-splice version of online recipe.
# for x in exp/nnet2_online/nnet_ms_a/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done
%WER 4.72 [ 2568 / 54402, 390 ins, 258 del, 1920 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_clean_tglarge/wer_12
%WER 5.90 [ 3212 / 54402, 345 ins, 441 del, 2426 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_clean_tgmed/wer_14
%WER 6.64 [ 3612 / 54402, 401 ins, 479 del, 2732 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_clean_tgsmall/wer_12
%WER 13.11 [ 6680 / 50948, 797 ins, 866 del, 5017 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_other_tglarge/wer_15
%WER 15.56 [ 7925 / 50948, 727 ins, 1261 del, 5937 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_other_tgmed/wer_15
%WER 17.10 [ 8714 / 50948, 733 ins, 1510 del, 6471 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_other_tgsmall/wer_15
# for x in exp/nnet2_online/nnet_ms_a_online/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done
%WER 4.83 [ 2629 / 54402, 393 ins, 264 del, 1972 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tglarge/wer_13
%WER 5.01 [ 2726 / 54402, 402 ins, 270 del, 2054 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tglarge_utt/wer_13
%WER 4.87 [ 2647 / 54402, 386 ins, 290 del, 1971 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tglarge_utt_offline/wer_14
%WER 6.05 [ 3294 / 54402, 409 ins, 392 del, 2493 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgmed/wer_12
%WER 6.30 [ 3428 / 54402, 389 ins, 434 del, 2605 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgmed_utt/wer_13
%WER 6.09 [ 3311 / 54402, 393 ins, 417 del, 2501 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgmed_utt_offline/wer_13
%WER 6.87 [ 3740 / 54402, 390 ins, 547 del, 2803 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgsmall/wer_13
%WER 7.21 [ 3921 / 54402, 440 ins, 535 del, 2946 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgsmall_utt/wer_12
%WER 6.95 [ 3783 / 54402, 415 ins, 543 del, 2825 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgsmall_utt_offline/wer_13
%WER 13.21 [ 6732 / 50948, 812 ins, 852 del, 5068 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tglarge/wer_14
%WER 14.24 [ 7254 / 50948, 884 ins, 959 del, 5411 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tglarge_utt/wer_15
%WER 13.63 [ 6945 / 50948, 890 ins, 856 del, 5199 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tglarge_utt_offline/wer_14
%WER 15.69 [ 7996 / 50948, 800 ins, 1189 del, 6007 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgmed/wer_14
%WER 16.63 [ 8473 / 50948, 809 ins, 1317 del, 6347 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgmed_utt/wer_15
%WER 16.09 [ 8197 / 50948, 872 ins, 1130 del, 6195 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgmed_utt_offline/wer_13
%WER 17.15 [ 8736 / 50948, 756 ins, 1424 del, 6556 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall/wer_14
%WER 18.23 [ 9288 / 50948, 782 ins, 1585 del, 6921 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall_utt/wer_15
%WER 17.54 [ 8936 / 50948, 813 ins, 1425 del, 6698 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall_utt_offline/wer_14
## Note: this learning rate is the effective learning rate; it gets multiplied by the num-jobs.
# for x in exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch*{clean,other}*; do grep WER $x/wer_* | utils/best_wer.sh ; done
%WER 5.92 [ 3221 / 54402, 352 ins, 439 del, 2430 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_clean_tgmed/wer_14
%WER 6.63 [ 3605 / 54402, 399 ins, 481 del, 2725 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_clean_tgsmall/wer_12
%WER 4.44 [ 2416 / 54402, 385 ins, 204 del, 1827 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_clean_tglarge/wer_14
%WER 5.52 [ 3001 / 54402, 360 ins, 340 del, 2301 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_clean_tgmed/wer_15
%WER 6.22 [ 3384 / 54402, 388 ins, 411 del, 2585 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_clean_tgsmall/wer_14
%WER 4.39 [ 2386 / 54402, 368 ins, 208 del, 1810 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_clean_tglarge/wer_15 **
%WER 5.41 [ 2945 / 54402, 338 ins, 339 del, 2268 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_clean_tgmed/wer_16
%WER 6.13 [ 3333 / 54402, 371 ins, 410 del, 2552 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_clean_tgsmall/wer_15
%WER 4.39 [ 2387 / 54402, 377 ins, 199 del, 1811 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tglarge/wer_14
%WER 5.36 [ 2918 / 54402, 328 ins, 338 del, 2252 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tgmed/wer_17
%WER 6.08 [ 3305 / 54402, 369 ins, 396 del, 2540 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tgsmall/wer_15
%WER 4.40 [ 2395 / 54402, 375 ins, 200 del, 1820 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tglarge/wer_14
%WER 5.35 [ 2909 / 54402, 328 ins, 339 del, 2242 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tgmed/wer_17
%WER 6.05 [ 3291 / 54402, 384 ins, 381 del, 2526 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tgsmall/wer_14
%WER 13.45 [ 6850 / 50948, 808 ins, 876 del, 5166 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_other_tglarge/wer_15
%WER 15.65 [ 7975 / 50948, 714 ins, 1311 del, 5950 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_other_tgmed/wer_16
%WER 17.12 [ 8722 / 50948, 739 ins, 1489 del, 6494 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_other_tgsmall/wer_15
%WER 12.84 [ 6544 / 50948, 877 ins, 703 del, 4964 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_other_tglarge/wer_16
%WER 14.87 [ 7578 / 50948, 742 ins, 1102 del, 5734 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_other_tgmed/wer_18
%WER 16.25 [ 8277 / 50948, 823 ins, 1171 del, 6283 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_other_tgsmall/wer_15
%WER 12.80 [ 6522 / 50948, 869 ins, 698 del, 4955 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_other_tglarge/wer_17 **
%WER 14.80 [ 7542 / 50948, 774 ins, 1034 del, 5734 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_other_tgmed/wer_17
%WER 16.14 [ 8225 / 50948, 763 ins, 1242 del, 6220 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_other_tgsmall/wer_17
%WER 12.82 [ 6531 / 50948, 871 ins, 710 del, 4950 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_other_tglarge/wer_18
%WER 14.82 [ 7549 / 50948, 818 ins, 958 del, 5773 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_other_tgmed/wer_16
%WER 16.10 [ 8204 / 50948, 795 ins, 1165 del, 6244 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_other_tgsmall/wer_16
%WER 12.85 [ 6549 / 50948, 902 ins, 672 del, 4975 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_other_tglarge/wer_17
%WER 14.80 [ 7540 / 50948, 800 ins, 1025 del, 5715 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_other_tgmed/wer_18
%WER 16.10 [ 8201 / 50948, 789 ins, 1240 del, 6172 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_other_tgsmall/wer_18

Просмотреть файл

@ -0,0 +1,13 @@
# Default configuration
command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
option mem=* -l mem_free=$0,ram_free=$0
option mem=0 # Do not add anything to qsub_opts
option num_threads=* -pe smp $0
option num_threads=1 # Do not add anything to qsub_opts
option max_jobs_run=* -tc $0
default gpu=0
option gpu=0 -q all.q
option gpu=* -l gpu=$0 -q g.q
default allow_k20=true
option allow_k20=true
option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*'

Просмотреть файл

@ -3,6 +3,8 @@
# example script for online-nnet2 system training and decoding,
# based on the one for fisher-English.
# note: run_nnet2_ms.sh gives better results, but it's slower to train.
. cmd.sh
@ -37,85 +39,7 @@ else
parallel_opts="-pe smp $num_threads"
fi
if [ $stage -le 1 ]; then
# Create high-resolution MFCC features (with 40 cepstra instead of 13).
# this shows how you can split across multiple file-systems. we'll split the
# MFCC dir across multiple locations. You might want to be careful here, if you
# have multiple copies of Kaldi checked out and run the same recipe, not to let
# them overwrite each other.
mfccdir=mfcc
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
fi
for datadir in train_960 dev_clean dev_other; do
utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
done
# now create some data subsets.
# mixed is the clean+other data.
# 30k is 1/10 of the data (around 100 hours), 60k is 1/5th of it (around 200 hours).
utils/subset_data_dir.sh data/train_960_hires 30000 data/train_mixed_hires_30k
utils/subset_data_dir.sh data/train_960_hires 60000 data/train_mixed_hires_60k
fi
if [ $stage -le 2 ]; then
# We need to build a small system just because we need the LDA+MLLT transform
# to train the diag-UBM on top of. We align a subset of training data for
# this purpose.
utils/subset_data_dir.sh --utt-list <(awk '{print $1}' data/train_mixed_hires_30k/utt2spk) \
data/train_960 data/train_960_30k
steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
data/train_960_30k data/lang exp/tri6b exp/nnet2_online/tri6b_ali_30k
fi
if [ $stage -le 3 ]; then
# Train a small system just for its LDA+MLLT transform. We use --num-iters 13
# because after we get the transform (12th iter is the last), any further
# training is pointless.
steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
--realign-iters "" \
--splice-opts "--left-context=3 --right-context=3" \
5000 10000 data/train_mixed_hires_30k data/lang \
exp/nnet2_online/tri6b_ali_30k exp/nnet2_online/tri7b
fi
if [ $stage -le 4 ]; then
mkdir -p exp/nnet2_online
# To train a diagonal UBM we don't need very much data, so use a small subset
# (actually, it's not that small: still around 100 hours).
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \
data/train_mixed_hires_30k 512 exp/nnet2_online/tri7b exp/nnet2_online/diag_ubm
fi
if [ $stage -le 5 ]; then
# iVector extractors can in general be sensitive to the amount of data, but
# this one has a fairly small dim (defaults to 100) so we don't use all of it,
# we use just the 60k subset (about one fifth of the data, or 200 hours).
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
data/train_mixed_hires_60k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
fi
if [ $stage -le 6 ]; then
ivectordir=exp/nnet2_online/ivectors_train_960_hires
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
fi
# We extract iVectors on all the train data, which will be what we train the
# system on. With --utts-per-spk-max 2, the script. pairs the utterances
# into twos, and treats each of these pairs as one speaker. Note that these
# are extracted 'online'.
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
--utts-per-spk-max 2 data/train_960_hires exp/nnet2_online/extractor $ivectordir || exit 1;
fi
local/online/run_nnet2_common.sh --stage $stage
if [ $stage -le 7 ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
@ -126,9 +50,8 @@ if [ $stage -le 7 ]; then
# The size of the system is kept rather smaller than the run_7a_960.sh system:
# this is because we want it to be small enough that we could plausibly run it
# in real-time.
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
--samples-per-iter 400000 \
--num-epochs 6 --num-epochs-extra 2 \
steps/nnet2/train_pnorm_simple2.sh --stage $train_stage \
--num-epochs 8 --num-jobs-nnet 6 \
--splice-width 7 --feat-type raw \
--online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
--cmvn-opts "--norm-means=false --norm-vars=false" \
@ -136,7 +59,6 @@ if [ $stage -le 7 ]; then
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--io-opts "-tc 12" \
--num-jobs-nnet 6 \
--initial-learning-rate 0.01 --final-learning-rate 0.001 \
--cmd "$decode_cmd" \
--pnorm-input-dim 3500 \

Просмотреть файл

@ -0,0 +1,94 @@
#!/bin/bash
# this script contains some common (shared) parts of the run_nnet*.sh scripts.
. cmd.sh
stage=0
set -e
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if [ $stage -le 1 ]; then
# Create high-resolution MFCC features (with 40 cepstra instead of 13).
# this shows how you can split across multiple file-systems. we'll split the
# MFCC dir across multiple locations. You might want to be careful here, if you
# have multiple copies of Kaldi checked out and run the same recipe, not to let
# them overwrite each other.
mfccdir=mfcc
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
fi
for datadir in train_960 dev_clean dev_other; do
utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
done
# now create some data subsets.
# mixed is the clean+other data.
# 30k is 1/10 of the data (around 100 hours), 60k is 1/5th of it (around 200 hours).
utils/subset_data_dir.sh data/train_960_hires 30000 data/train_mixed_hires_30k
utils/subset_data_dir.sh data/train_960_hires 60000 data/train_mixed_hires_60k
fi
if [ $stage -le 2 ]; then
# We need to build a small system just because we need the LDA+MLLT transform
# to train the diag-UBM on top of. We align a subset of training data for
# this purpose.
utils/subset_data_dir.sh --utt-list <(awk '{print $1}' data/train_mixed_hires_30k/utt2spk) \
data/train_960 data/train_960_30k
steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
data/train_960_30k data/lang exp/tri6b exp/nnet2_online/tri6b_ali_30k
fi
if [ $stage -le 3 ]; then
# Train a small system just for its LDA+MLLT transform. We use --num-iters 13
# because after we get the transform (12th iter is the last), any further
# training is pointless.
steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
--realign-iters "" \
--splice-opts "--left-context=3 --right-context=3" \
5000 10000 data/train_mixed_hires_30k data/lang \
exp/nnet2_online/tri6b_ali_30k exp/nnet2_online/tri7b
fi
if [ $stage -le 4 ]; then
mkdir -p exp/nnet2_online
# To train a diagonal UBM we don't need very much data, so use a small subset
# (actually, it's not that small: still around 100 hours).
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \
data/train_mixed_hires_30k 512 exp/nnet2_online/tri7b exp/nnet2_online/diag_ubm
fi
if [ $stage -le 5 ]; then
# iVector extractors can in general be sensitive to the amount of data, but
# this one has a fairly small dim (defaults to 100) so we don't use all of it,
# we use just the 60k subset (about one fifth of the data, or 200 hours).
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
data/train_mixed_hires_60k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
fi
if [ $stage -le 6 ]; then
ivectordir=exp/nnet2_online/ivectors_train_960_hires
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
fi
# We extract iVectors on all the train data, which will be what we train the
# system on. With --utts-per-spk-max 2, the script. pairs the utterances
# into twos, and treats each of these pairs as one speaker. Note that these
# are extracted 'online'.
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
--utts-per-spk-max 2 data/train_960_hires exp/nnet2_online/extractor $ivectordir || exit 1;
fi
exit 0;

Просмотреть файл

@ -0,0 +1,159 @@
#!/bin/bash
# This script does discriminative training on top of the online,
# system trained in run_nnet2.sh.
# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
# since the lattice generation runs in about real-time, so takes of the order of
# 1000 hours of CPU time.
#
# Note: rather than using any features we have dumped on disk, this script
# regenerates them from the wav data three times-- when we do lattice
# generation, numerator alignment and discriminative training. This made the
# script easier to write and more generic, because we don't have to know where
# the features and the iVectors are, but of course it's a little inefficient.
# The time taken is dominated by the lattice generation anyway, so this isn't
# a huge deal.
. cmd.sh
stage=0
train_stage=-10
use_gpu=true
srcdir=exp/nnet2_online/nnet_a
criterion=smbr
drop_frames=false # only relevant for MMI actually.
learning_rate=0.0001
train_stage=-10 # can be used to start training in the middle.
decode_start_epoch=0 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
num_epochs=4
cleanup=false # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
# alignments and degs).
set -e
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
EOF
fi
parallel_opts="-l gpu=1"
num_threads=1
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
parallel_opts="-pe smp $num_threads"
fi
if [ ! -f ${srcdir}_online/final.mdl ]; then
echo "$0: expected ${srcdir}_online/final.mdl to exist; first run run_nnet2_ms.sh."
exit 1;
fi
if [ $stage -le 1 ]; then
nj=50 # this doesn't really affect anything strongly, except the num-jobs for one of
# the phases of get_egs_discriminative2.sh below.
num_threads_denlats=6
subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
# total slots = 80 * 6 = 480.
steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
--online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
--nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
data/train_960_hires data/lang $srcdir ${srcdir}_denlats || exit 1;
# the command below is a more generic, but slower, way to do it.
#steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
# --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
# data/train_960 data/lang ${srcdir}_online ${srcdir}_denlats || exit 1;
fi
if [ $stage -le 2 ]; then
# hardcode no-GPU for alignment, although you could use GPU [you wouldn't
# get excellent GPU utilization though.]
nj=350 # have a high number of jobs because this could take a while, and we might
# have some stragglers.
use_gpu=no
gpu_opts=
steps/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
--online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
--nj $nj data/train_960_hires data/lang $srcdir ${srcdir}_ali || exit 1;
# the command below is a more generic, but slower, way to do it.
# steps/online/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
# --nj $nj data/train_960 data/lang ${srcdir}_online ${srcdir}_ali || exit 1;
fi
if [ $stage -le 3 ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
utils/create_split_dir.pl \
/export/b0{1,2,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
fi
# have a higher maximum num-jobs if
if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
steps/nnet2/get_egs_discriminative2.sh \
--cmd "$decode_cmd -tc $max_jobs" \
--online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
--criterion $criterion --drop-frames $drop_frames \
data/train_960_hires data/lang ${srcdir}{_ali,_denlats,/final.mdl,_degs} || exit 1;
# the command below is a more generic, but slower, way to do it.
#steps/online/nnet2/get_egs_discriminative2.sh \
# --cmd "$decode_cmd -tc $max_jobs" \
# --criterion $criterion --drop-frames $drop_frames \
# data/train_960 data/lang ${srcdir}{_ali,_denlats,_online,_degs} || exit 1;
fi
if [ $stage -le 4 ]; then
steps/nnet2/train_discriminative2.sh --cmd "$decode_cmd $parallel_opts" \
--stage $train_stage \
--learning-rate $learning_rate \
--criterion $criterion --drop-frames $drop_frames \
--num-epochs $num_epochs \
--num-jobs-nnet 2 --num-threads $num_threads \
${srcdir}_degs ${srcdir}_${criterion}_${learning_rate} || exit 1;
fi
if [ $stage -le 5 ]; then
dir=${srcdir}_${criterion}_${learning_rate}
ln -sf $(readlink -f ${srcdir}_online/conf) $dir/conf # so it acts like an online-decoding directory
for epoch in $(seq $decode_start_epoch $num_epochs); do
for test in dev_clean dev_other; do
(
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 50 \
--iter epoch$epoch exp/tri6b/graph_tgsmall data/${test} $dir/decode_epoch${epoch}_${test}_tgsmall || exit 1
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
data/${test} $dir/decode_epoch${epoch}_${test}_{tgsmall,tgmed} || exit 1;
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
data/$test $dir/decode_epoch${epoch}_${test}_{tgsmall,tglarge} || exit 1;
) &
done
done
wait
for dir in $dir/decode*; do grep WER $dir/wer_* | utils/best_wer.sh; done
fi
if [ $stage -le 6 ] && $cleanup; then
# if you run with "--cleanup true --stage 6" you can clean up.
rm ${srcdir}_denlats/lat.*.gz || true
rm ${srcdir}_ali/ali.*.gz || true
steps/nnet2/remove_egs.sh ${srcdir}_degs || true
fi
exit 0;

Просмотреть файл

@ -0,0 +1,159 @@
#!/bin/bash
# This is the "multi-splice" version of the online-nnet2 training script.
# It's currently the best recipe.
# You'll notice that we splice over successively larger windows as we go deeper
# into the network.
. cmd.sh
stage=7
train_stage=-10
use_gpu=true
dir=exp/nnet2_online/nnet_ms_a
set -e
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
EOF
fi
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
parallel_opts="$parallel_opts --config conf/queue_no_k20.conf --allow-k20 false"
# that config is like the default config in the text of queue.pl, but adding the following lines.
# default allow_k20=true
# option allow_k20=true
# option allow_k20=false -l 'hostname=!g01&!g02&!b06'
# It's a workaround for an NVidia CUDA library bug for our currently installed version
# of the CUDA toolkit, that only shows up on k20's
fi
# the _a is in case I want to change the parameters.
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="-pe smp $num_threads"
fi
# do the common parts of the script.
local/online/run_nnet2_common.sh --stage $stage
if [ $stage -le 7 ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
utils/create_split_dir.pl \
/export/b0{3,4,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
fi
# The size of the system is kept rather smaller than the run_7a_960.sh system:
# this is because we want it to be small enough that we could plausibly run it
# in real-time.
steps/nnet2/train_multisplice_accel2.sh --stage $train_stage \
--num-epochs 8 --num-jobs-initial 3 --num-jobs-final 18 \
--num-hidden-layers 6 --splice-indexes "layer0/-2:-1:0:1:2 layer1/-1:2 layer3/-3:3 layer4/-7:2" \
--feat-type raw \
--online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
--cmvn-opts "--norm-means=false --norm-vars=false" \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--io-opts "--max-jobs-run 12" \
--initial-effective-lrate 0.0015 --final-effective-lrate 0.00015 \
--cmd "$decode_cmd" \
--pnorm-input-dim 3500 \
--pnorm-output-dim 350 \
--mix-up 12000 \
data/train_960_hires data/lang exp/tri6b $dir || exit 1;
fi
if [ $stage -le 8 ]; then
# dump iVectors for the testing data.
for test in dev_clean dev_other; do
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
data/${test}_hires exp/nnet2_online/extractor exp/nnet2_online/ivectors_$test || exit 1;
done
fi
if [ $stage -le 9 ]; then
# this does offline decoding that should give about the same results as the
# real online decoding (the one with --per-utt true)
for test in dev_clean dev_other; do
steps/nnet2/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
--online-ivector-dir exp/nnet2_online/ivectors_${test} \
exp/tri6b/graph_tgsmall data/${test}_hires $dir/decode_${test}_tgsmall || exit 1;
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
data/${test}_hires $dir/decode_${test}_{tgsmall,tgmed} || exit 1;
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
data/$test $dir/decode_${test}_{tgsmall,tglarge} || exit 1;
done
fi
if [ $stage -le 10 ]; then
# If this setup used PLP features, we'd have to give the option --feature-type plp
# to the script below.
steps/online/nnet2/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
data/lang exp/nnet2_online/extractor "$dir" ${dir}_online || exit 1;
fi
if [ $stage -le 11 ]; then
# do the actual online decoding with iVectors, carrying info forward from
# previous utterances of the same speaker.
for test in dev_clean dev_other; do
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
exp/tri6b/graph_tgsmall data/$test ${dir}_online/decode_${test}_tgsmall || exit 1;
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
data/$test ${dir}_online/decode_${test}_{tgsmall,tgmed} || exit 1;
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
data/$test ${dir}_online/decode_${test}_{tgsmall,tglarge} || exit 1;
done
fi
if [ $stage -le 12 ]; then
# this version of the decoding treats each utterance separately
# without carrying forward speaker information.
for test in dev_clean dev_other; do
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
--per-utt true exp/tri6b/graph_tgsmall data/$test ${dir}_online/decode_${test}_tgsmall_utt || exit 1;
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
data/$test ${dir}_online/decode_${test}_{tgsmall,tgmed}_utt || exit 1;
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
data/$test ${dir}_online/decode_${test}_{tgsmall,tglarge}_utt || exit 1;
done
fi
if [ $stage -le 13 ]; then
# this version of the decoding treats each utterance separately
# without carrying forward speaker information, but looks to the end
# of the utterance while computing the iVector (--online false)
for test in dev_clean dev_other; do
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
--per-utt true --online false exp/tri6b/graph_tgsmall data/$test \
${dir}_online/decode_${test}_tgsmall_utt_offline || exit 1;
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
data/$test ${dir}_online/decode_${test}_{tgsmall,tgmed}_utt_offline || exit 1;
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
data/$test ${dir}_online/decode_${test}_{tgsmall,tglarge}_utt_offline || exit 1;
done
fi
exit 0;

Просмотреть файл

@ -0,0 +1,160 @@
#!/bin/bash
# This script does discriminative training on top of the online, multi-splice
# system trained in run_nnet2_ms.sh.
# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
# since the lattice generation runs in about real-time, so takes of the order of
# 1000 hours of CPU time.
#
# Note: rather than using any features we have dumped on disk, this script
# regenerates them from the wav data three times-- when we do lattice
# generation, numerator alignment and discriminative training. This made the
# script easier to write and more generic, because we don't have to know where
# the features and the iVectors are, but of course it's a little inefficient.
# The time taken is dominated by the lattice generation anyway, so this isn't
# a huge deal.
. cmd.sh
stage=0
train_stage=-10
use_gpu=true
srcdir=exp/nnet2_online/nnet_ms_a
criterion=smbr
drop_frames=false # only matters for MMI anyway.
effective_lrate=0.000005
num_jobs_nnet=6
train_stage=-10 # can be used to start training in the middle.
decode_start_epoch=0 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
num_epochs=4
cleanup=false # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
# alignments and degs).
set -e
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
EOF
fi
parallel_opts="-l gpu=1"
num_threads=1
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
parallel_opts="-pe smp $num_threads"
fi
if [ ! -f ${srcdir}_online/final.mdl ]; then
echo "$0: expected ${srcdir}_online/final.mdl to exist; first run run_nnet2_ms.sh."
exit 1;
fi
if [ $stage -le 1 ]; then
nj=50 # this doesn't really affect anything strongly, except the num-jobs for one of
# the phases of get_egs_discriminative2.sh below.
num_threads_denlats=6
subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
# max total slots = 80 * 6 = 480.
steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
--online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
--nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
data/train_960_hires data/lang $srcdir ${srcdir}_denlats || exit 1;
# the command below is a more generic, but slower, way to do it.
#steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
# --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
# data/train_960 data/lang ${srcdir}_online ${srcdir}_denlats || exit 1;
fi
if [ $stage -le 2 ]; then
# hardcode no-GPU for alignment, although you could use GPU [you wouldn't
# get excellent GPU utilization though.]
nj=350 # have a high number of jobs because this could take a while, and we might
# have some stragglers.
use_gpu=no
gpu_opts=
steps/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
--online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
--nj $nj data/train_960_hires data/lang $srcdir ${srcdir}_ali || exit 1;
# the command below is a more generic, but slower, way to do it.
# steps/online/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
# --nj $nj data/train_960 data/lang ${srcdir}_online ${srcdir}_ali || exit 1;
fi
if [ $stage -le 3 ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
utils/create_split_dir.pl \
/export/b0{1,2,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
fi
# have a higher maximum num-jobs if
if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
steps/nnet2/get_egs_discriminative2.sh \
--cmd "$decode_cmd -tc $max_jobs" \
--online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
--criterion $criterion --drop-frames $drop_frames \
data/train_960_hires data/lang ${srcdir}{_ali,_denlats,/final.mdl,_degs} || exit 1;
# the command below is a more generic, but slower, way to do it.
#steps/online/nnet2/get_egs_discriminative2.sh \
# --cmd "$decode_cmd -tc $max_jobs" \
# --criterion $criterion --drop-frames $drop_frames \
# data/train_960 data/lang ${srcdir}{_ali,_denlats,_online,_degs} || exit 1;
fi
if [ $stage -le 4 ]; then
steps/nnet2/train_discriminative2.sh --cmd "$decode_cmd $parallel_opts" \
--stage $train_stage \
--effective-lrate $effective_lrate \
--criterion $criterion --drop-frames $drop_frames \
--num-epochs $num_epochs \
--num-jobs-nnet 6 --num-threads $num_threads \
${srcdir}_degs ${srcdir}_${criterion}_${effective_lrate} || exit 1;
fi
if [ $stage -le 5 ]; then
dir=${srcdir}_${criterion}_${effective_lrate}
ln -sf $(readlink -f ${srcdir}_online/conf) $dir/conf # so it acts like an online-decoding directory
for epoch in $(seq $decode_start_epoch $num_epochs); do
for test in dev_clean dev_other; do
(
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 50 \
--iter epoch$epoch exp/tri6b/graph_tgsmall data/${test} $dir/decode_epoch${epoch}_${test}_tgsmall || exit 1
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
data/${test} $dir/decode_epoch${epoch}_${test}_{tgsmall,tgmed} || exit 1;
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
data/$test $dir/decode_epoch${epoch}_${test}_{tgsmall,tglarge} || exit 1;
) &
done
done
wait
for dir in $dir/decode*; do grep WER $dir/wer_* | utils/best_wer.sh; done
fi
if [ $stage -le 6 ] && $cleanup; then
# if you run with "--cleanup true --stage 6" you can clean up.
rm ${srcdir}_denlats/lat.*.gz || true
rm ${srcdir}_ali/ali.*.gz || true
steps/nnet2/remove_egs.sh ${srcdir}_degs || true
fi
exit 0;

Просмотреть файл

@ -260,6 +260,9 @@ steps/train_quick.sh --cmd "$train_cmd" \
done
)&
# steps/cleanup/debug_lexicon.sh --remove-stress true --nj 200 --cmd "$train_cmd" data/train_clean_100 \
# data/lang exp/tri6b data/local/dict/lexicon.txt exp/debug_lexicon_100h
# Perform RNNLM rescoring of tri6b
# Attention: with default settings requires 4 GB of memory per rescoring job, so commenting this out by default
# local/run_rnnlm.sh $data data/local/lm
@ -271,4 +274,13 @@ local/nnet2/run_7a_960.sh || exit 1
## we've found that this isn't helpful-- see the comments in local/run_data_cleaning.sh
#local/run_data_cleaning.sh
# local/online/run_nnet2.sh
# # The following is the current online-nnet2 recipe, with "multi-splice".
# local/online/run_nnet2_ms.sh
# # The following is the discriminative-training continuation of the above.
# local/online/run_nnet2_ms_disc.sh
# ## The following is an older version of the online-nnet2 recipe, without "multi-splice". It's faster
# ## to train but slightly worse.
# # local/online/run_nnet2.sh