зеркало из https://github.com/mozilla/kaldi.git
trunk: updates to librispeech recipe, with better 'multi-splice' version of pnorm online-nnet2 recipe.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4765 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
2bbbea9efe
Коммит
b7c7005024
|
@ -140,6 +140,8 @@
|
|||
|
||||
|
||||
### online-nnet2 results with a model trained on all(960h) of the training data
|
||||
### Note: these results are now superseded by the multi-splice (_ms_) results below.
|
||||
### Be careful when comparing, as the _ms_ results don't yet have the _fglarge tests.
|
||||
%WER 4.90 [ 2663 / 54402, 388 ins, 273 del, 2002 sub ] exp/nnet2_online/nnet_a_online/decode_dev_clean_fglarge/wer_13
|
||||
%WER 5.19 [ 2822 / 54402, 406 ins, 311 del, 2105 sub ] exp/nnet2_online/nnet_a_online/decode_dev_clean_tglarge/wer_13
|
||||
%WER 6.60 [ 3593 / 54402, 457 ins, 426 del, 2710 sub ] exp/nnet2_online/nnet_a_online/decode_dev_clean_tgmed/wer_11
|
||||
|
@ -192,3 +194,65 @@
|
|||
%WER 24.18 [ 12317 / 50948, 1284 ins, 1732 del, 9301 sub ] exp/tri6b/decode_tgsmall_dev_other_rnnlm_h150_me5-1000_L0.5/wer_15
|
||||
%WER 24.19 [ 12323 / 50948, 1327 ins, 1686 del, 9310 sub ] exp/tri6b/decode_tgsmall_dev_other_rnnlm_h150_me5-1000_L0.75/wer_15
|
||||
|
||||
## Multi-splice version of online recipe.
|
||||
# for x in exp/nnet2_online/nnet_ms_a/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done
|
||||
%WER 4.72 [ 2568 / 54402, 390 ins, 258 del, 1920 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_clean_tglarge/wer_12
|
||||
%WER 5.90 [ 3212 / 54402, 345 ins, 441 del, 2426 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_clean_tgmed/wer_14
|
||||
%WER 6.64 [ 3612 / 54402, 401 ins, 479 del, 2732 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_clean_tgsmall/wer_12
|
||||
%WER 13.11 [ 6680 / 50948, 797 ins, 866 del, 5017 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_other_tglarge/wer_15
|
||||
%WER 15.56 [ 7925 / 50948, 727 ins, 1261 del, 5937 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_other_tgmed/wer_15
|
||||
%WER 17.10 [ 8714 / 50948, 733 ins, 1510 del, 6471 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_other_tgsmall/wer_15
|
||||
|
||||
# for x in exp/nnet2_online/nnet_ms_a_online/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done
|
||||
%WER 4.83 [ 2629 / 54402, 393 ins, 264 del, 1972 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tglarge/wer_13
|
||||
%WER 5.01 [ 2726 / 54402, 402 ins, 270 del, 2054 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tglarge_utt/wer_13
|
||||
%WER 4.87 [ 2647 / 54402, 386 ins, 290 del, 1971 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tglarge_utt_offline/wer_14
|
||||
%WER 6.05 [ 3294 / 54402, 409 ins, 392 del, 2493 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgmed/wer_12
|
||||
%WER 6.30 [ 3428 / 54402, 389 ins, 434 del, 2605 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgmed_utt/wer_13
|
||||
%WER 6.09 [ 3311 / 54402, 393 ins, 417 del, 2501 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgmed_utt_offline/wer_13
|
||||
%WER 6.87 [ 3740 / 54402, 390 ins, 547 del, 2803 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgsmall/wer_13
|
||||
%WER 7.21 [ 3921 / 54402, 440 ins, 535 del, 2946 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgsmall_utt/wer_12
|
||||
%WER 6.95 [ 3783 / 54402, 415 ins, 543 del, 2825 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgsmall_utt_offline/wer_13
|
||||
%WER 13.21 [ 6732 / 50948, 812 ins, 852 del, 5068 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tglarge/wer_14
|
||||
%WER 14.24 [ 7254 / 50948, 884 ins, 959 del, 5411 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tglarge_utt/wer_15
|
||||
%WER 13.63 [ 6945 / 50948, 890 ins, 856 del, 5199 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tglarge_utt_offline/wer_14
|
||||
%WER 15.69 [ 7996 / 50948, 800 ins, 1189 del, 6007 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgmed/wer_14
|
||||
%WER 16.63 [ 8473 / 50948, 809 ins, 1317 del, 6347 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgmed_utt/wer_15
|
||||
%WER 16.09 [ 8197 / 50948, 872 ins, 1130 del, 6195 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgmed_utt_offline/wer_13
|
||||
%WER 17.15 [ 8736 / 50948, 756 ins, 1424 del, 6556 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall/wer_14
|
||||
%WER 18.23 [ 9288 / 50948, 782 ins, 1585 del, 6921 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall_utt/wer_15
|
||||
%WER 17.54 [ 8936 / 50948, 813 ins, 1425 del, 6698 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall_utt_offline/wer_14
|
||||
|
||||
|
||||
## Note: this learning rate is the effective learning rate; it gets multiplied by the num-jobs.
|
||||
# for x in exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch*{clean,other}*; do grep WER $x/wer_* | utils/best_wer.sh ; done
|
||||
%WER 5.92 [ 3221 / 54402, 352 ins, 439 del, 2430 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_clean_tgmed/wer_14
|
||||
%WER 6.63 [ 3605 / 54402, 399 ins, 481 del, 2725 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_clean_tgsmall/wer_12
|
||||
%WER 4.44 [ 2416 / 54402, 385 ins, 204 del, 1827 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_clean_tglarge/wer_14
|
||||
%WER 5.52 [ 3001 / 54402, 360 ins, 340 del, 2301 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_clean_tgmed/wer_15
|
||||
%WER 6.22 [ 3384 / 54402, 388 ins, 411 del, 2585 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_clean_tgsmall/wer_14
|
||||
%WER 4.39 [ 2386 / 54402, 368 ins, 208 del, 1810 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_clean_tglarge/wer_15 **
|
||||
%WER 5.41 [ 2945 / 54402, 338 ins, 339 del, 2268 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_clean_tgmed/wer_16
|
||||
%WER 6.13 [ 3333 / 54402, 371 ins, 410 del, 2552 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_clean_tgsmall/wer_15
|
||||
%WER 4.39 [ 2387 / 54402, 377 ins, 199 del, 1811 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tglarge/wer_14
|
||||
%WER 5.36 [ 2918 / 54402, 328 ins, 338 del, 2252 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tgmed/wer_17
|
||||
%WER 6.08 [ 3305 / 54402, 369 ins, 396 del, 2540 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tgsmall/wer_15
|
||||
%WER 4.40 [ 2395 / 54402, 375 ins, 200 del, 1820 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tglarge/wer_14
|
||||
%WER 5.35 [ 2909 / 54402, 328 ins, 339 del, 2242 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tgmed/wer_17
|
||||
%WER 6.05 [ 3291 / 54402, 384 ins, 381 del, 2526 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tgsmall/wer_14
|
||||
%WER 13.45 [ 6850 / 50948, 808 ins, 876 del, 5166 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_other_tglarge/wer_15
|
||||
%WER 15.65 [ 7975 / 50948, 714 ins, 1311 del, 5950 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_other_tgmed/wer_16
|
||||
%WER 17.12 [ 8722 / 50948, 739 ins, 1489 del, 6494 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_other_tgsmall/wer_15
|
||||
%WER 12.84 [ 6544 / 50948, 877 ins, 703 del, 4964 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_other_tglarge/wer_16
|
||||
%WER 14.87 [ 7578 / 50948, 742 ins, 1102 del, 5734 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_other_tgmed/wer_18
|
||||
%WER 16.25 [ 8277 / 50948, 823 ins, 1171 del, 6283 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_other_tgsmall/wer_15
|
||||
%WER 12.80 [ 6522 / 50948, 869 ins, 698 del, 4955 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_other_tglarge/wer_17 **
|
||||
%WER 14.80 [ 7542 / 50948, 774 ins, 1034 del, 5734 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_other_tgmed/wer_17
|
||||
%WER 16.14 [ 8225 / 50948, 763 ins, 1242 del, 6220 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_other_tgsmall/wer_17
|
||||
%WER 12.82 [ 6531 / 50948, 871 ins, 710 del, 4950 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_other_tglarge/wer_18
|
||||
%WER 14.82 [ 7549 / 50948, 818 ins, 958 del, 5773 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_other_tgmed/wer_16
|
||||
%WER 16.10 [ 8204 / 50948, 795 ins, 1165 del, 6244 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_other_tgsmall/wer_16
|
||||
%WER 12.85 [ 6549 / 50948, 902 ins, 672 del, 4975 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_other_tglarge/wer_17
|
||||
%WER 14.80 [ 7540 / 50948, 800 ins, 1025 del, 5715 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_other_tgmed/wer_18
|
||||
%WER 16.10 [ 8201 / 50948, 789 ins, 1240 del, 6172 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_other_tgsmall/wer_18
|
||||
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
# Default configuration
|
||||
command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
|
||||
option mem=* -l mem_free=$0,ram_free=$0
|
||||
option mem=0 # Do not add anything to qsub_opts
|
||||
option num_threads=* -pe smp $0
|
||||
option num_threads=1 # Do not add anything to qsub_opts
|
||||
option max_jobs_run=* -tc $0
|
||||
default gpu=0
|
||||
option gpu=0 -q all.q
|
||||
option gpu=* -l gpu=$0 -q g.q
|
||||
default allow_k20=true
|
||||
option allow_k20=true
|
||||
option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*'
|
|
@ -3,6 +3,8 @@
|
|||
# example script for online-nnet2 system training and decoding,
|
||||
# based on the one for fisher-English.
|
||||
|
||||
# note: run_nnet2_ms.sh gives better results, but it's slower to train.
|
||||
|
||||
. cmd.sh
|
||||
|
||||
|
||||
|
@ -37,85 +39,7 @@ else
|
|||
parallel_opts="-pe smp $num_threads"
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
# Create high-resolution MFCC features (with 40 cepstra instead of 13).
|
||||
# this shows how you can split across multiple file-systems. we'll split the
|
||||
# MFCC dir across multiple locations. You might want to be careful here, if you
|
||||
# have multiple copies of Kaldi checked out and run the same recipe, not to let
|
||||
# them overwrite each other.
|
||||
mfccdir=mfcc
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
|
||||
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
|
||||
fi
|
||||
|
||||
for datadir in train_960 dev_clean dev_other; do
|
||||
utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
|
||||
steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
|
||||
--cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
|
||||
steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
|
||||
done
|
||||
|
||||
# now create some data subsets.
|
||||
# mixed is the clean+other data.
|
||||
# 30k is 1/10 of the data (around 100 hours), 60k is 1/5th of it (around 200 hours).
|
||||
utils/subset_data_dir.sh data/train_960_hires 30000 data/train_mixed_hires_30k
|
||||
utils/subset_data_dir.sh data/train_960_hires 60000 data/train_mixed_hires_60k
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 2 ]; then
|
||||
# We need to build a small system just because we need the LDA+MLLT transform
|
||||
# to train the diag-UBM on top of. We align a subset of training data for
|
||||
# this purpose.
|
||||
utils/subset_data_dir.sh --utt-list <(awk '{print $1}' data/train_mixed_hires_30k/utt2spk) \
|
||||
data/train_960 data/train_960_30k
|
||||
|
||||
steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
|
||||
data/train_960_30k data/lang exp/tri6b exp/nnet2_online/tri6b_ali_30k
|
||||
fi
|
||||
|
||||
if [ $stage -le 3 ]; then
|
||||
# Train a small system just for its LDA+MLLT transform. We use --num-iters 13
|
||||
# because after we get the transform (12th iter is the last), any further
|
||||
# training is pointless.
|
||||
steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
|
||||
--realign-iters "" \
|
||||
--splice-opts "--left-context=3 --right-context=3" \
|
||||
5000 10000 data/train_mixed_hires_30k data/lang \
|
||||
exp/nnet2_online/tri6b_ali_30k exp/nnet2_online/tri7b
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 4 ]; then
|
||||
mkdir -p exp/nnet2_online
|
||||
# To train a diagonal UBM we don't need very much data, so use a small subset
|
||||
# (actually, it's not that small: still around 100 hours).
|
||||
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \
|
||||
data/train_mixed_hires_30k 512 exp/nnet2_online/tri7b exp/nnet2_online/diag_ubm
|
||||
fi
|
||||
|
||||
if [ $stage -le 5 ]; then
|
||||
# iVector extractors can in general be sensitive to the amount of data, but
|
||||
# this one has a fairly small dim (defaults to 100) so we don't use all of it,
|
||||
# we use just the 60k subset (about one fifth of the data, or 200 hours).
|
||||
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
|
||||
data/train_mixed_hires_60k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 6 ]; then
|
||||
ivectordir=exp/nnet2_online/ivectors_train_960_hires
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
|
||||
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
|
||||
fi
|
||||
# We extract iVectors on all the train data, which will be what we train the
|
||||
# system on. With --utts-per-spk-max 2, the script. pairs the utterances
|
||||
# into twos, and treats each of these pairs as one speaker. Note that these
|
||||
# are extracted 'online'.
|
||||
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
|
||||
--utts-per-spk-max 2 data/train_960_hires exp/nnet2_online/extractor $ivectordir || exit 1;
|
||||
fi
|
||||
|
||||
local/online/run_nnet2_common.sh --stage $stage
|
||||
|
||||
if [ $stage -le 7 ]; then
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
|
||||
|
@ -126,9 +50,8 @@ if [ $stage -le 7 ]; then
|
|||
# The size of the system is kept rather smaller than the run_7a_960.sh system:
|
||||
# this is because we want it to be small enough that we could plausibly run it
|
||||
# in real-time.
|
||||
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
|
||||
--samples-per-iter 400000 \
|
||||
--num-epochs 6 --num-epochs-extra 2 \
|
||||
steps/nnet2/train_pnorm_simple2.sh --stage $train_stage \
|
||||
--num-epochs 8 --num-jobs-nnet 6 \
|
||||
--splice-width 7 --feat-type raw \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
|
||||
--cmvn-opts "--norm-means=false --norm-vars=false" \
|
||||
|
@ -136,7 +59,6 @@ if [ $stage -le 7 ]; then
|
|||
--minibatch-size "$minibatch_size" \
|
||||
--parallel-opts "$parallel_opts" \
|
||||
--io-opts "-tc 12" \
|
||||
--num-jobs-nnet 6 \
|
||||
--initial-learning-rate 0.01 --final-learning-rate 0.001 \
|
||||
--cmd "$decode_cmd" \
|
||||
--pnorm-input-dim 3500 \
|
||||
|
|
|
@ -0,0 +1,94 @@
|
|||
#!/bin/bash
|
||||
|
||||
# this script contains some common (shared) parts of the run_nnet*.sh scripts.
|
||||
|
||||
. cmd.sh
|
||||
|
||||
|
||||
stage=0
|
||||
|
||||
set -e
|
||||
. cmd.sh
|
||||
. ./path.sh
|
||||
. ./utils/parse_options.sh
|
||||
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
# Create high-resolution MFCC features (with 40 cepstra instead of 13).
|
||||
# this shows how you can split across multiple file-systems. we'll split the
|
||||
# MFCC dir across multiple locations. You might want to be careful here, if you
|
||||
# have multiple copies of Kaldi checked out and run the same recipe, not to let
|
||||
# them overwrite each other.
|
||||
mfccdir=mfcc
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
|
||||
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
|
||||
fi
|
||||
|
||||
for datadir in train_960 dev_clean dev_other; do
|
||||
utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
|
||||
steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
|
||||
--cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
|
||||
steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
|
||||
done
|
||||
|
||||
# now create some data subsets.
|
||||
# mixed is the clean+other data.
|
||||
# 30k is 1/10 of the data (around 100 hours), 60k is 1/5th of it (around 200 hours).
|
||||
utils/subset_data_dir.sh data/train_960_hires 30000 data/train_mixed_hires_30k
|
||||
utils/subset_data_dir.sh data/train_960_hires 60000 data/train_mixed_hires_60k
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ]; then
|
||||
# We need to build a small system just because we need the LDA+MLLT transform
|
||||
# to train the diag-UBM on top of. We align a subset of training data for
|
||||
# this purpose.
|
||||
utils/subset_data_dir.sh --utt-list <(awk '{print $1}' data/train_mixed_hires_30k/utt2spk) \
|
||||
data/train_960 data/train_960_30k
|
||||
|
||||
steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
|
||||
data/train_960_30k data/lang exp/tri6b exp/nnet2_online/tri6b_ali_30k
|
||||
fi
|
||||
|
||||
if [ $stage -le 3 ]; then
|
||||
# Train a small system just for its LDA+MLLT transform. We use --num-iters 13
|
||||
# because after we get the transform (12th iter is the last), any further
|
||||
# training is pointless.
|
||||
steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
|
||||
--realign-iters "" \
|
||||
--splice-opts "--left-context=3 --right-context=3" \
|
||||
5000 10000 data/train_mixed_hires_30k data/lang \
|
||||
exp/nnet2_online/tri6b_ali_30k exp/nnet2_online/tri7b
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 4 ]; then
|
||||
mkdir -p exp/nnet2_online
|
||||
# To train a diagonal UBM we don't need very much data, so use a small subset
|
||||
# (actually, it's not that small: still around 100 hours).
|
||||
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \
|
||||
data/train_mixed_hires_30k 512 exp/nnet2_online/tri7b exp/nnet2_online/diag_ubm
|
||||
fi
|
||||
|
||||
if [ $stage -le 5 ]; then
|
||||
# iVector extractors can in general be sensitive to the amount of data, but
|
||||
# this one has a fairly small dim (defaults to 100) so we don't use all of it,
|
||||
# we use just the 60k subset (about one fifth of the data, or 200 hours).
|
||||
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
|
||||
data/train_mixed_hires_60k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 6 ]; then
|
||||
ivectordir=exp/nnet2_online/ivectors_train_960_hires
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
|
||||
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
|
||||
fi
|
||||
# We extract iVectors on all the train data, which will be what we train the
|
||||
# system on. With --utts-per-spk-max 2, the script. pairs the utterances
|
||||
# into twos, and treats each of these pairs as one speaker. Note that these
|
||||
# are extracted 'online'.
|
||||
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
|
||||
--utts-per-spk-max 2 data/train_960_hires exp/nnet2_online/extractor $ivectordir || exit 1;
|
||||
fi
|
||||
|
||||
|
||||
exit 0;
|
|
@ -0,0 +1,159 @@
|
|||
#!/bin/bash
|
||||
|
||||
|
||||
# This script does discriminative training on top of the online,
|
||||
# system trained in run_nnet2.sh.
|
||||
# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
|
||||
# since the lattice generation runs in about real-time, so takes of the order of
|
||||
# 1000 hours of CPU time.
|
||||
#
|
||||
# Note: rather than using any features we have dumped on disk, this script
|
||||
# regenerates them from the wav data three times-- when we do lattice
|
||||
# generation, numerator alignment and discriminative training. This made the
|
||||
# script easier to write and more generic, because we don't have to know where
|
||||
# the features and the iVectors are, but of course it's a little inefficient.
|
||||
# The time taken is dominated by the lattice generation anyway, so this isn't
|
||||
# a huge deal.
|
||||
|
||||
. cmd.sh
|
||||
|
||||
|
||||
stage=0
|
||||
train_stage=-10
|
||||
use_gpu=true
|
||||
srcdir=exp/nnet2_online/nnet_a
|
||||
criterion=smbr
|
||||
drop_frames=false # only relevant for MMI actually.
|
||||
learning_rate=0.0001
|
||||
train_stage=-10 # can be used to start training in the middle.
|
||||
decode_start_epoch=0 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
|
||||
num_epochs=4
|
||||
cleanup=false # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
|
||||
# alignments and degs).
|
||||
|
||||
set -e
|
||||
. cmd.sh
|
||||
. ./path.sh
|
||||
. ./utils/parse_options.sh
|
||||
|
||||
|
||||
if $use_gpu; then
|
||||
if ! cuda-compiled; then
|
||||
cat <<EOF && exit 1
|
||||
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
|
||||
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
|
||||
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
|
||||
EOF
|
||||
fi
|
||||
parallel_opts="-l gpu=1"
|
||||
num_threads=1
|
||||
else
|
||||
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
|
||||
# almost the same, but this may be a little bit slow.
|
||||
num_threads=16
|
||||
parallel_opts="-pe smp $num_threads"
|
||||
fi
|
||||
|
||||
if [ ! -f ${srcdir}_online/final.mdl ]; then
|
||||
echo "$0: expected ${srcdir}_online/final.mdl to exist; first run run_nnet2_ms.sh."
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
nj=50 # this doesn't really affect anything strongly, except the num-jobs for one of
|
||||
# the phases of get_egs_discriminative2.sh below.
|
||||
num_threads_denlats=6
|
||||
subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
|
||||
# total slots = 80 * 6 = 480.
|
||||
steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
|
||||
--nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
|
||||
data/train_960_hires data/lang $srcdir ${srcdir}_denlats || exit 1;
|
||||
|
||||
# the command below is a more generic, but slower, way to do it.
|
||||
#steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
|
||||
# --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
|
||||
# data/train_960 data/lang ${srcdir}_online ${srcdir}_denlats || exit 1;
|
||||
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ]; then
|
||||
# hardcode no-GPU for alignment, although you could use GPU [you wouldn't
|
||||
# get excellent GPU utilization though.]
|
||||
nj=350 # have a high number of jobs because this could take a while, and we might
|
||||
# have some stragglers.
|
||||
use_gpu=no
|
||||
gpu_opts=
|
||||
|
||||
steps/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
|
||||
--nj $nj data/train_960_hires data/lang $srcdir ${srcdir}_ali || exit 1;
|
||||
|
||||
# the command below is a more generic, but slower, way to do it.
|
||||
# steps/online/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
|
||||
# --nj $nj data/train_960 data/lang ${srcdir}_online ${srcdir}_ali || exit 1;
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 3 ]; then
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
|
||||
utils/create_split_dir.pl \
|
||||
/export/b0{1,2,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
|
||||
fi
|
||||
# have a higher maximum num-jobs if
|
||||
if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
|
||||
|
||||
steps/nnet2/get_egs_discriminative2.sh \
|
||||
--cmd "$decode_cmd -tc $max_jobs" \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
|
||||
--criterion $criterion --drop-frames $drop_frames \
|
||||
data/train_960_hires data/lang ${srcdir}{_ali,_denlats,/final.mdl,_degs} || exit 1;
|
||||
|
||||
# the command below is a more generic, but slower, way to do it.
|
||||
#steps/online/nnet2/get_egs_discriminative2.sh \
|
||||
# --cmd "$decode_cmd -tc $max_jobs" \
|
||||
# --criterion $criterion --drop-frames $drop_frames \
|
||||
# data/train_960 data/lang ${srcdir}{_ali,_denlats,_online,_degs} || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 4 ]; then
|
||||
steps/nnet2/train_discriminative2.sh --cmd "$decode_cmd $parallel_opts" \
|
||||
--stage $train_stage \
|
||||
--learning-rate $learning_rate \
|
||||
--criterion $criterion --drop-frames $drop_frames \
|
||||
--num-epochs $num_epochs \
|
||||
--num-jobs-nnet 2 --num-threads $num_threads \
|
||||
${srcdir}_degs ${srcdir}_${criterion}_${learning_rate} || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 5 ]; then
|
||||
dir=${srcdir}_${criterion}_${learning_rate}
|
||||
ln -sf $(readlink -f ${srcdir}_online/conf) $dir/conf # so it acts like an online-decoding directory
|
||||
|
||||
for epoch in $(seq $decode_start_epoch $num_epochs); do
|
||||
for test in dev_clean dev_other; do
|
||||
(
|
||||
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 50 \
|
||||
--iter epoch$epoch exp/tri6b/graph_tgsmall data/${test} $dir/decode_epoch${epoch}_${test}_tgsmall || exit 1
|
||||
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
|
||||
data/${test} $dir/decode_epoch${epoch}_${test}_{tgsmall,tgmed} || exit 1;
|
||||
steps/lmrescore_const_arpa.sh \
|
||||
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
|
||||
data/$test $dir/decode_epoch${epoch}_${test}_{tgsmall,tglarge} || exit 1;
|
||||
) &
|
||||
done
|
||||
done
|
||||
wait
|
||||
for dir in $dir/decode*; do grep WER $dir/wer_* | utils/best_wer.sh; done
|
||||
fi
|
||||
|
||||
if [ $stage -le 6 ] && $cleanup; then
|
||||
# if you run with "--cleanup true --stage 6" you can clean up.
|
||||
rm ${srcdir}_denlats/lat.*.gz || true
|
||||
rm ${srcdir}_ali/ali.*.gz || true
|
||||
steps/nnet2/remove_egs.sh ${srcdir}_degs || true
|
||||
fi
|
||||
|
||||
|
||||
exit 0;
|
|
@ -0,0 +1,159 @@
|
|||
#!/bin/bash
|
||||
|
||||
# This is the "multi-splice" version of the online-nnet2 training script.
|
||||
# It's currently the best recipe.
|
||||
# You'll notice that we splice over successively larger windows as we go deeper
|
||||
# into the network.
|
||||
|
||||
. cmd.sh
|
||||
|
||||
|
||||
stage=7
|
||||
train_stage=-10
|
||||
use_gpu=true
|
||||
dir=exp/nnet2_online/nnet_ms_a
|
||||
|
||||
set -e
|
||||
. cmd.sh
|
||||
. ./path.sh
|
||||
. ./utils/parse_options.sh
|
||||
|
||||
|
||||
if $use_gpu; then
|
||||
if ! cuda-compiled; then
|
||||
cat <<EOF && exit 1
|
||||
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
|
||||
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
|
||||
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
|
||||
EOF
|
||||
fi
|
||||
parallel_opts="-l gpu=1"
|
||||
num_threads=1
|
||||
minibatch_size=512
|
||||
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
|
||||
parallel_opts="$parallel_opts --config conf/queue_no_k20.conf --allow-k20 false"
|
||||
# that config is like the default config in the text of queue.pl, but adding the following lines.
|
||||
# default allow_k20=true
|
||||
# option allow_k20=true
|
||||
# option allow_k20=false -l 'hostname=!g01&!g02&!b06'
|
||||
# It's a workaround for an NVidia CUDA library bug for our currently installed version
|
||||
# of the CUDA toolkit, that only shows up on k20's
|
||||
fi
|
||||
# the _a is in case I want to change the parameters.
|
||||
else
|
||||
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
|
||||
# almost the same, but this may be a little bit slow.
|
||||
num_threads=16
|
||||
minibatch_size=128
|
||||
parallel_opts="-pe smp $num_threads"
|
||||
fi
|
||||
|
||||
# do the common parts of the script.
|
||||
local/online/run_nnet2_common.sh --stage $stage
|
||||
|
||||
|
||||
if [ $stage -le 7 ]; then
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
|
||||
utils/create_split_dir.pl \
|
||||
/export/b0{3,4,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
|
||||
fi
|
||||
|
||||
# The size of the system is kept rather smaller than the run_7a_960.sh system:
|
||||
# this is because we want it to be small enough that we could plausibly run it
|
||||
# in real-time.
|
||||
steps/nnet2/train_multisplice_accel2.sh --stage $train_stage \
|
||||
--num-epochs 8 --num-jobs-initial 3 --num-jobs-final 18 \
|
||||
--num-hidden-layers 6 --splice-indexes "layer0/-2:-1:0:1:2 layer1/-1:2 layer3/-3:3 layer4/-7:2" \
|
||||
--feat-type raw \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
|
||||
--cmvn-opts "--norm-means=false --norm-vars=false" \
|
||||
--num-threads "$num_threads" \
|
||||
--minibatch-size "$minibatch_size" \
|
||||
--parallel-opts "$parallel_opts" \
|
||||
--io-opts "--max-jobs-run 12" \
|
||||
--initial-effective-lrate 0.0015 --final-effective-lrate 0.00015 \
|
||||
--cmd "$decode_cmd" \
|
||||
--pnorm-input-dim 3500 \
|
||||
--pnorm-output-dim 350 \
|
||||
--mix-up 12000 \
|
||||
data/train_960_hires data/lang exp/tri6b $dir || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 8 ]; then
|
||||
# dump iVectors for the testing data.
|
||||
for test in dev_clean dev_other; do
|
||||
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
|
||||
data/${test}_hires exp/nnet2_online/extractor exp/nnet2_online/ivectors_$test || exit 1;
|
||||
done
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 9 ]; then
|
||||
# this does offline decoding that should give about the same results as the
|
||||
# real online decoding (the one with --per-utt true)
|
||||
for test in dev_clean dev_other; do
|
||||
steps/nnet2/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors_${test} \
|
||||
exp/tri6b/graph_tgsmall data/${test}_hires $dir/decode_${test}_tgsmall || exit 1;
|
||||
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
|
||||
data/${test}_hires $dir/decode_${test}_{tgsmall,tgmed} || exit 1;
|
||||
steps/lmrescore_const_arpa.sh \
|
||||
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
|
||||
data/$test $dir/decode_${test}_{tgsmall,tglarge} || exit 1;
|
||||
done
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 10 ]; then
|
||||
# If this setup used PLP features, we'd have to give the option --feature-type plp
|
||||
# to the script below.
|
||||
steps/online/nnet2/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
|
||||
data/lang exp/nnet2_online/extractor "$dir" ${dir}_online || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 11 ]; then
|
||||
# do the actual online decoding with iVectors, carrying info forward from
|
||||
# previous utterances of the same speaker.
|
||||
for test in dev_clean dev_other; do
|
||||
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
|
||||
exp/tri6b/graph_tgsmall data/$test ${dir}_online/decode_${test}_tgsmall || exit 1;
|
||||
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
|
||||
data/$test ${dir}_online/decode_${test}_{tgsmall,tgmed} || exit 1;
|
||||
steps/lmrescore_const_arpa.sh \
|
||||
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
|
||||
data/$test ${dir}_online/decode_${test}_{tgsmall,tglarge} || exit 1;
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 12 ]; then
|
||||
# this version of the decoding treats each utterance separately
|
||||
# without carrying forward speaker information.
|
||||
for test in dev_clean dev_other; do
|
||||
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
|
||||
--per-utt true exp/tri6b/graph_tgsmall data/$test ${dir}_online/decode_${test}_tgsmall_utt || exit 1;
|
||||
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
|
||||
data/$test ${dir}_online/decode_${test}_{tgsmall,tgmed}_utt || exit 1;
|
||||
steps/lmrescore_const_arpa.sh \
|
||||
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
|
||||
data/$test ${dir}_online/decode_${test}_{tgsmall,tglarge}_utt || exit 1;
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 13 ]; then
|
||||
# this version of the decoding treats each utterance separately
|
||||
# without carrying forward speaker information, but looks to the end
|
||||
# of the utterance while computing the iVector (--online false)
|
||||
for test in dev_clean dev_other; do
|
||||
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
|
||||
--per-utt true --online false exp/tri6b/graph_tgsmall data/$test \
|
||||
${dir}_online/decode_${test}_tgsmall_utt_offline || exit 1;
|
||||
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
|
||||
data/$test ${dir}_online/decode_${test}_{tgsmall,tgmed}_utt_offline || exit 1;
|
||||
steps/lmrescore_const_arpa.sh \
|
||||
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
|
||||
data/$test ${dir}_online/decode_${test}_{tgsmall,tglarge}_utt_offline || exit 1;
|
||||
done
|
||||
fi
|
||||
|
||||
exit 0;
|
|
@ -0,0 +1,160 @@
|
|||
#!/bin/bash
|
||||
|
||||
|
||||
# This script does discriminative training on top of the online, multi-splice
|
||||
# system trained in run_nnet2_ms.sh.
|
||||
# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
|
||||
# since the lattice generation runs in about real-time, so takes of the order of
|
||||
# 1000 hours of CPU time.
|
||||
#
|
||||
# Note: rather than using any features we have dumped on disk, this script
|
||||
# regenerates them from the wav data three times-- when we do lattice
|
||||
# generation, numerator alignment and discriminative training. This made the
|
||||
# script easier to write and more generic, because we don't have to know where
|
||||
# the features and the iVectors are, but of course it's a little inefficient.
|
||||
# The time taken is dominated by the lattice generation anyway, so this isn't
|
||||
# a huge deal.
|
||||
|
||||
. cmd.sh
|
||||
|
||||
|
||||
stage=0
|
||||
train_stage=-10
|
||||
use_gpu=true
|
||||
srcdir=exp/nnet2_online/nnet_ms_a
|
||||
criterion=smbr
|
||||
drop_frames=false # only matters for MMI anyway.
|
||||
effective_lrate=0.000005
|
||||
num_jobs_nnet=6
|
||||
train_stage=-10 # can be used to start training in the middle.
|
||||
decode_start_epoch=0 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
|
||||
num_epochs=4
|
||||
cleanup=false # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
|
||||
# alignments and degs).
|
||||
|
||||
set -e
|
||||
. cmd.sh
|
||||
. ./path.sh
|
||||
. ./utils/parse_options.sh
|
||||
|
||||
|
||||
if $use_gpu; then
|
||||
if ! cuda-compiled; then
|
||||
cat <<EOF && exit 1
|
||||
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
|
||||
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
|
||||
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
|
||||
EOF
|
||||
fi
|
||||
parallel_opts="-l gpu=1"
|
||||
num_threads=1
|
||||
else
|
||||
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
|
||||
# almost the same, but this may be a little bit slow.
|
||||
num_threads=16
|
||||
parallel_opts="-pe smp $num_threads"
|
||||
fi
|
||||
|
||||
if [ ! -f ${srcdir}_online/final.mdl ]; then
|
||||
echo "$0: expected ${srcdir}_online/final.mdl to exist; first run run_nnet2_ms.sh."
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
nj=50 # this doesn't really affect anything strongly, except the num-jobs for one of
|
||||
# the phases of get_egs_discriminative2.sh below.
|
||||
num_threads_denlats=6
|
||||
subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
|
||||
# max total slots = 80 * 6 = 480.
|
||||
steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
|
||||
--nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
|
||||
data/train_960_hires data/lang $srcdir ${srcdir}_denlats || exit 1;
|
||||
|
||||
# the command below is a more generic, but slower, way to do it.
|
||||
#steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
|
||||
# --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
|
||||
# data/train_960 data/lang ${srcdir}_online ${srcdir}_denlats || exit 1;
|
||||
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ]; then
|
||||
# hardcode no-GPU for alignment, although you could use GPU [you wouldn't
|
||||
# get excellent GPU utilization though.]
|
||||
nj=350 # have a high number of jobs because this could take a while, and we might
|
||||
# have some stragglers.
|
||||
use_gpu=no
|
||||
gpu_opts=
|
||||
|
||||
steps/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
|
||||
--nj $nj data/train_960_hires data/lang $srcdir ${srcdir}_ali || exit 1;
|
||||
|
||||
# the command below is a more generic, but slower, way to do it.
|
||||
# steps/online/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
|
||||
# --nj $nj data/train_960 data/lang ${srcdir}_online ${srcdir}_ali || exit 1;
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 3 ]; then
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
|
||||
utils/create_split_dir.pl \
|
||||
/export/b0{1,2,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
|
||||
fi
|
||||
# have a higher maximum num-jobs if
|
||||
if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
|
||||
|
||||
steps/nnet2/get_egs_discriminative2.sh \
|
||||
--cmd "$decode_cmd -tc $max_jobs" \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
|
||||
--criterion $criterion --drop-frames $drop_frames \
|
||||
data/train_960_hires data/lang ${srcdir}{_ali,_denlats,/final.mdl,_degs} || exit 1;
|
||||
|
||||
# the command below is a more generic, but slower, way to do it.
|
||||
#steps/online/nnet2/get_egs_discriminative2.sh \
|
||||
# --cmd "$decode_cmd -tc $max_jobs" \
|
||||
# --criterion $criterion --drop-frames $drop_frames \
|
||||
# data/train_960 data/lang ${srcdir}{_ali,_denlats,_online,_degs} || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 4 ]; then
|
||||
steps/nnet2/train_discriminative2.sh --cmd "$decode_cmd $parallel_opts" \
|
||||
--stage $train_stage \
|
||||
--effective-lrate $effective_lrate \
|
||||
--criterion $criterion --drop-frames $drop_frames \
|
||||
--num-epochs $num_epochs \
|
||||
--num-jobs-nnet 6 --num-threads $num_threads \
|
||||
${srcdir}_degs ${srcdir}_${criterion}_${effective_lrate} || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 5 ]; then
|
||||
dir=${srcdir}_${criterion}_${effective_lrate}
|
||||
ln -sf $(readlink -f ${srcdir}_online/conf) $dir/conf # so it acts like an online-decoding directory
|
||||
|
||||
for epoch in $(seq $decode_start_epoch $num_epochs); do
|
||||
for test in dev_clean dev_other; do
|
||||
(
|
||||
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 50 \
|
||||
--iter epoch$epoch exp/tri6b/graph_tgsmall data/${test} $dir/decode_epoch${epoch}_${test}_tgsmall || exit 1
|
||||
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
|
||||
data/${test} $dir/decode_epoch${epoch}_${test}_{tgsmall,tgmed} || exit 1;
|
||||
steps/lmrescore_const_arpa.sh \
|
||||
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
|
||||
data/$test $dir/decode_epoch${epoch}_${test}_{tgsmall,tglarge} || exit 1;
|
||||
) &
|
||||
done
|
||||
done
|
||||
wait
|
||||
for dir in $dir/decode*; do grep WER $dir/wer_* | utils/best_wer.sh; done
|
||||
fi
|
||||
|
||||
if [ $stage -le 6 ] && $cleanup; then
|
||||
# if you run with "--cleanup true --stage 6" you can clean up.
|
||||
rm ${srcdir}_denlats/lat.*.gz || true
|
||||
rm ${srcdir}_ali/ali.*.gz || true
|
||||
steps/nnet2/remove_egs.sh ${srcdir}_degs || true
|
||||
fi
|
||||
|
||||
|
||||
exit 0;
|
|
@ -260,6 +260,9 @@ steps/train_quick.sh --cmd "$train_cmd" \
|
|||
done
|
||||
)&
|
||||
|
||||
# steps/cleanup/debug_lexicon.sh --remove-stress true --nj 200 --cmd "$train_cmd" data/train_clean_100 \
|
||||
# data/lang exp/tri6b data/local/dict/lexicon.txt exp/debug_lexicon_100h
|
||||
|
||||
# Perform RNNLM rescoring of tri6b
|
||||
# Attention: with default settings requires 4 GB of memory per rescoring job, so commenting this out by default
|
||||
# local/run_rnnlm.sh $data data/local/lm
|
||||
|
@ -271,4 +274,13 @@ local/nnet2/run_7a_960.sh || exit 1
|
|||
## we've found that this isn't helpful-- see the comments in local/run_data_cleaning.sh
|
||||
#local/run_data_cleaning.sh
|
||||
|
||||
# local/online/run_nnet2.sh
|
||||
|
||||
# # The following is the current online-nnet2 recipe, with "multi-splice".
|
||||
# local/online/run_nnet2_ms.sh
|
||||
|
||||
# # The following is the discriminative-training continuation of the above.
|
||||
# local/online/run_nnet2_ms_disc.sh
|
||||
|
||||
# ## The following is an older version of the online-nnet2 recipe, without "multi-splice". It's faster
|
||||
# ## to train but slightly worse.
|
||||
# # local/online/run_nnet2.sh
|
||||
|
|
Загрузка…
Ссылка в новой задаче