some new tuning experiments on chain+swbd setup; add --xent-separate-forward-affine option to make_jesus_configs.py; some cleanup in librispeech/s5/run.sh

This commit is contained in:
Daniel Povey 2016-02-22 16:47:19 -05:00
Родитель 71b30095e2
Коммит 3e73f67d29
9 изменённых файлов: 622 добавлений и 108 удалений

Просмотреть файл

@ -10,8 +10,8 @@ data=/export/a15/vpanayotov/data
data_url=www.openslr.org/resources/12
lm_url=www.openslr.org/resources/11
. cmd.sh
. path.sh
. ./cmd.sh
. ./path.sh
# you might not want to do this for interactive shells.
set -e
@ -24,12 +24,12 @@ for part in dev-clean test-clean dev-other test-other train-clean-100; do
done
# download the LM resources
local/download_lm.sh $lm_url data/local/lm || exit 1
local/download_lm.sh $lm_url data/local/lm
# format the data as Kaldi data directories
for part in dev-clean test-clean dev-other test-other train-clean-100; do
# use underscore-separated names in data directories.
local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g) || exit 1
local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g)
done
## Optional text corpus normalization and LM training
@ -39,7 +39,7 @@ done
## well as some intermediate data(e.g. the normalized text used for LM training),
## are available for download at http://www.openslr.org/11/
#local/lm/train_lm.sh $LM_CORPUS_ROOT \
# data/local/lm/norm/tmp data/local/lm/norm/norm_texts data/local/lm || exit 1
# data/local/lm/norm/tmp data/local/lm/norm/norm_texts data/local/lm
## Optional G2P training scripts.
## As the LM training scripts above, this script is intended primarily to
@ -49,24 +49,24 @@ done
# when "--stage 3" option is used below we skip the G2P steps, and use the
# lexicon we have already downloaded from openslr.org/11/
local/prepare_dict.sh --stage 3 --nj 30 --cmd "$train_cmd" \
data/local/lm data/local/lm data/local/dict_nosp || exit 1
data/local/lm data/local/lm data/local/dict_nosp
utils/prepare_lang.sh data/local/dict_nosp \
"<SPOKEN_NOISE>" data/local/lang_tmp_nosp data/lang_nosp || exit 1;
"<SPOKEN_NOISE>" data/local/lang_tmp_nosp data/lang_nosp
local/format_lms.sh --src-dir data/lang_nosp data/local/lm || exit 1
local/format_lms.sh --src-dir data/lang_nosp data/local/lm
# Create ConstArpaLm format language model for full 3-gram and 4-gram LMs
utils/build_const_arpa_lm.sh data/local/lm/lm_tglarge.arpa.gz \
data/lang_nosp data/lang_nosp_test_tglarge || exit 1;
data/lang_nosp data/lang_nosp_test_tglarge
utils/build_const_arpa_lm.sh data/local/lm/lm_fglarge.arpa.gz \
data/lang_nosp data/lang_nosp_test_fglarge || exit 1;
data/lang_nosp data/lang_nosp_test_fglarge
mfccdir=mfcc
# spread the mfccs over various machines, as this data-set is quite large.
if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename.
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \
utils/create_split_dir.pl /export/b{02,11,12,13}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \
$mfccdir/storage
fi
@ -87,15 +87,15 @@ utils/subset_data_dir.sh data/train_clean_100 10000 data/train_10k
# train a monophone system
steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \
data/train_2kshort data/lang_nosp exp/mono || exit 1;
data/train_2kshort data/lang_nosp exp/mono
# decode using the monophone model
(
utils/mkgraph.sh --mono data/lang_nosp_test_tgsmall \
exp/mono exp/mono/graph_nosp_tgsmall || exit 1
exp/mono exp/mono/graph_nosp_tgsmall
for test in test_clean test_other dev_clean dev_other; do
steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/mono/graph_nosp_tgsmall \
data/$test exp/mono/decode_nosp_tgsmall_$test || exit 1
data/$test exp/mono/decode_nosp_tgsmall_$test
done
)&
@ -104,97 +104,97 @@ steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
# train a first delta + delta-delta triphone system on a subset of 5000 utterances
steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1 || exit 1;
2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1
# decode using the tri1 model
(
utils/mkgraph.sh data/lang_nosp_test_tgsmall \
exp/tri1 exp/tri1/graph_nosp_tgsmall || exit 1;
exp/tri1 exp/tri1/graph_nosp_tgsmall
for test in test_clean test_other dev_clean dev_other; do
steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgsmall \
data/$test exp/tri1/decode_nosp_tgsmall_$test || exit 1;
data/$test exp/tri1/decode_nosp_tgsmall_$test
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test || exit 1;
data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test
done
)&
steps/align_si.sh --nj 10 --cmd "$train_cmd" \
data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k || exit 1;
data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k
# train an LDA+MLLT system.
steps/train_lda_mllt.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" 2500 15000 \
data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b || exit 1;
data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b
# decode using the LDA+MLLT model
(
utils/mkgraph.sh data/lang_nosp_test_tgsmall \
exp/tri2b exp/tri2b/graph_nosp_tgsmall || exit 1;
exp/tri2b exp/tri2b/graph_nosp_tgsmall
for test in test_clean test_other dev_clean dev_other; do
steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgsmall \
data/$test exp/tri2b/decode_nosp_tgsmall_$test || exit 1;
data/$test exp/tri2b/decode_nosp_tgsmall_$test
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test || exit 1;
data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test
done
)&
# Align a 10k utts subset using the tri2b model
steps/align_si.sh --nj 10 --cmd "$train_cmd" --use-graphs true \
data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k || exit 1;
data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k
# Train tri3b, which is LDA+MLLT+SAT on 10k utts
steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b || exit 1;
data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b
# decode using the tri3b model
(
utils/mkgraph.sh data/lang_nosp_test_tgsmall \
exp/tri3b exp/tri3b/graph_nosp_tgsmall || exit 1;
exp/tri3b exp/tri3b/graph_nosp_tgsmall
for test in test_clean test_other dev_clean dev_other; do
steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
exp/tri3b/graph_nosp_tgsmall data/$test \
exp/tri3b/decode_nosp_tgsmall_$test || exit 1;
exp/tri3b/decode_nosp_tgsmall_$test
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test || exit 1;
data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test
done
)&
# align the entire train_clean_100 subset using the tri3b model
steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
data/train_clean_100 data/lang_nosp \
exp/tri3b exp/tri3b_ali_clean_100 || exit 1;
exp/tri3b exp/tri3b_ali_clean_100
# train another LDA+MLLT+SAT system on the entire 100 hour subset
steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
data/train_clean_100 data/lang_nosp \
exp/tri3b_ali_clean_100 exp/tri4b || exit 1;
exp/tri3b_ali_clean_100 exp/tri4b
# decode using the tri4b model
(
utils/mkgraph.sh data/lang_nosp_test_tgsmall \
exp/tri4b exp/tri4b/graph_nosp_tgsmall || exit 1;
exp/tri4b exp/tri4b/graph_nosp_tgsmall
for test in test_clean test_other dev_clean dev_other; do
steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
exp/tri4b/graph_nosp_tgsmall data/$test \
exp/tri4b/decode_nosp_tgsmall_$test || exit 1;
exp/tri4b/decode_nosp_tgsmall_$test
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test || exit 1;
data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \
data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test || exit 1;
data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test
done
)&
@ -205,125 +205,125 @@ steps/get_prons.sh --cmd "$train_cmd" \
utils/dict_dir_add_pronprobs.sh --max-normalize true \
data/local/dict_nosp \
exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \
exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict || exit 1
exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict
utils/prepare_lang.sh data/local/dict \
"<SPOKEN_NOISE>" data/local/lang_tmp data/lang
local/format_lms.sh --src-dir data/lang data/local/lm
utils/build_const_arpa_lm.sh \
data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge || exit 1;
data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge
utils/build_const_arpa_lm.sh \
data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge || exit 1;
data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge
# decode using the tri4b model with pronunciation and silence probabilities
(
utils/mkgraph.sh \
data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall || exit 1;
data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall
for test in test_clean test_other dev_clean dev_other; do
steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
exp/tri4b/graph_tgsmall data/$test \
exp/tri4b/decode_tgsmall_$test || exit 1;
exp/tri4b/decode_tgsmall_$test
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test || exit 1;
data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test || exit 1;
data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test || exit 1;
data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test
done
)&
# align train_clean_100 using the tri4b model
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100 || exit 1;
data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100
# if you want at this point you can train and test NN model(s) on the 100 hour
# subset
local/nnet2/run_5a_clean_100.sh || exit 1
local/nnet2/run_5a_clean_100.sh
local/download_and_untar.sh $data $data_url train-clean-360 || exit 1;
local/download_and_untar.sh $data $data_url train-clean-360
# now add the "clean-360" subset to the mix ...
local/data_prep.sh \
$data/LibriSpeech/train-clean-360 data/train_clean_360 || exit 1
$data/LibriSpeech/train-clean-360 data/train_clean_360
steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_clean_360 \
exp/make_mfcc/train_clean_360 $mfccdir || exit 1
exp/make_mfcc/train_clean_360 $mfccdir
steps/compute_cmvn_stats.sh \
data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir || exit 1
data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir
# ... and then combine the two sets into a 460 hour one
utils/combine_data.sh \
data/train_clean_460 data/train_clean_100 data/train_clean_360 || exit 1
data/train_clean_460 data/train_clean_100 data/train_clean_360
# align the new, combined set, using the tri4b model
steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460 || exit 1;
data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460
# create a larger SAT model, trained on the 460 hours of data.
steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b || exit 1;
data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b
# decode using the tri5b model
(
utils/mkgraph.sh data/lang_test_tgsmall \
exp/tri5b exp/tri5b/graph_tgsmall || exit 1;
exp/tri5b exp/tri5b/graph_tgsmall
for test in test_clean test_other dev_clean dev_other; do
steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
exp/tri5b/graph_tgsmall data/$test \
exp/tri5b/decode_tgsmall_$test || exit 1;
exp/tri5b/decode_tgsmall_$test
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test || exit 1;
data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test || exit 1;
data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test || exit 1;
data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test
done
)&
# train a NN model on the 460 hour set
local/nnet2/run_6a_clean_460.sh || exit 1
local/nnet2/run_6a_clean_460.sh
local/download_and_untar.sh $data $data_url train-other-500 || exit 1;
local/download_and_untar.sh $data $data_url train-other-500
# prepare the 500 hour subset.
local/data_prep.sh \
$data/LibriSpeech/train-other-500 data/train_other_500 || exit 1
$data/LibriSpeech/train-other-500 data/train_other_500
steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_other_500 \
exp/make_mfcc/train_other_500 $mfccdir || exit 1
exp/make_mfcc/train_other_500 $mfccdir
steps/compute_cmvn_stats.sh \
data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir || exit 1
data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir
# combine all the data
utils/combine_data.sh \
data/train_960 data/train_clean_460 data/train_other_500 || exit 1
data/train_960 data/train_clean_460 data/train_other_500
steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
data/train_960 data/lang exp/tri5b exp/tri5b_ali_960 || exit 1;
data/train_960 data/lang exp/tri5b exp/tri5b_ali_960
# train a SAT model on the 960 hour mixed data. Use the train_quick.sh script
# as it is faster.
steps/train_quick.sh --cmd "$train_cmd" \
7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b || exit 1;
7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b
# decode using the tri6b model
(
utils/mkgraph.sh data/lang_test_tgsmall \
exp/tri6b exp/tri6b/graph_tgsmall || exit 1;
exp/tri6b exp/tri6b/graph_tgsmall
for test in test_clean test_other dev_clean dev_other; do
steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test || exit 1;
exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test || exit 1;
data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test || exit 1;
data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test
steps/lmrescore_const_arpa.sh \
--cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test || exit 1;
data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test
done
)&
@ -349,7 +349,7 @@ steps/train_quick.sh --cmd "$train_cmd" \
# train NN models on the entire dataset
local/nnet2/run_7a_960.sh || exit 1
local/nnet2/run_7a_960.sh
# # train models on cleaned-up data
# # we've found that this isn't helpful-- see the comments in local/run_data_cleaning.sh

Просмотреть файл

@ -17,5 +17,6 @@ ones to look at right now:
5v is what I am currently using as a baseline- it has an even smaller
--jesus-hidden-dim as 5t (hence faster to train), but gives the same
performance.
6g is a setup with a 'thinner' jesus-layer (with only one repeated-affine component)
and slightly more parameters, which is quicker to train than 5v but gives
about the same results. I'm hoping to use this setup, going forward.

Просмотреть файл

@ -1,6 +1,6 @@
#!/bin/bash
# _5w is as _5x but decreasing the context of the averaging layer from +-0.99
# _5x is as _5w but decreasing the context of the averaging layer from +-0.99
# seconds to +-0.66 seconds. I would not have expected this to work a priori,
# but the change from 5k -> 5l, which made the context wider, made WERs slightly
# worse, so I'd like to see what happens when we decrease the context.

Просмотреть файл

@ -1,15 +1,17 @@
#!/bin/bash
# _5z is as _5v, but adding skip-splicing (a new configuration option)
# It seems definitely not helpful. I'll remove the option soon.
#local/chain/compare_wer.sh 5v 5z
#System 5v 5z
#WER on train_dev(tg) 15.38 15.60
#WER on train_dev(fg) 14.39 14.50
#WER on eval2000(tg) 17.4 17.6
#WER on eval2000(fg) 15.7 15.9
#Final train prob -0.11156 -0.113823
#Final valid prob -0.131797 -0.131356
# It seems not helpful. I'll remove the option soon.
# note: 5v2 is a rerun of 5v.
# local/chain/compare_wer.sh 5v 5v2 5z
# System 5v 5v2 5z
# WER on train_dev(tg) 15.38 15.74 15.60
# WER on train_dev(fg) 14.39 14.50 14.50
# WER on eval2000(tg) 17.4 17.5 17.6
# WER on eval2000(fg) 15.7 15.9 15.9
# Final train prob -0.11156 -0.112155 -0.113823
# Final valid prob -0.131797 -0.129516 -0.131356
# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.

Просмотреть файл

@ -3,15 +3,15 @@
# _6c is as _5v but adding "--thick-jesus-layer true" (new option): extra hidden
# layer inside jesus layer.
# Doesn't seem to be helpful.
#local/chain/compare_wer.sh 5v 6c
#System 5v 6c
#WER on train_dev(tg) 15.38 15.54
#WER on train_dev(fg) 14.39 14.55
#WER on eval2000(tg) 17.4 17.5
#WER on eval2000(fg) 15.7 15.8
#Final train prob -0.11156 -0.114084
#Final valid prob -0.131797 -0.129589
# Note: 5v2 is a rerun of 5v.
#local/chain/compare_wer.sh 5v 5v2 6c
#System 5v 5v2 6c
#WER on train_dev(tg) 15.38 15.74 15.54
#WER on train_dev(fg) 14.39 14.50 14.55
#WER on eval2000(tg) 17.4 17.5 17.5
#WER on eval2000(fg) 15.7 15.9 15.8
#Final train prob -0.11156 -0.112155 -0.114084
#Final valid prob -0.131797 -0.129516 -0.129589
# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.

Просмотреть файл

@ -4,15 +4,16 @@
# this means (after rounding) that we have 6, not 5, as
# --jesus-forward-input-dim / --num-jesus-blocks.
#a bit worse.
#a03:s5c: local/chain/compare_wer.sh 5v 6d
#System 5v 6d
#WER on train_dev(tg) 15.38 15.66
#WER on train_dev(fg) 14.39 14.54
#WER on eval2000(tg) 17.4 17.5
#WER on eval2000(fg) 15.7 15.8
#Final train prob -0.11156 -0.112034
#Final valid prob -0.131797 -0.131714
# no clear difference.
#[note, 5v2 is a rerun of 5v].
# local/chain/compare_wer.sh 5v 5v2 6d
# System 5v 5v2 6d
# WER on train_dev(tg) 15.38 15.74 15.66
# WER on train_dev(fg) 14.39 14.50 14.54
# WER on eval2000(tg) 17.4 17.5 17.5
# WER on eval2000(fg) 15.7 15.9 15.8
# Final train prob -0.11156 -0.112155 -0.112034
# Final valid prob -0.131797 -0.129516 -0.131714
# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.

Просмотреть файл

@ -3,6 +3,17 @@
# _6g is as _6f but increasing the parameters (increasing
# jesus-forward-input-from from 500 to 600).
# seems better than 6f, and about the same as (5v,5v2). encouraging.
# note, 5v2 is rerun of 5v.
#local/chain/compare_wer.sh 5v 5v2 6f 6g
#System 5v 5v2 6f 6g
#WER on train_dev(tg) 15.38 15.74 15.71 15.50
#WER on train_dev(fg) 14.39 14.50 14.50 14.31
#WER on eval2000(tg) 17.4 17.5 17.5 17.5
#WER on eval2000(fg) 15.7 15.9 15.9 15.8
#Final train prob -0.11156 -0.112155 -0.111305 -0.105853
#Final valid prob -0.131797 -0.129516 -0.131487 -0.129997
# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change
# means there is no hidden part in the jesus layer (it's just repeated affine and relu).

Просмотреть файл

@ -0,0 +1,483 @@
#!/bin/bash
# _6h is as _6g but adding --xent-separate-forward-affine=true, which
# gives a separate last-but-one weight matrix to the xent output.
# _6g is as _6f but increasing the parameters (increasing
# jesus-forward-input-from from 500 to 600).
# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change
# means there is no hidden part in the jesus layer (it's just repeated affine and relu).
# slightly worse, but encouragingly small difference.
#local/chain/compare_wer.sh 5v 6f
#System 5v 6f
#WER on train_dev(tg) 15.38 15.71
#WER on train_dev(fg) 14.39 14.50
#WER on eval2000(tg) 17.4 17.5
#WER on eval2000(fg) 15.7 15.9
#Final train prob -0.11156 -0.111305
#Final valid prob -0.131797 -0.131487
# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
#
#local/chain/compare_wer.sh 5e 5s 5t 5v
#System 5e 5s 5t 5v
#WER on train_dev(tg) 15.43 15.47 15.43 15.38
#WER on train_dev(fg) 14.32 14.31 14.34 14.39
#WER on eval2000(tg) 17.3 17.4 17.4 17.4
#WER on eval2000(fg) 15.5 15.6 15.6 15.7
#Final train prob -0.110056 -0.110928 -0.110752 -0.11156
#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797
# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
# up), from 5000 to 3500.
# about 5s: comparing with 5e which is the most recent baseline we actually
# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
# jesus-hidden-dim reduced 7500 to 5000, and and the new option
# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even
# smaller jesus-hidden-dims.
# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
# code to a bug which was doubling the thresholds so there was, in effect,
# no upper threshold. I stopped the p,q,r runs after I found this, but in
# configuring this run I'm bearing in mind the train and valid probs from the
# p,q,r runs.
# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
# to compensate for the fact that more of the output dimensions are now being
# usefully used.
# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
# ReLUs that are over or under-saturated.
# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
# in the train and valid probs.
#System 5b 5e
#WER on train_dev(tg) 15.51 15.43
#WER on train_dev(fg) 14.39 14.32
#WER on eval2000(tg) 17.3 17.3
#WER on eval2000(fg) 15.6 15.5
#Final train prob -0.112013 -0.110056
#Final valid prob -0.130879 -0.129184
# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
#./compare_wer.sh 5a 5b
#System 5a 5b
#WER on train_dev(tg) 15.86 15.51
#WER on train_dev(fg) 14.74 14.39
#WER on eval2000(tg) 17.4 17.3
#WER on eval2000(fg) 15.6 15.6
#Final train prob -0.0998359 -0.112013
#Final valid prob -0.115884 -0.130879
# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization
# will mean that the increased parameters are now helpful.
# quite helpful:
#local/chain/compare_wer.sh 4w 5a
#System 4w 5a
#WER on train_dev(tg) 16.05 15.86
#WER on train_dev(fg) 14.92 14.74
#WER on eval2000(tg) 18.0 17.4
#WER on eval2000(fg) 16.2 15.6
#Final train prob -0.108816-0.0998359
#Final valid prob -0.118254 -0.115884
# _4w is as _4v, but doubling --xent-regularize to 0.2
# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change
# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
# layer, and this limits the rate of change of the other layers.
# _4r is as _4f, but one more hidden layer, and reducing context of existing
# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly
# from 1500 to 1400.
# This is better than 4f by almost all metrics.
# ./compare_wer.sh 4f 4r
# System 4f 4r
# WER on train_dev(tg) 16.83 16.50
# WER on train_dev(fg) 15.73 15.45
# WER on eval2000(tg) 18.4 18.3
# WER on eval2000(fg) 16.6 16.7
# Final train prob -0.105832 -0.103652
# Final valid prob -0.123021 -0.121105
# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
# It's even better than 4e, by about 0.3% abs.
# 4c 4e 4f
# Final valid prob: -0.1241 -0.1267 -0.1230
# Final train prob: -0.08820 -0.1149 -0.1058
# ./show_wer.sh 4f
# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
# a03:s5c: ./show_wer.sh 4e
# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
# _4e is as _4c, but adding the option --l2-regularize 0.0001.
# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
# _4a is as _3s, but using narrower splice-indexes in the first layer.
# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
# This of course reduces overtraining. Results are a bit better than 3p but still
# not as good as 2y
# ./show_wer.sh 3s
# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
# a03:s5c: ./show_wer.sh 3p
# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
# a03:s5c: ./show_wer.sh 2y
# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
# _3r is as _3p but reducing the number of parameters as it seemed to be
# overtraining (despite already being quite a small model): [600,1800 ->
# 500,1500]. Also in the interim there was a script change to
# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
# with the halving of the minibatch size.]
# _3p is the same as 3o, but after a code and script change so we can use
# natural gradient for the RepeatedAffineComponent.
# [natural gradient was helpful, based on logs;
# also made a change to use positive bias for the jesus-component affine parts.]
# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
# recurrence, with improvements to the learning of the jesus layers.
# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
# to be worse.
# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
# is helpful.]
#./show_wer.sh 3g
#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
#a03:s5c: ./show_wer.sh 2y
#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
#a03:s5c: ./show_wer.sh 3d
#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
# Therefore it's
# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra
# context, and this isn't really ideal - I want to see if this seems promising first.
# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
# to 200 in order to reduce computation in the Jesus layer.
# _3d is as _2y, and re-using the egs, but using --jesus-opts and
# configs from make_jesus_configs.py.
# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
# 800k to 1.2 million. The aim is to avoid some of the per-job overhead
# (model-averaging, etc.), since each iteration takes only a minute or so.
# I added the results to the table below. It seems the same on average-
# which is good. We'll probably keep this configuration.
# _2o is as _2m, but going back to our original 2-state topology, which it turns
# out that I never tested to WER.
# hm--- it's about the same, or maybe slightly better!
# caution: accidentally overwrote most of this dir, but kept the key stuff.
# note: when I compare with the rerun of 2o (not shown), this run is actually
# better.
# WER on 2m 2o 2y [ now comparing 2o->2y:]
# train_dev,tg 17.22 17.24 16.99 0.2% better
# train_dev,fg 15.87 15.93 15.86 0.1% better
# eval2000,tg 18.7 18.7 18.9 0.2% worse
# eval2000,fg 17.0 16.9 17.0 0.1% worse
# train-prob,final -0.0803 -0.0835
# valid-prob,final -0.0116 -0.0122
# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
# that mechanism.
# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
# set --apply-deriv-weights false and --frames-overlap-per-eg 0.
# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
# the log-like change when deciding which states to back off. The code is not the same
# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By
# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration
# is quite similar to 2d, except new/more-exact code is used.
# _2d is as _2c but with different LM options:
# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
# provided from the tree-building, and effectively puts the leftmost context position as a single
# set.
# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
# _2c is as _2a but after a code change in which we start using transition-scale
# and self-loop-scale of 1 instead of zero in training; we change the options to
# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect
# results at all; it's is mainly for convenience in pushing weights in graphs,
# and checking that graphs are stochastic.
# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
# _z is as _x but setting --lm-opts "--num-extra-states=2000".
# (see also y, which has --num-extra-states=500).
# _x is as _s but setting --lm-opts "--num-extra-states=0".
# this is a kind of repeat of the u->v experiment, where it seemed to make things
# worse, but there were other factors involved in that so I want to be sure.
# _s is as _q but setting pdf-boundary-penalty to 0.0
# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
# and 18.07 -> 16.96 on train_dev, after fg rescoring.
# _q is as _p except making the same change as from n->o, which
# reduces the parameters to try to reduce over-training. We reduce
# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
# and modify the splicing setup.
# note: I don't rerun the tree-building, I just use the '5o' treedir.
# _p is as _m except with a code change in which we switch to a different, more
# exact mechanism to deal with the edges of the egs, and correspondingly
# different script options... we now dump weights with the egs, and apply the
# weights to the derivative w.r.t. the output instead of using the
# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap
# to 30 also. This wil. give 10 frames on each side with zero derivs, then
# ramping up to a weight of 1.0 over 10 frames.
# _m is as _k but after a code change that makes the denominator FST more
# compact. I am rerunning in order to verify that the WER is not changed (since
# it's possible in principle that due to edge effects related to weight-pushing,
# the results could be a bit different).
# The results are inconsistently different but broadly the same. On all of eval2000,
# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
# option and setting max-param-change to 1.. Using the same egs.
# _i is as _h but longer egs: 150 frames instead of 75, and
# 128 elements per minibatch instead of 256.
# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
# _g is as _f but more splicing at last layer.
# _f is as _e but with 30 as the number of left phone classes instead
# of 10.
# _e is as _d but making it more similar in configuration to _b.
# (turns out b was better than a after all-- the egs' likelihoods had to
# be corrected before comparing them).
# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
# _d is as _c but with a modified topology (with 4 distinct states per phone
# instead of 2), and a slightly larger num-states (8000) to compensate for the
# different topology, which has more states.
# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
# as the default) as it's not clear that it was helpful; using the old learning-rates;
# and modifying the target-num-states to 7000.
# _b is as as _a except for configuration changes: using 12k num-leaves instead of
# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
# which will make the final layer learn less fast compared with other layers.
set -e
# configs for 'chain'
stage=12
train_stage=-10
get_egs_stage=-10
speed_perturb=true
dir=exp/chain/tdnn_6h # Note: _sp will get added to this if $speed_perturb == true.
# training options
num_epochs=4
initial_effective_lrate=0.001
final_effective_lrate=0.0001
leftmost_questions_truncate=-1
max_param_change=2.0
final_layer_normalize_target=0.5
num_jobs_initial=3
num_jobs_final=16
minibatch_size=128
frames_per_eg=150
remove_egs=false
# End configuration section.
echo "$0 $@" # Print the command line for logging
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
fi
# The iVector-extraction and feature-dumping parts are the same as the standard
# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
# run those things.
suffix=
if [ "$speed_perturb" == "true" ]; then
suffix=_sp
fi
dir=${dir}$suffix
train_set=train_nodup$suffix
ali_dir=exp/tri4_ali_nodup$suffix
treedir=exp/chain/tri5_2y_tree$suffix
lang=data/lang_chain_2y
# if we are using the speed-perturbed data we need to generate
# alignments for it.
local/nnet3/run_ivector_common.sh --stage $stage \
--speed-perturb $speed_perturb \
--generate-alignments $speed_perturb || exit 1;
if [ $stage -le 9 ]; then
# Get the alignments as lattices (gives the CTC training more freedom).
# use the same num-jobs as the alignments
nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
data/lang exp/tri4 exp/tri4_lats_nodup$suffix
rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
fi
if [ $stage -le 10 ]; then
# Create a version of the lang/ directory that has one state per phone in the
# topo file. [note, it really has two states.. the first one is only repeated
# once, the second one has zero or more repeats.]
rm -rf $lang
cp -r data/lang $lang
silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
# Use our special topology... note that later on may have to tune this
# topology.
steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
fi
if [ $stage -le 11 ]; then
# Build a tree using our new topology.
steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
--leftmost-questions-truncate $leftmost_questions_truncate \
--cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
fi
if [ $stage -le 12 ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
utils/create_split_dir.pl \
/export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
fi
touch $dir/egs/.nodelete # keep egs around when that run dies.
steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
--xent-regularize 0.1 \
--leaky-hmm-coefficient 0.1 \
--l2-regularize 0.00005 \
--egs-dir exp/chain/tdnn_2y_sp/egs \
--jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
--splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
--apply-deriv-weights false \
--frames-per-iter 1200000 \
--lm-opts "--num-extra-lm-states=2000" \
--get-egs-stage $get_egs_stage \
--minibatch-size $minibatch_size \
--egs-opts "--frames-overlap-per-eg 0" \
--frames-per-eg $frames_per_eg \
--num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
--feat-type raw \
--online-ivector-dir exp/nnet3/ivectors_${train_set} \
--cmvn-opts "--norm-means=false --norm-vars=false" \
--initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
--max-param-change $max_param_change \
--cmd "$decode_cmd" \
--remove-egs $remove_egs \
data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1;
fi
if [ $stage -le 13 ]; then
# Note: it might appear that this $lang directory is mismatched, and it is as
# far as the 'topo' is concerned, but this script doesn't read the 'topo' from
# the lang directory.
utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
fi
decode_suff=sw1_tg
graph_dir=$dir/graph_sw1_tg
if [ $stage -le 14 ]; then
for decode_set in train_dev eval2000; do
(
steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
--extra-left-context 20 \
--nj 50 --cmd "$decode_cmd" \
--online-ivector-dir exp/nnet3/ivectors_${decode_set} \
$graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
if $has_fisher; then
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
$dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
fi
) &
done
fi
wait;
exit 0;

Просмотреть файл

@ -32,6 +32,9 @@ parser.add_argument("--xent-regularize", type=float,
help="For chain models, if nonzero, add a separate output for cross-entropy "
"regularization (with learning-rate-factor equal to the inverse of this)",
default=0.0)
parser.add_argument("--xent-separate-forward-affine", type=str,
help="if using --xent-regularize, gives it separate last-but-one weight matrix",
default="false", choices = ["false", "true"])
parser.add_argument("--use-repeated-affine", type=str,
help="if true use RepeatedAffineComponent, else BlockAffineComponent (i.e. no sharing)",
default="true", choices = ["false", "true"])
@ -462,6 +465,19 @@ for l in range(1, num_hidden_layers + 1):
print('output-node name=output input=final-affine', file=f)
if args.xent_regularize != 0.0:
xent_input = 'final-relu'
if l == num_hidden_layers and args.xent_separate_forward_affine == "true":
print('component name=forward-affine{0}-xent type=NaturalGradientAffineComponent '
'input-dim={1} output-dim={2} bias-stddev=0'.
format(l, args.jesus_forward_output_dim, args.final_hidden_dim), file=f)
print('component-node name=jesus{0}-forward-output-affine-xent component=forward-affine{0}-xent input=post-jesus{0}'.format(
l), file=f)
print('component name=final-relu-xent type=RectifiedLinearComponent dim={0} self-repair-scale={1}'.format(
args.final_hidden_dim, args.self_repair_scale), file=f)
print('component-node name=final-relu-xent component=final-relu-xent '
'input=jesus{0}-forward-output-affine-xent'.format(l), file=f)
xent_input = 'final-relu-xent'
# This block prints the configs for a separate output that will be
# trained with a cross-entropy objective in the 'chain' models... this
# has the effect of regularizing the hidden parts of the model. we use
@ -473,8 +489,8 @@ for l in range(1, num_hidden_layers + 1):
print('component name=final-affine-xent type=NaturalGradientAffineComponent '
'input-dim={0} output-dim={1} param-stddev=0.0 bias-stddev=0 learning-rate-factor={2}'.format(
cur_affine_output_dim, args.num_targets, 0.5 / args.xent_regularize), file=f)
print('component-node name=final-affine-xent component=final-affine-xent input=final-relu',
file=f)
print('component-node name=final-affine-xent component=final-affine-xent input={0}'.format(
xent_input), file=f)
print('component name=final-log-softmax-xent type=LogSoftmaxComponent dim={0}'.format(
args.num_targets), file=f)
print('component-node name=final-log-softmax-xent component=final-log-softmax-xent '