зеркало из https://github.com/mozilla/kaldi.git
some new tuning experiments on chain+swbd setup; add --xent-separate-forward-affine option to make_jesus_configs.py; some cleanup in librispeech/s5/run.sh
This commit is contained in:
Родитель
71b30095e2
Коммит
3e73f67d29
|
@ -10,8 +10,8 @@ data=/export/a15/vpanayotov/data
|
|||
data_url=www.openslr.org/resources/12
|
||||
lm_url=www.openslr.org/resources/11
|
||||
|
||||
. cmd.sh
|
||||
. path.sh
|
||||
. ./cmd.sh
|
||||
. ./path.sh
|
||||
|
||||
# you might not want to do this for interactive shells.
|
||||
set -e
|
||||
|
@ -24,12 +24,12 @@ for part in dev-clean test-clean dev-other test-other train-clean-100; do
|
|||
done
|
||||
|
||||
# download the LM resources
|
||||
local/download_lm.sh $lm_url data/local/lm || exit 1
|
||||
local/download_lm.sh $lm_url data/local/lm
|
||||
|
||||
# format the data as Kaldi data directories
|
||||
for part in dev-clean test-clean dev-other test-other train-clean-100; do
|
||||
# use underscore-separated names in data directories.
|
||||
local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g) || exit 1
|
||||
local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g)
|
||||
done
|
||||
|
||||
## Optional text corpus normalization and LM training
|
||||
|
@ -39,7 +39,7 @@ done
|
|||
## well as some intermediate data(e.g. the normalized text used for LM training),
|
||||
## are available for download at http://www.openslr.org/11/
|
||||
#local/lm/train_lm.sh $LM_CORPUS_ROOT \
|
||||
# data/local/lm/norm/tmp data/local/lm/norm/norm_texts data/local/lm || exit 1
|
||||
# data/local/lm/norm/tmp data/local/lm/norm/norm_texts data/local/lm
|
||||
|
||||
## Optional G2P training scripts.
|
||||
## As the LM training scripts above, this script is intended primarily to
|
||||
|
@ -49,24 +49,24 @@ done
|
|||
# when "--stage 3" option is used below we skip the G2P steps, and use the
|
||||
# lexicon we have already downloaded from openslr.org/11/
|
||||
local/prepare_dict.sh --stage 3 --nj 30 --cmd "$train_cmd" \
|
||||
data/local/lm data/local/lm data/local/dict_nosp || exit 1
|
||||
data/local/lm data/local/lm data/local/dict_nosp
|
||||
|
||||
utils/prepare_lang.sh data/local/dict_nosp \
|
||||
"<SPOKEN_NOISE>" data/local/lang_tmp_nosp data/lang_nosp || exit 1;
|
||||
"<SPOKEN_NOISE>" data/local/lang_tmp_nosp data/lang_nosp
|
||||
|
||||
local/format_lms.sh --src-dir data/lang_nosp data/local/lm || exit 1
|
||||
local/format_lms.sh --src-dir data/lang_nosp data/local/lm
|
||||
|
||||
# Create ConstArpaLm format language model for full 3-gram and 4-gram LMs
|
||||
utils/build_const_arpa_lm.sh data/local/lm/lm_tglarge.arpa.gz \
|
||||
data/lang_nosp data/lang_nosp_test_tglarge || exit 1;
|
||||
data/lang_nosp data/lang_nosp_test_tglarge
|
||||
utils/build_const_arpa_lm.sh data/local/lm/lm_fglarge.arpa.gz \
|
||||
data/lang_nosp data/lang_nosp_test_fglarge || exit 1;
|
||||
data/lang_nosp data/lang_nosp_test_fglarge
|
||||
|
||||
mfccdir=mfcc
|
||||
# spread the mfccs over various machines, as this data-set is quite large.
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
|
||||
mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename.
|
||||
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \
|
||||
utils/create_split_dir.pl /export/b{02,11,12,13}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \
|
||||
$mfccdir/storage
|
||||
fi
|
||||
|
||||
|
@ -87,15 +87,15 @@ utils/subset_data_dir.sh data/train_clean_100 10000 data/train_10k
|
|||
|
||||
# train a monophone system
|
||||
steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \
|
||||
data/train_2kshort data/lang_nosp exp/mono || exit 1;
|
||||
data/train_2kshort data/lang_nosp exp/mono
|
||||
|
||||
# decode using the monophone model
|
||||
(
|
||||
utils/mkgraph.sh --mono data/lang_nosp_test_tgsmall \
|
||||
exp/mono exp/mono/graph_nosp_tgsmall || exit 1
|
||||
exp/mono exp/mono/graph_nosp_tgsmall
|
||||
for test in test_clean test_other dev_clean dev_other; do
|
||||
steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/mono/graph_nosp_tgsmall \
|
||||
data/$test exp/mono/decode_nosp_tgsmall_$test || exit 1
|
||||
data/$test exp/mono/decode_nosp_tgsmall_$test
|
||||
done
|
||||
)&
|
||||
|
||||
|
@ -104,97 +104,97 @@ steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
|
|||
|
||||
# train a first delta + delta-delta triphone system on a subset of 5000 utterances
|
||||
steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
|
||||
2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1 || exit 1;
|
||||
2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1
|
||||
|
||||
# decode using the tri1 model
|
||||
(
|
||||
utils/mkgraph.sh data/lang_nosp_test_tgsmall \
|
||||
exp/tri1 exp/tri1/graph_nosp_tgsmall || exit 1;
|
||||
exp/tri1 exp/tri1/graph_nosp_tgsmall
|
||||
for test in test_clean test_other dev_clean dev_other; do
|
||||
steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgsmall \
|
||||
data/$test exp/tri1/decode_nosp_tgsmall_$test || exit 1;
|
||||
data/$test exp/tri1/decode_nosp_tgsmall_$test
|
||||
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
|
||||
data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test || exit 1;
|
||||
data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test
|
||||
steps/lmrescore_const_arpa.sh \
|
||||
--cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
|
||||
data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
|
||||
data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test
|
||||
done
|
||||
)&
|
||||
|
||||
steps/align_si.sh --nj 10 --cmd "$train_cmd" \
|
||||
data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k || exit 1;
|
||||
data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k
|
||||
|
||||
|
||||
# train an LDA+MLLT system.
|
||||
steps/train_lda_mllt.sh --cmd "$train_cmd" \
|
||||
--splice-opts "--left-context=3 --right-context=3" 2500 15000 \
|
||||
data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b || exit 1;
|
||||
data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b
|
||||
|
||||
# decode using the LDA+MLLT model
|
||||
(
|
||||
utils/mkgraph.sh data/lang_nosp_test_tgsmall \
|
||||
exp/tri2b exp/tri2b/graph_nosp_tgsmall || exit 1;
|
||||
exp/tri2b exp/tri2b/graph_nosp_tgsmall
|
||||
for test in test_clean test_other dev_clean dev_other; do
|
||||
steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgsmall \
|
||||
data/$test exp/tri2b/decode_nosp_tgsmall_$test || exit 1;
|
||||
data/$test exp/tri2b/decode_nosp_tgsmall_$test
|
||||
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
|
||||
data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test || exit 1;
|
||||
data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test
|
||||
steps/lmrescore_const_arpa.sh \
|
||||
--cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
|
||||
data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
|
||||
data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test
|
||||
done
|
||||
)&
|
||||
|
||||
# Align a 10k utts subset using the tri2b model
|
||||
steps/align_si.sh --nj 10 --cmd "$train_cmd" --use-graphs true \
|
||||
data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k || exit 1;
|
||||
data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k
|
||||
|
||||
# Train tri3b, which is LDA+MLLT+SAT on 10k utts
|
||||
steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
|
||||
data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b || exit 1;
|
||||
data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b
|
||||
|
||||
# decode using the tri3b model
|
||||
(
|
||||
utils/mkgraph.sh data/lang_nosp_test_tgsmall \
|
||||
exp/tri3b exp/tri3b/graph_nosp_tgsmall || exit 1;
|
||||
exp/tri3b exp/tri3b/graph_nosp_tgsmall
|
||||
for test in test_clean test_other dev_clean dev_other; do
|
||||
steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
|
||||
exp/tri3b/graph_nosp_tgsmall data/$test \
|
||||
exp/tri3b/decode_nosp_tgsmall_$test || exit 1;
|
||||
exp/tri3b/decode_nosp_tgsmall_$test
|
||||
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
|
||||
data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test || exit 1;
|
||||
data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test
|
||||
steps/lmrescore_const_arpa.sh \
|
||||
--cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
|
||||
data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
|
||||
data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test
|
||||
done
|
||||
)&
|
||||
|
||||
# align the entire train_clean_100 subset using the tri3b model
|
||||
steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
|
||||
data/train_clean_100 data/lang_nosp \
|
||||
exp/tri3b exp/tri3b_ali_clean_100 || exit 1;
|
||||
exp/tri3b exp/tri3b_ali_clean_100
|
||||
|
||||
# train another LDA+MLLT+SAT system on the entire 100 hour subset
|
||||
steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
|
||||
data/train_clean_100 data/lang_nosp \
|
||||
exp/tri3b_ali_clean_100 exp/tri4b || exit 1;
|
||||
exp/tri3b_ali_clean_100 exp/tri4b
|
||||
|
||||
# decode using the tri4b model
|
||||
(
|
||||
utils/mkgraph.sh data/lang_nosp_test_tgsmall \
|
||||
exp/tri4b exp/tri4b/graph_nosp_tgsmall || exit 1;
|
||||
exp/tri4b exp/tri4b/graph_nosp_tgsmall
|
||||
for test in test_clean test_other dev_clean dev_other; do
|
||||
steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
|
||||
exp/tri4b/graph_nosp_tgsmall data/$test \
|
||||
exp/tri4b/decode_nosp_tgsmall_$test || exit 1;
|
||||
exp/tri4b/decode_nosp_tgsmall_$test
|
||||
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
|
||||
data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test || exit 1;
|
||||
data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test
|
||||
steps/lmrescore_const_arpa.sh \
|
||||
--cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
|
||||
data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
|
||||
data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test
|
||||
steps/lmrescore_const_arpa.sh \
|
||||
--cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \
|
||||
data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test || exit 1;
|
||||
data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test
|
||||
done
|
||||
)&
|
||||
|
||||
|
@ -205,125 +205,125 @@ steps/get_prons.sh --cmd "$train_cmd" \
|
|||
utils/dict_dir_add_pronprobs.sh --max-normalize true \
|
||||
data/local/dict_nosp \
|
||||
exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \
|
||||
exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict || exit 1
|
||||
exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict
|
||||
|
||||
utils/prepare_lang.sh data/local/dict \
|
||||
"<SPOKEN_NOISE>" data/local/lang_tmp data/lang
|
||||
local/format_lms.sh --src-dir data/lang data/local/lm
|
||||
|
||||
utils/build_const_arpa_lm.sh \
|
||||
data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge || exit 1;
|
||||
data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge
|
||||
utils/build_const_arpa_lm.sh \
|
||||
data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge || exit 1;
|
||||
data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge
|
||||
|
||||
# decode using the tri4b model with pronunciation and silence probabilities
|
||||
(
|
||||
utils/mkgraph.sh \
|
||||
data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall || exit 1;
|
||||
data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall
|
||||
for test in test_clean test_other dev_clean dev_other; do
|
||||
steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
|
||||
exp/tri4b/graph_tgsmall data/$test \
|
||||
exp/tri4b/decode_tgsmall_$test || exit 1;
|
||||
exp/tri4b/decode_tgsmall_$test
|
||||
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
|
||||
data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test || exit 1;
|
||||
data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test
|
||||
steps/lmrescore_const_arpa.sh \
|
||||
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
|
||||
data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test || exit 1;
|
||||
data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test
|
||||
steps/lmrescore_const_arpa.sh \
|
||||
--cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
|
||||
data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test || exit 1;
|
||||
data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test
|
||||
done
|
||||
)&
|
||||
|
||||
# align train_clean_100 using the tri4b model
|
||||
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
|
||||
data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100 || exit 1;
|
||||
data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100
|
||||
|
||||
# if you want at this point you can train and test NN model(s) on the 100 hour
|
||||
# subset
|
||||
local/nnet2/run_5a_clean_100.sh || exit 1
|
||||
local/nnet2/run_5a_clean_100.sh
|
||||
|
||||
local/download_and_untar.sh $data $data_url train-clean-360 || exit 1;
|
||||
local/download_and_untar.sh $data $data_url train-clean-360
|
||||
|
||||
# now add the "clean-360" subset to the mix ...
|
||||
local/data_prep.sh \
|
||||
$data/LibriSpeech/train-clean-360 data/train_clean_360 || exit 1
|
||||
$data/LibriSpeech/train-clean-360 data/train_clean_360
|
||||
steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_clean_360 \
|
||||
exp/make_mfcc/train_clean_360 $mfccdir || exit 1
|
||||
exp/make_mfcc/train_clean_360 $mfccdir
|
||||
steps/compute_cmvn_stats.sh \
|
||||
data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir || exit 1
|
||||
data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir
|
||||
|
||||
# ... and then combine the two sets into a 460 hour one
|
||||
utils/combine_data.sh \
|
||||
data/train_clean_460 data/train_clean_100 data/train_clean_360 || exit 1
|
||||
data/train_clean_460 data/train_clean_100 data/train_clean_360
|
||||
|
||||
# align the new, combined set, using the tri4b model
|
||||
steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
|
||||
data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460 || exit 1;
|
||||
data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460
|
||||
|
||||
# create a larger SAT model, trained on the 460 hours of data.
|
||||
steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
|
||||
data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b || exit 1;
|
||||
data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b
|
||||
|
||||
# decode using the tri5b model
|
||||
(
|
||||
utils/mkgraph.sh data/lang_test_tgsmall \
|
||||
exp/tri5b exp/tri5b/graph_tgsmall || exit 1;
|
||||
exp/tri5b exp/tri5b/graph_tgsmall
|
||||
for test in test_clean test_other dev_clean dev_other; do
|
||||
steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
|
||||
exp/tri5b/graph_tgsmall data/$test \
|
||||
exp/tri5b/decode_tgsmall_$test || exit 1;
|
||||
exp/tri5b/decode_tgsmall_$test
|
||||
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
|
||||
data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test || exit 1;
|
||||
data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test
|
||||
steps/lmrescore_const_arpa.sh \
|
||||
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
|
||||
data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test || exit 1;
|
||||
data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test
|
||||
steps/lmrescore_const_arpa.sh \
|
||||
--cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
|
||||
data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test || exit 1;
|
||||
data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test
|
||||
done
|
||||
)&
|
||||
|
||||
# train a NN model on the 460 hour set
|
||||
local/nnet2/run_6a_clean_460.sh || exit 1
|
||||
local/nnet2/run_6a_clean_460.sh
|
||||
|
||||
local/download_and_untar.sh $data $data_url train-other-500 || exit 1;
|
||||
local/download_and_untar.sh $data $data_url train-other-500
|
||||
|
||||
# prepare the 500 hour subset.
|
||||
local/data_prep.sh \
|
||||
$data/LibriSpeech/train-other-500 data/train_other_500 || exit 1
|
||||
$data/LibriSpeech/train-other-500 data/train_other_500
|
||||
steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_other_500 \
|
||||
exp/make_mfcc/train_other_500 $mfccdir || exit 1
|
||||
exp/make_mfcc/train_other_500 $mfccdir
|
||||
steps/compute_cmvn_stats.sh \
|
||||
data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir || exit 1
|
||||
data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir
|
||||
|
||||
# combine all the data
|
||||
utils/combine_data.sh \
|
||||
data/train_960 data/train_clean_460 data/train_other_500 || exit 1
|
||||
data/train_960 data/train_clean_460 data/train_other_500
|
||||
|
||||
steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
|
||||
data/train_960 data/lang exp/tri5b exp/tri5b_ali_960 || exit 1;
|
||||
data/train_960 data/lang exp/tri5b exp/tri5b_ali_960
|
||||
|
||||
# train a SAT model on the 960 hour mixed data. Use the train_quick.sh script
|
||||
# as it is faster.
|
||||
steps/train_quick.sh --cmd "$train_cmd" \
|
||||
7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b || exit 1;
|
||||
7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b
|
||||
|
||||
# decode using the tri6b model
|
||||
(
|
||||
utils/mkgraph.sh data/lang_test_tgsmall \
|
||||
exp/tri6b exp/tri6b/graph_tgsmall || exit 1;
|
||||
exp/tri6b exp/tri6b/graph_tgsmall
|
||||
for test in test_clean test_other dev_clean dev_other; do
|
||||
steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
|
||||
exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test || exit 1;
|
||||
exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test
|
||||
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
|
||||
data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test || exit 1;
|
||||
data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test
|
||||
steps/lmrescore_const_arpa.sh \
|
||||
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
|
||||
data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test || exit 1;
|
||||
data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test
|
||||
steps/lmrescore_const_arpa.sh \
|
||||
--cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
|
||||
data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test || exit 1;
|
||||
data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test
|
||||
done
|
||||
)&
|
||||
|
||||
|
@ -349,7 +349,7 @@ steps/train_quick.sh --cmd "$train_cmd" \
|
|||
|
||||
|
||||
# train NN models on the entire dataset
|
||||
local/nnet2/run_7a_960.sh || exit 1
|
||||
local/nnet2/run_7a_960.sh
|
||||
|
||||
# # train models on cleaned-up data
|
||||
# # we've found that this isn't helpful-- see the comments in local/run_data_cleaning.sh
|
||||
|
|
|
@ -17,5 +17,6 @@ ones to look at right now:
|
|||
5v is what I am currently using as a baseline- it has an even smaller
|
||||
--jesus-hidden-dim as 5t (hence faster to train), but gives the same
|
||||
performance.
|
||||
|
||||
|
||||
6g is a setup with a 'thinner' jesus-layer (with only one repeated-affine component)
|
||||
and slightly more parameters, which is quicker to train than 5v but gives
|
||||
about the same results. I'm hoping to use this setup, going forward.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#!/bin/bash
|
||||
|
||||
# _5w is as _5x but decreasing the context of the averaging layer from +-0.99
|
||||
# _5x is as _5w but decreasing the context of the averaging layer from +-0.99
|
||||
# seconds to +-0.66 seconds. I would not have expected this to work a priori,
|
||||
# but the change from 5k -> 5l, which made the context wider, made WERs slightly
|
||||
# worse, so I'd like to see what happens when we decrease the context.
|
||||
|
|
|
@ -1,15 +1,17 @@
|
|||
#!/bin/bash
|
||||
|
||||
# _5z is as _5v, but adding skip-splicing (a new configuration option)
|
||||
# It seems definitely not helpful. I'll remove the option soon.
|
||||
#local/chain/compare_wer.sh 5v 5z
|
||||
#System 5v 5z
|
||||
#WER on train_dev(tg) 15.38 15.60
|
||||
#WER on train_dev(fg) 14.39 14.50
|
||||
#WER on eval2000(tg) 17.4 17.6
|
||||
#WER on eval2000(fg) 15.7 15.9
|
||||
#Final train prob -0.11156 -0.113823
|
||||
#Final valid prob -0.131797 -0.131356
|
||||
# It seems not helpful. I'll remove the option soon.
|
||||
# note: 5v2 is a rerun of 5v.
|
||||
|
||||
# local/chain/compare_wer.sh 5v 5v2 5z
|
||||
# System 5v 5v2 5z
|
||||
# WER on train_dev(tg) 15.38 15.74 15.60
|
||||
# WER on train_dev(fg) 14.39 14.50 14.50
|
||||
# WER on eval2000(tg) 17.4 17.5 17.6
|
||||
# WER on eval2000(fg) 15.7 15.9 15.9
|
||||
# Final train prob -0.11156 -0.112155 -0.113823
|
||||
# Final valid prob -0.131797 -0.129516 -0.131356
|
||||
|
||||
# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
|
||||
|
||||
|
|
|
@ -3,15 +3,15 @@
|
|||
# _6c is as _5v but adding "--thick-jesus-layer true" (new option): extra hidden
|
||||
# layer inside jesus layer.
|
||||
|
||||
# Doesn't seem to be helpful.
|
||||
#local/chain/compare_wer.sh 5v 6c
|
||||
#System 5v 6c
|
||||
#WER on train_dev(tg) 15.38 15.54
|
||||
#WER on train_dev(fg) 14.39 14.55
|
||||
#WER on eval2000(tg) 17.4 17.5
|
||||
#WER on eval2000(fg) 15.7 15.8
|
||||
#Final train prob -0.11156 -0.114084
|
||||
#Final valid prob -0.131797 -0.129589
|
||||
# Note: 5v2 is a rerun of 5v.
|
||||
#local/chain/compare_wer.sh 5v 5v2 6c
|
||||
#System 5v 5v2 6c
|
||||
#WER on train_dev(tg) 15.38 15.74 15.54
|
||||
#WER on train_dev(fg) 14.39 14.50 14.55
|
||||
#WER on eval2000(tg) 17.4 17.5 17.5
|
||||
#WER on eval2000(fg) 15.7 15.9 15.8
|
||||
#Final train prob -0.11156 -0.112155 -0.114084
|
||||
#Final valid prob -0.131797 -0.129516 -0.129589
|
||||
|
||||
# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
|
||||
|
||||
|
|
|
@ -4,15 +4,16 @@
|
|||
# this means (after rounding) that we have 6, not 5, as
|
||||
# --jesus-forward-input-dim / --num-jesus-blocks.
|
||||
|
||||
#a bit worse.
|
||||
#a03:s5c: local/chain/compare_wer.sh 5v 6d
|
||||
#System 5v 6d
|
||||
#WER on train_dev(tg) 15.38 15.66
|
||||
#WER on train_dev(fg) 14.39 14.54
|
||||
#WER on eval2000(tg) 17.4 17.5
|
||||
#WER on eval2000(fg) 15.7 15.8
|
||||
#Final train prob -0.11156 -0.112034
|
||||
#Final valid prob -0.131797 -0.131714
|
||||
# no clear difference.
|
||||
#[note, 5v2 is a rerun of 5v].
|
||||
# local/chain/compare_wer.sh 5v 5v2 6d
|
||||
# System 5v 5v2 6d
|
||||
# WER on train_dev(tg) 15.38 15.74 15.66
|
||||
# WER on train_dev(fg) 14.39 14.50 14.54
|
||||
# WER on eval2000(tg) 17.4 17.5 17.5
|
||||
# WER on eval2000(fg) 15.7 15.9 15.8
|
||||
# Final train prob -0.11156 -0.112155 -0.112034
|
||||
# Final valid prob -0.131797 -0.129516 -0.131714
|
||||
|
||||
# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
|
||||
|
||||
|
|
|
@ -3,6 +3,17 @@
|
|||
# _6g is as _6f but increasing the parameters (increasing
|
||||
# jesus-forward-input-from from 500 to 600).
|
||||
|
||||
# seems better than 6f, and about the same as (5v,5v2). encouraging.
|
||||
# note, 5v2 is rerun of 5v.
|
||||
#local/chain/compare_wer.sh 5v 5v2 6f 6g
|
||||
#System 5v 5v2 6f 6g
|
||||
#WER on train_dev(tg) 15.38 15.74 15.71 15.50
|
||||
#WER on train_dev(fg) 14.39 14.50 14.50 14.31
|
||||
#WER on eval2000(tg) 17.4 17.5 17.5 17.5
|
||||
#WER on eval2000(fg) 15.7 15.9 15.9 15.8
|
||||
#Final train prob -0.11156 -0.112155 -0.111305 -0.105853
|
||||
#Final valid prob -0.131797 -0.129516 -0.131487 -0.129997
|
||||
|
||||
# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change
|
||||
# means there is no hidden part in the jesus layer (it's just repeated affine and relu).
|
||||
|
||||
|
|
|
@ -0,0 +1,483 @@
|
|||
#!/bin/bash
|
||||
|
||||
# _6h is as _6g but adding --xent-separate-forward-affine=true, which
|
||||
# gives a separate last-but-one weight matrix to the xent output.
|
||||
|
||||
# _6g is as _6f but increasing the parameters (increasing
|
||||
# jesus-forward-input-from from 500 to 600).
|
||||
|
||||
# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change
|
||||
# means there is no hidden part in the jesus layer (it's just repeated affine and relu).
|
||||
|
||||
# slightly worse, but encouragingly small difference.
|
||||
#local/chain/compare_wer.sh 5v 6f
|
||||
#System 5v 6f
|
||||
#WER on train_dev(tg) 15.38 15.71
|
||||
#WER on train_dev(fg) 14.39 14.50
|
||||
#WER on eval2000(tg) 17.4 17.5
|
||||
#WER on eval2000(fg) 15.7 15.9
|
||||
#Final train prob -0.11156 -0.111305
|
||||
#Final valid prob -0.131797 -0.131487
|
||||
|
||||
# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
|
||||
|
||||
# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
|
||||
#
|
||||
#local/chain/compare_wer.sh 5e 5s 5t 5v
|
||||
#System 5e 5s 5t 5v
|
||||
#WER on train_dev(tg) 15.43 15.47 15.43 15.38
|
||||
#WER on train_dev(fg) 14.32 14.31 14.34 14.39
|
||||
#WER on eval2000(tg) 17.3 17.4 17.4 17.4
|
||||
#WER on eval2000(fg) 15.5 15.6 15.6 15.7
|
||||
#Final train prob -0.110056 -0.110928 -0.110752 -0.11156
|
||||
#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797
|
||||
|
||||
# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
|
||||
# up), from 5000 to 3500.
|
||||
|
||||
# about 5s: comparing with 5e which is the most recent baseline we actually
|
||||
# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
|
||||
# jesus-hidden-dim reduced 7500 to 5000, and and the new option
|
||||
# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even
|
||||
# smaller jesus-hidden-dims.
|
||||
|
||||
# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
|
||||
# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
|
||||
# code to a bug which was doubling the thresholds so there was, in effect,
|
||||
# no upper threshold. I stopped the p,q,r runs after I found this, but in
|
||||
# configuring this run I'm bearing in mind the train and valid probs from the
|
||||
# p,q,r runs.
|
||||
|
||||
# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
|
||||
|
||||
# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
|
||||
# to compensate for the fact that more of the output dimensions are now being
|
||||
# usefully used.
|
||||
|
||||
# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
|
||||
# ReLUs that are over or under-saturated.
|
||||
|
||||
# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
|
||||
# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
|
||||
|
||||
# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
|
||||
# in the train and valid probs.
|
||||
#System 5b 5e
|
||||
#WER on train_dev(tg) 15.51 15.43
|
||||
#WER on train_dev(fg) 14.39 14.32
|
||||
#WER on eval2000(tg) 17.3 17.3
|
||||
#WER on eval2000(fg) 15.6 15.5
|
||||
#Final train prob -0.112013 -0.110056
|
||||
#Final valid prob -0.130879 -0.129184
|
||||
|
||||
# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
|
||||
|
||||
# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
|
||||
#./compare_wer.sh 5a 5b
|
||||
#System 5a 5b
|
||||
#WER on train_dev(tg) 15.86 15.51
|
||||
#WER on train_dev(fg) 14.74 14.39
|
||||
#WER on eval2000(tg) 17.4 17.3
|
||||
#WER on eval2000(fg) 15.6 15.6
|
||||
#Final train prob -0.0998359 -0.112013
|
||||
#Final valid prob -0.115884 -0.130879
|
||||
|
||||
# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
|
||||
# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization
|
||||
# will mean that the increased parameters are now helpful.
|
||||
# quite helpful:
|
||||
#local/chain/compare_wer.sh 4w 5a
|
||||
#System 4w 5a
|
||||
#WER on train_dev(tg) 16.05 15.86
|
||||
#WER on train_dev(fg) 14.92 14.74
|
||||
#WER on eval2000(tg) 18.0 17.4
|
||||
#WER on eval2000(fg) 16.2 15.6
|
||||
#Final train prob -0.108816-0.0998359
|
||||
#Final valid prob -0.118254 -0.115884
|
||||
|
||||
# _4w is as _4v, but doubling --xent-regularize to 0.2
|
||||
|
||||
# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change
|
||||
# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
|
||||
# layer, and this limits the rate of change of the other layers.
|
||||
|
||||
# _4r is as _4f, but one more hidden layer, and reducing context of existing
|
||||
# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly
|
||||
# from 1500 to 1400.
|
||||
|
||||
# This is better than 4f by almost all metrics.
|
||||
# ./compare_wer.sh 4f 4r
|
||||
# System 4f 4r
|
||||
# WER on train_dev(tg) 16.83 16.50
|
||||
# WER on train_dev(fg) 15.73 15.45
|
||||
# WER on eval2000(tg) 18.4 18.3
|
||||
# WER on eval2000(fg) 16.6 16.7
|
||||
# Final train prob -0.105832 -0.103652
|
||||
# Final valid prob -0.123021 -0.121105
|
||||
|
||||
# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
|
||||
|
||||
# It's even better than 4e, by about 0.3% abs.
|
||||
# 4c 4e 4f
|
||||
# Final valid prob: -0.1241 -0.1267 -0.1230
|
||||
# Final train prob: -0.08820 -0.1149 -0.1058
|
||||
|
||||
# ./show_wer.sh 4f
|
||||
# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
|
||||
# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
|
||||
# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
|
||||
# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
|
||||
# a03:s5c: ./show_wer.sh 4e
|
||||
# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
|
||||
# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
|
||||
# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
|
||||
# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
|
||||
|
||||
|
||||
# _4e is as _4c, but adding the option --l2-regularize 0.0001.
|
||||
|
||||
# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
|
||||
|
||||
# _4a is as _3s, but using narrower splice-indexes in the first layer.
|
||||
|
||||
# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
|
||||
# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
|
||||
# This of course reduces overtraining. Results are a bit better than 3p but still
|
||||
# not as good as 2y
|
||||
|
||||
# ./show_wer.sh 3s
|
||||
# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
|
||||
# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
|
||||
# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
|
||||
# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
|
||||
# a03:s5c: ./show_wer.sh 3p
|
||||
# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
|
||||
# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
|
||||
# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
|
||||
# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
|
||||
# a03:s5c: ./show_wer.sh 2y
|
||||
# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
|
||||
# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
|
||||
# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
|
||||
# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
|
||||
|
||||
|
||||
# _3r is as _3p but reducing the number of parameters as it seemed to be
|
||||
# overtraining (despite already being quite a small model): [600,1800 ->
|
||||
# 500,1500]. Also in the interim there was a script change to
|
||||
# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
|
||||
# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
|
||||
# with the halving of the minibatch size.]
|
||||
|
||||
|
||||
# _3p is the same as 3o, but after a code and script change so we can use
|
||||
# natural gradient for the RepeatedAffineComponent.
|
||||
# [natural gradient was helpful, based on logs;
|
||||
# also made a change to use positive bias for the jesus-component affine parts.]
|
||||
|
||||
# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
|
||||
|
||||
# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
|
||||
# recurrence, with improvements to the learning of the jesus layers.
|
||||
|
||||
# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
|
||||
# to be worse.
|
||||
# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
|
||||
# is helpful.]
|
||||
#./show_wer.sh 3g
|
||||
#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
|
||||
#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
|
||||
#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
|
||||
#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
|
||||
#a03:s5c: ./show_wer.sh 2y
|
||||
#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
|
||||
#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
|
||||
#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
|
||||
#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
|
||||
|
||||
#a03:s5c: ./show_wer.sh 3d
|
||||
#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
|
||||
#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
|
||||
#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
|
||||
#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
|
||||
|
||||
# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
|
||||
# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
|
||||
# Therefore it's
|
||||
# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra
|
||||
# context, and this isn't really ideal - I want to see if this seems promising first.
|
||||
|
||||
# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
|
||||
# to 200 in order to reduce computation in the Jesus layer.
|
||||
|
||||
# _3d is as _2y, and re-using the egs, but using --jesus-opts and
|
||||
# configs from make_jesus_configs.py.
|
||||
# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
|
||||
# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
|
||||
|
||||
# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
|
||||
# 800k to 1.2 million. The aim is to avoid some of the per-job overhead
|
||||
# (model-averaging, etc.), since each iteration takes only a minute or so.
|
||||
# I added the results to the table below. It seems the same on average-
|
||||
# which is good. We'll probably keep this configuration.
|
||||
|
||||
# _2o is as _2m, but going back to our original 2-state topology, which it turns
|
||||
# out that I never tested to WER.
|
||||
# hm--- it's about the same, or maybe slightly better!
|
||||
# caution: accidentally overwrote most of this dir, but kept the key stuff.
|
||||
|
||||
# note: when I compare with the rerun of 2o (not shown), this run is actually
|
||||
# better.
|
||||
# WER on 2m 2o 2y [ now comparing 2o->2y:]
|
||||
# train_dev,tg 17.22 17.24 16.99 0.2% better
|
||||
# train_dev,fg 15.87 15.93 15.86 0.1% better
|
||||
# eval2000,tg 18.7 18.7 18.9 0.2% worse
|
||||
# eval2000,fg 17.0 16.9 17.0 0.1% worse
|
||||
|
||||
# train-prob,final -0.0803 -0.0835
|
||||
# valid-prob,final -0.0116 -0.0122
|
||||
|
||||
# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
|
||||
# that mechanism.
|
||||
|
||||
# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
|
||||
# set --apply-deriv-weights false and --frames-overlap-per-eg 0.
|
||||
|
||||
# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
|
||||
# the log-like change when deciding which states to back off. The code is not the same
|
||||
# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By
|
||||
# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration
|
||||
# is quite similar to 2d, except new/more-exact code is used.
|
||||
|
||||
# _2d is as _2c but with different LM options:
|
||||
# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
|
||||
# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
|
||||
# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
|
||||
# provided from the tree-building, and effectively puts the leftmost context position as a single
|
||||
# set.
|
||||
# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
|
||||
# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
|
||||
|
||||
# _2c is as _2a but after a code change in which we start using transition-scale
|
||||
# and self-loop-scale of 1 instead of zero in training; we change the options to
|
||||
# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect
|
||||
# results at all; it's is mainly for convenience in pushing weights in graphs,
|
||||
# and checking that graphs are stochastic.
|
||||
|
||||
# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
|
||||
|
||||
# _z is as _x but setting --lm-opts "--num-extra-states=2000".
|
||||
# (see also y, which has --num-extra-states=500).
|
||||
|
||||
# _x is as _s but setting --lm-opts "--num-extra-states=0".
|
||||
# this is a kind of repeat of the u->v experiment, where it seemed to make things
|
||||
# worse, but there were other factors involved in that so I want to be sure.
|
||||
|
||||
# _s is as _q but setting pdf-boundary-penalty to 0.0
|
||||
# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
|
||||
# and 18.07 -> 16.96 on train_dev, after fg rescoring.
|
||||
|
||||
# _q is as _p except making the same change as from n->o, which
|
||||
# reduces the parameters to try to reduce over-training. We reduce
|
||||
# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
|
||||
# and modify the splicing setup.
|
||||
# note: I don't rerun the tree-building, I just use the '5o' treedir.
|
||||
|
||||
# _p is as _m except with a code change in which we switch to a different, more
|
||||
# exact mechanism to deal with the edges of the egs, and correspondingly
|
||||
# different script options... we now dump weights with the egs, and apply the
|
||||
# weights to the derivative w.r.t. the output instead of using the
|
||||
# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap
|
||||
# to 30 also. This wil. give 10 frames on each side with zero derivs, then
|
||||
# ramping up to a weight of 1.0 over 10 frames.
|
||||
|
||||
# _m is as _k but after a code change that makes the denominator FST more
|
||||
# compact. I am rerunning in order to verify that the WER is not changed (since
|
||||
# it's possible in principle that due to edge effects related to weight-pushing,
|
||||
# the results could be a bit different).
|
||||
# The results are inconsistently different but broadly the same. On all of eval2000,
|
||||
# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
|
||||
# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
|
||||
|
||||
|
||||
# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
|
||||
# option and setting max-param-change to 1.. Using the same egs.
|
||||
|
||||
# _i is as _h but longer egs: 150 frames instead of 75, and
|
||||
# 128 elements per minibatch instead of 256.
|
||||
|
||||
# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
|
||||
|
||||
# _g is as _f but more splicing at last layer.
|
||||
|
||||
# _f is as _e but with 30 as the number of left phone classes instead
|
||||
# of 10.
|
||||
|
||||
# _e is as _d but making it more similar in configuration to _b.
|
||||
# (turns out b was better than a after all-- the egs' likelihoods had to
|
||||
# be corrected before comparing them).
|
||||
# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
|
||||
# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
|
||||
|
||||
# _d is as _c but with a modified topology (with 4 distinct states per phone
|
||||
# instead of 2), and a slightly larger num-states (8000) to compensate for the
|
||||
# different topology, which has more states.
|
||||
|
||||
# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
|
||||
# as the default) as it's not clear that it was helpful; using the old learning-rates;
|
||||
# and modifying the target-num-states to 7000.
|
||||
|
||||
# _b is as as _a except for configuration changes: using 12k num-leaves instead of
|
||||
# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
|
||||
# which will make the final layer learn less fast compared with other layers.
|
||||
|
||||
set -e
|
||||
|
||||
# configs for 'chain'
|
||||
stage=12
|
||||
train_stage=-10
|
||||
get_egs_stage=-10
|
||||
speed_perturb=true
|
||||
dir=exp/chain/tdnn_6h # Note: _sp will get added to this if $speed_perturb == true.
|
||||
|
||||
# training options
|
||||
num_epochs=4
|
||||
initial_effective_lrate=0.001
|
||||
final_effective_lrate=0.0001
|
||||
leftmost_questions_truncate=-1
|
||||
max_param_change=2.0
|
||||
final_layer_normalize_target=0.5
|
||||
num_jobs_initial=3
|
||||
num_jobs_final=16
|
||||
minibatch_size=128
|
||||
frames_per_eg=150
|
||||
remove_egs=false
|
||||
|
||||
# End configuration section.
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
. cmd.sh
|
||||
. ./path.sh
|
||||
. ./utils/parse_options.sh
|
||||
|
||||
if ! cuda-compiled; then
|
||||
cat <<EOF && exit 1
|
||||
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
|
||||
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
|
||||
where "nvcc" is installed.
|
||||
EOF
|
||||
fi
|
||||
|
||||
# The iVector-extraction and feature-dumping parts are the same as the standard
|
||||
# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
|
||||
# run those things.
|
||||
|
||||
suffix=
|
||||
if [ "$speed_perturb" == "true" ]; then
|
||||
suffix=_sp
|
||||
fi
|
||||
|
||||
dir=${dir}$suffix
|
||||
train_set=train_nodup$suffix
|
||||
ali_dir=exp/tri4_ali_nodup$suffix
|
||||
treedir=exp/chain/tri5_2y_tree$suffix
|
||||
lang=data/lang_chain_2y
|
||||
|
||||
|
||||
# if we are using the speed-perturbed data we need to generate
|
||||
# alignments for it.
|
||||
local/nnet3/run_ivector_common.sh --stage $stage \
|
||||
--speed-perturb $speed_perturb \
|
||||
--generate-alignments $speed_perturb || exit 1;
|
||||
|
||||
|
||||
if [ $stage -le 9 ]; then
|
||||
# Get the alignments as lattices (gives the CTC training more freedom).
|
||||
# use the same num-jobs as the alignments
|
||||
nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
|
||||
steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
|
||||
data/lang exp/tri4 exp/tri4_lats_nodup$suffix
|
||||
rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 10 ]; then
|
||||
# Create a version of the lang/ directory that has one state per phone in the
|
||||
# topo file. [note, it really has two states.. the first one is only repeated
|
||||
# once, the second one has zero or more repeats.]
|
||||
rm -rf $lang
|
||||
cp -r data/lang $lang
|
||||
silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
|
||||
nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
|
||||
# Use our special topology... note that later on may have to tune this
|
||||
# topology.
|
||||
steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
|
||||
fi
|
||||
|
||||
if [ $stage -le 11 ]; then
|
||||
# Build a tree using our new topology.
|
||||
steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
|
||||
--leftmost-questions-truncate $leftmost_questions_truncate \
|
||||
--cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
|
||||
fi
|
||||
|
||||
if [ $stage -le 12 ]; then
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
|
||||
utils/create_split_dir.pl \
|
||||
/export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
|
||||
fi
|
||||
|
||||
touch $dir/egs/.nodelete # keep egs around when that run dies.
|
||||
|
||||
steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
|
||||
--xent-regularize 0.1 \
|
||||
--leaky-hmm-coefficient 0.1 \
|
||||
--l2-regularize 0.00005 \
|
||||
--egs-dir exp/chain/tdnn_2y_sp/egs \
|
||||
--jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
|
||||
--splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
|
||||
--apply-deriv-weights false \
|
||||
--frames-per-iter 1200000 \
|
||||
--lm-opts "--num-extra-lm-states=2000" \
|
||||
--get-egs-stage $get_egs_stage \
|
||||
--minibatch-size $minibatch_size \
|
||||
--egs-opts "--frames-overlap-per-eg 0" \
|
||||
--frames-per-eg $frames_per_eg \
|
||||
--num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
|
||||
--feat-type raw \
|
||||
--online-ivector-dir exp/nnet3/ivectors_${train_set} \
|
||||
--cmvn-opts "--norm-means=false --norm-vars=false" \
|
||||
--initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
|
||||
--max-param-change $max_param_change \
|
||||
--cmd "$decode_cmd" \
|
||||
--remove-egs $remove_egs \
|
||||
data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 13 ]; then
|
||||
# Note: it might appear that this $lang directory is mismatched, and it is as
|
||||
# far as the 'topo' is concerned, but this script doesn't read the 'topo' from
|
||||
# the lang directory.
|
||||
utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
|
||||
fi
|
||||
|
||||
decode_suff=sw1_tg
|
||||
graph_dir=$dir/graph_sw1_tg
|
||||
if [ $stage -le 14 ]; then
|
||||
for decode_set in train_dev eval2000; do
|
||||
(
|
||||
steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
|
||||
--extra-left-context 20 \
|
||||
--nj 50 --cmd "$decode_cmd" \
|
||||
--online-ivector-dir exp/nnet3/ivectors_${decode_set} \
|
||||
$graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
|
||||
if $has_fisher; then
|
||||
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
|
||||
data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
|
||||
$dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
|
||||
fi
|
||||
) &
|
||||
done
|
||||
fi
|
||||
wait;
|
||||
exit 0;
|
|
@ -32,6 +32,9 @@ parser.add_argument("--xent-regularize", type=float,
|
|||
help="For chain models, if nonzero, add a separate output for cross-entropy "
|
||||
"regularization (with learning-rate-factor equal to the inverse of this)",
|
||||
default=0.0)
|
||||
parser.add_argument("--xent-separate-forward-affine", type=str,
|
||||
help="if using --xent-regularize, gives it separate last-but-one weight matrix",
|
||||
default="false", choices = ["false", "true"])
|
||||
parser.add_argument("--use-repeated-affine", type=str,
|
||||
help="if true use RepeatedAffineComponent, else BlockAffineComponent (i.e. no sharing)",
|
||||
default="true", choices = ["false", "true"])
|
||||
|
@ -462,6 +465,19 @@ for l in range(1, num_hidden_layers + 1):
|
|||
print('output-node name=output input=final-affine', file=f)
|
||||
|
||||
if args.xent_regularize != 0.0:
|
||||
xent_input = 'final-relu'
|
||||
if l == num_hidden_layers and args.xent_separate_forward_affine == "true":
|
||||
print('component name=forward-affine{0}-xent type=NaturalGradientAffineComponent '
|
||||
'input-dim={1} output-dim={2} bias-stddev=0'.
|
||||
format(l, args.jesus_forward_output_dim, args.final_hidden_dim), file=f)
|
||||
print('component-node name=jesus{0}-forward-output-affine-xent component=forward-affine{0}-xent input=post-jesus{0}'.format(
|
||||
l), file=f)
|
||||
print('component name=final-relu-xent type=RectifiedLinearComponent dim={0} self-repair-scale={1}'.format(
|
||||
args.final_hidden_dim, args.self_repair_scale), file=f)
|
||||
print('component-node name=final-relu-xent component=final-relu-xent '
|
||||
'input=jesus{0}-forward-output-affine-xent'.format(l), file=f)
|
||||
xent_input = 'final-relu-xent'
|
||||
|
||||
# This block prints the configs for a separate output that will be
|
||||
# trained with a cross-entropy objective in the 'chain' models... this
|
||||
# has the effect of regularizing the hidden parts of the model. we use
|
||||
|
@ -473,8 +489,8 @@ for l in range(1, num_hidden_layers + 1):
|
|||
print('component name=final-affine-xent type=NaturalGradientAffineComponent '
|
||||
'input-dim={0} output-dim={1} param-stddev=0.0 bias-stddev=0 learning-rate-factor={2}'.format(
|
||||
cur_affine_output_dim, args.num_targets, 0.5 / args.xent_regularize), file=f)
|
||||
print('component-node name=final-affine-xent component=final-affine-xent input=final-relu',
|
||||
file=f)
|
||||
print('component-node name=final-affine-xent component=final-affine-xent input={0}'.format(
|
||||
xent_input), file=f)
|
||||
print('component name=final-log-softmax-xent type=LogSoftmaxComponent dim={0}'.format(
|
||||
args.num_targets), file=f)
|
||||
print('component-node name=final-log-softmax-xent component=final-log-softmax-xent '
|
||||
|
|
Загрузка…
Ссылка в новой задаче