зеркало из https://github.com/mozilla/kaldi.git
several nnet2-online changes: make it easier to get the feature extraction options right in cross-system training; add train_pnorm_simple.sh script (simplified learning-rate schedule and improved combination at the end, supersedes train_pnorm_fast.sh); modifying big-data online-nnet2 recipes to use 40-dimensional MFCC rather than 13 as input (will add results soon, but they are improved). Modified filter_scp.pl to have one-based, not zero-based, field index.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4493 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
011808dcae
Коммит
6f598676cc
|
@ -96,7 +96,7 @@ while (( "$#" )); do
|
|||
$cmd LMWT=$min_lmwt:$max_lmwt $targetdir/$kws/kws_filter.LMWT.log \
|
||||
set -e';' set -o pipefail';' \
|
||||
mkdir -p $targetdir/${kws}_LMWT';'\
|
||||
cat $resultdir/${kws}_LMWT/'result.*' \| utils/filter_scp.pl -f 1 $filter \> $targetdir/${kws}_LMWT/result || exit -1
|
||||
cat $resultdir/${kws}_LMWT/'result.*' \| utils/filter_scp.pl -f 2 $filter \> $targetdir/${kws}_LMWT/result || exit -1
|
||||
|
||||
|
||||
echo -e "\tWrite normalized..."
|
||||
|
|
|
@ -1,10 +0,0 @@
|
|||
--window-type=hamming # disable Dans window, use the standard
|
||||
--use-energy=false # only fbank outputs
|
||||
--sample-frequency=8000 # Cantonese is sampled at 8kHz
|
||||
|
||||
--low-freq=64 # typical setup from Frantisek Grezl
|
||||
--high-freq=3800
|
||||
--dither=1
|
||||
|
||||
--num-mel-bins=15 # 8kHz so we use 15 bins
|
||||
--htk-compat=true # try to make it compatible with HTK
|
|
@ -0,0 +1,10 @@
|
|||
# config for high-resolution MFCC features, intended for neural network training.
|
||||
# Note: we keep all cepstra, so it has the same info as filterbank features,
|
||||
# but MFCC is more easily compressible (because less correlated) which is why
|
||||
# we prefer this method.
|
||||
--use-energy=false # use average of log energy, not energy.
|
||||
--sample-frequency=8000 # Switchboard is sampled at 8kHz
|
||||
--num-mel-bins=40 # similar to Google's setup.
|
||||
--num-ceps=40 # there is no dimensionality reduction.
|
||||
--low-freq=40 # low cutoff frequency for mel bins
|
||||
--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
|
|
@ -1,6 +1,5 @@
|
|||
#!/bin/bash
|
||||
|
||||
|
||||
. cmd.sh
|
||||
|
||||
|
||||
|
@ -12,74 +11,99 @@ set -e
|
|||
. ./path.sh
|
||||
. ./utils/parse_options.sh
|
||||
|
||||
if $use_gpu; then
|
||||
if ! cuda-compiled; then
|
||||
cat <<EOF && exit 1
|
||||
|
||||
# assume use_gpu=true since it would be way too slow otherwise.
|
||||
|
||||
if ! cuda-compiled; then
|
||||
cat <<EOF && exit 1
|
||||
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
|
||||
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
|
||||
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
|
||||
where "nvcc" is installed.
|
||||
EOF
|
||||
fi
|
||||
parallel_opts="-l gpu=1"
|
||||
num_threads=1
|
||||
minibatch_size=512
|
||||
# the _a is in case I want to change the parameters.
|
||||
dir=exp/nnet2_online/nnet_a_gpu
|
||||
else
|
||||
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
|
||||
# almost the same, but this may be a little bit slow.
|
||||
num_threads=16
|
||||
minibatch_size=128
|
||||
parallel_opts="-pe smp $num_threads"
|
||||
dir=exp/nnet2_online/nnet_a
|
||||
fi
|
||||
|
||||
parallel_opts="-l gpu=1"
|
||||
num_threads=1
|
||||
minibatch_size=512
|
||||
dir=exp/nnet2_online/nnet_a_gpu
|
||||
mkdir -p exp/nnet2_online
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
mkdir -p exp/nnet2_online
|
||||
# To train a diagonal UBM we don't need very much data, so use the smallest subset.
|
||||
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \
|
||||
data/train_30k 512 exp/tri5a exp/nnet2_online/diag_ubm
|
||||
# this shows how you can split across multiple file-systems. we'll split the
|
||||
# MFCC dir across multiple locations. You might want to be careful here, if you
|
||||
# have multiple copies of Kaldi checked out and run the same recipe, not to let
|
||||
# them overwrite each other.
|
||||
mfccdir=mfcc
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
|
||||
date=$(date +'%m_%d_%H_%M')
|
||||
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english-$date/s5/$mfccdir/storage $mfccdir/storage
|
||||
fi
|
||||
utils/copy_data_dir.sh data/train data/train_hires
|
||||
steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
|
||||
--cmd "$train_cmd" data/train_hires exp/make_hires/train $mfccdir || exit 1;
|
||||
steps/compute_cmvn_stats.sh data/train_hires exp/make_hires/train $mfccdir || exit 1;
|
||||
|
||||
utils/subset_data_dir.sh data/train_hires 30000 data/train_hires_30k
|
||||
# want the 100k subset to exactly match train_100k, since we'll use its alignments.
|
||||
awk '{print $1}' data/train_100k/utt2spk > uttlist
|
||||
utils/subset_data_dir.sh --utt-list uttlist data/train_hires data/train_hires_100k
|
||||
rm uttlist
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ]; then
|
||||
# We need to build a small system just because we need the LDA+MLLT transform
|
||||
# to train the diag-UBM on top of. We use --num-iters 13 because after we get
|
||||
# the transform (12th iter is the last), any further training is pointless.
|
||||
steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
|
||||
--splice-opts "--left-context=3 --right-context=3" \
|
||||
5000 10000 data/train_hires_100k data/lang exp/tri4a exp/nnet2_online/tri5a
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 3 ]; then
|
||||
# To train a diagonal UBM we don't need very much data, so use the smallest
|
||||
# subset. the input directory exp/nnet2_online/tri5a is only needed for
|
||||
# the splice-opts and the LDA transform.
|
||||
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \
|
||||
data/train_hires_30k 512 exp/nnet2_online/tri5a exp/nnet2_online/diag_ubm
|
||||
fi
|
||||
|
||||
if [ $stage -le 4 ]; then
|
||||
# iVector extractors can in general be sensitive to the amount of data, but
|
||||
# this one has a fairly small dim (defaults to 100) so we don't use all of it,
|
||||
# we use just the 100k subset (about one sixteenth of the data).
|
||||
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
|
||||
data/train_100k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
|
||||
data/train_hires_100k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 3 ]; then
|
||||
if [ $stage -le 5 ]; then
|
||||
ivectordir=exp/nnet2_online/ivectors_train
|
||||
if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
|
||||
utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/$ivectordir $ivectordir/storage
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then # this shows how you can split across multiple file-systems.
|
||||
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english/s5/$ivectordir/storage $ivectordir/storage
|
||||
fi
|
||||
|
||||
# We extract iVectors on all the train data, which will be what we
|
||||
# train the system on. This version of the iVector-extraction script
|
||||
# pairs the utterances into twos (by default, see --utts-per-spk-max option)
|
||||
# and treats each of these pairs as one speaker.
|
||||
# Note that these are extracted 'online'.
|
||||
# having a larger number of speakers is helpful for generalization, and to
|
||||
# handle per-utterance decoding well (iVector starts at zero).
|
||||
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_hires data/train_hires_max2
|
||||
|
||||
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
|
||||
--utts-per-spk-max 2 \
|
||||
data/train exp/nnet2_online/extractor $ivectordir || exit 1;
|
||||
data/train_hires_max2 exp/nnet2_online/extractor $ivectordir || exit 1;
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 4 ]; then
|
||||
if [ $stage -le 6 ]; then
|
||||
if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
|
||||
utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/$dir/egs $dir/egs/storage
|
||||
fi
|
||||
|
||||
# Because we have a lot of data here and we don't want the training to take
|
||||
# too long, we reduce the number of epochs from the defaults (15 + 5) to (1 +
|
||||
# too long, we reduce the number of epochs from the defaults (15 + 5) to (3 +
|
||||
# 1). The option "--io-opts '-tc 12'" is to have more than the default number
|
||||
# (5) of jobs dumping the egs to disk; this is OK since we're splitting our
|
||||
# data across four filesystems for speed.
|
||||
|
||||
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
|
||||
--num-epochs 3 --num-epochs-extra 1 \
|
||||
--num-epochs 4 --num-epochs-extra 1 \
|
||||
--samples-per-iter 400000 \
|
||||
--splice-width 7 --feat-type raw \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors_train \
|
||||
--cmvn-opts "--norm-means=false --norm-vars=false" \
|
||||
|
@ -94,30 +118,12 @@ if [ $stage -le 4 ]; then
|
|||
--cmd "$decode_cmd" \
|
||||
--pnorm-input-dim 3500 \
|
||||
--pnorm-output-dim 350 \
|
||||
data/train data/lang exp/tri5a $dir || exit 1;
|
||||
data/train_hires data/lang exp/tri5a $dir || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 5 ]; then
|
||||
# dump iVectors for the testing data.
|
||||
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
|
||||
data/dev exp/nnet2_online/extractor exp/nnet2_online/ivectors_dev || exit 1;
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 6 ]; then
|
||||
# this does offline decoding that should give about the same results as the
|
||||
# real online decoding (the one with --per-utt true)
|
||||
steps/nnet2/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors_dev \
|
||||
exp/tri5a/graph data/dev $dir/decode_dev || exit 1;
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 7 ]; then
|
||||
# If this setup used PLP features, we'd have to give the option --feature-type plp
|
||||
# to the script below.
|
||||
steps/online/nnet2/prepare_online_decoding.sh data/lang exp/nnet2_online/extractor \
|
||||
"$dir" ${dir}_online || exit 1;
|
||||
steps/online/nnet2/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
|
||||
data/lang exp/nnet2_online/extractor "$dir" ${dir}_online || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 8 ]; then
|
||||
|
@ -146,30 +152,3 @@ fi
|
|||
|
||||
exit 0;
|
||||
|
||||
|
||||
#Baseline: GMM+SAT system.
|
||||
#%WER 31.07 [ 12163 / 39141, 1869 ins, 2705 del, 7589 sub ] exp/tri5a/decode_dev/wer_13
|
||||
|
||||
# Baseline: p-norm system on top of fMLLR features.
|
||||
#%WER 23.66 [ 9259 / 39141, 1495 ins, 2432 del, 5332 sub ] exp/nnet6c4_gpu/decode_dev/wer_11
|
||||
|
||||
# Our experiment, carrying forward the adaptation state between
|
||||
# utterances of each speaker.
|
||||
#%WER 23.79 [ 9311 / 39141, 1499 ins, 2277 del, 5535 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev/wer_11
|
||||
|
||||
|
||||
# Our experiment, with per-utterance decoding:
|
||||
%WER 24.84 [ 9721 / 39141, 1445 ins, 2410 del, 5866 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt/wer_11
|
||||
|
||||
|
||||
# below, with --max-chunks-at-once 3. The WER is slightly worse but I expect in general it will
|
||||
# be slightly better, to to more iVector right context; this is likely just noise. The average
|
||||
# latency was reduced vs the baseline,
|
||||
#%WER 24.92 [ 9753 / 39141, 1423 ins, 2429 del, 5901 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt_mc3/wer_11
|
||||
|
||||
|
||||
# The following results (obtained after ./run_nnet2_discriminative.sh was run), show
|
||||
# the effect of discriminative training. After 2 epochs, we reduce the WER from 23.58 to 22.07.
|
||||
%WER 23.58 [ 9229 / 39141, 1382 ins, 2400 del, 5447 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev/wer_12
|
||||
%WER 22.16 [ 8675 / 39141, 1522 ins, 1886 del, 5267 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_smbr_epoch1/wer_13
|
||||
%WER 22.07 [ 8637 / 39141, 1540 ins, 1873 del, 5224 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_smbr_epoch2/wer_13
|
||||
|
|
|
@ -60,14 +60,12 @@ if [ $stage -le 3 ]; then
|
|||
utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/$ivectordir $ivectordir/storage
|
||||
fi
|
||||
|
||||
# We extract iVectors on all the train data, which will be what we
|
||||
# train the system on. This version of the iVector-extraction script
|
||||
# pairs the utterances into twos (by default, see --utts-per-spk-max option)
|
||||
# and treats each of these pairs as one speaker.
|
||||
# Note that these are extracted 'online'.
|
||||
# having a larger number of speakers is helpful for generalization, and to
|
||||
# handle per-utterance decoding well (iVector starts at zero).
|
||||
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2
|
||||
|
||||
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
|
||||
--utts-per-spk-max 2 \
|
||||
data/train exp/nnet2_online/extractor $ivectordir || exit 1;
|
||||
data/train_max2 exp/nnet2_online/extractor $ivectordir || exit 1;
|
||||
fi
|
||||
|
||||
|
||||
|
@ -83,7 +81,8 @@ if [ $stage -le 4 ]; then
|
|||
# data across four filesystems for speed.
|
||||
|
||||
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
|
||||
--num-epochs 3 --num-epochs-extra 1 \
|
||||
--num-epochs 4 --num-epochs-extra 1 \
|
||||
--samples-per-iter 400000 \
|
||||
--splice-width 7 --feat-type raw \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors_train \
|
||||
--cmvn-opts "--norm-means=false --norm-vars=false" \
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
#!/bin/bash
|
||||
|
||||
# This is to be run after run_nnet2.sh
|
||||
# THIS IS NOT TESTED YET.
|
||||
|
||||
|
||||
. cmd.sh
|
||||
|
||||
|
@ -43,7 +41,6 @@ set -e
|
|||
nj=40
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
|
||||
# the make_denlats job is always done on CPU not GPU, since in any case
|
||||
# the graph search and lattice determinization takes quite a bit of CPU.
|
||||
# note: it's the sub-split option that determinies how many jobs actually
|
||||
|
@ -51,7 +48,7 @@ if [ $stage -le 1 ]; then
|
|||
steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
|
||||
--nj $nj --sub-split 40 --num-threads 6 --parallel-opts "-pe smp 6" \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors_train \
|
||||
data/train data/lang $srcdir ${srcdir}_denlats
|
||||
data/train_hires data/lang $srcdir ${srcdir}_denlats
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ]; then
|
||||
|
@ -59,7 +56,7 @@ if [ $stage -le 2 ]; then
|
|||
steps/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors_train \
|
||||
--use-gpu $use_gpu_opt \
|
||||
--nj $nj data/train data/lang ${srcdir} ${srcdir}_ali
|
||||
--nj $nj data/train_hires data/lang ${srcdir} ${srcdir}_ali
|
||||
fi
|
||||
|
||||
if [ $stage -le 3 ]; then
|
||||
|
@ -72,22 +69,22 @@ if [ $stage -le 3 ]; then
|
|||
# since we're using 4 disks.
|
||||
steps/nnet2/train_discriminative.sh --cmd "$decode_cmd" --learning-rate 0.00001 \
|
||||
--io-opts "-pe smp 10" \
|
||||
--num-epochs 2 \
|
||||
--num-epochs 4 \
|
||||
--use-preconditioning $use_preconditioning \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors_train \
|
||||
--num-jobs-nnet 4 --num-threads $num_threads --parallel-opts "$gpu_opts" \
|
||||
data/train data/lang \
|
||||
data/train_hires data/lang \
|
||||
${srcdir}_ali ${srcdir}_denlats ${srcdir}/final.mdl ${srcdir}_smbr
|
||||
fi
|
||||
|
||||
if [ $stage -le 4 ]; then
|
||||
# we'll do the decoding as 'online' decoding by using the existing
|
||||
# _online directory but with extra models copied to it.
|
||||
for epoch in 1 2; do
|
||||
for epoch in 1 2 3 4; do
|
||||
cp ${srcdir}_smbr/epoch${epoch}.mdl ${srcdir}_online/smbr_epoch${epoch}.mdl
|
||||
done
|
||||
|
||||
for epoch in 1 2; do
|
||||
for epoch in 1 2 3 4; do
|
||||
# do the actual online decoding with iVectors, carrying info forward from
|
||||
# previous utterances of the same speaker.
|
||||
steps/online/nnet2/decode.sh --cmd "$decode_cmd" --nj 30 --iter smbr_epoch${epoch} \
|
||||
|
@ -95,5 +92,6 @@ if [ $stage -le 4 ]; then
|
|||
done
|
||||
fi
|
||||
|
||||
wait
|
||||
|
||||
# for results, see the end of run_nnet2.sh
|
||||
|
|
|
@ -45,7 +45,7 @@ if [ $stage -le 2 ]; then
|
|||
local/vad_split_utts_fix_data.pl $in_dir $dir;
|
||||
fi
|
||||
|
||||
utils/filter_scp.pl -f 0 \
|
||||
utils/filter_scp.pl \
|
||||
<(echo "`awk < "$dir/segments" '{ print $2 }'`") $in_dir/wav.scp \
|
||||
> $dir/wav.scp
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ classes="ark:lid/remove_dialect.pl data/train/utt2lang \
|
|||
# Create priors to rebalance the model. The following script rebalances
|
||||
# the languages as count(lang_test) / (count(lang_test) + count(lang_train)).
|
||||
lid/balance_priors_to_test.pl \
|
||||
<(lid/remove_dialect.pl <(utils/filter_scp.pl -f 0 \
|
||||
<(lid/remove_dialect.pl <(utils/filter_scp.pl \
|
||||
exp/ivectors_train/ivector.scp data/train/utt2lang)) \
|
||||
<(lid/remove_dialect.pl data/lre07/utt2lang) \
|
||||
exp/ivectors_train/languages.txt \
|
||||
|
|
|
@ -6,6 +6,9 @@
|
|||
stage=1
|
||||
train_stage=-10
|
||||
use_gpu=true
|
||||
dir=exp/nnet2_online/nnet_a
|
||||
|
||||
|
||||
. cmd.sh
|
||||
. ./path.sh
|
||||
. ./utils/parse_options.sh
|
||||
|
@ -21,7 +24,6 @@ EOF
|
|||
parallel_opts="-l gpu=1"
|
||||
num_threads=1
|
||||
minibatch_size=512
|
||||
dir=exp/nnet2_online/nnet_gpu
|
||||
else
|
||||
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
|
||||
# almost the same, but this may be a little bit slow.
|
||||
|
@ -47,14 +49,17 @@ if [ $stage -le 2 ]; then
|
|||
fi
|
||||
|
||||
if [ $stage -le 3 ]; then
|
||||
# having a larger number of speakers is helpful for generalization, and to
|
||||
# handle per-utterance decoding well (iVector starts at zero).
|
||||
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2
|
||||
|
||||
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
|
||||
--utts-per-spk-max 2 \
|
||||
data/train exp/nnet2_online/extractor exp/nnet2_online/ivectors || exit 1;
|
||||
data/train_max2 exp/nnet2_online/extractor exp/nnet2_online/ivectors || exit 1;
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 4 ]; then
|
||||
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
|
||||
steps/nnet2/train_pnorm_simple.sh --stage $train_stage \
|
||||
--splice-width 7 \
|
||||
--feat-type raw \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors \
|
||||
|
@ -63,7 +68,8 @@ if [ $stage -le 4 ]; then
|
|||
--minibatch-size "$minibatch_size" \
|
||||
--parallel-opts "$parallel_opts" \
|
||||
--num-jobs-nnet 4 \
|
||||
--num-epochs-extra 10 --add-layers-period 1 \
|
||||
--num-epochs 25 \
|
||||
--add-layers-period 1 \
|
||||
--num-hidden-layers 2 \
|
||||
--mix-up 4000 \
|
||||
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#!/bin/bash
|
||||
|
||||
|
||||
# this is a baseline for run_online_decoding_nnet2.sh, without
|
||||
# this is a baseline for ./run_nnet2.sh, without
|
||||
# the iVectors, to see whether they make a difference.
|
||||
|
||||
. cmd.sh
|
||||
|
@ -10,10 +10,14 @@
|
|||
stage=1
|
||||
train_stage=-10
|
||||
use_gpu=true
|
||||
dir=exp/nnet2_online/nnet_a_baseline
|
||||
|
||||
. cmd.sh
|
||||
. ./path.sh
|
||||
. ./utils/parse_options.sh
|
||||
|
||||
|
||||
|
||||
if $use_gpu; then
|
||||
if ! cuda-compiled; then
|
||||
cat <<EOF && exit 1
|
||||
|
@ -25,19 +29,17 @@ EOF
|
|||
parallel_opts="-l gpu=1"
|
||||
num_threads=1
|
||||
minibatch_size=512
|
||||
dir=exp/nnet2_online/nnet_gpu_baseline
|
||||
else
|
||||
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
|
||||
# almost the same, but this may be a little bit slow.
|
||||
num_threads=16
|
||||
minibatch_size=128
|
||||
parallel_opts="-pe smp $num_threads"
|
||||
dir=exp/nnet2_online/nnet_baseline
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
|
||||
steps/nnet2/train_pnorm_simple.sh --stage $train_stage \
|
||||
--splice-width 7 \
|
||||
--feat-type raw \
|
||||
--cmvn-opts "--norm-means=false --norm-vars=false" \
|
||||
|
@ -45,7 +47,8 @@ if [ $stage -le 1 ]; then
|
|||
--minibatch-size "$minibatch_size" \
|
||||
--parallel-opts "$parallel_opts" \
|
||||
--num-jobs-nnet 4 \
|
||||
--num-epochs-extra 10 --add-layers-period 1 \
|
||||
--num-epochs 25 \
|
||||
--add-layers-period 1 \
|
||||
--num-hidden-layers 2 \
|
||||
--mix-up 4000 \
|
||||
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
|
||||
|
@ -82,4 +85,4 @@ if [ $stage -le 4 ]; then
|
|||
wait
|
||||
fi
|
||||
|
||||
# for results, see the end of ./run_online_decoding_nnet2.sh
|
||||
# for results, see the end of ./run_nnet2.sh
|
||||
|
|
|
@ -77,11 +77,13 @@ if [ $stage -le 5 ]; then
|
|||
fi
|
||||
# Below, setting --utts-per-spk-max to a noninteger helps to randomize the division
|
||||
# of speakers into "fake-speakers" with about 2 utterances each, by randomly making
|
||||
# some have 2 and some 3 utterances... this randomnes will be different in different
|
||||
# some have 2 and some 3 utterances... this randomness will be different in different
|
||||
# copies of the data.
|
||||
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2.5 data/train_perturbed_mfcc \
|
||||
data/train_perturbed_mfcc_max2.5
|
||||
|
||||
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
|
||||
--utts-per-spk-max 2.5 \
|
||||
data/train_perturbed_mfcc exp/nnet2_online/extractor $ivectordir || exit 1;
|
||||
data/train_perturbed_mfcc_max2.5 exp/nnet2_online/extractor $ivectordir || exit 1;
|
||||
fi
|
||||
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
# the optional part local/online/run_online_decoding_nnet2.sh. It builds a
|
||||
# neural net for online decoding on top of the network we previously trained on
|
||||
# WSJ, by keeping everything but the last layer of that network and then
|
||||
# training just the last layer on our data.
|
||||
# training just the last layer on our data. We then train the whole thing.
|
||||
|
||||
stage=0
|
||||
set -e
|
||||
|
@ -26,35 +26,40 @@ EOF
|
|||
parallel_opts="-l gpu=1"
|
||||
num_threads=1
|
||||
minibatch_size=512
|
||||
dir=exp/nnet2_online_wsj/nnet_gpu
|
||||
trainfeats=exp/nnet2_online_wsj/wsj_activations_train_gpu
|
||||
dir=exp/nnet2_online_wsj/nnet_a
|
||||
trainfeats=exp/nnet2_online_wsj/wsj_activations_train
|
||||
# later we'll change the script to download the trained model from kaldi-asr.org.
|
||||
srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_gpu_online
|
||||
# the following things are needed while training the combined model.
|
||||
srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a_gpu
|
||||
ivector_src=../../wsj/s5/exp/nnet2_online/extractor
|
||||
else
|
||||
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
|
||||
# almost the same, but this may be a little bit slow.
|
||||
num_threads=16
|
||||
minibatch_size=128
|
||||
parallel_opts="-pe smp $num_threads"
|
||||
dir=exp/nnet2_online_wsj/nnet
|
||||
dir=exp/nnet2_online_wsj/nnet_a
|
||||
trainfeats=exp/nnet2_online_wsj/wsj_activations_train
|
||||
srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_online
|
||||
# the following things are needed while training the combined model.
|
||||
srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a
|
||||
ivector_src=../../wsj/s5/exp/nnet2_online/extractor
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 0 ]; then
|
||||
echo "$0: dumping activations from WSJ model"
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $trainfeats/feats/storage ]; then
|
||||
# this shows how you can split the data across multiple file-systems; it's optional.
|
||||
date=$(date +'%m_%d_%H_%M')
|
||||
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/rm-$date/s5/$trainfeats/feats/storage \
|
||||
$trainfeats/feats/storage
|
||||
fi
|
||||
steps/online/nnet2/dump_nnet_activations.sh --cmd "$train_cmd" --nj 30 \
|
||||
data/train $srcdir $trainfeats
|
||||
fi
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
echo "$0: training 0-hidden-layer model on top of WSJ activations"
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
|
||||
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/rm-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
|
||||
fi
|
||||
|
||||
steps/nnet2/retrain_fast.sh --stage $train_stage \
|
||||
--num-threads "$num_threads" \
|
||||
--minibatch-size "$minibatch_size" \
|
||||
|
@ -71,9 +76,6 @@ if [ $stage -le 2 ]; then
|
|||
steps/online/nnet2/prepare_online_decoding_retrain.sh $srcdir $dir ${dir}_online
|
||||
fi
|
||||
|
||||
# Note: at this point it might be possible to further train the combined model
|
||||
# by doing backprop through all of it. We haven't implemented this yet.
|
||||
|
||||
if [ $stage -le 3 ]; then
|
||||
# do online decoding with the combined model.
|
||||
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
|
@ -98,7 +100,7 @@ fi
|
|||
## the model on this dataset. First we need to create a combined version of the
|
||||
## model.
|
||||
if [ $stage -le 5 ]; then
|
||||
steps/nnet2/create_appended_model.sh $srcdir_orig $dir ${dir}_combined_init
|
||||
steps/nnet2/create_appended_model.sh $srcdir $dir ${dir}_combined_init
|
||||
|
||||
# Set the learning rate in this initial value to our guess of a suitable value.
|
||||
# note: we initially tried 0.005, and this gave us WERs of (1.40, 1.48, 7.24, 7.70) vs.
|
||||
|
@ -107,31 +109,20 @@ if [ $stage -le 5 ]; then
|
|||
nnet-am-copy --learning-rate=$initial_learning_rate ${dir}_combined_init/final.mdl ${dir}_combined_init/final.mdl
|
||||
fi
|
||||
|
||||
# In order to train the combined model, we'll need to dump iVectors.
|
||||
if [ $stage -le 6 ]; then
|
||||
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \
|
||||
--utts-per-spk-max 2 \
|
||||
data/train $ivector_src exp/nnet2_online_wsj/ivectors || exit 1;
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
|
||||
utils/create_split_dir.pl \
|
||||
/export/b0{1,2,3,4}/$USER/kaldi-data/rm-$(date +'%m_%d_%H_%M')/s5/${dir}_combined/egs/storage \
|
||||
$dir_combined/egs/storage
|
||||
fi
|
||||
|
||||
# This version of the get_egs.sh script does the feature extraction and iVector
|
||||
# extraction in a single binary, reading the config, as part of the script.
|
||||
steps/online/nnet2/get_egs.sh --cmd "$train_cmd" --num-jobs-nnet 4 \
|
||||
data/train exp/tri3b_ali ${dir}_online ${dir}_combined
|
||||
fi
|
||||
|
||||
if [ $stage -le 7 ]; then
|
||||
# assume left and right context of model are identical.
|
||||
splice_width=$(nnet-am-info exp/nnet2_online_wsj/nnet_gpu_combined_init/final.mdl | grep '^left-context' | awk '{print $2}') || exit 1;
|
||||
|
||||
# Note: in general the get_egs.sh script would get things like the LDA matrix
|
||||
# from exp/tri3b_ali, which would be the wrong thing to do as we want to get
|
||||
# them from the original model dir. In this case we're using raw MFCC
|
||||
# features so it's not an issue. But in general we'd probably have to create
|
||||
# a temporary dir and copy or link both the alignments and feature-related
|
||||
# things to it.
|
||||
steps/nnet2/get_egs.sh --cmd "$train_cmd" \
|
||||
--feat-type raw --cmvn-opts "--norm-means=false --norm-vars=false" \
|
||||
--online-ivector-dir exp/nnet2_online_wsj/ivectors \
|
||||
--num-jobs-nnet 4 --splice-width $splice_width \
|
||||
data/train data/lang exp/tri3b_ali ${dir}_combined
|
||||
fi
|
||||
|
||||
if [ $stage -le 8 ]; then
|
||||
steps/nnet2/train_more.sh --learning-rate-factor 0.1 --cmd "$train_cmd" \
|
||||
--num-threads "$num_threads" \
|
||||
--minibatch-size "$minibatch_size" \
|
||||
|
@ -139,15 +130,15 @@ if [ $stage -le 8 ]; then
|
|||
${dir}_combined_init/final.mdl ${dir}_combined/egs ${dir}_combined
|
||||
fi
|
||||
|
||||
if [ $stage -le 9 ]; then
|
||||
if [ $stage -le 8 ]; then
|
||||
# Create an online-decoding dir corresponding to what we just trained above.
|
||||
# If this setup used PLP features, we'd have to give the option --feature-type plp
|
||||
# to the script below.
|
||||
steps/online/nnet2/prepare_online_decoding.sh data/lang $ivector_src \
|
||||
steps/online/nnet2/prepare_online_decoding.sh data/lang $srcdir/ivector_extractor \
|
||||
${dir}_combined ${dir}_combined_online || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 10 ]; then
|
||||
if [ $stage -le 9 ]; then
|
||||
# do the online decoding on top of the retrained _combined_online model, and
|
||||
# also the per-utterance version of the online decoding.
|
||||
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
|
@ -166,25 +157,27 @@ fi
|
|||
exit 0;
|
||||
|
||||
# Here are the results when we just retrain the last layer:
|
||||
# grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_* | utils/best_wer.sh
|
||||
#%WER 1.61 [ 202 / 12533, 22 ins, 46 del, 134 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_3
|
||||
#a11:s5: grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh
|
||||
#%WER 7.99 [ 1002 / 12533, 74 ins, 153 del, 775 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_6
|
||||
# grep WER exp/nnet2_online_wsj/nnet_a_online/decode/wer_* | utils/best_wer.sh
|
||||
#%WER 1.60 [ 201 / 12533, 22 ins, 46 del, 133 sub ] exp/nnet2_online_wsj/nnet_a_online/decode/wer_3
|
||||
#a11:s5: grep WER exp/nnet2_online_wsj/nnet_a_online/decode_ug/wer_* | utils/best_wer.sh
|
||||
#%WER 8.02 [ 1005 / 12533, 74 ins, 155 del, 776 sub ] exp/nnet2_online_wsj/nnet_a_online/decode_ug/wer_6
|
||||
|
||||
# and with per-utterance decoding:
|
||||
# %WER 1.72 [ 216 / 12533, 26 ins, 45 del, 145 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_utt/wer_3
|
||||
# %WER 8.40 [ 1053 / 12533, 85 ins, 158 del, 810 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_ug_utt/wer_6
|
||||
# %WER 8.47 [ 1061 / 12533, 88 ins, 157 del, 816 sub ] exp/nnet2_online_wsj/nnet_a_online/decode_ug_utt/wer_6
|
||||
# %WER 1.70 [ 213 / 12533, 24 ins, 46 del, 143 sub ] exp/nnet2_online_wsj/nnet_a_online/decode_utt/wer_3
|
||||
|
||||
|
||||
|
||||
#, here when we retrain the whole thing:
|
||||
# %WER 1.32 [ 165 / 12533, 14 ins, 34 del, 117 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode/wer_3
|
||||
# %WER 7.20 [ 902 / 12533, 78 ins, 127 del, 697 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_ug/wer_6
|
||||
#%WER 1.42 [ 178 / 12533, 16 ins, 44 del, 118 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode/wer_4
|
||||
#%WER 7.08 [ 887 / 12533, 74 ins, 133 del, 680 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode_ug/wer_6
|
||||
|
||||
# and with per-utterance decoding:
|
||||
# %WER 1.38 [ 173 / 12533, 19 ins, 32 del, 122 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_per_utt/wer_3
|
||||
# %WER 7.44 [ 932 / 12533, 57 ins, 163 del, 712 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_ug_per_utt/wer_8
|
||||
# and the same with per-utterance decoding:
|
||||
# %WER 1.56 [ 196 / 12533, 31 ins, 26 del, 139 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode_per_utt/wer_2
|
||||
# %WER 7.86 [ 985 / 12533, 59 ins, 171 del, 755 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode_ug_per_utt/wer_8
|
||||
|
||||
# And this is a suitable baseline: a system trained on RM only.
|
||||
#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode/wer_* | utils/best_wer.sh
|
||||
#%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_gpu_online/decode/wer_8
|
||||
#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh
|
||||
#%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug/wer_11
|
||||
#a11:s5: grep WER exp/nnet2_online/nnet_a_online/decode/wer_* | utils/best_wer.sh
|
||||
#%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_a_online/decode/wer_8
|
||||
#a11:s5: grep WER exp/nnet2_online/nnet_a_online/decode_ug/wer_* | utils/best_wer.sh
|
||||
#%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_a_online/decode_ug/wer_11
|
||||
|
|
|
@ -50,12 +50,14 @@ fi
|
|||
|
||||
if [ $stage -le 3 ]; then
|
||||
# We extract iVectors on all the train_nodup data, which will be what we
|
||||
# train the system on. This version of the iVector-extraction script
|
||||
# pairs the utterances into twos (by default, see --utts-per-spk-max option)
|
||||
# and treats each as one speaker.
|
||||
# train the system on.
|
||||
|
||||
# having a larger number of speakers is helpful for generalization, and to
|
||||
# handle per-utterance decoding well (iVector starts at zero).
|
||||
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_nodup data/train_nodup_max2
|
||||
|
||||
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
|
||||
--utts-per-spk-max 2 \
|
||||
data/train_nodup exp/nnet2_online/extractor exp/nnet2_online/ivectors_train_nodup2 || exit 1;
|
||||
data/train_nodup_max2 exp/nnet2_online/extractor exp/nnet2_online/ivectors_train_nodup2 || exit 1;
|
||||
fi
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,194 @@
|
|||
#!/bin/bash
|
||||
|
||||
|
||||
# This script trains a Switchboard system starting from a neural net trained for
|
||||
# Fisher English. It builds a
|
||||
# neural net for online decoding on top of the network we previously trained on
|
||||
# WSJ, by keeping everything but the last layer of that network and then
|
||||
# training just the last layer on our data.
|
||||
|
||||
stage=0
|
||||
set -e
|
||||
|
||||
train_stage=-10
|
||||
use_gpu=true
|
||||
. cmd.sh
|
||||
. ./path.sh
|
||||
. ./utils/parse_options.sh
|
||||
|
||||
if $use_gpu; then
|
||||
if ! cuda-compiled; then
|
||||
cat <<EOF && exit 1
|
||||
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
|
||||
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
|
||||
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
|
||||
EOF
|
||||
fi
|
||||
parallel_opts="-l gpu=1"
|
||||
num_threads=1
|
||||
minibatch_size=512
|
||||
dir=exp/nnet2_online_wsj/nnet_gpu
|
||||
trainfeats=exp/nnet2_online_wsj/wsj_activations_train_gpu
|
||||
srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_gpu_online
|
||||
# the following things are needed while training the combined model.
|
||||
srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a_gpu
|
||||
ivector_src=../../wsj/s5/exp/nnet2_online/extractor
|
||||
else
|
||||
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
|
||||
# almost the same, but this may be a little bit slow.
|
||||
num_threads=16
|
||||
minibatch_size=128
|
||||
parallel_opts="-pe smp $num_threads"
|
||||
dir=exp/nnet2_online_wsj/nnet
|
||||
trainfeats=exp/nnet2_online_wsj/wsj_activations_train
|
||||
srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_online
|
||||
# the following things are needed while training the combined model.
|
||||
srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a
|
||||
ivector_src=../../wsj/s5/exp/nnet2_online/extractor
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 0 ]; then
|
||||
echo "$0: dumping activations from WSJ model"
|
||||
steps/online/nnet2/dump_nnet_activations.sh --cmd "$train_cmd" --nj 30 \
|
||||
data/train $srcdir $trainfeats
|
||||
fi
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
echo "$0: training 0-hidden-layer model on top of WSJ activations"
|
||||
steps/nnet2/retrain_fast.sh --stage $train_stage \
|
||||
--num-threads "$num_threads" \
|
||||
--minibatch-size "$minibatch_size" \
|
||||
--parallel-opts "$parallel_opts" \
|
||||
--cmd "$decode_cmd" \
|
||||
--num-jobs-nnet 4 \
|
||||
--mix-up 4000 \
|
||||
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
|
||||
$trainfeats/data data/lang exp/tri3b_ali $dir
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ]; then
|
||||
echo "$0: formatting combined model for online decoding."
|
||||
steps/online/nnet2/prepare_online_decoding_retrain.sh $srcdir $dir ${dir}_online
|
||||
fi
|
||||
|
||||
# Note: at this point it might be possible to further train the combined model
|
||||
# by doing backprop through all of it. We haven't implemented this yet.
|
||||
|
||||
if [ $stage -le 3 ]; then
|
||||
# do online decoding with the combined model.
|
||||
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
exp/tri3b/graph data/test ${dir}_online/decode &
|
||||
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
exp/tri3b/graph_ug data/test ${dir}_online/decode_ug || exit 1;
|
||||
wait
|
||||
fi
|
||||
|
||||
if [ $stage -le 4 ]; then
|
||||
# do online per-utterance decoding with the combined model.
|
||||
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--per-utt true \
|
||||
exp/tri3b/graph data/test ${dir}_online/decode_utt &
|
||||
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--per-utt true \
|
||||
exp/tri3b/graph_ug data/test ${dir}_online/decode_ug_utt || exit 1;
|
||||
wait
|
||||
fi
|
||||
|
||||
## From this point on we try something else: we try training all the layers of
|
||||
## the model on this dataset. First we need to create a combined version of the
|
||||
## model.
|
||||
if [ $stage -le 5 ]; then
|
||||
steps/nnet2/create_appended_model.sh $srcdir_orig $dir ${dir}_combined_init
|
||||
|
||||
# Set the learning rate in this initial value to our guess of a suitable value.
|
||||
# note: we initially tried 0.005, and this gave us WERs of (1.40, 1.48, 7.24, 7.70) vs.
|
||||
# (1.32, 1.38, 7.20, 7.44) with a learning rate of 0.01.
|
||||
initial_learning_rate=0.01
|
||||
nnet-am-copy --learning-rate=$initial_learning_rate ${dir}_combined_init/final.mdl ${dir}_combined_init/final.mdl
|
||||
fi
|
||||
|
||||
# In order to train the combined model, we'll need to dump iVectors.
|
||||
if [ $stage -le 6 ]; then
|
||||
# having a larger number of speakers is helpful for generalization, and to
|
||||
# handle per-utterance decoding well (iVector starts at zero).
|
||||
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2
|
||||
|
||||
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \
|
||||
data/train_max2 $ivector_src exp/nnet2_online_wsj/ivectors || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 7 ]; then
|
||||
# assume left and right context of model are identical.
|
||||
splice_width=$(nnet-am-info exp/nnet2_online_wsj/nnet_gpu_combined_init/final.mdl | grep '^left-context' | awk '{print $2}') || exit 1;
|
||||
|
||||
# Note: in general the get_egs.sh script would get things like the LDA matrix
|
||||
# from exp/tri3b_ali, which would be the wrong thing to do as we want to get
|
||||
# them from the original model dir. In this case we're using raw MFCC
|
||||
# features so it's not an issue. But in general we'd probably have to create
|
||||
# a temporary dir and copy or link both the alignments and feature-related
|
||||
# things to it.
|
||||
steps/nnet2/get_egs.sh --cmd "$train_cmd" \
|
||||
--feat-type raw --cmvn-opts "--norm-means=false --norm-vars=false" \
|
||||
--online-ivector-dir exp/nnet2_online_wsj/ivectors \
|
||||
--num-jobs-nnet 4 --splice-width $splice_width \
|
||||
data/train data/lang exp/tri3b_ali ${dir}_combined
|
||||
fi
|
||||
|
||||
if [ $stage -le 8 ]; then
|
||||
steps/nnet2/train_more.sh --learning-rate-factor 0.1 --cmd "$train_cmd" \
|
||||
--num-threads "$num_threads" \
|
||||
--minibatch-size "$minibatch_size" \
|
||||
--parallel-opts "$parallel_opts" \
|
||||
${dir}_combined_init/final.mdl ${dir}_combined/egs ${dir}_combined
|
||||
fi
|
||||
|
||||
if [ $stage -le 9 ]; then
|
||||
# Create an online-decoding dir corresponding to what we just trained above.
|
||||
# If this setup used PLP features, we'd have to give the option --feature-type plp
|
||||
# to the script below.
|
||||
steps/online/nnet2/prepare_online_decoding.sh data/lang $ivector_src \
|
||||
${dir}_combined ${dir}_combined_online || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 10 ]; then
|
||||
# do the online decoding on top of the retrained _combined_online model, and
|
||||
# also the per-utterance version of the online decoding.
|
||||
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
exp/tri3b/graph data/test ${dir}_combined_online/decode &
|
||||
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
exp/tri3b/graph_ug data/test ${dir}_combined_online/decode_ug &
|
||||
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--per-utt true exp/tri3b/graph data/test ${dir}_combined_online/decode_per_utt &
|
||||
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--per-utt true exp/tri3b/graph_ug data/test ${dir}_combined_online/decode_ug_per_utt || exit 1;
|
||||
wait
|
||||
fi
|
||||
|
||||
|
||||
|
||||
exit 0;
|
||||
|
||||
# Here are the results when we just retrain the last layer:
|
||||
# grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_* | utils/best_wer.sh
|
||||
#%WER 1.61 [ 202 / 12533, 22 ins, 46 del, 134 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_3
|
||||
#a11:s5: grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh
|
||||
#%WER 7.99 [ 1002 / 12533, 74 ins, 153 del, 775 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_6
|
||||
|
||||
# and with per-utterance decoding:
|
||||
# %WER 1.72 [ 216 / 12533, 26 ins, 45 del, 145 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_utt/wer_3
|
||||
# %WER 8.40 [ 1053 / 12533, 85 ins, 158 del, 810 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_ug_utt/wer_6
|
||||
|
||||
#, here when we retrain the whole thing:
|
||||
# %WER 1.32 [ 165 / 12533, 14 ins, 34 del, 117 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode/wer_3
|
||||
# %WER 7.20 [ 902 / 12533, 78 ins, 127 del, 697 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_ug/wer_6
|
||||
|
||||
# and with per-utterance decoding:
|
||||
# %WER 1.38 [ 173 / 12533, 19 ins, 32 del, 122 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_per_utt/wer_3
|
||||
# %WER 7.44 [ 932 / 12533, 57 ins, 163 del, 712 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_ug_per_utt/wer_8
|
||||
|
||||
# And this is a suitable baseline: a system trained on RM only.
|
||||
#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode/wer_* | utils/best_wer.sh
|
||||
#%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_gpu_online/decode/wer_8
|
||||
#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh
|
||||
#%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug/wer_11
|
|
@ -55,9 +55,13 @@ fi
|
|||
if [ $stage -le 3 ]; then
|
||||
# We extract iVectors on all the train_si284 data, which will be what we
|
||||
# train the system on.
|
||||
|
||||
# having a larger number of speakers is helpful for generalization, and to
|
||||
# handle per-utterance decoding well (iVector starts at zero).
|
||||
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_si284 data/train_si284_max2
|
||||
|
||||
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
|
||||
--utts-per-spk-max 2
|
||||
data/train_si284 exp/nnet2_online/extractor exp/nnet2_online/ivectors_train_si284 || exit 1;
|
||||
data/train_si284_max2 exp/nnet2_online/extractor exp/nnet2_online/ivectors_train_si284 || exit 1;
|
||||
fi
|
||||
|
||||
|
||||
|
@ -78,8 +82,8 @@ if [ $stage -le 4 ]; then
|
|||
# wouldn't be able to decode in real-time using a CPU.
|
||||
#
|
||||
# I copied the learning rates from ../nnet2/run_5d.sh
|
||||
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
|
||||
--num-epochs 8 --num-epochs-extra 4 \
|
||||
steps/nnet2/train_pnorm_simple.sh --stage $train_stage \
|
||||
--num-epochs 12 \
|
||||
--splice-width 7 --feat-type raw \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors_train_si284 \
|
||||
--cmvn-opts "--norm-means=false --norm-vars=false" \
|
||||
|
|
|
@ -32,7 +32,7 @@ src1=$1
|
|||
src2=$2
|
||||
dir=$3
|
||||
|
||||
for f in $src1/final.mdl $src1/cmvn_opts $src2/tree $src2/final.mdl; do
|
||||
for f in $src1/final.mdl $src2/tree $src2/final.mdl; do
|
||||
[ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
|
||||
done
|
||||
|
||||
|
|
|
@ -60,10 +60,12 @@ if [ $# != 4 ]; then
|
|||
fi
|
||||
|
||||
data=$1
|
||||
lang=$2
|
||||
lang=$2 # kept for historical reasons, but never used.
|
||||
alidir=$3
|
||||
dir=$4
|
||||
|
||||
|
||||
|
||||
# Check some files.
|
||||
[ ! -z "$online_ivector_dir" ] && \
|
||||
extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
|
||||
|
@ -73,13 +75,8 @@ for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/
|
|||
done
|
||||
|
||||
|
||||
# Set some variables.
|
||||
oov=`cat $lang/oov.int`
|
||||
num_leaves=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
|
||||
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
|
||||
|
||||
nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir...
|
||||
# in this dir we'll have just one job.
|
||||
|
||||
sdata=$data/split$nj
|
||||
utils/split_data.sh $data $nj
|
||||
|
||||
|
@ -189,14 +186,22 @@ mkdir -p $dir/egs
|
|||
if [ $stage -le 2 ]; then
|
||||
echo "Getting validation and training subset examples."
|
||||
rm $dir/.error 2>/dev/null
|
||||
echo "$0: extracting validation and training-subset alignments."
|
||||
set -o pipefail;
|
||||
for id in $(seq $nj); do gunzip -c $alidir/ali.$id.gz; done | \
|
||||
copy-int-vector ark:- ark,t:- | \
|
||||
utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) | \
|
||||
gzip -c >$dir/ali_special.gz || exit 1;
|
||||
set +o pipefail; # unset the pipefail option.
|
||||
|
||||
all_ids=$(seq -s, $nj) # e.g. 1,2,...39,40
|
||||
$cmd $dir/log/create_valid_subset.log \
|
||||
nnet-get-egs $ivectors_opt $nnet_context_opts "$valid_feats" \
|
||||
"ark,s,cs:gunzip -c $alidir/ali.{$all_ids}.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
|
||||
"ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
|
||||
"ark:$dir/egs/valid_all.egs" || touch $dir/.error &
|
||||
$cmd $dir/log/create_train_subset.log \
|
||||
nnet-get-egs $ivectors_opt $nnet_context_opts "$train_subset_feats" \
|
||||
"ark,s,cs:gunzip -c $alidir/ali.{$all_ids}.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
|
||||
"ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
|
||||
"ark:$dir/egs/train_subset_all.egs" || touch $dir/.error &
|
||||
wait;
|
||||
[ -f $dir/.error ] && exit 1;
|
||||
|
@ -220,12 +225,10 @@ if [ $stage -le 2 ]; then
|
|||
for f in $dir/egs/{combine,train_diagnostic,valid_diagnostic}.egs; do
|
||||
[ ! -s $f ] && echo "No examples in file $f" && exit 1;
|
||||
done
|
||||
rm $dir/egs/valid_all.egs $dir/egs/train_subset_all.egs $dir/egs/{train,valid}_combine.egs
|
||||
rm $dir/egs/valid_all.egs $dir/egs/train_subset_all.egs $dir/egs/{train,valid}_combine.egs $dir/ali_special.gz
|
||||
fi
|
||||
|
||||
if [ $stage -le 3 ]; then
|
||||
mkdir -p $dir/temp
|
||||
|
||||
# Other scripts might need to know the following info:
|
||||
echo $num_jobs_nnet >$dir/egs/num_jobs_nnet
|
||||
echo $iters_per_epoch >$dir/egs/iters_per_epoch
|
||||
|
@ -279,9 +282,6 @@ if [ $stage -le 5 ]; then
|
|||
echo "Shuffling the order of training examples"
|
||||
echo "(in order to avoid stressing the disk, these won't all run at once)."
|
||||
|
||||
|
||||
# note, the "|| true" below is a workaround for NFS bugs
|
||||
# we encountered running this script with Debian-7, NFS-v4.
|
||||
for n in `seq 0 $[$iters_per_epoch-1]`; do
|
||||
$cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$n.JOB.log \
|
||||
nnet-shuffle-egs "--srand=\$[JOB+($num_jobs_nnet*$n)]" \
|
||||
|
|
|
@ -392,7 +392,6 @@ echo Done
|
|||
if $cleanup; then
|
||||
echo Cleaning up data
|
||||
if [ $egs_dir == "$dir/egs" ]; then
|
||||
echo Removing training examples
|
||||
rm $dir/egs/egs*
|
||||
steps/nnet2/remove_egs.sh $dir/egs
|
||||
fi
|
||||
fi
|
||||
|
|
|
@ -121,8 +121,6 @@ if [ $# != 4 ]; then
|
|||
echo " --lda-dim <dim|250> # Dimension to reduce spliced features to with LDA"
|
||||
echo " --num-iters-final <#iters|20> # Number of final iterations to give to nnet-combine-fast to "
|
||||
echo " # interpolate parameters (the weights are learned with a validation set)"
|
||||
echo " --first-component-power <power|1.0> # Power applied to output of first p-norm layer... setting this to"
|
||||
echo " # 0.5 seems to help under some circumstances."
|
||||
echo " --egs-opts <opts> # Extra options to pass to get_egs.sh"
|
||||
echo " --lda-opts <opts> # Extra options to pass to get_lda.sh"
|
||||
echo " --stage <stage|-9> # Used to run a partially-completed training process from somewhere in"
|
||||
|
|
|
@ -0,0 +1,478 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).
|
||||
# 2013 Xiaohui Zhang
|
||||
# 2013 Guoguo Chen
|
||||
# Apache 2.0.
|
||||
|
||||
|
||||
# train_pnorm_simpo.sh is a modified version of train_pnorm_fast.sh. Like
|
||||
# train_pnorm_fast.sh, it uses the `online' preconditioning, which is faster
|
||||
# (especially on GPUs). The difference is that the learning-rate schedule is
|
||||
# simpler, with the learning rate exponentially decreasing during training,
|
||||
# and no phase where the learning rate is constant.
|
||||
#
|
||||
# Also, the final model-combination is done a bit differently: we combine models
|
||||
# over typically a whole epoch, and because that would be too many iterations to
|
||||
# easily be able to combine over, we arrange the iterations into groups (20
|
||||
# groups by default) and average over each group.
|
||||
|
||||
# Begin configuration section.
|
||||
cmd=run.pl
|
||||
num_epochs=15 # Number of epochs of training;
|
||||
# the number of iterations is worked out from this.
|
||||
initial_learning_rate=0.04
|
||||
final_learning_rate=0.004
|
||||
bias_stddev=0.5
|
||||
pnorm_input_dim=3000
|
||||
pnorm_output_dim=300
|
||||
p=2
|
||||
minibatch_size=128 # by default use a smallish minibatch size for neural net
|
||||
# training; this controls instability which would otherwise
|
||||
# be a problem with multi-threaded update.
|
||||
|
||||
samples_per_iter=400000 # each iteration of training, see this many samples
|
||||
# per job. This option is passed to get_egs.sh
|
||||
num_jobs_nnet=16 # Number of neural net jobs to run in parallel. This option
|
||||
# is passed to get_egs.sh.
|
||||
get_egs_stage=0
|
||||
online_ivector_dir=
|
||||
|
||||
|
||||
max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
|
||||
# to the final 'combine' stage, but these models will themselves be averages of
|
||||
# iteration-number ranges.
|
||||
|
||||
shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
|
||||
# on each iter. You could set it to 0 or to a large value for complete
|
||||
# randomization, but this would both consume memory and cause spikes in
|
||||
# disk I/O. Smaller is easier on disk and memory but less random. It's
|
||||
# not a huge deal though, as samples are anyway randomized right at the start.
|
||||
# (the point of this is to get data in different minibatches on different iterations,
|
||||
# since in the preconditioning method, 2 samples in the same minibatch can
|
||||
# affect each others' gradients.
|
||||
|
||||
add_layers_period=2 # by default, add new layers every 2 iterations.
|
||||
num_hidden_layers=3
|
||||
stage=-4
|
||||
|
||||
io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. These don't
|
||||
splice_width=4 # meaning +- 4 frames on each side for second LDA
|
||||
randprune=4.0 # speeds up LDA.
|
||||
alpha=4.0 # relates to preconditioning.
|
||||
update_period=4 # relates to online preconditioning: says how often we update the subspace.
|
||||
num_samples_history=2000 # relates to online preconditioning
|
||||
max_change_per_sample=0.075
|
||||
precondition_rank_in=20 # relates to online preconditioning
|
||||
precondition_rank_out=80 # relates to online preconditioning
|
||||
|
||||
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
|
||||
# specified.)
|
||||
num_threads=16
|
||||
parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G"
|
||||
# by default we use 16 threads; this lets the queue know.
|
||||
# note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
|
||||
combine_num_threads=8
|
||||
combine_parallel_opts="-pe smp 8" # queue options for the "combine" stage.
|
||||
cleanup=true
|
||||
egs_dir=
|
||||
lda_opts=
|
||||
lda_dim=
|
||||
egs_opts=
|
||||
transform_dir= # If supplied, overrides alidir
|
||||
cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied.
|
||||
# only relevant for "raw" features, not lda.
|
||||
feat_type= # Can be used to force "raw" features.
|
||||
prior_subset_size=10000 # 10k samples per job, for computing priors. Should be
|
||||
# more than enough.
|
||||
# End configuration section.
|
||||
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
if [ -f path.sh ]; then . ./path.sh; fi
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
if [ $# != 4 ]; then
|
||||
echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
|
||||
echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
|
||||
echo ""
|
||||
echo "Main options (for others, see top of script file)"
|
||||
echo " --config <config-file> # config file containing options"
|
||||
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
|
||||
echo " --num-epochs <#epochs|15> # Number of epochs of training"
|
||||
echo " --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
|
||||
echo " # data, 0.01 for large data"
|
||||
echo " --final-learning-rate <final-learning-rate|0.004> # Learning rate at end of training, e.g. 0.004 for small"
|
||||
echo " # data, 0.001 for large data"
|
||||
echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
|
||||
echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers"
|
||||
echo " --mix-up <#pseudo-gaussians|0> # Can be used to have multiple targets in final output layer,"
|
||||
echo " # per context-dependent state. Try a number several times #states."
|
||||
echo " --num-jobs-nnet <num-jobs|8> # Number of parallel jobs to use for main neural net"
|
||||
echo " # training (will affect results as well as speed; try 8, 16)"
|
||||
echo " # Note: if you increase this, you may want to also increase"
|
||||
echo " # the learning rate."
|
||||
echo " --num-threads <num-threads|16> # Number of parallel threads per job (will affect results"
|
||||
echo " # as well as speed; may interact with batch size; if you increase"
|
||||
echo " # this, you may want to decrease the batch size."
|
||||
echo " --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\"> # extra options to pass to e.g. queue.pl for processes that"
|
||||
echo " # use multiple threads... note, you might have to reduce mem_free,ram_free"
|
||||
echo " # versus your defaults, because it gets multiplied by the -pe smp argument."
|
||||
echo " --io-opts <opts|\"-tc 10\"> # Options given to e.g. queue.pl for jobs that do a lot of I/O."
|
||||
echo " --minibatch-size <minibatch-size|128> # Size of minibatch to process (note: product with --num-threads"
|
||||
echo " # should not get too large, e.g. >2k)."
|
||||
echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per"
|
||||
echo " # process."
|
||||
echo " --splice-width <width|4> # Number of frames on each side to append for feature input"
|
||||
echo " # (note: we splice processed, typically 40-dimensional frames"
|
||||
echo " --lda-dim <dim|250> # Dimension to reduce spliced features to with LDA"
|
||||
echo " --stage <stage|-4> # Used to run a partially-completed training process from somewhere in"
|
||||
echo " # the middle."
|
||||
|
||||
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
data=$1
|
||||
lang=$2
|
||||
alidir=$3
|
||||
dir=$4
|
||||
|
||||
# Check some files.
|
||||
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
|
||||
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
|
||||
done
|
||||
|
||||
|
||||
# Set some variables.
|
||||
num_leaves=`tree-info $alidir/tree 2>/dev/null | awk '{print $2}'` || exit 1
|
||||
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
|
||||
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1
|
||||
|
||||
nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir...
|
||||
# in this dir we'll have just one job.
|
||||
sdata=$data/split$nj
|
||||
utils/split_data.sh $data $nj
|
||||
|
||||
mkdir -p $dir/log
|
||||
echo $nj > $dir/num_jobs
|
||||
cp $alidir/tree $dir
|
||||
|
||||
extra_opts=()
|
||||
[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
|
||||
[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
|
||||
[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
|
||||
[ -z "$transform_dir" ] && transform_dir=$alidir
|
||||
extra_opts+=(--transform-dir $transform_dir)
|
||||
extra_opts+=(--splice-width $splice_width)
|
||||
|
||||
if [ $stage -le -4 ]; then
|
||||
echo "$0: calling get_lda.sh"
|
||||
steps/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --cmd "$cmd" $data $lang $alidir $dir || exit 1;
|
||||
fi
|
||||
|
||||
# these files will have been written by get_lda.sh
|
||||
feat_dim=$(cat $dir/feat_dim) || exit 1;
|
||||
ivector_dim=$(cat $dir/ivector_dim) || exit 1;
|
||||
lda_dim=$(cat $dir/lda_dim) || exit 1;
|
||||
|
||||
if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
|
||||
echo "$0: calling get_egs.sh"
|
||||
steps/nnet2/get_egs.sh $egs_opts "${extra_opts[@]}" \
|
||||
--samples-per-iter $samples_per_iter \
|
||||
--num-jobs-nnet $num_jobs_nnet --stage $get_egs_stage \
|
||||
--cmd "$cmd" $egs_opts --io-opts "$io_opts" \
|
||||
$data $lang $alidir $dir || exit 1;
|
||||
fi
|
||||
|
||||
if [ -z $egs_dir ]; then
|
||||
egs_dir=$dir/egs
|
||||
fi
|
||||
|
||||
iters_per_epoch=`cat $egs_dir/iters_per_epoch` || exit 1;
|
||||
! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
|
||||
echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
|
||||
num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;
|
||||
|
||||
|
||||
if ! [ $num_hidden_layers -ge 1 ]; then
|
||||
echo "Invalid num-hidden-layers $num_hidden_layers"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $stage -le -2 ]; then
|
||||
echo "$0: initializing neural net";
|
||||
lda_mat=$dir/lda.mat
|
||||
tot_input_dim=$[$feat_dim+$ivector_dim]
|
||||
|
||||
online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"
|
||||
|
||||
stddev=`perl -e "print 1.0/sqrt($pnorm_input_dim);"`
|
||||
cat >$dir/nnet.config <<EOF
|
||||
SpliceComponent input-dim=$tot_input_dim left-context=$splice_width right-context=$splice_width const-component-dim=$ivector_dim
|
||||
FixedAffineComponent matrix=$lda_mat
|
||||
AffineComponentPreconditionedOnline input-dim=$lda_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
|
||||
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
|
||||
NormalizeComponent dim=$pnorm_output_dim
|
||||
AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
|
||||
SoftmaxComponent dim=$num_leaves
|
||||
EOF
|
||||
|
||||
# to hidden.config it will write the part of the config corresponding to a
|
||||
# single hidden layer; we need this to add new layers.
|
||||
cat >$dir/hidden.config <<EOF
|
||||
AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
|
||||
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
|
||||
NormalizeComponent dim=$pnorm_output_dim
|
||||
EOF
|
||||
$cmd $dir/log/nnet_init.log \
|
||||
nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
|
||||
$dir/0.mdl || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le -1 ]; then
|
||||
echo "Training transition probabilities and setting priors"
|
||||
$cmd $dir/log/train_trans.log \
|
||||
nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
|
||||
|| exit 1;
|
||||
fi
|
||||
|
||||
num_iters=$[$num_epochs * $iters_per_epoch];
|
||||
|
||||
echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
|
||||
|
||||
finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
|
||||
# This is when we decide to mix up from: halfway between when we've finished
|
||||
# adding the hidden layers and the end of training.
|
||||
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]
|
||||
|
||||
if [ $num_threads -eq 1 ]; then
|
||||
parallel_suffix="-simple" # this enables us to use GPU code if
|
||||
# we have just one thread.
|
||||
parallel_train_opts=
|
||||
if ! cuda-compiled; then
|
||||
echo "$0: WARNING: you are running with one thread but you have not compiled"
|
||||
echo " for CUDA. You may be running a setup optimized for GPUs. If you have"
|
||||
echo " GPUs and have nvcc installed, go to src/ and do ./configure; make"
|
||||
fi
|
||||
else
|
||||
parallel_suffix="-parallel"
|
||||
parallel_train_opts="--num-threads=$num_threads"
|
||||
fi
|
||||
|
||||
# First work out how many models we want to combine over in the final
|
||||
# nnet-combine-fast invocation. This equals
|
||||
# min(max(max_models_combine, iters_per_epoch),
|
||||
# 2/3 * iters_after_mixup)
|
||||
num_models_combine=$max_models_combine
|
||||
if [ $num_models_combine -lt $iters_per_epoch ]; then
|
||||
num_models_combine=$iters_per_epoch
|
||||
fi
|
||||
iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
|
||||
if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
|
||||
num_models_combine=$iters_after_mixup_23
|
||||
fi
|
||||
first_model_combine=$[$num_iters-$num_models_combine+1]
|
||||
|
||||
x=0
|
||||
|
||||
while [ $x -lt $num_iters ]; do
|
||||
if [ $x -ge 0 ] && [ $stage -le $x ]; then
|
||||
# Set off jobs doing some diagnostics, in the background.
|
||||
$cmd $dir/log/compute_prob_valid.$x.log \
|
||||
nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
|
||||
$cmd $dir/log/compute_prob_train.$x.log \
|
||||
nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
|
||||
if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
|
||||
$cmd $dir/log/progress.$x.log \
|
||||
nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
|
||||
ark:$egs_dir/train_diagnostic.egs '&&' \
|
||||
nnet-am-info $dir/$x.mdl &
|
||||
fi
|
||||
|
||||
echo "Training neural net (pass $x)"
|
||||
|
||||
if [ $x -gt 0 ] && \
|
||||
[ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
|
||||
[ $[($x-1) % $add_layers_period] -eq 0 ]; then
|
||||
mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert $dir/$x.mdl - - |"
|
||||
else
|
||||
mdl=$dir/$x.mdl
|
||||
fi
|
||||
if [ $x -eq 0 ] || [ "$mdl" != "$dir/$x.mdl" ]; then
|
||||
# on iteration zero or when we just added a layer, use a smaller minibatch
|
||||
# size and just one job: the model-averaging doesn't seem to be helpful
|
||||
# when the model is changing too fast (i.e. it worsens the objective
|
||||
# function), and the smaller minibatch size will help to keep
|
||||
# the update stable.
|
||||
this_minibatch_size=$[$minibatch_size/2];
|
||||
do_average=false
|
||||
else
|
||||
this_minibatch_size=$minibatch_size
|
||||
do_average=true
|
||||
fi
|
||||
|
||||
$cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
|
||||
nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
|
||||
ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
|
||||
nnet-train$parallel_suffix $parallel_train_opts \
|
||||
--minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
|
||||
ark:- $dir/$[$x+1].JOB.mdl \
|
||||
|| exit 1;
|
||||
|
||||
nnets_list=
|
||||
for n in `seq 1 $num_jobs_nnet`; do
|
||||
nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
|
||||
done
|
||||
|
||||
learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters $initial_learning_rate $final_learning_rate`;
|
||||
|
||||
if $do_average; then
|
||||
# average the output of the different jobs.
|
||||
$cmd $dir/log/average.$x.log \
|
||||
nnet-am-average $nnets_list - \| \
|
||||
nnet-am-copy --learning-rate=$learning_rate - $dir/$[$x+1].mdl || exit 1;
|
||||
else
|
||||
# choose the best from the different jobs.
|
||||
n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
|
||||
$fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
|
||||
undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
|
||||
close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
|
||||
$best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
|
||||
[ -z "$n" ] && echo "Error getting best model" && exit 1;
|
||||
$cmd $dir/log/select.$x.log \
|
||||
nnet-am-copy --learning-rate=$learning_rate $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
|
||||
fi
|
||||
|
||||
if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
|
||||
# mix up.
|
||||
echo Mixing up from $num_leaves to $mix_up components
|
||||
$cmd $dir/log/mix_up.$x.log \
|
||||
nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
|
||||
$dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
|
||||
fi
|
||||
rm $nnets_list
|
||||
[ ! -f $dir/$[$x+1].mdl ] && exit 1;
|
||||
if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
|
||||
[ $[($x-1)%100] -ne 0 ] && [ $[$x-1] -lt $first_model_combine ]; then
|
||||
rm $dir/$[$x-1].mdl
|
||||
fi
|
||||
fi
|
||||
x=$[$x+1]
|
||||
done
|
||||
|
||||
|
||||
if [ $stage -le $num_iters ]; then
|
||||
echo "Doing final combination to produce final.mdl"
|
||||
|
||||
# Now do combination.
|
||||
nnets_list=()
|
||||
# the if..else..fi statement below sets 'nnets_list'.
|
||||
if [ $max_models_combine -lt $num_models_combine ]; then
|
||||
# The number of models to combine is too large, e.g. > 20. In this case,
|
||||
# each argument to nnet-combine-fast will be an average of multiple models.
|
||||
cur_offset=0 # current offset from first_model_combine.
|
||||
for n in $(seq $max_models_combine); do
|
||||
next_offset=$[($n*$num_models_combine)/$max_models_combine]
|
||||
sub_list=""
|
||||
for o in $(seq $cur_offset $[$next_offset-1]); do
|
||||
iter=$[$first_model_combine+$o]
|
||||
mdl=$dir/$iter.mdl
|
||||
[ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
|
||||
sub_list="$sub_list $mdl"
|
||||
done
|
||||
nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
|
||||
cur_offset=$next_offset
|
||||
done
|
||||
else
|
||||
nnets_list=
|
||||
for n in $(seq 0 $[num_models_combine-1]); do
|
||||
iter=$[$first_model_combine+$n]
|
||||
mdl=$dir/$iter.mdl
|
||||
[ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
|
||||
nnets_list[$n]=$mdl
|
||||
done
|
||||
fi
|
||||
|
||||
|
||||
# Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
|
||||
# if there are many models it can give out-of-memory error; set num-threads to 8
|
||||
# to speed it up (this isn't ideal...)
|
||||
num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
|
||||
mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
|
||||
[ $mb -gt 512 ] && mb=512
|
||||
# Setting --initial-model to a large value makes it initialize the combination
|
||||
# with the average of all the models. It's important not to start with a
|
||||
# single model, or, due to the invariance to scaling that these nonlinearities
|
||||
# give us, we get zero diagonal entries in the fisher matrix that
|
||||
# nnet-combine-fast uses for scaling, which after flooring and inversion, has
|
||||
# the effect that the initial model chosen gets much higher learning rates
|
||||
# than the others. This prevents the optimization from working well.
|
||||
$cmd $combine_parallel_opts $dir/log/combine.log \
|
||||
nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
|
||||
--num-threads=$combine_num_threads \
|
||||
--verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
|
||||
$dir/final.mdl || exit 1;
|
||||
|
||||
# Normalize stddev for affine or block affine layers that are followed by a
|
||||
# pnorm layer and then a normalize layer.
|
||||
$cmd $dir/log/normalize.log \
|
||||
nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;
|
||||
|
||||
# Compute the probability of the final, combined model with
|
||||
# the same subset we used for the previous compute_probs, as the
|
||||
# different subsets will lead to different probs.
|
||||
$cmd $dir/log/compute_prob_valid.final.log \
|
||||
nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
|
||||
$cmd $dir/log/compute_prob_train.final.log \
|
||||
nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &
|
||||
fi
|
||||
|
||||
if [ $stage -le $[$num_iters+1] ]; then
|
||||
echo "Getting average posterior for purposes of adjusting the priors."
|
||||
# Note: this just uses CPUs, using a smallish subset of data.
|
||||
rm $dir/post.*.vec 2>/dev/null
|
||||
$cmd JOB=1:$num_jobs_nnet $dir/log/get_post.JOB.log \
|
||||
nnet-subset-egs --n=$prior_subset_size ark:$egs_dir/egs.JOB.0.ark ark:- \| \
|
||||
nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
|
||||
matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.JOB.vec || exit 1;
|
||||
|
||||
sleep 3; # make sure there is time for $dir/post.*.vec to appear.
|
||||
|
||||
$cmd $dir/log/vector_sum.log \
|
||||
vector-sum $dir/post.*.vec $dir/post.vec || exit 1;
|
||||
|
||||
rm $dir/post.*.vec;
|
||||
|
||||
echo "Re-adjusting priors based on computed posteriors"
|
||||
$cmd $dir/log/adjust_priors.log \
|
||||
nnet-adjust-priors $dir/final.mdl $dir/post.vec $dir/final.mdl || exit 1;
|
||||
fi
|
||||
|
||||
|
||||
if [ ! -f $dir/final.mdl ]; then
|
||||
echo "$0: $dir/final.mdl does not exist."
|
||||
# we don't want to clean up if the training didn't succeed.
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
sleep 2
|
||||
|
||||
echo Done
|
||||
|
||||
if $cleanup; then
|
||||
echo Cleaning up data
|
||||
if [ $egs_dir == "$dir/egs" ]; then
|
||||
steps/nnet2/remove_egs.sh $dir/egs
|
||||
fi
|
||||
|
||||
echo Removing most of the models
|
||||
for x in `seq 0 $num_iters`; do
|
||||
if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ]; then
|
||||
# delete all but every 100th model; don't delete the ones which combine to form the final model.
|
||||
rm $dir/$x.mdl
|
||||
fi
|
||||
done
|
||||
|
||||
fi
|
|
@ -0,0 +1,81 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2013-2014 Johns Hopkins University (author: Daniel Povey)
|
||||
# Apache 2.0
|
||||
|
||||
# This script is as utils/copy_data_dir.sh in that it copies a data-dir,
|
||||
# but it supports the --utts-per-spk-max option. If nonzero, it modifies
|
||||
# the utt2spk and spk2utt files by splitting each speaker into multiple
|
||||
# versions, so that each speaker has no more than --utts-per-spk-max
|
||||
# utterances.
|
||||
|
||||
|
||||
# begin configuration section
|
||||
utts_per_spk_max=-1
|
||||
# end configuration section
|
||||
|
||||
. utils/parse_options.sh
|
||||
|
||||
if [ $# != 2 ]; then
|
||||
echo "Usage: "
|
||||
echo " $0 [options] <srcdir> <destdir>"
|
||||
echo "e.g.:"
|
||||
echo " $0 --utts-per-spk-max 2 data/train data/train-max2"
|
||||
echo "Options"
|
||||
echo " --utts-per-spk-max=n # number of utterances per speaker maximum,"
|
||||
echo " # default -1 (meaning no maximum). E.g. 2."
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
|
||||
export LC_ALL=C
|
||||
|
||||
srcdir=$1
|
||||
destdir=$2
|
||||
|
||||
if [ ! -f $srcdir/utt2spk ]; then
|
||||
echo "$0: no such file $srcdir/utt2spk"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
set -e;
|
||||
set -o pipefail
|
||||
|
||||
mkdir -p $destdir
|
||||
|
||||
|
||||
if [ "$utts_per_spk_max" != -1 ]; then
|
||||
# create spk2utt file with reduced number of utterances per speaker.
|
||||
awk -v max=$utts_per_spk_max '{ n=2; count=0;
|
||||
while(n<=NF) {
|
||||
int_max=int(max)+ (rand() < (max-int(max))?1:0);
|
||||
nmax=n+int_max; count++; printf("%s-%06x", $1, count);
|
||||
for (;n<nmax&&n<=NF; n++) printf(" %s", $n); print "";} }' \
|
||||
<$srcdir/spk2utt >$destdir/spk2utt
|
||||
utils/spk2utt_to_utt2spk.pl <$destdir/spk2utt >$destdir/utt2spk
|
||||
|
||||
if [ -f $srcdir/cmvn.scp ]; then
|
||||
# below, the first apply_map command outputs a cmvn.scp indexed by utt;
|
||||
# the second one outputs a cmvn.scp indexed by new speaker-id.
|
||||
utils/apply_map.pl -f 2 $srcdir/cmvn.scp <$srcdir/utt2spk | \
|
||||
utils/apply_map.pl -f 1 $destdir/utt2spk | sort | uniq > $destdir/cmvn.scp
|
||||
echo "$0: mapping cmvn.scp, but you may want to recompute it if it's needed,"
|
||||
echo " as it would probably change."
|
||||
fi
|
||||
if [ -f $srcdir/spk2gender ]; then
|
||||
utils/apply_map.pl -f 2 $srcdir/spk2gender <$srcdir/utt2spk | \
|
||||
utils/apply_map.pl -f 1 $destdir/utt2spk | sort | uniq >$destdir/spk2gender
|
||||
fi
|
||||
else
|
||||
cp $srcdir/spk2utt $srcdir/utt2spk $destdir/
|
||||
[ -f $srcdir/spk2gender ] && cp $srcdir/spk2gender $destdir/
|
||||
[ -f $srcdir/cmvn.scp ] && cp $srcdir/cmvn.scp $destdir/
|
||||
fi
|
||||
|
||||
|
||||
for f in feats.scp segments wav.scp reco2file_and_channel text stm glm ctm; do
|
||||
[ -f $srcdir/$f ] && cp $srcdir/$f $destdir/
|
||||
done
|
||||
|
||||
echo "$0: copied data from $srcdir to $destdir, with --utts-per-spk-max $utts_per_spk_max"
|
||||
utils/validate_data_dir.sh $destdir
|
|
@ -98,6 +98,9 @@ fi
|
|||
if [ $stage -le 2 ]; then
|
||||
echo "$0: dumping neural net activations"
|
||||
|
||||
# The next line is a no-op unless $dir/feats/storage/ exists; see utils/create_split_dir.pl.
|
||||
for j in $(seq $nj); do utils/create_data_link.pl $dir/feats/feats.$j.ark; done
|
||||
|
||||
if [ -f $data/segments ]; then
|
||||
wav_rspecifier="ark,s,cs:extract-segments scp,p:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
|
||||
else
|
||||
|
|
|
@ -14,12 +14,8 @@
|
|||
# for online decoding.
|
||||
|
||||
# Rather than treating each utterance separately, it carries forward
|
||||
# information from one utterance to the next, within the speaker. However,
|
||||
# take note of the option "utts-per-spk-max", which splits speakers up into
|
||||
# "fake speakers" with at most two utterances in them. This means that more
|
||||
# iVectors are estimated starting from an uninformative starting point, than
|
||||
# if we used the real speaker labels (which may have many utterances each);
|
||||
# it's a compromise between per-utterance and per-speaker iVector estimation.
|
||||
# information from one utterance to the next, within the speaker.
|
||||
|
||||
|
||||
# Begin configuration section.
|
||||
nj=30
|
||||
|
@ -36,13 +32,9 @@ posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
|
|||
# used when training the iVector extractor, but more important
|
||||
# that this match the value used when you do real online decoding
|
||||
# with the neural nets trained with these iVectors.
|
||||
utts_per_spk_max=-1 # Maximum utterances per "fake-speaker." With the default
|
||||
# of -1 no fake-speakers are used. Note: this does not have to
|
||||
# be an integer; if it's noninteger, it will be rounded in a
|
||||
# randomized way to one of the two integers it's close to.
|
||||
# This is useful in the "perturbed-feature" recipe to encourage
|
||||
# that different perturbed versions of the same speaker get
|
||||
# split into fake-speakers differently.
|
||||
#utts_per_spk_max=-1 # This option is no longer supported, you should use
|
||||
# steps/online/nnet2/copy_data_dir.sh with the --utts-per-spk-max
|
||||
# option to make a copy of the data dir.
|
||||
compress=true # If true, compress the iVectors stored on disk (it's lossy
|
||||
# compression, as used for feature matrices).
|
||||
|
||||
|
@ -112,7 +104,6 @@ echo "--posterior-scale=$posterior_scale" >>$ieconf
|
|||
echo "--max-remembered-frames=1000" >>$ieconf # the default
|
||||
|
||||
|
||||
|
||||
ns=$(wc -l <$data/spk2utt)
|
||||
if [ "$ns" == 1 -a "$utts_per_spk_max" != 1 -a "$utts_per_spk_max" != -1 ]; then
|
||||
echo "$0: you seem to have just one speaker in your database. This is probably not a good idea."
|
||||
|
@ -121,29 +112,10 @@ if [ "$ns" == 1 -a "$utts_per_spk_max" != 1 -a "$utts_per_spk_max" != -1 ]; then
|
|||
utts_per_spk_max=1
|
||||
fi
|
||||
|
||||
spk2utt=""
|
||||
if [ "$utts_per_spk_max" != -1 ]; then
|
||||
mkdir -p $dir/spk2utt_fake
|
||||
for job in $(seq $nj); do
|
||||
# create fake spk2utt files with reduced number of utterances per speaker,
|
||||
# so the network is well adapted to using iVectors from small amounts of
|
||||
# training data.
|
||||
# the if (rand() % 2 == 0)
|
||||
awk -v max=$utts_per_spk_max '{ n=2; count=0;
|
||||
while(n<=NF) {
|
||||
int_max=int(max)+ (rand() < (max-int(max))?1:0);
|
||||
nmax=n+int_max; count++; printf("%s-%06x", $1, count);
|
||||
for (;n<nmax&&n<=NF; n++) printf(" %s", $n); print "";} }' \
|
||||
<$sdata/$job/spk2utt >$dir/spk2utt_fake/spk2utt.$job
|
||||
done
|
||||
spk2utt="ark:$dir/spk2utt_fake/spk2utt.JOB"
|
||||
else
|
||||
spk2utt="ark:$sdata/JOB/spk2utt"
|
||||
fi
|
||||
|
||||
|
||||
for n in $(seq $nj); do
|
||||
# This will do nothing unless the directorys $dir/storage exists;
|
||||
# This will do nothing unless the directory $dir/storage exists;
|
||||
# it can be used to distribute the data among multiple machines.
|
||||
utils/create_data_link.pl $dir/ivector_online.$n.ark
|
||||
done
|
||||
|
@ -151,7 +123,7 @@ done
|
|||
if [ $stage -le 0 ]; then
|
||||
echo "$0: extracting iVectors"
|
||||
$cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
|
||||
ivector-extract-online2 --config=$ieconf "$spk2utt" scp:$sdata/JOB/feats.scp ark:- \| \
|
||||
ivector-extract-online2 --config=$ieconf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/feats.scp ark:- \| \
|
||||
copy-feats --compress=$compress ark:- \
|
||||
ark,scp:$dir/ivector_online.JOB.ark,$dir/ivector_online.JOB.scp || exit 1;
|
||||
fi
|
||||
|
|
|
@ -0,0 +1,285 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
|
||||
|
||||
# This is modified from ../../nnet2/get_egs.sh.
|
||||
# This script combines the
|
||||
# nnet-example extraction with the feature extraction directly from wave files;
|
||||
# it uses the program online2-wav-dump-feature to do all parts of feature
|
||||
# extraction: MFCC/PLP/fbank, possibly plus pitch, plus iVectors. This script
|
||||
# is intended mostly for cross-system training for online decoding, where you
|
||||
# initialize the nnet from an existing, larger systme.
|
||||
|
||||
|
||||
# Begin configuration section.
|
||||
cmd=run.pl
|
||||
num_utts_subset=300 # number of utterances in validation and training
|
||||
# subsets used for shrinkage and diagnostics
|
||||
num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
|
||||
num_train_frames_combine=10000 # # train frames for the above.
|
||||
num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
|
||||
samples_per_iter=400000 # each iteration of training, see this many samples
|
||||
# per job. This is just a guideline; it will pick a number
|
||||
# that divides the number of samples in the entire data.
|
||||
transform_dir= # If supplied, overrides alidir
|
||||
num_jobs_nnet=16 # Number of neural net jobs to run in parallel
|
||||
stage=0
|
||||
io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.
|
||||
random_copy=false
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
if [ -f path.sh ]; then . ./path.sh; fi
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
|
||||
if [ $# != 4 ]; then
|
||||
echo "Usage: steps/online/nnet2/get_egs.sh [opts] <data> <ali-dir> <online-nnet-dir> <exp-dir>"
|
||||
echo " e.g.: steps/online/nnet2/get_egs.sh data/train exp/tri3_ali exp/nnet2_online/nnet_a_gpu_online/ exp/tri4_nnet"
|
||||
echo "In <online-nnet-dir>, it looks for final.mdl (need to compute required left and right context),"
|
||||
echo "and a configuration file conf/online_nnet2_decoding.conf which describes the features."
|
||||
echo "Main options (for others, see top of script file)"
|
||||
echo " --config <config-file> # config file containing options"
|
||||
echo " --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
|
||||
echo " --num-jobs-nnet <num-jobs;16> # Number of parallel jobs to use for main neural net"
|
||||
echo " # training (will affect results as well as speed; try 8, 16)"
|
||||
echo " # Note: if you increase this, you may want to also increase"
|
||||
echo " # the learning rate."
|
||||
echo " --samples-per-iter <#samples;400000> # Number of samples of data to process per iteration, per"
|
||||
echo " # process."
|
||||
echo " --feat-type <lda|raw> # (by default it tries to guess). The feature type you want"
|
||||
echo " # to use as input to the neural net."
|
||||
echo " --splice-width <width;4> # Number of frames on each side to append for feature input"
|
||||
echo " # (note: we splice processed, typically 40-dimensional frames"
|
||||
echo " --num-frames-diagnostic <#frames;4000> # Number of frames used in computing (train,valid) diagnostics"
|
||||
echo " --num-valid-frames-combine <#frames;10000> # Number of frames used in getting combination weights at the"
|
||||
echo " # very end."
|
||||
echo " --stage <stage|0> # Used to run a partially-completed training process from somewhere in"
|
||||
echo " # the middle."
|
||||
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
data=$1
|
||||
alidir=$2
|
||||
online_nnet_dir=$3
|
||||
dir=$4
|
||||
|
||||
|
||||
mdl=$online_nnet_dir/final.mdl # only needed for left and right context.
|
||||
feature_conf=$online_nnet_dir/conf/online_nnet2_decoding.conf
|
||||
|
||||
for f in $data/feats.scp $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $feature_conf $mdl; do
|
||||
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
|
||||
done
|
||||
|
||||
nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir...
|
||||
|
||||
sdata=$data/split$nj
|
||||
utils/split_data.sh $data $nj
|
||||
|
||||
mkdir -p $dir/log
|
||||
cp $alidir/tree $dir
|
||||
grep -v '^--endpoint' $feature_conf >$dir/feature.conf || exit 1;
|
||||
|
||||
# Get list of validation utterances.
|
||||
mkdir -p $dir/valid $dir/train_subset
|
||||
|
||||
awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
|
||||
> $dir/valid/uttlist || exit 1;
|
||||
|
||||
if [ -f $data/utt2uniq ]; then
|
||||
echo "File $data/utt2uniq exists, so augmenting valid/uttlist to"
|
||||
echo "include all perturbed versions of the same 'real' utterances."
|
||||
mv $dir/valid/uttlist $dir/valid/uttlist.tmp
|
||||
utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
|
||||
cat $dir/valid/uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
|
||||
sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
|
||||
awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid/uttlist
|
||||
rm $dir/uniq2utt $dir/valid/uttlist.tmp
|
||||
fi
|
||||
|
||||
awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid/uttlist | \
|
||||
head -$num_utts_subset > $dir/train_subset/uttlist || exit 1;
|
||||
|
||||
|
||||
for subdir in valid train_subset; do
|
||||
# In order for the iVector extraction to work right, we need to process all
|
||||
# utterances of the speakers which have utterances in valid/uttlist, and the
|
||||
# same for train_subset/uttlist. We produce $dir/valid/uttlist_extended which
|
||||
# will contain all utterances of all speakers which have utterances in
|
||||
# $dir/valid/uttlist, and the same for $dir/train_subset/.
|
||||
|
||||
utils/filter_scp.pl $dir/$subdir/uttlist <$data/utt2spk | awk '{print $2}' > $dir/$subdir/spklist || exit 1;
|
||||
utils/filter_scp.pl -f 2 $dir/$subdir/spklist <$data/utt2spk >$dir/$subdir/utt2spk || exit 1;
|
||||
utils/utt2spk_to_spk2utt.pl <$dir/$subdir/utt2spk >$dir/$subdir/spk2utt || exit 1;
|
||||
awk '{print $1}' <$dir/$subdir/utt2spk >$dir/$subdir/uttlist_extended || exit 1;
|
||||
rm $dir/$subdir/spklist
|
||||
done
|
||||
|
||||
if [ -f $data/segments ]; then
|
||||
# note: in the feature extraction, because the program online2-wav-dump-features is sensitive to the
|
||||
# previous utterances within a speaker, we do the filtering after extracting the features.
|
||||
echo "$0 [info]: segments file exists: using that."
|
||||
feats="ark,s,cs:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt ark,s,cs:- ark:- | subset-feats --exclude=$dir/valid/uttlist ark:- ark:- |"
|
||||
valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid/uttlist_extended $data/segments | extract-segments scp:$data/wav.scp - ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/valid/spk2utt ark,s,cs:- ark:- | subset-feats --include=$dir/valid/uttlist ark:- ark:- |"
|
||||
train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset/uttlist_extended $data/segments | extract-segments scp:$data/wav.scp - ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/train_subset/spk2utt ark,s,cs:- ark:- | subset-feats --include=$dir/train_subset/uttlist ark:- ark:- |"
|
||||
else
|
||||
echo "$0 [info]: no segments file exists, using wav.scp."
|
||||
feats="ark,s,cs:online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/wav.scp ark:- | subset-feats --exclude=$dir/valid/uttlist ark:- ark:- |"
|
||||
valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid/uttlist_extended $data/wav.scp | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/valid/spk2utt scp:- ark:- | subset-feats --include=$dir/valid/uttlist ark:- ark:- |"
|
||||
train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset/uttlist_extended $data/wav.scp | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/train_subset/spk2utt scp:- ark:- | subset-feats --include=$dir/train_subset/uttlist ark:- ark:- |"
|
||||
fi
|
||||
|
||||
ivector_dim=$(online2-wav-dump-features --config=$dir/feature.conf --print-ivector-dim=true) || exit 1;
|
||||
|
||||
! [ $ivector_dim -ge 0 ] && echo "$0: error getting iVector dim" && exit 1;
|
||||
|
||||
|
||||
if [ $stage -le 0 ]; then
|
||||
echo "$0: working out number of frames of training data"
|
||||
num_frames=$(steps/nnet2/get_num_frames.sh $data)
|
||||
echo $num_frames > $dir/num_frames
|
||||
else
|
||||
num_frames=`cat $dir/num_frames` || exit 1;
|
||||
fi
|
||||
|
||||
# Working out number of iterations per epoch.
|
||||
iters_per_epoch=`perl -e "print int($num_frames/($samples_per_iter * $num_jobs_nnet) + 0.5);"` || exit 1;
|
||||
[ $iters_per_epoch -eq 0 ] && iters_per_epoch=1
|
||||
samples_per_iter_real=$[$num_frames/($num_jobs_nnet*$iters_per_epoch)]
|
||||
echo "$0: Every epoch, splitting the data up into $iters_per_epoch iterations,"
|
||||
echo "$0: giving samples-per-iteration of $samples_per_iter_real (you requested $samples_per_iter)."
|
||||
|
||||
# Making soft links to storage directories. This is a no-up unless
|
||||
# the subdirectory $dir/egs/storage/ exists. See utils/create_split_dir.pl
|
||||
for x in `seq 1 $num_jobs_nnet`; do
|
||||
for y in `seq 0 $[$iters_per_epoch-1]`; do
|
||||
utils/create_data_link.pl $dir/egs/egs.$x.$y.ark
|
||||
utils/create_data_link.pl $dir/egs/egs_tmp.$x.$y.ark
|
||||
done
|
||||
for y in `seq 1 $nj`; do
|
||||
utils/create_data_link.pl $dir/egs/egs_orig.$x.$y.ark
|
||||
done
|
||||
done
|
||||
|
||||
remove () { for x in $*; do [ -L $x ] && rm $(readlink -f $x); rm $x; done }
|
||||
|
||||
set -o pipefail
|
||||
left_context=$(nnet-am-info $mdl | grep '^left-context' | awk '{print $2}') || exit 1;
|
||||
right_context=$(nnet-am-info $mdl | grep '^right-context' | awk '{print $2}') || exit 1;
|
||||
nnet_context_opts="--left-context=$left_context --right-context=$right_context"
|
||||
set +o pipefail
|
||||
|
||||
mkdir -p $dir/egs
|
||||
|
||||
if [ $stage -le 2 ]; then
|
||||
rm $dir/.error 2>/dev/null
|
||||
|
||||
echo "$0: extracting validation and training-subset alignments."
|
||||
set -o pipefail;
|
||||
for id in $(seq $nj); do gunzip -c $alidir/ali.$id.gz; done | \
|
||||
copy-int-vector ark:- ark,t:- | \
|
||||
utils/filter_scp.pl <(cat $dir/valid/uttlist $dir/train_subset/uttlist) | \
|
||||
gzip -c >$dir/ali_special.gz || exit 1;
|
||||
set +o pipefail; # unset the pipefail option.
|
||||
|
||||
echo "Getting validation and training subset examples."
|
||||
$cmd $dir/log/create_valid_subset.log \
|
||||
nnet-get-egs $ivectors_opt $nnet_context_opts "$valid_feats" \
|
||||
"ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
|
||||
"ark:$dir/egs/valid_all.egs" || touch $dir/.error &
|
||||
$cmd $dir/log/create_train_subset.log \
|
||||
nnet-get-egs $ivectors_opt $nnet_context_opts "$train_subset_feats" \
|
||||
"ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
|
||||
"ark:$dir/egs/train_subset_all.egs" || touch $dir/.error &
|
||||
wait;
|
||||
[ -f $dir/.error ] && exit 1;
|
||||
echo "Getting subsets of validation examples for diagnostics and combination."
|
||||
$cmd $dir/log/create_valid_subset_combine.log \
|
||||
nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/egs/valid_all.egs \
|
||||
ark:$dir/egs/valid_combine.egs || touch $dir/.error &
|
||||
$cmd $dir/log/create_valid_subset_diagnostic.log \
|
||||
nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/egs/valid_all.egs \
|
||||
ark:$dir/egs/valid_diagnostic.egs || touch $dir/.error &
|
||||
|
||||
$cmd $dir/log/create_train_subset_combine.log \
|
||||
nnet-subset-egs --n=$num_train_frames_combine ark:$dir/egs/train_subset_all.egs \
|
||||
ark:$dir/egs/train_combine.egs || touch $dir/.error &
|
||||
$cmd $dir/log/create_train_subset_diagnostic.log \
|
||||
nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/egs/train_subset_all.egs \
|
||||
ark:$dir/egs/train_diagnostic.egs || touch $dir/.error &
|
||||
wait
|
||||
[ -f $dir/.error ] && echo "Error detected while creating egs" && exit 1;
|
||||
cat $dir/egs/valid_combine.egs $dir/egs/train_combine.egs > $dir/egs/combine.egs
|
||||
|
||||
for f in $dir/egs/{combine,train_diagnostic,valid_diagnostic}.egs; do
|
||||
[ ! -s $f ] && echo "No examples in file $f" && exit 1;
|
||||
done
|
||||
rm $dir/egs/valid_all.egs $dir/egs/train_subset_all.egs $dir/egs/{train,valid}_combine.egs $dir/ali_special.gz
|
||||
fi
|
||||
|
||||
if [ $stage -le 3 ]; then
|
||||
|
||||
# Other scripts might need to know the following info:
|
||||
echo $num_jobs_nnet >$dir/egs/num_jobs_nnet
|
||||
echo $iters_per_epoch >$dir/egs/iters_per_epoch
|
||||
echo $samples_per_iter_real >$dir/egs/samples_per_iter
|
||||
|
||||
echo "Creating training examples";
|
||||
# in $dir/egs, create $num_jobs_nnet separate files with training examples.
|
||||
# The order is not randomized at this point.
|
||||
|
||||
egs_list=
|
||||
for n in `seq 1 $num_jobs_nnet`; do
|
||||
egs_list="$egs_list ark:$dir/egs/egs_orig.$n.JOB.ark"
|
||||
done
|
||||
echo "Generating training examples on disk"
|
||||
# The examples will go round-robin to egs_list.
|
||||
$cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
|
||||
nnet-get-egs $ivectors_opt $nnet_context_opts "$feats" \
|
||||
"ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
|
||||
nnet-copy-egs ark:- $egs_list || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 4 ]; then
|
||||
echo "$0: rearranging examples into parts for different parallel jobs"
|
||||
# combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and
|
||||
# then split into multiple parts egs.JOB.*.scp for different parts of the
|
||||
# data, 0 .. $iters_per_epoch-1.
|
||||
|
||||
if [ $iters_per_epoch -eq 1 ]; then
|
||||
echo "$0: Since iters-per-epoch == 1, just concatenating the data."
|
||||
for n in `seq 1 $num_jobs_nnet`; do
|
||||
cat $dir/egs/egs_orig.$n.*.ark > $dir/egs/egs_tmp.$n.0.ark || exit 1;
|
||||
remove $dir/egs/egs_orig.$n.*.ark
|
||||
done
|
||||
else # We'll have to split it up using nnet-copy-egs.
|
||||
egs_list=
|
||||
for n in `seq 0 $[$iters_per_epoch-1]`; do
|
||||
egs_list="$egs_list ark:$dir/egs/egs_tmp.JOB.$n.ark"
|
||||
done
|
||||
# note, the "|| true" below is a workaround for NFS bugs
|
||||
# we encountered running this script with Debian-7, NFS-v4.
|
||||
$cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/split_egs.JOB.log \
|
||||
nnet-copy-egs --random=$random_copy --srand=JOB \
|
||||
"ark:cat $dir/egs/egs_orig.JOB.*.ark|" $egs_list || exit 1;
|
||||
remove $dir/egs/egs_orig.*.*.ark 2>/dev/null
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 5 ]; then
|
||||
# Next, shuffle the order of the examples in each of those files.
|
||||
# Each one should not be too large, so we can do this in memory.
|
||||
echo "Shuffling the order of training examples"
|
||||
echo "(in order to avoid stressing the disk, these won't all run at once)."
|
||||
|
||||
for n in `seq 0 $[$iters_per_epoch-1]`; do
|
||||
$cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$n.JOB.log \
|
||||
nnet-shuffle-egs "--srand=\$[JOB+($num_jobs_nnet*$n)]" \
|
||||
ark:$dir/egs/egs_tmp.JOB.$n.ark ark:$dir/egs/egs.JOB.$n.ark
|
||||
remove $dir/egs/egs_tmp.*.$n.ark
|
||||
done
|
||||
fi
|
||||
|
||||
echo "$0: Finished preparing training examples"
|
|
@ -79,6 +79,12 @@ cp $srcdir/final.mdl $dir/ || exit 1;
|
|||
if [ ! -z "$iedir" ]; then
|
||||
mkdir -p $dir/ivector_extractor/
|
||||
cp $iedir/final.{mat,ie,dubm} $iedir/global_cmvn.stats $dir/ivector_extractor/ || exit 1;
|
||||
|
||||
# The following things won't be needed directly by the online decoding, but
|
||||
# will allow us to run prepare_online_decoding.sh again with
|
||||
# $dir/ivector_extractor/ as the input directory (useful in certain
|
||||
# cross-system training scenarios).
|
||||
cp $iedir/splice_opts $iedir/online_cmvn.conf $dir/ivector_extractor/ || exit 1;
|
||||
fi
|
||||
|
||||
|
||||
|
|
|
@ -46,6 +46,7 @@ creates a link such as
|
|||
Usage: utils/create_data_link.pl <data-archive>
|
||||
e.g.: utils/create_data_link.pl foo/bar/egs.3.4.ark
|
||||
|
||||
See also utils/remove_data_links.sh
|
||||
EOU
|
||||
|
||||
GetOptions();
|
||||
|
|
|
@ -30,7 +30,7 @@ Allowed options:
|
|||
--suffix : Common suffix to <actual_storage_dirs> (string, default = "")
|
||||
|
||||
See also create_data_link.pl, which is intended to work with the resulting
|
||||
directory structure.
|
||||
directory structure, and remove_data_links.sh
|
||||
EOU
|
||||
|
||||
my $suffix="";
|
||||
|
|
|
@ -19,12 +19,12 @@
|
|||
# This script takes a list of utterance-ids or any file whose first field
|
||||
# of each line is an utterance-id, and filters an scp
|
||||
# file (or any file whose "n-th" field is an utterance id), printing
|
||||
# out only those lines whose "n-th" field is in id_list. The index of
|
||||
# the "n-th" field is zero, by default, but can be changed by using \
|
||||
# out only those lines whose "n-th" field is in id_list. The index of
|
||||
# the "n-th" field is 1, by default, but can be changed by using
|
||||
# the -f <n> switch
|
||||
|
||||
$exclude = 0;
|
||||
$field = 0;
|
||||
$field = 1;
|
||||
$shifted = 0;
|
||||
|
||||
do {
|
||||
|
@ -42,7 +42,13 @@ do {
|
|||
} while ($shifted);
|
||||
|
||||
if(@ARGV < 1 || @ARGV > 2) {
|
||||
die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp ";
|
||||
die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
|
||||
"Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
|
||||
"Note: only the first field of each line in id_list matters. With --exclude, prints\n" .
|
||||
"only the lines that were *not* in id_list.\n" .
|
||||
"Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
|
||||
"If your older scripts (written before Oct 2014) stopped working and you used the\n" .
|
||||
"-f option, add 1 to the argument.\n";
|
||||
}
|
||||
|
||||
|
||||
|
@ -54,12 +60,27 @@ while(<F>) {
|
|||
$seen{$A[0]} = 1;
|
||||
}
|
||||
|
||||
while(<>) {
|
||||
@A = split;
|
||||
@A > 0 || die "Invalid scp file line $_";
|
||||
@A >= $field || die "Invalid scp file line $_";
|
||||
|
||||
if((!$exclude && $seen{$A[$field]}) || ($exclude && !defined $seen{$A[$field]})) {
|
||||
print $_;
|
||||
if ($field == 1) { # Treat this as special case, since it is common.
|
||||
while(<>) {
|
||||
$_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
|
||||
# $1 is what we filter on.
|
||||
if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
|
||||
print $_;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
while(<>) {
|
||||
@A = split;
|
||||
@A > 0 || die "Invalid scp file line $_";
|
||||
@A >= $field || die "Invalid scp file line $_";
|
||||
if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
|
||||
print $_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# tests:
|
||||
# the following should print "foo 1"
|
||||
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
|
||||
# the following should print "bar 2".
|
||||
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
#!/bin/bash
|
||||
|
||||
# This program searches within a directory for soft links that
|
||||
# appear to be created by 'create_data_link.pl' to a 'storage/' subdirectory,
|
||||
# and it removes both the soft links and the things they point to.
|
||||
# for instance, if you have a soft link
|
||||
# foo/egs/1.1.egs -> storage/2/1.1.egs
|
||||
# it will remove both foo/egs/storage/2/1.1.egs, and foo/egs/1.1.egs.
|
||||
|
||||
ret=0
|
||||
|
||||
dry_run=false
|
||||
|
||||
if [ "$1" == "--dry-run" ]; then
|
||||
dry_run=true
|
||||
shift
|
||||
fi
|
||||
|
||||
if [ $# == 0 ]; then
|
||||
echo "Usage: $0 [--dry-run] <list-of-directories>"
|
||||
echo "e.g.: $0 exp/nnet4a/egs/"
|
||||
echo " Removes from any subdirectories of the command-line arguments, soft links that "
|
||||
echo " appear to have been created by utils/create_data_link.pl, as well as the things"
|
||||
echo " that those soft links point to. Will typically be called on a directory prior"
|
||||
echo " to 'rm -r' on that directory, to ensure that data that was distributed on other"
|
||||
echo " volumes also gets deleted."
|
||||
echo " With --dry-run, just prints what it would do."
|
||||
fi
|
||||
|
||||
for dir in $*; do
|
||||
if [ ! -d $dir ]; then
|
||||
echo "$0: not a directory: $dir"
|
||||
ret=1
|
||||
else
|
||||
for subdir in $(find $dir -type d); do
|
||||
if [ -d $subdir/storage ]; then
|
||||
for x in $(ls $subdir); do
|
||||
f=$subdir/$x
|
||||
if [ -L $f ] && [[ $(readlink $f) == storage/* ]]; then
|
||||
target=$subdir/$(readlink $f)
|
||||
if $dry_run; then
|
||||
echo rm $f $target
|
||||
else
|
||||
rm $f $target
|
||||
fi
|
||||
fi
|
||||
done
|
||||
fi
|
||||
done
|
||||
fi
|
||||
done
|
||||
|
||||
exit $ret
|
|
@ -39,7 +39,7 @@ int main(int argc, char *argv[]) {
|
|||
const char *usage =
|
||||
"Extract segments from a large audio file in WAV format.\n"
|
||||
"Usage: extract-segments [options] <wav-rspecifier> <segments-file> <wav-wspecifier>\n"
|
||||
"e.g. extract-segments wav.scp segments ark:- | <some other program>\n"
|
||||
"e.g. extract-segments scp:wav.scp segments ark:- | <some other program>\n"
|
||||
" segments-file format: segment_id wav_file_name start_time end_time [channel]\n"
|
||||
" e.g.: spkabc_seg1 spkabc_recording1 1.10 2.36 1\n"
|
||||
" If channel is not provided as last element, expects mono.\n"
|
||||
|
|
|
@ -235,6 +235,7 @@ class LmExampleDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
|
|||
Label bos_symbol,
|
||||
Label eos_symbol);
|
||||
|
||||
|
||||
virtual StateId Start() { return start_state_; }
|
||||
|
||||
/// We don't bother caching the final-probs, just the arcs.
|
||||
|
|
|
@ -141,6 +141,7 @@ struct OnlineNnet2FeaturePipelineInfo {
|
|||
bool use_ivectors;
|
||||
OnlineIvectorExtractionInfo ivector_extractor_info;
|
||||
|
||||
int32 IvectorDim() { return ivector_extractor_info.extractor.IvectorDim(); }
|
||||
private:
|
||||
KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineNnet2FeaturePipelineInfo);
|
||||
};
|
||||
|
|
|
@ -42,7 +42,8 @@ int main(int argc, char *argv[]) {
|
|||
"Usage: online2-wav-dump-features [options] <spk2utt-rspecifier> <wav-rspecifier> <feature-wspecifier>\n"
|
||||
"The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
|
||||
"you want to generate features utterance by utterance.\n"
|
||||
"See steps/online/nnet2/dump_nnet_activations.sh for an example.\n";
|
||||
"Alternate usage: online2-wav-dump-features [options] --print-ivector-dim=true\n"
|
||||
"See steps/online/nnet2/{dump_nnet_activations,get_egs.sh} for examples.\n";
|
||||
|
||||
ParseOptions po(usage);
|
||||
|
||||
|
@ -50,24 +51,34 @@ int main(int argc, char *argv[]) {
|
|||
// as well as the basic features.
|
||||
OnlineNnet2FeaturePipelineConfig feature_config;
|
||||
BaseFloat chunk_length_secs = 0.05;
|
||||
bool print_ivector_dim = false;
|
||||
|
||||
po.Register("chunk-length", &chunk_length_secs,
|
||||
"Length of chunk size in seconds, that we process.");
|
||||
po.Register("print-ivector-dim", &print_ivector_dim,
|
||||
"If true, print iVector dimension (possibly zero) and exit. This "
|
||||
"version requires no arguments.");
|
||||
|
||||
feature_config.Register(&po);
|
||||
|
||||
po.Read(argc, argv);
|
||||
|
||||
if (po.NumArgs() != 3) {
|
||||
if (!print_ivector_dim && po.NumArgs() != 3) {
|
||||
po.PrintUsage();
|
||||
return 1;
|
||||
}
|
||||
|
||||
OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
|
||||
|
||||
if (print_ivector_dim) {
|
||||
std::cout << feature_info.IvectorDim() << std::endl;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
std::string spk2utt_rspecifier = po.GetArg(1),
|
||||
wav_rspecifier = po.GetArg(2),
|
||||
feats_wspecifier = po.GetArg(3);
|
||||
|
||||
OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
|
||||
|
||||
int32 num_done = 0, num_err = 0;
|
||||
int64 num_frames_tot = 0;
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
// online2bin/online2-wav-nnet2-am-compute.cc
|
||||
|
||||
// Copyright 2014 Johns Hopkins University (author: Daniel Povey)
|
||||
// David Snyder
|
||||
// 2014 David Snyder
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
|
|
Загрузка…
Ссылка в новой задаче