several nnet2-online changes: make it easier to get the feature extraction options right in cross-system training; add train_pnorm_simple.sh script (simplified learning-rate schedule and improved combination at the end, supersedes train_pnorm_fast.sh); modifying big-data online-nnet2 recipes to use 40-dimensional MFCC rather than 13 as input (will add results soon, but they are improved). Modified filter_scp.pl to have one-based, not zero-based, field index.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4493 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2014-09-30 19:18:36 +00:00
Родитель 011808dcae
Коммит 6f598676cc
34 изменённых файлов: 1353 добавлений и 263 удалений

Просмотреть файл

@ -96,7 +96,7 @@ while (( "$#" )); do
$cmd LMWT=$min_lmwt:$max_lmwt $targetdir/$kws/kws_filter.LMWT.log \
set -e';' set -o pipefail';' \
mkdir -p $targetdir/${kws}_LMWT';'\
cat $resultdir/${kws}_LMWT/'result.*' \| utils/filter_scp.pl -f 1 $filter \> $targetdir/${kws}_LMWT/result || exit -1
cat $resultdir/${kws}_LMWT/'result.*' \| utils/filter_scp.pl -f 2 $filter \> $targetdir/${kws}_LMWT/result || exit -1
echo -e "\tWrite normalized..."

Просмотреть файл

@ -1,10 +0,0 @@
--window-type=hamming # disable Dans window, use the standard
--use-energy=false # only fbank outputs
--sample-frequency=8000 # Cantonese is sampled at 8kHz
--low-freq=64 # typical setup from Frantisek Grezl
--high-freq=3800
--dither=1
--num-mel-bins=15 # 8kHz so we use 15 bins
--htk-compat=true # try to make it compatible with HTK

Просмотреть файл

@ -0,0 +1,10 @@
# config for high-resolution MFCC features, intended for neural network training.
# Note: we keep all cepstra, so it has the same info as filterbank features,
# but MFCC is more easily compressible (because less correlated) which is why
# we prefer this method.
--use-energy=false # use average of log energy, not energy.
--sample-frequency=8000 # Switchboard is sampled at 8kHz
--num-mel-bins=40 # similar to Google's setup.
--num-ceps=40 # there is no dimensionality reduction.
--low-freq=40 # low cutoff frequency for mel bins
--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)

Просмотреть файл

@ -1,6 +1,5 @@
#!/bin/bash
. cmd.sh
@ -12,74 +11,99 @@ set -e
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
# assume use_gpu=true since it would be way too slow otherwise.
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
where "nvcc" is installed.
EOF
fi
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
# the _a is in case I want to change the parameters.
dir=exp/nnet2_online/nnet_a_gpu
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="-pe smp $num_threads"
dir=exp/nnet2_online/nnet_a
fi
mkdir -p exp/nnet2_online
if [ $stage -le 1 ]; then
mkdir -p exp/nnet2_online
# To train a diagonal UBM we don't need very much data, so use the smallest subset.
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \
data/train_30k 512 exp/tri5a exp/nnet2_online/diag_ubm
# this shows how you can split across multiple file-systems. we'll split the
# MFCC dir across multiple locations. You might want to be careful here, if you
# have multiple copies of Kaldi checked out and run the same recipe, not to let
# them overwrite each other.
mfccdir=mfcc
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
date=$(date +'%m_%d_%H_%M')
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english-$date/s5/$mfccdir/storage $mfccdir/storage
fi
utils/copy_data_dir.sh data/train data/train_hires
steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" data/train_hires exp/make_hires/train $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/train_hires exp/make_hires/train $mfccdir || exit 1;
utils/subset_data_dir.sh data/train_hires 30000 data/train_hires_30k
# want the 100k subset to exactly match train_100k, since we'll use its alignments.
awk '{print $1}' data/train_100k/utt2spk > uttlist
utils/subset_data_dir.sh --utt-list uttlist data/train_hires data/train_hires_100k
rm uttlist
fi
if [ $stage -le 2 ]; then
# We need to build a small system just because we need the LDA+MLLT transform
# to train the diag-UBM on top of. We use --num-iters 13 because after we get
# the transform (12th iter is the last), any further training is pointless.
steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
--splice-opts "--left-context=3 --right-context=3" \
5000 10000 data/train_hires_100k data/lang exp/tri4a exp/nnet2_online/tri5a
fi
if [ $stage -le 3 ]; then
# To train a diagonal UBM we don't need very much data, so use the smallest
# subset. the input directory exp/nnet2_online/tri5a is only needed for
# the splice-opts and the LDA transform.
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \
data/train_hires_30k 512 exp/nnet2_online/tri5a exp/nnet2_online/diag_ubm
fi
if [ $stage -le 4 ]; then
# iVector extractors can in general be sensitive to the amount of data, but
# this one has a fairly small dim (defaults to 100) so we don't use all of it,
# we use just the 100k subset (about one sixteenth of the data).
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
data/train_100k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
data/train_hires_100k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
fi
if [ $stage -le 3 ]; then
if [ $stage -le 5 ]; then
ivectordir=exp/nnet2_online/ivectors_train
if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/$ivectordir $ivectordir/storage
if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then # this shows how you can split across multiple file-systems.
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english/s5/$ivectordir/storage $ivectordir/storage
fi
# We extract iVectors on all the train data, which will be what we
# train the system on. This version of the iVector-extraction script
# pairs the utterances into twos (by default, see --utts-per-spk-max option)
# and treats each of these pairs as one speaker.
# Note that these are extracted 'online'.
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_hires data/train_hires_max2
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
--utts-per-spk-max 2 \
data/train exp/nnet2_online/extractor $ivectordir || exit 1;
data/train_hires_max2 exp/nnet2_online/extractor $ivectordir || exit 1;
fi
if [ $stage -le 4 ]; then
if [ $stage -le 6 ]; then
if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/$dir/egs $dir/egs/storage
fi
# Because we have a lot of data here and we don't want the training to take
# too long, we reduce the number of epochs from the defaults (15 + 5) to (1 +
# too long, we reduce the number of epochs from the defaults (15 + 5) to (3 +
# 1). The option "--io-opts '-tc 12'" is to have more than the default number
# (5) of jobs dumping the egs to disk; this is OK since we're splitting our
# data across four filesystems for speed.
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
--num-epochs 3 --num-epochs-extra 1 \
--num-epochs 4 --num-epochs-extra 1 \
--samples-per-iter 400000 \
--splice-width 7 --feat-type raw \
--online-ivector-dir exp/nnet2_online/ivectors_train \
--cmvn-opts "--norm-means=false --norm-vars=false" \
@ -94,30 +118,12 @@ if [ $stage -le 4 ]; then
--cmd "$decode_cmd" \
--pnorm-input-dim 3500 \
--pnorm-output-dim 350 \
data/train data/lang exp/tri5a $dir || exit 1;
data/train_hires data/lang exp/tri5a $dir || exit 1;
fi
if [ $stage -le 5 ]; then
# dump iVectors for the testing data.
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
data/dev exp/nnet2_online/extractor exp/nnet2_online/ivectors_dev || exit 1;
fi
if [ $stage -le 6 ]; then
# this does offline decoding that should give about the same results as the
# real online decoding (the one with --per-utt true)
steps/nnet2/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
--online-ivector-dir exp/nnet2_online/ivectors_dev \
exp/tri5a/graph data/dev $dir/decode_dev || exit 1;
fi
if [ $stage -le 7 ]; then
# If this setup used PLP features, we'd have to give the option --feature-type plp
# to the script below.
steps/online/nnet2/prepare_online_decoding.sh data/lang exp/nnet2_online/extractor \
"$dir" ${dir}_online || exit 1;
steps/online/nnet2/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
data/lang exp/nnet2_online/extractor "$dir" ${dir}_online || exit 1;
fi
if [ $stage -le 8 ]; then
@ -146,30 +152,3 @@ fi
exit 0;
#Baseline: GMM+SAT system.
#%WER 31.07 [ 12163 / 39141, 1869 ins, 2705 del, 7589 sub ] exp/tri5a/decode_dev/wer_13
# Baseline: p-norm system on top of fMLLR features.
#%WER 23.66 [ 9259 / 39141, 1495 ins, 2432 del, 5332 sub ] exp/nnet6c4_gpu/decode_dev/wer_11
# Our experiment, carrying forward the adaptation state between
# utterances of each speaker.
#%WER 23.79 [ 9311 / 39141, 1499 ins, 2277 del, 5535 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev/wer_11
# Our experiment, with per-utterance decoding:
%WER 24.84 [ 9721 / 39141, 1445 ins, 2410 del, 5866 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt/wer_11
# below, with --max-chunks-at-once 3. The WER is slightly worse but I expect in general it will
# be slightly better, to to more iVector right context; this is likely just noise. The average
# latency was reduced vs the baseline,
#%WER 24.92 [ 9753 / 39141, 1423 ins, 2429 del, 5901 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt_mc3/wer_11
# The following results (obtained after ./run_nnet2_discriminative.sh was run), show
# the effect of discriminative training. After 2 epochs, we reduce the WER from 23.58 to 22.07.
%WER 23.58 [ 9229 / 39141, 1382 ins, 2400 del, 5447 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev/wer_12
%WER 22.16 [ 8675 / 39141, 1522 ins, 1886 del, 5267 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_smbr_epoch1/wer_13
%WER 22.07 [ 8637 / 39141, 1540 ins, 1873 del, 5224 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_smbr_epoch2/wer_13

Просмотреть файл

@ -60,14 +60,12 @@ if [ $stage -le 3 ]; then
utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/$ivectordir $ivectordir/storage
fi
# We extract iVectors on all the train data, which will be what we
# train the system on. This version of the iVector-extraction script
# pairs the utterances into twos (by default, see --utts-per-spk-max option)
# and treats each of these pairs as one speaker.
# Note that these are extracted 'online'.
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
--utts-per-spk-max 2 \
data/train exp/nnet2_online/extractor $ivectordir || exit 1;
data/train_max2 exp/nnet2_online/extractor $ivectordir || exit 1;
fi
@ -83,7 +81,8 @@ if [ $stage -le 4 ]; then
# data across four filesystems for speed.
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
--num-epochs 3 --num-epochs-extra 1 \
--num-epochs 4 --num-epochs-extra 1 \
--samples-per-iter 400000 \
--splice-width 7 --feat-type raw \
--online-ivector-dir exp/nnet2_online/ivectors_train \
--cmvn-opts "--norm-means=false --norm-vars=false" \

Просмотреть файл

@ -1,8 +1,6 @@
#!/bin/bash
# This is to be run after run_nnet2.sh
# THIS IS NOT TESTED YET.
. cmd.sh
@ -43,7 +41,6 @@ set -e
nj=40
if [ $stage -le 1 ]; then
# the make_denlats job is always done on CPU not GPU, since in any case
# the graph search and lattice determinization takes quite a bit of CPU.
# note: it's the sub-split option that determinies how many jobs actually
@ -51,7 +48,7 @@ if [ $stage -le 1 ]; then
steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
--nj $nj --sub-split 40 --num-threads 6 --parallel-opts "-pe smp 6" \
--online-ivector-dir exp/nnet2_online/ivectors_train \
data/train data/lang $srcdir ${srcdir}_denlats
data/train_hires data/lang $srcdir ${srcdir}_denlats
fi
if [ $stage -le 2 ]; then
@ -59,7 +56,7 @@ if [ $stage -le 2 ]; then
steps/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" \
--online-ivector-dir exp/nnet2_online/ivectors_train \
--use-gpu $use_gpu_opt \
--nj $nj data/train data/lang ${srcdir} ${srcdir}_ali
--nj $nj data/train_hires data/lang ${srcdir} ${srcdir}_ali
fi
if [ $stage -le 3 ]; then
@ -72,22 +69,22 @@ if [ $stage -le 3 ]; then
# since we're using 4 disks.
steps/nnet2/train_discriminative.sh --cmd "$decode_cmd" --learning-rate 0.00001 \
--io-opts "-pe smp 10" \
--num-epochs 2 \
--num-epochs 4 \
--use-preconditioning $use_preconditioning \
--online-ivector-dir exp/nnet2_online/ivectors_train \
--num-jobs-nnet 4 --num-threads $num_threads --parallel-opts "$gpu_opts" \
data/train data/lang \
data/train_hires data/lang \
${srcdir}_ali ${srcdir}_denlats ${srcdir}/final.mdl ${srcdir}_smbr
fi
if [ $stage -le 4 ]; then
# we'll do the decoding as 'online' decoding by using the existing
# _online directory but with extra models copied to it.
for epoch in 1 2; do
for epoch in 1 2 3 4; do
cp ${srcdir}_smbr/epoch${epoch}.mdl ${srcdir}_online/smbr_epoch${epoch}.mdl
done
for epoch in 1 2; do
for epoch in 1 2 3 4; do
# do the actual online decoding with iVectors, carrying info forward from
# previous utterances of the same speaker.
steps/online/nnet2/decode.sh --cmd "$decode_cmd" --nj 30 --iter smbr_epoch${epoch} \
@ -95,5 +92,6 @@ if [ $stage -le 4 ]; then
done
fi
wait
# for results, see the end of run_nnet2.sh

Просмотреть файл

@ -45,7 +45,7 @@ if [ $stage -le 2 ]; then
local/vad_split_utts_fix_data.pl $in_dir $dir;
fi
utils/filter_scp.pl -f 0 \
utils/filter_scp.pl \
<(echo "`awk < "$dir/segments" '{ print $2 }'`") $in_dir/wav.scp \
> $dir/wav.scp

Просмотреть файл

@ -31,7 +31,7 @@ classes="ark:lid/remove_dialect.pl data/train/utt2lang \
# Create priors to rebalance the model. The following script rebalances
# the languages as count(lang_test) / (count(lang_test) + count(lang_train)).
lid/balance_priors_to_test.pl \
<(lid/remove_dialect.pl <(utils/filter_scp.pl -f 0 \
<(lid/remove_dialect.pl <(utils/filter_scp.pl \
exp/ivectors_train/ivector.scp data/train/utt2lang)) \
<(lid/remove_dialect.pl data/lre07/utt2lang) \
exp/ivectors_train/languages.txt \

Просмотреть файл

@ -6,6 +6,9 @@
stage=1
train_stage=-10
use_gpu=true
dir=exp/nnet2_online/nnet_a
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
@ -21,7 +24,6 @@ EOF
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
dir=exp/nnet2_online/nnet_gpu
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
@ -47,14 +49,17 @@ if [ $stage -le 2 ]; then
fi
if [ $stage -le 3 ]; then
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
--utts-per-spk-max 2 \
data/train exp/nnet2_online/extractor exp/nnet2_online/ivectors || exit 1;
data/train_max2 exp/nnet2_online/extractor exp/nnet2_online/ivectors || exit 1;
fi
if [ $stage -le 4 ]; then
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
steps/nnet2/train_pnorm_simple.sh --stage $train_stage \
--splice-width 7 \
--feat-type raw \
--online-ivector-dir exp/nnet2_online/ivectors \
@ -63,7 +68,8 @@ if [ $stage -le 4 ]; then
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--num-jobs-nnet 4 \
--num-epochs-extra 10 --add-layers-period 1 \
--num-epochs 25 \
--add-layers-period 1 \
--num-hidden-layers 2 \
--mix-up 4000 \
--initial-learning-rate 0.02 --final-learning-rate 0.004 \

Просмотреть файл

@ -1,7 +1,7 @@
#!/bin/bash
# this is a baseline for run_online_decoding_nnet2.sh, without
# this is a baseline for ./run_nnet2.sh, without
# the iVectors, to see whether they make a difference.
. cmd.sh
@ -10,10 +10,14 @@
stage=1
train_stage=-10
use_gpu=true
dir=exp/nnet2_online/nnet_a_baseline
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
@ -25,19 +29,17 @@ EOF
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
dir=exp/nnet2_online/nnet_gpu_baseline
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="-pe smp $num_threads"
dir=exp/nnet2_online/nnet_baseline
fi
if [ $stage -le 1 ]; then
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
steps/nnet2/train_pnorm_simple.sh --stage $train_stage \
--splice-width 7 \
--feat-type raw \
--cmvn-opts "--norm-means=false --norm-vars=false" \
@ -45,7 +47,8 @@ if [ $stage -le 1 ]; then
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--num-jobs-nnet 4 \
--num-epochs-extra 10 --add-layers-period 1 \
--num-epochs 25 \
--add-layers-period 1 \
--num-hidden-layers 2 \
--mix-up 4000 \
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
@ -82,4 +85,4 @@ if [ $stage -le 4 ]; then
wait
fi
# for results, see the end of ./run_online_decoding_nnet2.sh
# for results, see the end of ./run_nnet2.sh

Просмотреть файл

@ -77,11 +77,13 @@ if [ $stage -le 5 ]; then
fi
# Below, setting --utts-per-spk-max to a noninteger helps to randomize the division
# of speakers into "fake-speakers" with about 2 utterances each, by randomly making
# some have 2 and some 3 utterances... this randomnes will be different in different
# some have 2 and some 3 utterances... this randomness will be different in different
# copies of the data.
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2.5 data/train_perturbed_mfcc \
data/train_perturbed_mfcc_max2.5
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
--utts-per-spk-max 2.5 \
data/train_perturbed_mfcc exp/nnet2_online/extractor $ivectordir || exit 1;
data/train_perturbed_mfcc_max2.5 exp/nnet2_online/extractor $ivectordir || exit 1;
fi

Просмотреть файл

@ -4,7 +4,7 @@
# the optional part local/online/run_online_decoding_nnet2.sh. It builds a
# neural net for online decoding on top of the network we previously trained on
# WSJ, by keeping everything but the last layer of that network and then
# training just the last layer on our data.
# training just the last layer on our data. We then train the whole thing.
stage=0
set -e
@ -26,35 +26,40 @@ EOF
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
dir=exp/nnet2_online_wsj/nnet_gpu
trainfeats=exp/nnet2_online_wsj/wsj_activations_train_gpu
dir=exp/nnet2_online_wsj/nnet_a
trainfeats=exp/nnet2_online_wsj/wsj_activations_train
# later we'll change the script to download the trained model from kaldi-asr.org.
srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_gpu_online
# the following things are needed while training the combined model.
srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a_gpu
ivector_src=../../wsj/s5/exp/nnet2_online/extractor
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="-pe smp $num_threads"
dir=exp/nnet2_online_wsj/nnet
dir=exp/nnet2_online_wsj/nnet_a
trainfeats=exp/nnet2_online_wsj/wsj_activations_train
srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_online
# the following things are needed while training the combined model.
srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a
ivector_src=../../wsj/s5/exp/nnet2_online/extractor
fi
if [ $stage -le 0 ]; then
echo "$0: dumping activations from WSJ model"
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $trainfeats/feats/storage ]; then
# this shows how you can split the data across multiple file-systems; it's optional.
date=$(date +'%m_%d_%H_%M')
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/rm-$date/s5/$trainfeats/feats/storage \
$trainfeats/feats/storage
fi
steps/online/nnet2/dump_nnet_activations.sh --cmd "$train_cmd" --nj 30 \
data/train $srcdir $trainfeats
fi
if [ $stage -le 1 ]; then
echo "$0: training 0-hidden-layer model on top of WSJ activations"
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/rm-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
fi
steps/nnet2/retrain_fast.sh --stage $train_stage \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
@ -71,9 +76,6 @@ if [ $stage -le 2 ]; then
steps/online/nnet2/prepare_online_decoding_retrain.sh $srcdir $dir ${dir}_online
fi
# Note: at this point it might be possible to further train the combined model
# by doing backprop through all of it. We haven't implemented this yet.
if [ $stage -le 3 ]; then
# do online decoding with the combined model.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
@ -98,7 +100,7 @@ fi
## the model on this dataset. First we need to create a combined version of the
## model.
if [ $stage -le 5 ]; then
steps/nnet2/create_appended_model.sh $srcdir_orig $dir ${dir}_combined_init
steps/nnet2/create_appended_model.sh $srcdir $dir ${dir}_combined_init
# Set the learning rate in this initial value to our guess of a suitable value.
# note: we initially tried 0.005, and this gave us WERs of (1.40, 1.48, 7.24, 7.70) vs.
@ -107,31 +109,20 @@ if [ $stage -le 5 ]; then
nnet-am-copy --learning-rate=$initial_learning_rate ${dir}_combined_init/final.mdl ${dir}_combined_init/final.mdl
fi
# In order to train the combined model, we'll need to dump iVectors.
if [ $stage -le 6 ]; then
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \
--utts-per-spk-max 2 \
data/train $ivector_src exp/nnet2_online_wsj/ivectors || exit 1;
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
utils/create_split_dir.pl \
/export/b0{1,2,3,4}/$USER/kaldi-data/rm-$(date +'%m_%d_%H_%M')/s5/${dir}_combined/egs/storage \
$dir_combined/egs/storage
fi
# This version of the get_egs.sh script does the feature extraction and iVector
# extraction in a single binary, reading the config, as part of the script.
steps/online/nnet2/get_egs.sh --cmd "$train_cmd" --num-jobs-nnet 4 \
data/train exp/tri3b_ali ${dir}_online ${dir}_combined
fi
if [ $stage -le 7 ]; then
# assume left and right context of model are identical.
splice_width=$(nnet-am-info exp/nnet2_online_wsj/nnet_gpu_combined_init/final.mdl | grep '^left-context' | awk '{print $2}') || exit 1;
# Note: in general the get_egs.sh script would get things like the LDA matrix
# from exp/tri3b_ali, which would be the wrong thing to do as we want to get
# them from the original model dir. In this case we're using raw MFCC
# features so it's not an issue. But in general we'd probably have to create
# a temporary dir and copy or link both the alignments and feature-related
# things to it.
steps/nnet2/get_egs.sh --cmd "$train_cmd" \
--feat-type raw --cmvn-opts "--norm-means=false --norm-vars=false" \
--online-ivector-dir exp/nnet2_online_wsj/ivectors \
--num-jobs-nnet 4 --splice-width $splice_width \
data/train data/lang exp/tri3b_ali ${dir}_combined
fi
if [ $stage -le 8 ]; then
steps/nnet2/train_more.sh --learning-rate-factor 0.1 --cmd "$train_cmd" \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
@ -139,15 +130,15 @@ if [ $stage -le 8 ]; then
${dir}_combined_init/final.mdl ${dir}_combined/egs ${dir}_combined
fi
if [ $stage -le 9 ]; then
if [ $stage -le 8 ]; then
# Create an online-decoding dir corresponding to what we just trained above.
# If this setup used PLP features, we'd have to give the option --feature-type plp
# to the script below.
steps/online/nnet2/prepare_online_decoding.sh data/lang $ivector_src \
steps/online/nnet2/prepare_online_decoding.sh data/lang $srcdir/ivector_extractor \
${dir}_combined ${dir}_combined_online || exit 1;
fi
if [ $stage -le 10 ]; then
if [ $stage -le 9 ]; then
# do the online decoding on top of the retrained _combined_online model, and
# also the per-utterance version of the online decoding.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
@ -166,25 +157,27 @@ fi
exit 0;
# Here are the results when we just retrain the last layer:
# grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_* | utils/best_wer.sh
#%WER 1.61 [ 202 / 12533, 22 ins, 46 del, 134 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_3
#a11:s5: grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh
#%WER 7.99 [ 1002 / 12533, 74 ins, 153 del, 775 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_6
# grep WER exp/nnet2_online_wsj/nnet_a_online/decode/wer_* | utils/best_wer.sh
#%WER 1.60 [ 201 / 12533, 22 ins, 46 del, 133 sub ] exp/nnet2_online_wsj/nnet_a_online/decode/wer_3
#a11:s5: grep WER exp/nnet2_online_wsj/nnet_a_online/decode_ug/wer_* | utils/best_wer.sh
#%WER 8.02 [ 1005 / 12533, 74 ins, 155 del, 776 sub ] exp/nnet2_online_wsj/nnet_a_online/decode_ug/wer_6
# and with per-utterance decoding:
# %WER 1.72 [ 216 / 12533, 26 ins, 45 del, 145 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_utt/wer_3
# %WER 8.40 [ 1053 / 12533, 85 ins, 158 del, 810 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_ug_utt/wer_6
# %WER 8.47 [ 1061 / 12533, 88 ins, 157 del, 816 sub ] exp/nnet2_online_wsj/nnet_a_online/decode_ug_utt/wer_6
# %WER 1.70 [ 213 / 12533, 24 ins, 46 del, 143 sub ] exp/nnet2_online_wsj/nnet_a_online/decode_utt/wer_3
#, here when we retrain the whole thing:
# %WER 1.32 [ 165 / 12533, 14 ins, 34 del, 117 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode/wer_3
# %WER 7.20 [ 902 / 12533, 78 ins, 127 del, 697 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_ug/wer_6
#%WER 1.42 [ 178 / 12533, 16 ins, 44 del, 118 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode/wer_4
#%WER 7.08 [ 887 / 12533, 74 ins, 133 del, 680 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode_ug/wer_6
# and with per-utterance decoding:
# %WER 1.38 [ 173 / 12533, 19 ins, 32 del, 122 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_per_utt/wer_3
# %WER 7.44 [ 932 / 12533, 57 ins, 163 del, 712 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_ug_per_utt/wer_8
# and the same with per-utterance decoding:
# %WER 1.56 [ 196 / 12533, 31 ins, 26 del, 139 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode_per_utt/wer_2
# %WER 7.86 [ 985 / 12533, 59 ins, 171 del, 755 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode_ug_per_utt/wer_8
# And this is a suitable baseline: a system trained on RM only.
#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode/wer_* | utils/best_wer.sh
#%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_gpu_online/decode/wer_8
#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh
#%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug/wer_11
#a11:s5: grep WER exp/nnet2_online/nnet_a_online/decode/wer_* | utils/best_wer.sh
#%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_a_online/decode/wer_8
#a11:s5: grep WER exp/nnet2_online/nnet_a_online/decode_ug/wer_* | utils/best_wer.sh
#%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_a_online/decode_ug/wer_11

Просмотреть файл

@ -50,12 +50,14 @@ fi
if [ $stage -le 3 ]; then
# We extract iVectors on all the train_nodup data, which will be what we
# train the system on. This version of the iVector-extraction script
# pairs the utterances into twos (by default, see --utts-per-spk-max option)
# and treats each as one speaker.
# train the system on.
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_nodup data/train_nodup_max2
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
--utts-per-spk-max 2 \
data/train_nodup exp/nnet2_online/extractor exp/nnet2_online/ivectors_train_nodup2 || exit 1;
data/train_nodup_max2 exp/nnet2_online/extractor exp/nnet2_online/ivectors_train_nodup2 || exit 1;
fi

Просмотреть файл

@ -0,0 +1,194 @@
#!/bin/bash
# This script trains a Switchboard system starting from a neural net trained for
# Fisher English. It builds a
# neural net for online decoding on top of the network we previously trained on
# WSJ, by keeping everything but the last layer of that network and then
# training just the last layer on our data.
stage=0
set -e
train_stage=-10
use_gpu=true
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
EOF
fi
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
dir=exp/nnet2_online_wsj/nnet_gpu
trainfeats=exp/nnet2_online_wsj/wsj_activations_train_gpu
srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_gpu_online
# the following things are needed while training the combined model.
srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a_gpu
ivector_src=../../wsj/s5/exp/nnet2_online/extractor
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="-pe smp $num_threads"
dir=exp/nnet2_online_wsj/nnet
trainfeats=exp/nnet2_online_wsj/wsj_activations_train
srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_online
# the following things are needed while training the combined model.
srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a
ivector_src=../../wsj/s5/exp/nnet2_online/extractor
fi
if [ $stage -le 0 ]; then
echo "$0: dumping activations from WSJ model"
steps/online/nnet2/dump_nnet_activations.sh --cmd "$train_cmd" --nj 30 \
data/train $srcdir $trainfeats
fi
if [ $stage -le 1 ]; then
echo "$0: training 0-hidden-layer model on top of WSJ activations"
steps/nnet2/retrain_fast.sh --stage $train_stage \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--cmd "$decode_cmd" \
--num-jobs-nnet 4 \
--mix-up 4000 \
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
$trainfeats/data data/lang exp/tri3b_ali $dir
fi
if [ $stage -le 2 ]; then
echo "$0: formatting combined model for online decoding."
steps/online/nnet2/prepare_online_decoding_retrain.sh $srcdir $dir ${dir}_online
fi
# Note: at this point it might be possible to further train the combined model
# by doing backprop through all of it. We haven't implemented this yet.
if [ $stage -le 3 ]; then
# do online decoding with the combined model.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
exp/tri3b/graph data/test ${dir}_online/decode &
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
exp/tri3b/graph_ug data/test ${dir}_online/decode_ug || exit 1;
wait
fi
if [ $stage -le 4 ]; then
# do online per-utterance decoding with the combined model.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--per-utt true \
exp/tri3b/graph data/test ${dir}_online/decode_utt &
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--per-utt true \
exp/tri3b/graph_ug data/test ${dir}_online/decode_ug_utt || exit 1;
wait
fi
## From this point on we try something else: we try training all the layers of
## the model on this dataset. First we need to create a combined version of the
## model.
if [ $stage -le 5 ]; then
steps/nnet2/create_appended_model.sh $srcdir_orig $dir ${dir}_combined_init
# Set the learning rate in this initial value to our guess of a suitable value.
# note: we initially tried 0.005, and this gave us WERs of (1.40, 1.48, 7.24, 7.70) vs.
# (1.32, 1.38, 7.20, 7.44) with a learning rate of 0.01.
initial_learning_rate=0.01
nnet-am-copy --learning-rate=$initial_learning_rate ${dir}_combined_init/final.mdl ${dir}_combined_init/final.mdl
fi
# In order to train the combined model, we'll need to dump iVectors.
if [ $stage -le 6 ]; then
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \
data/train_max2 $ivector_src exp/nnet2_online_wsj/ivectors || exit 1;
fi
if [ $stage -le 7 ]; then
# assume left and right context of model are identical.
splice_width=$(nnet-am-info exp/nnet2_online_wsj/nnet_gpu_combined_init/final.mdl | grep '^left-context' | awk '{print $2}') || exit 1;
# Note: in general the get_egs.sh script would get things like the LDA matrix
# from exp/tri3b_ali, which would be the wrong thing to do as we want to get
# them from the original model dir. In this case we're using raw MFCC
# features so it's not an issue. But in general we'd probably have to create
# a temporary dir and copy or link both the alignments and feature-related
# things to it.
steps/nnet2/get_egs.sh --cmd "$train_cmd" \
--feat-type raw --cmvn-opts "--norm-means=false --norm-vars=false" \
--online-ivector-dir exp/nnet2_online_wsj/ivectors \
--num-jobs-nnet 4 --splice-width $splice_width \
data/train data/lang exp/tri3b_ali ${dir}_combined
fi
if [ $stage -le 8 ]; then
steps/nnet2/train_more.sh --learning-rate-factor 0.1 --cmd "$train_cmd" \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
${dir}_combined_init/final.mdl ${dir}_combined/egs ${dir}_combined
fi
if [ $stage -le 9 ]; then
# Create an online-decoding dir corresponding to what we just trained above.
# If this setup used PLP features, we'd have to give the option --feature-type plp
# to the script below.
steps/online/nnet2/prepare_online_decoding.sh data/lang $ivector_src \
${dir}_combined ${dir}_combined_online || exit 1;
fi
if [ $stage -le 10 ]; then
# do the online decoding on top of the retrained _combined_online model, and
# also the per-utterance version of the online decoding.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
exp/tri3b/graph data/test ${dir}_combined_online/decode &
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
exp/tri3b/graph_ug data/test ${dir}_combined_online/decode_ug &
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--per-utt true exp/tri3b/graph data/test ${dir}_combined_online/decode_per_utt &
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--per-utt true exp/tri3b/graph_ug data/test ${dir}_combined_online/decode_ug_per_utt || exit 1;
wait
fi
exit 0;
# Here are the results when we just retrain the last layer:
# grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_* | utils/best_wer.sh
#%WER 1.61 [ 202 / 12533, 22 ins, 46 del, 134 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_3
#a11:s5: grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh
#%WER 7.99 [ 1002 / 12533, 74 ins, 153 del, 775 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_6
# and with per-utterance decoding:
# %WER 1.72 [ 216 / 12533, 26 ins, 45 del, 145 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_utt/wer_3
# %WER 8.40 [ 1053 / 12533, 85 ins, 158 del, 810 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_ug_utt/wer_6
#, here when we retrain the whole thing:
# %WER 1.32 [ 165 / 12533, 14 ins, 34 del, 117 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode/wer_3
# %WER 7.20 [ 902 / 12533, 78 ins, 127 del, 697 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_ug/wer_6
# and with per-utterance decoding:
# %WER 1.38 [ 173 / 12533, 19 ins, 32 del, 122 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_per_utt/wer_3
# %WER 7.44 [ 932 / 12533, 57 ins, 163 del, 712 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_ug_per_utt/wer_8
# And this is a suitable baseline: a system trained on RM only.
#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode/wer_* | utils/best_wer.sh
#%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_gpu_online/decode/wer_8
#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh
#%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug/wer_11

Просмотреть файл

@ -55,9 +55,13 @@ fi
if [ $stage -le 3 ]; then
# We extract iVectors on all the train_si284 data, which will be what we
# train the system on.
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_si284 data/train_si284_max2
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
--utts-per-spk-max 2
data/train_si284 exp/nnet2_online/extractor exp/nnet2_online/ivectors_train_si284 || exit 1;
data/train_si284_max2 exp/nnet2_online/extractor exp/nnet2_online/ivectors_train_si284 || exit 1;
fi
@ -78,8 +82,8 @@ if [ $stage -le 4 ]; then
# wouldn't be able to decode in real-time using a CPU.
#
# I copied the learning rates from ../nnet2/run_5d.sh
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
--num-epochs 8 --num-epochs-extra 4 \
steps/nnet2/train_pnorm_simple.sh --stage $train_stage \
--num-epochs 12 \
--splice-width 7 --feat-type raw \
--online-ivector-dir exp/nnet2_online/ivectors_train_si284 \
--cmvn-opts "--norm-means=false --norm-vars=false" \

Просмотреть файл

@ -32,7 +32,7 @@ src1=$1
src2=$2
dir=$3
for f in $src1/final.mdl $src1/cmvn_opts $src2/tree $src2/final.mdl; do
for f in $src1/final.mdl $src2/tree $src2/final.mdl; do
[ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done

Просмотреть файл

@ -60,10 +60,12 @@ if [ $# != 4 ]; then
fi
data=$1
lang=$2
lang=$2 # kept for historical reasons, but never used.
alidir=$3
dir=$4
# Check some files.
[ ! -z "$online_ivector_dir" ] && \
extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
@ -73,13 +75,8 @@ for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/
done
# Set some variables.
oov=`cat $lang/oov.int`
num_leaves=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj
@ -189,14 +186,22 @@ mkdir -p $dir/egs
if [ $stage -le 2 ]; then
echo "Getting validation and training subset examples."
rm $dir/.error 2>/dev/null
echo "$0: extracting validation and training-subset alignments."
set -o pipefail;
for id in $(seq $nj); do gunzip -c $alidir/ali.$id.gz; done | \
copy-int-vector ark:- ark,t:- | \
utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) | \
gzip -c >$dir/ali_special.gz || exit 1;
set +o pipefail; # unset the pipefail option.
all_ids=$(seq -s, $nj) # e.g. 1,2,...39,40
$cmd $dir/log/create_valid_subset.log \
nnet-get-egs $ivectors_opt $nnet_context_opts "$valid_feats" \
"ark,s,cs:gunzip -c $alidir/ali.{$all_ids}.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
"ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
"ark:$dir/egs/valid_all.egs" || touch $dir/.error &
$cmd $dir/log/create_train_subset.log \
nnet-get-egs $ivectors_opt $nnet_context_opts "$train_subset_feats" \
"ark,s,cs:gunzip -c $alidir/ali.{$all_ids}.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
"ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
"ark:$dir/egs/train_subset_all.egs" || touch $dir/.error &
wait;
[ -f $dir/.error ] && exit 1;
@ -220,12 +225,10 @@ if [ $stage -le 2 ]; then
for f in $dir/egs/{combine,train_diagnostic,valid_diagnostic}.egs; do
[ ! -s $f ] && echo "No examples in file $f" && exit 1;
done
rm $dir/egs/valid_all.egs $dir/egs/train_subset_all.egs $dir/egs/{train,valid}_combine.egs
rm $dir/egs/valid_all.egs $dir/egs/train_subset_all.egs $dir/egs/{train,valid}_combine.egs $dir/ali_special.gz
fi
if [ $stage -le 3 ]; then
mkdir -p $dir/temp
# Other scripts might need to know the following info:
echo $num_jobs_nnet >$dir/egs/num_jobs_nnet
echo $iters_per_epoch >$dir/egs/iters_per_epoch
@ -279,9 +282,6 @@ if [ $stage -le 5 ]; then
echo "Shuffling the order of training examples"
echo "(in order to avoid stressing the disk, these won't all run at once)."
# note, the "|| true" below is a workaround for NFS bugs
# we encountered running this script with Debian-7, NFS-v4.
for n in `seq 0 $[$iters_per_epoch-1]`; do
$cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$n.JOB.log \
nnet-shuffle-egs "--srand=\$[JOB+($num_jobs_nnet*$n)]" \

Просмотреть файл

@ -392,7 +392,6 @@ echo Done
if $cleanup; then
echo Cleaning up data
if [ $egs_dir == "$dir/egs" ]; then
echo Removing training examples
rm $dir/egs/egs*
steps/nnet2/remove_egs.sh $dir/egs
fi
fi

Просмотреть файл

@ -121,8 +121,6 @@ if [ $# != 4 ]; then
echo " --lda-dim <dim|250> # Dimension to reduce spliced features to with LDA"
echo " --num-iters-final <#iters|20> # Number of final iterations to give to nnet-combine-fast to "
echo " # interpolate parameters (the weights are learned with a validation set)"
echo " --first-component-power <power|1.0> # Power applied to output of first p-norm layer... setting this to"
echo " # 0.5 seems to help under some circumstances."
echo " --egs-opts <opts> # Extra options to pass to get_egs.sh"
echo " --lda-opts <opts> # Extra options to pass to get_lda.sh"
echo " --stage <stage|-9> # Used to run a partially-completed training process from somewhere in"

Просмотреть файл

@ -0,0 +1,478 @@
#!/bin/bash
# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).
# 2013 Xiaohui Zhang
# 2013 Guoguo Chen
# Apache 2.0.
# train_pnorm_simpo.sh is a modified version of train_pnorm_fast.sh. Like
# train_pnorm_fast.sh, it uses the `online' preconditioning, which is faster
# (especially on GPUs). The difference is that the learning-rate schedule is
# simpler, with the learning rate exponentially decreasing during training,
# and no phase where the learning rate is constant.
#
# Also, the final model-combination is done a bit differently: we combine models
# over typically a whole epoch, and because that would be too many iterations to
# easily be able to combine over, we arrange the iterations into groups (20
# groups by default) and average over each group.
# Begin configuration section.
cmd=run.pl
num_epochs=15 # Number of epochs of training;
# the number of iterations is worked out from this.
initial_learning_rate=0.04
final_learning_rate=0.004
bias_stddev=0.5
pnorm_input_dim=3000
pnorm_output_dim=300
p=2
minibatch_size=128 # by default use a smallish minibatch size for neural net
# training; this controls instability which would otherwise
# be a problem with multi-threaded update.
samples_per_iter=400000 # each iteration of training, see this many samples
# per job. This option is passed to get_egs.sh
num_jobs_nnet=16 # Number of neural net jobs to run in parallel. This option
# is passed to get_egs.sh.
get_egs_stage=0
online_ivector_dir=
max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
# to the final 'combine' stage, but these models will themselves be averages of
# iteration-number ranges.
shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
# on each iter. You could set it to 0 or to a large value for complete
# randomization, but this would both consume memory and cause spikes in
# disk I/O. Smaller is easier on disk and memory but less random. It's
# not a huge deal though, as samples are anyway randomized right at the start.
# (the point of this is to get data in different minibatches on different iterations,
# since in the preconditioning method, 2 samples in the same minibatch can
# affect each others' gradients.
add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3
stage=-4
io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. These don't
splice_width=4 # meaning +- 4 frames on each side for second LDA
randprune=4.0 # speeds up LDA.
alpha=4.0 # relates to preconditioning.
update_period=4 # relates to online preconditioning: says how often we update the subspace.
num_samples_history=2000 # relates to online preconditioning
max_change_per_sample=0.075
precondition_rank_in=20 # relates to online preconditioning
precondition_rank_out=80 # relates to online preconditioning
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
# specified.)
num_threads=16
parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G"
# by default we use 16 threads; this lets the queue know.
# note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_num_threads=8
combine_parallel_opts="-pe smp 8" # queue options for the "combine" stage.
cleanup=true
egs_dir=
lda_opts=
lda_dim=
egs_opts=
transform_dir= # If supplied, overrides alidir
cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied.
# only relevant for "raw" features, not lda.
feat_type= # Can be used to force "raw" features.
prior_subset_size=10000 # 10k samples per job, for computing priors. Should be
# more than enough.
# End configuration section.
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 4 ]; then
echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
echo ""
echo "Main options (for others, see top of script file)"
echo " --config <config-file> # config file containing options"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --num-epochs <#epochs|15> # Number of epochs of training"
echo " --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
echo " # data, 0.01 for large data"
echo " --final-learning-rate <final-learning-rate|0.004> # Learning rate at end of training, e.g. 0.004 for small"
echo " # data, 0.001 for large data"
echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers"
echo " --mix-up <#pseudo-gaussians|0> # Can be used to have multiple targets in final output layer,"
echo " # per context-dependent state. Try a number several times #states."
echo " --num-jobs-nnet <num-jobs|8> # Number of parallel jobs to use for main neural net"
echo " # training (will affect results as well as speed; try 8, 16)"
echo " # Note: if you increase this, you may want to also increase"
echo " # the learning rate."
echo " --num-threads <num-threads|16> # Number of parallel threads per job (will affect results"
echo " # as well as speed; may interact with batch size; if you increase"
echo " # this, you may want to decrease the batch size."
echo " --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\"> # extra options to pass to e.g. queue.pl for processes that"
echo " # use multiple threads... note, you might have to reduce mem_free,ram_free"
echo " # versus your defaults, because it gets multiplied by the -pe smp argument."
echo " --io-opts <opts|\"-tc 10\"> # Options given to e.g. queue.pl for jobs that do a lot of I/O."
echo " --minibatch-size <minibatch-size|128> # Size of minibatch to process (note: product with --num-threads"
echo " # should not get too large, e.g. >2k)."
echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per"
echo " # process."
echo " --splice-width <width|4> # Number of frames on each side to append for feature input"
echo " # (note: we splice processed, typically 40-dimensional frames"
echo " --lda-dim <dim|250> # Dimension to reduce spliced features to with LDA"
echo " --stage <stage|-4> # Used to run a partially-completed training process from somewhere in"
echo " # the middle."
exit 1;
fi
data=$1
lang=$2
alidir=$3
dir=$4
# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1
nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj
mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir
extra_opts=()
[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
[ -z "$transform_dir" ] && transform_dir=$alidir
extra_opts+=(--transform-dir $transform_dir)
extra_opts+=(--splice-width $splice_width)
if [ $stage -le -4 ]; then
echo "$0: calling get_lda.sh"
steps/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi
# these files will have been written by get_lda.sh
feat_dim=$(cat $dir/feat_dim) || exit 1;
ivector_dim=$(cat $dir/ivector_dim) || exit 1;
lda_dim=$(cat $dir/lda_dim) || exit 1;
if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
echo "$0: calling get_egs.sh"
steps/nnet2/get_egs.sh $egs_opts "${extra_opts[@]}" \
--samples-per-iter $samples_per_iter \
--num-jobs-nnet $num_jobs_nnet --stage $get_egs_stage \
--cmd "$cmd" $egs_opts --io-opts "$io_opts" \
$data $lang $alidir $dir || exit 1;
fi
if [ -z $egs_dir ]; then
egs_dir=$dir/egs
fi
iters_per_epoch=`cat $egs_dir/iters_per_epoch` || exit 1;
! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;
if ! [ $num_hidden_layers -ge 1 ]; then
echo "Invalid num-hidden-layers $num_hidden_layers"
exit 1
fi
if [ $stage -le -2 ]; then
echo "$0: initializing neural net";
lda_mat=$dir/lda.mat
tot_input_dim=$[$feat_dim+$ivector_dim]
online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"
stddev=`perl -e "print 1.0/sqrt($pnorm_input_dim);"`
cat >$dir/nnet.config <<EOF
SpliceComponent input-dim=$tot_input_dim left-context=$splice_width right-context=$splice_width const-component-dim=$ivector_dim
FixedAffineComponent matrix=$lda_mat
AffineComponentPreconditionedOnline input-dim=$lda_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
NormalizeComponent dim=$pnorm_output_dim
AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF
# to hidden.config it will write the part of the config corresponding to a
# single hidden layer; we need this to add new layers.
cat >$dir/hidden.config <<EOF
AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
NormalizeComponent dim=$pnorm_output_dim
EOF
$cmd $dir/log/nnet_init.log \
nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
$dir/0.mdl || exit 1;
fi
if [ $stage -le -1 ]; then
echo "Training transition probabilities and setting priors"
$cmd $dir/log/train_trans.log \
nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
|| exit 1;
fi
num_iters=$[$num_epochs * $iters_per_epoch];
echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
# This is when we decide to mix up from: halfway between when we've finished
# adding the hidden layers and the end of training.
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]
if [ $num_threads -eq 1 ]; then
parallel_suffix="-simple" # this enables us to use GPU code if
# we have just one thread.
parallel_train_opts=
if ! cuda-compiled; then
echo "$0: WARNING: you are running with one thread but you have not compiled"
echo " for CUDA. You may be running a setup optimized for GPUs. If you have"
echo " GPUs and have nvcc installed, go to src/ and do ./configure; make"
fi
else
parallel_suffix="-parallel"
parallel_train_opts="--num-threads=$num_threads"
fi
# First work out how many models we want to combine over in the final
# nnet-combine-fast invocation. This equals
# min(max(max_models_combine, iters_per_epoch),
# 2/3 * iters_after_mixup)
num_models_combine=$max_models_combine
if [ $num_models_combine -lt $iters_per_epoch ]; then
num_models_combine=$iters_per_epoch
fi
iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
num_models_combine=$iters_after_mixup_23
fi
first_model_combine=$[$num_iters-$num_models_combine+1]
x=0
while [ $x -lt $num_iters ]; do
if [ $x -ge 0 ] && [ $stage -le $x ]; then
# Set off jobs doing some diagnostics, in the background.
$cmd $dir/log/compute_prob_valid.$x.log \
nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
$cmd $dir/log/compute_prob_train.$x.log \
nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
$cmd $dir/log/progress.$x.log \
nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
ark:$egs_dir/train_diagnostic.egs '&&' \
nnet-am-info $dir/$x.mdl &
fi
echo "Training neural net (pass $x)"
if [ $x -gt 0 ] && \
[ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
[ $[($x-1) % $add_layers_period] -eq 0 ]; then
mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert $dir/$x.mdl - - |"
else
mdl=$dir/$x.mdl
fi
if [ $x -eq 0 ] || [ "$mdl" != "$dir/$x.mdl" ]; then
# on iteration zero or when we just added a layer, use a smaller minibatch
# size and just one job: the model-averaging doesn't seem to be helpful
# when the model is changing too fast (i.e. it worsens the objective
# function), and the smaller minibatch size will help to keep
# the update stable.
this_minibatch_size=$[$minibatch_size/2];
do_average=false
else
this_minibatch_size=$minibatch_size
do_average=true
fi
$cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
nnet-train$parallel_suffix $parallel_train_opts \
--minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
ark:- $dir/$[$x+1].JOB.mdl \
|| exit 1;
nnets_list=
for n in `seq 1 $num_jobs_nnet`; do
nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
done
learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters $initial_learning_rate $final_learning_rate`;
if $do_average; then
# average the output of the different jobs.
$cmd $dir/log/average.$x.log \
nnet-am-average $nnets_list - \| \
nnet-am-copy --learning-rate=$learning_rate - $dir/$[$x+1].mdl || exit 1;
else
# choose the best from the different jobs.
n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
$fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
$best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
[ -z "$n" ] && echo "Error getting best model" && exit 1;
$cmd $dir/log/select.$x.log \
nnet-am-copy --learning-rate=$learning_rate $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
fi
if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
# mix up.
echo Mixing up from $num_leaves to $mix_up components
$cmd $dir/log/mix_up.$x.log \
nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
$dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
fi
rm $nnets_list
[ ! -f $dir/$[$x+1].mdl ] && exit 1;
if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
[ $[($x-1)%100] -ne 0 ] && [ $[$x-1] -lt $first_model_combine ]; then
rm $dir/$[$x-1].mdl
fi
fi
x=$[$x+1]
done
if [ $stage -le $num_iters ]; then
echo "Doing final combination to produce final.mdl"
# Now do combination.
nnets_list=()
# the if..else..fi statement below sets 'nnets_list'.
if [ $max_models_combine -lt $num_models_combine ]; then
# The number of models to combine is too large, e.g. > 20. In this case,
# each argument to nnet-combine-fast will be an average of multiple models.
cur_offset=0 # current offset from first_model_combine.
for n in $(seq $max_models_combine); do
next_offset=$[($n*$num_models_combine)/$max_models_combine]
sub_list=""
for o in $(seq $cur_offset $[$next_offset-1]); do
iter=$[$first_model_combine+$o]
mdl=$dir/$iter.mdl
[ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
sub_list="$sub_list $mdl"
done
nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
cur_offset=$next_offset
done
else
nnets_list=
for n in $(seq 0 $[num_models_combine-1]); do
iter=$[$first_model_combine+$n]
mdl=$dir/$iter.mdl
[ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
nnets_list[$n]=$mdl
done
fi
# Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
# if there are many models it can give out-of-memory error; set num-threads to 8
# to speed it up (this isn't ideal...)
num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
[ $mb -gt 512 ] && mb=512
# Setting --initial-model to a large value makes it initialize the combination
# with the average of all the models. It's important not to start with a
# single model, or, due to the invariance to scaling that these nonlinearities
# give us, we get zero diagonal entries in the fisher matrix that
# nnet-combine-fast uses for scaling, which after flooring and inversion, has
# the effect that the initial model chosen gets much higher learning rates
# than the others. This prevents the optimization from working well.
$cmd $combine_parallel_opts $dir/log/combine.log \
nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
--num-threads=$combine_num_threads \
--verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
$dir/final.mdl || exit 1;
# Normalize stddev for affine or block affine layers that are followed by a
# pnorm layer and then a normalize layer.
$cmd $dir/log/normalize.log \
nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;
# Compute the probability of the final, combined model with
# the same subset we used for the previous compute_probs, as the
# different subsets will lead to different probs.
$cmd $dir/log/compute_prob_valid.final.log \
nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
$cmd $dir/log/compute_prob_train.final.log \
nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &
fi
if [ $stage -le $[$num_iters+1] ]; then
echo "Getting average posterior for purposes of adjusting the priors."
# Note: this just uses CPUs, using a smallish subset of data.
rm $dir/post.*.vec 2>/dev/null
$cmd JOB=1:$num_jobs_nnet $dir/log/get_post.JOB.log \
nnet-subset-egs --n=$prior_subset_size ark:$egs_dir/egs.JOB.0.ark ark:- \| \
nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.JOB.vec || exit 1;
sleep 3; # make sure there is time for $dir/post.*.vec to appear.
$cmd $dir/log/vector_sum.log \
vector-sum $dir/post.*.vec $dir/post.vec || exit 1;
rm $dir/post.*.vec;
echo "Re-adjusting priors based on computed posteriors"
$cmd $dir/log/adjust_priors.log \
nnet-adjust-priors $dir/final.mdl $dir/post.vec $dir/final.mdl || exit 1;
fi
if [ ! -f $dir/final.mdl ]; then
echo "$0: $dir/final.mdl does not exist."
# we don't want to clean up if the training didn't succeed.
exit 1;
fi
sleep 2
echo Done
if $cleanup; then
echo Cleaning up data
if [ $egs_dir == "$dir/egs" ]; then
steps/nnet2/remove_egs.sh $dir/egs
fi
echo Removing most of the models
for x in `seq 0 $num_iters`; do
if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ]; then
# delete all but every 100th model; don't delete the ones which combine to form the final model.
rm $dir/$x.mdl
fi
done
fi

Просмотреть файл

@ -0,0 +1,81 @@
#!/bin/bash
# Copyright 2013-2014 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0
# This script is as utils/copy_data_dir.sh in that it copies a data-dir,
# but it supports the --utts-per-spk-max option. If nonzero, it modifies
# the utt2spk and spk2utt files by splitting each speaker into multiple
# versions, so that each speaker has no more than --utts-per-spk-max
# utterances.
# begin configuration section
utts_per_spk_max=-1
# end configuration section
. utils/parse_options.sh
if [ $# != 2 ]; then
echo "Usage: "
echo " $0 [options] <srcdir> <destdir>"
echo "e.g.:"
echo " $0 --utts-per-spk-max 2 data/train data/train-max2"
echo "Options"
echo " --utts-per-spk-max=n # number of utterances per speaker maximum,"
echo " # default -1 (meaning no maximum). E.g. 2."
exit 1;
fi
export LC_ALL=C
srcdir=$1
destdir=$2
if [ ! -f $srcdir/utt2spk ]; then
echo "$0: no such file $srcdir/utt2spk"
exit 1;
fi
set -e;
set -o pipefail
mkdir -p $destdir
if [ "$utts_per_spk_max" != -1 ]; then
# create spk2utt file with reduced number of utterances per speaker.
awk -v max=$utts_per_spk_max '{ n=2; count=0;
while(n<=NF) {
int_max=int(max)+ (rand() < (max-int(max))?1:0);
nmax=n+int_max; count++; printf("%s-%06x", $1, count);
for (;n<nmax&&n<=NF; n++) printf(" %s", $n); print "";} }' \
<$srcdir/spk2utt >$destdir/spk2utt
utils/spk2utt_to_utt2spk.pl <$destdir/spk2utt >$destdir/utt2spk
if [ -f $srcdir/cmvn.scp ]; then
# below, the first apply_map command outputs a cmvn.scp indexed by utt;
# the second one outputs a cmvn.scp indexed by new speaker-id.
utils/apply_map.pl -f 2 $srcdir/cmvn.scp <$srcdir/utt2spk | \
utils/apply_map.pl -f 1 $destdir/utt2spk | sort | uniq > $destdir/cmvn.scp
echo "$0: mapping cmvn.scp, but you may want to recompute it if it's needed,"
echo " as it would probably change."
fi
if [ -f $srcdir/spk2gender ]; then
utils/apply_map.pl -f 2 $srcdir/spk2gender <$srcdir/utt2spk | \
utils/apply_map.pl -f 1 $destdir/utt2spk | sort | uniq >$destdir/spk2gender
fi
else
cp $srcdir/spk2utt $srcdir/utt2spk $destdir/
[ -f $srcdir/spk2gender ] && cp $srcdir/spk2gender $destdir/
[ -f $srcdir/cmvn.scp ] && cp $srcdir/cmvn.scp $destdir/
fi
for f in feats.scp segments wav.scp reco2file_and_channel text stm glm ctm; do
[ -f $srcdir/$f ] && cp $srcdir/$f $destdir/
done
echo "$0: copied data from $srcdir to $destdir, with --utts-per-spk-max $utts_per_spk_max"
utils/validate_data_dir.sh $destdir

Просмотреть файл

@ -98,6 +98,9 @@ fi
if [ $stage -le 2 ]; then
echo "$0: dumping neural net activations"
# The next line is a no-op unless $dir/feats/storage/ exists; see utils/create_split_dir.pl.
for j in $(seq $nj); do utils/create_data_link.pl $dir/feats/feats.$j.ark; done
if [ -f $data/segments ]; then
wav_rspecifier="ark,s,cs:extract-segments scp,p:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
else

Просмотреть файл

@ -14,12 +14,8 @@
# for online decoding.
# Rather than treating each utterance separately, it carries forward
# information from one utterance to the next, within the speaker. However,
# take note of the option "utts-per-spk-max", which splits speakers up into
# "fake speakers" with at most two utterances in them. This means that more
# iVectors are estimated starting from an uninformative starting point, than
# if we used the real speaker labels (which may have many utterances each);
# it's a compromise between per-utterance and per-speaker iVector estimation.
# information from one utterance to the next, within the speaker.
# Begin configuration section.
nj=30
@ -36,13 +32,9 @@ posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
# used when training the iVector extractor, but more important
# that this match the value used when you do real online decoding
# with the neural nets trained with these iVectors.
utts_per_spk_max=-1 # Maximum utterances per "fake-speaker." With the default
# of -1 no fake-speakers are used. Note: this does not have to
# be an integer; if it's noninteger, it will be rounded in a
# randomized way to one of the two integers it's close to.
# This is useful in the "perturbed-feature" recipe to encourage
# that different perturbed versions of the same speaker get
# split into fake-speakers differently.
#utts_per_spk_max=-1 # This option is no longer supported, you should use
# steps/online/nnet2/copy_data_dir.sh with the --utts-per-spk-max
# option to make a copy of the data dir.
compress=true # If true, compress the iVectors stored on disk (it's lossy
# compression, as used for feature matrices).
@ -112,7 +104,6 @@ echo "--posterior-scale=$posterior_scale" >>$ieconf
echo "--max-remembered-frames=1000" >>$ieconf # the default
ns=$(wc -l <$data/spk2utt)
if [ "$ns" == 1 -a "$utts_per_spk_max" != 1 -a "$utts_per_spk_max" != -1 ]; then
echo "$0: you seem to have just one speaker in your database. This is probably not a good idea."
@ -121,29 +112,10 @@ if [ "$ns" == 1 -a "$utts_per_spk_max" != 1 -a "$utts_per_spk_max" != -1 ]; then
utts_per_spk_max=1
fi
spk2utt=""
if [ "$utts_per_spk_max" != -1 ]; then
mkdir -p $dir/spk2utt_fake
for job in $(seq $nj); do
# create fake spk2utt files with reduced number of utterances per speaker,
# so the network is well adapted to using iVectors from small amounts of
# training data.
# the if (rand() % 2 == 0)
awk -v max=$utts_per_spk_max '{ n=2; count=0;
while(n<=NF) {
int_max=int(max)+ (rand() < (max-int(max))?1:0);
nmax=n+int_max; count++; printf("%s-%06x", $1, count);
for (;n<nmax&&n<=NF; n++) printf(" %s", $n); print "";} }' \
<$sdata/$job/spk2utt >$dir/spk2utt_fake/spk2utt.$job
done
spk2utt="ark:$dir/spk2utt_fake/spk2utt.JOB"
else
spk2utt="ark:$sdata/JOB/spk2utt"
fi
for n in $(seq $nj); do
# This will do nothing unless the directorys $dir/storage exists;
# This will do nothing unless the directory $dir/storage exists;
# it can be used to distribute the data among multiple machines.
utils/create_data_link.pl $dir/ivector_online.$n.ark
done
@ -151,7 +123,7 @@ done
if [ $stage -le 0 ]; then
echo "$0: extracting iVectors"
$cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
ivector-extract-online2 --config=$ieconf "$spk2utt" scp:$sdata/JOB/feats.scp ark:- \| \
ivector-extract-online2 --config=$ieconf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/feats.scp ark:- \| \
copy-feats --compress=$compress ark:- \
ark,scp:$dir/ivector_online.JOB.ark,$dir/ivector_online.JOB.scp || exit 1;
fi

Просмотреть файл

@ -0,0 +1,285 @@
#!/bin/bash
# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
# This is modified from ../../nnet2/get_egs.sh.
# This script combines the
# nnet-example extraction with the feature extraction directly from wave files;
# it uses the program online2-wav-dump-feature to do all parts of feature
# extraction: MFCC/PLP/fbank, possibly plus pitch, plus iVectors. This script
# is intended mostly for cross-system training for online decoding, where you
# initialize the nnet from an existing, larger systme.
# Begin configuration section.
cmd=run.pl
num_utts_subset=300 # number of utterances in validation and training
# subsets used for shrinkage and diagnostics
num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
num_train_frames_combine=10000 # # train frames for the above.
num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
samples_per_iter=400000 # each iteration of training, see this many samples
# per job. This is just a guideline; it will pick a number
# that divides the number of samples in the entire data.
transform_dir= # If supplied, overrides alidir
num_jobs_nnet=16 # Number of neural net jobs to run in parallel
stage=0
io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.
random_copy=false
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 4 ]; then
echo "Usage: steps/online/nnet2/get_egs.sh [opts] <data> <ali-dir> <online-nnet-dir> <exp-dir>"
echo " e.g.: steps/online/nnet2/get_egs.sh data/train exp/tri3_ali exp/nnet2_online/nnet_a_gpu_online/ exp/tri4_nnet"
echo "In <online-nnet-dir>, it looks for final.mdl (need to compute required left and right context),"
echo "and a configuration file conf/online_nnet2_decoding.conf which describes the features."
echo "Main options (for others, see top of script file)"
echo " --config <config-file> # config file containing options"
echo " --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
echo " --num-jobs-nnet <num-jobs;16> # Number of parallel jobs to use for main neural net"
echo " # training (will affect results as well as speed; try 8, 16)"
echo " # Note: if you increase this, you may want to also increase"
echo " # the learning rate."
echo " --samples-per-iter <#samples;400000> # Number of samples of data to process per iteration, per"
echo " # process."
echo " --feat-type <lda|raw> # (by default it tries to guess). The feature type you want"
echo " # to use as input to the neural net."
echo " --splice-width <width;4> # Number of frames on each side to append for feature input"
echo " # (note: we splice processed, typically 40-dimensional frames"
echo " --num-frames-diagnostic <#frames;4000> # Number of frames used in computing (train,valid) diagnostics"
echo " --num-valid-frames-combine <#frames;10000> # Number of frames used in getting combination weights at the"
echo " # very end."
echo " --stage <stage|0> # Used to run a partially-completed training process from somewhere in"
echo " # the middle."
exit 1;
fi
data=$1
alidir=$2
online_nnet_dir=$3
dir=$4
mdl=$online_nnet_dir/final.mdl # only needed for left and right context.
feature_conf=$online_nnet_dir/conf/online_nnet2_decoding.conf
for f in $data/feats.scp $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $feature_conf $mdl; do
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir...
sdata=$data/split$nj
utils/split_data.sh $data $nj
mkdir -p $dir/log
cp $alidir/tree $dir
grep -v '^--endpoint' $feature_conf >$dir/feature.conf || exit 1;
# Get list of validation utterances.
mkdir -p $dir/valid $dir/train_subset
awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
> $dir/valid/uttlist || exit 1;
if [ -f $data/utt2uniq ]; then
echo "File $data/utt2uniq exists, so augmenting valid/uttlist to"
echo "include all perturbed versions of the same 'real' utterances."
mv $dir/valid/uttlist $dir/valid/uttlist.tmp
utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
cat $dir/valid/uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid/uttlist
rm $dir/uniq2utt $dir/valid/uttlist.tmp
fi
awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid/uttlist | \
head -$num_utts_subset > $dir/train_subset/uttlist || exit 1;
for subdir in valid train_subset; do
# In order for the iVector extraction to work right, we need to process all
# utterances of the speakers which have utterances in valid/uttlist, and the
# same for train_subset/uttlist. We produce $dir/valid/uttlist_extended which
# will contain all utterances of all speakers which have utterances in
# $dir/valid/uttlist, and the same for $dir/train_subset/.
utils/filter_scp.pl $dir/$subdir/uttlist <$data/utt2spk | awk '{print $2}' > $dir/$subdir/spklist || exit 1;
utils/filter_scp.pl -f 2 $dir/$subdir/spklist <$data/utt2spk >$dir/$subdir/utt2spk || exit 1;
utils/utt2spk_to_spk2utt.pl <$dir/$subdir/utt2spk >$dir/$subdir/spk2utt || exit 1;
awk '{print $1}' <$dir/$subdir/utt2spk >$dir/$subdir/uttlist_extended || exit 1;
rm $dir/$subdir/spklist
done
if [ -f $data/segments ]; then
# note: in the feature extraction, because the program online2-wav-dump-features is sensitive to the
# previous utterances within a speaker, we do the filtering after extracting the features.
echo "$0 [info]: segments file exists: using that."
feats="ark,s,cs:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt ark,s,cs:- ark:- | subset-feats --exclude=$dir/valid/uttlist ark:- ark:- |"
valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid/uttlist_extended $data/segments | extract-segments scp:$data/wav.scp - ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/valid/spk2utt ark,s,cs:- ark:- | subset-feats --include=$dir/valid/uttlist ark:- ark:- |"
train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset/uttlist_extended $data/segments | extract-segments scp:$data/wav.scp - ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/train_subset/spk2utt ark,s,cs:- ark:- | subset-feats --include=$dir/train_subset/uttlist ark:- ark:- |"
else
echo "$0 [info]: no segments file exists, using wav.scp."
feats="ark,s,cs:online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/wav.scp ark:- | subset-feats --exclude=$dir/valid/uttlist ark:- ark:- |"
valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid/uttlist_extended $data/wav.scp | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/valid/spk2utt scp:- ark:- | subset-feats --include=$dir/valid/uttlist ark:- ark:- |"
train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset/uttlist_extended $data/wav.scp | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/train_subset/spk2utt scp:- ark:- | subset-feats --include=$dir/train_subset/uttlist ark:- ark:- |"
fi
ivector_dim=$(online2-wav-dump-features --config=$dir/feature.conf --print-ivector-dim=true) || exit 1;
! [ $ivector_dim -ge 0 ] && echo "$0: error getting iVector dim" && exit 1;
if [ $stage -le 0 ]; then
echo "$0: working out number of frames of training data"
num_frames=$(steps/nnet2/get_num_frames.sh $data)
echo $num_frames > $dir/num_frames
else
num_frames=`cat $dir/num_frames` || exit 1;
fi
# Working out number of iterations per epoch.
iters_per_epoch=`perl -e "print int($num_frames/($samples_per_iter * $num_jobs_nnet) + 0.5);"` || exit 1;
[ $iters_per_epoch -eq 0 ] && iters_per_epoch=1
samples_per_iter_real=$[$num_frames/($num_jobs_nnet*$iters_per_epoch)]
echo "$0: Every epoch, splitting the data up into $iters_per_epoch iterations,"
echo "$0: giving samples-per-iteration of $samples_per_iter_real (you requested $samples_per_iter)."
# Making soft links to storage directories. This is a no-up unless
# the subdirectory $dir/egs/storage/ exists. See utils/create_split_dir.pl
for x in `seq 1 $num_jobs_nnet`; do
for y in `seq 0 $[$iters_per_epoch-1]`; do
utils/create_data_link.pl $dir/egs/egs.$x.$y.ark
utils/create_data_link.pl $dir/egs/egs_tmp.$x.$y.ark
done
for y in `seq 1 $nj`; do
utils/create_data_link.pl $dir/egs/egs_orig.$x.$y.ark
done
done
remove () { for x in $*; do [ -L $x ] && rm $(readlink -f $x); rm $x; done }
set -o pipefail
left_context=$(nnet-am-info $mdl | grep '^left-context' | awk '{print $2}') || exit 1;
right_context=$(nnet-am-info $mdl | grep '^right-context' | awk '{print $2}') || exit 1;
nnet_context_opts="--left-context=$left_context --right-context=$right_context"
set +o pipefail
mkdir -p $dir/egs
if [ $stage -le 2 ]; then
rm $dir/.error 2>/dev/null
echo "$0: extracting validation and training-subset alignments."
set -o pipefail;
for id in $(seq $nj); do gunzip -c $alidir/ali.$id.gz; done | \
copy-int-vector ark:- ark,t:- | \
utils/filter_scp.pl <(cat $dir/valid/uttlist $dir/train_subset/uttlist) | \
gzip -c >$dir/ali_special.gz || exit 1;
set +o pipefail; # unset the pipefail option.
echo "Getting validation and training subset examples."
$cmd $dir/log/create_valid_subset.log \
nnet-get-egs $ivectors_opt $nnet_context_opts "$valid_feats" \
"ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
"ark:$dir/egs/valid_all.egs" || touch $dir/.error &
$cmd $dir/log/create_train_subset.log \
nnet-get-egs $ivectors_opt $nnet_context_opts "$train_subset_feats" \
"ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
"ark:$dir/egs/train_subset_all.egs" || touch $dir/.error &
wait;
[ -f $dir/.error ] && exit 1;
echo "Getting subsets of validation examples for diagnostics and combination."
$cmd $dir/log/create_valid_subset_combine.log \
nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/egs/valid_all.egs \
ark:$dir/egs/valid_combine.egs || touch $dir/.error &
$cmd $dir/log/create_valid_subset_diagnostic.log \
nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/egs/valid_all.egs \
ark:$dir/egs/valid_diagnostic.egs || touch $dir/.error &
$cmd $dir/log/create_train_subset_combine.log \
nnet-subset-egs --n=$num_train_frames_combine ark:$dir/egs/train_subset_all.egs \
ark:$dir/egs/train_combine.egs || touch $dir/.error &
$cmd $dir/log/create_train_subset_diagnostic.log \
nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/egs/train_subset_all.egs \
ark:$dir/egs/train_diagnostic.egs || touch $dir/.error &
wait
[ -f $dir/.error ] && echo "Error detected while creating egs" && exit 1;
cat $dir/egs/valid_combine.egs $dir/egs/train_combine.egs > $dir/egs/combine.egs
for f in $dir/egs/{combine,train_diagnostic,valid_diagnostic}.egs; do
[ ! -s $f ] && echo "No examples in file $f" && exit 1;
done
rm $dir/egs/valid_all.egs $dir/egs/train_subset_all.egs $dir/egs/{train,valid}_combine.egs $dir/ali_special.gz
fi
if [ $stage -le 3 ]; then
# Other scripts might need to know the following info:
echo $num_jobs_nnet >$dir/egs/num_jobs_nnet
echo $iters_per_epoch >$dir/egs/iters_per_epoch
echo $samples_per_iter_real >$dir/egs/samples_per_iter
echo "Creating training examples";
# in $dir/egs, create $num_jobs_nnet separate files with training examples.
# The order is not randomized at this point.
egs_list=
for n in `seq 1 $num_jobs_nnet`; do
egs_list="$egs_list ark:$dir/egs/egs_orig.$n.JOB.ark"
done
echo "Generating training examples on disk"
# The examples will go round-robin to egs_list.
$cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
nnet-get-egs $ivectors_opt $nnet_context_opts "$feats" \
"ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
nnet-copy-egs ark:- $egs_list || exit 1;
fi
if [ $stage -le 4 ]; then
echo "$0: rearranging examples into parts for different parallel jobs"
# combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and
# then split into multiple parts egs.JOB.*.scp for different parts of the
# data, 0 .. $iters_per_epoch-1.
if [ $iters_per_epoch -eq 1 ]; then
echo "$0: Since iters-per-epoch == 1, just concatenating the data."
for n in `seq 1 $num_jobs_nnet`; do
cat $dir/egs/egs_orig.$n.*.ark > $dir/egs/egs_tmp.$n.0.ark || exit 1;
remove $dir/egs/egs_orig.$n.*.ark
done
else # We'll have to split it up using nnet-copy-egs.
egs_list=
for n in `seq 0 $[$iters_per_epoch-1]`; do
egs_list="$egs_list ark:$dir/egs/egs_tmp.JOB.$n.ark"
done
# note, the "|| true" below is a workaround for NFS bugs
# we encountered running this script with Debian-7, NFS-v4.
$cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/split_egs.JOB.log \
nnet-copy-egs --random=$random_copy --srand=JOB \
"ark:cat $dir/egs/egs_orig.JOB.*.ark|" $egs_list || exit 1;
remove $dir/egs/egs_orig.*.*.ark 2>/dev/null
fi
fi
if [ $stage -le 5 ]; then
# Next, shuffle the order of the examples in each of those files.
# Each one should not be too large, so we can do this in memory.
echo "Shuffling the order of training examples"
echo "(in order to avoid stressing the disk, these won't all run at once)."
for n in `seq 0 $[$iters_per_epoch-1]`; do
$cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$n.JOB.log \
nnet-shuffle-egs "--srand=\$[JOB+($num_jobs_nnet*$n)]" \
ark:$dir/egs/egs_tmp.JOB.$n.ark ark:$dir/egs/egs.JOB.$n.ark
remove $dir/egs/egs_tmp.*.$n.ark
done
fi
echo "$0: Finished preparing training examples"

Просмотреть файл

@ -79,6 +79,12 @@ cp $srcdir/final.mdl $dir/ || exit 1;
if [ ! -z "$iedir" ]; then
mkdir -p $dir/ivector_extractor/
cp $iedir/final.{mat,ie,dubm} $iedir/global_cmvn.stats $dir/ivector_extractor/ || exit 1;
# The following things won't be needed directly by the online decoding, but
# will allow us to run prepare_online_decoding.sh again with
# $dir/ivector_extractor/ as the input directory (useful in certain
# cross-system training scenarios).
cp $iedir/splice_opts $iedir/online_cmvn.conf $dir/ivector_extractor/ || exit 1;
fi

Просмотреть файл

@ -46,6 +46,7 @@ creates a link such as
Usage: utils/create_data_link.pl <data-archive>
e.g.: utils/create_data_link.pl foo/bar/egs.3.4.ark
See also utils/remove_data_links.sh
EOU
GetOptions();

Просмотреть файл

@ -30,7 +30,7 @@ Allowed options:
--suffix : Common suffix to <actual_storage_dirs> (string, default = "")
See also create_data_link.pl, which is intended to work with the resulting
directory structure.
directory structure, and remove_data_links.sh
EOU
my $suffix="";

Просмотреть файл

@ -20,11 +20,11 @@
# of each line is an utterance-id, and filters an scp
# file (or any file whose "n-th" field is an utterance id), printing
# out only those lines whose "n-th" field is in id_list. The index of
# the "n-th" field is zero, by default, but can be changed by using \
# the "n-th" field is 1, by default, but can be changed by using
# the -f <n> switch
$exclude = 0;
$field = 0;
$field = 1;
$shifted = 0;
do {
@ -42,7 +42,13 @@ do {
} while ($shifted);
if(@ARGV < 1 || @ARGV > 2) {
die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp ";
die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
"Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
"Note: only the first field of each line in id_list matters. With --exclude, prints\n" .
"only the lines that were *not* in id_list.\n" .
"Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
"If your older scripts (written before Oct 2014) stopped working and you used the\n" .
"-f option, add 1 to the argument.\n";
}
@ -54,12 +60,27 @@ while(<F>) {
$seen{$A[0]} = 1;
}
if ($field == 1) { # Treat this as special case, since it is common.
while(<>) {
$_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
# $1 is what we filter on.
if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
print $_;
}
}
} else {
while(<>) {
@A = split;
@A > 0 || die "Invalid scp file line $_";
@A >= $field || die "Invalid scp file line $_";
if((!$exclude && $seen{$A[$field]}) || ($exclude && !defined $seen{$A[$field]})) {
if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
print $_;
}
}
}
# tests:
# the following should print "foo 1"
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
# the following should print "bar 2".
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)

Просмотреть файл

@ -0,0 +1,53 @@
#!/bin/bash
# This program searches within a directory for soft links that
# appear to be created by 'create_data_link.pl' to a 'storage/' subdirectory,
# and it removes both the soft links and the things they point to.
# for instance, if you have a soft link
# foo/egs/1.1.egs -> storage/2/1.1.egs
# it will remove both foo/egs/storage/2/1.1.egs, and foo/egs/1.1.egs.
ret=0
dry_run=false
if [ "$1" == "--dry-run" ]; then
dry_run=true
shift
fi
if [ $# == 0 ]; then
echo "Usage: $0 [--dry-run] <list-of-directories>"
echo "e.g.: $0 exp/nnet4a/egs/"
echo " Removes from any subdirectories of the command-line arguments, soft links that "
echo " appear to have been created by utils/create_data_link.pl, as well as the things"
echo " that those soft links point to. Will typically be called on a directory prior"
echo " to 'rm -r' on that directory, to ensure that data that was distributed on other"
echo " volumes also gets deleted."
echo " With --dry-run, just prints what it would do."
fi
for dir in $*; do
if [ ! -d $dir ]; then
echo "$0: not a directory: $dir"
ret=1
else
for subdir in $(find $dir -type d); do
if [ -d $subdir/storage ]; then
for x in $(ls $subdir); do
f=$subdir/$x
if [ -L $f ] && [[ $(readlink $f) == storage/* ]]; then
target=$subdir/$(readlink $f)
if $dry_run; then
echo rm $f $target
else
rm $f $target
fi
fi
done
fi
done
fi
done
exit $ret

Просмотреть файл

@ -39,7 +39,7 @@ int main(int argc, char *argv[]) {
const char *usage =
"Extract segments from a large audio file in WAV format.\n"
"Usage: extract-segments [options] <wav-rspecifier> <segments-file> <wav-wspecifier>\n"
"e.g. extract-segments wav.scp segments ark:- | <some other program>\n"
"e.g. extract-segments scp:wav.scp segments ark:- | <some other program>\n"
" segments-file format: segment_id wav_file_name start_time end_time [channel]\n"
" e.g.: spkabc_seg1 spkabc_recording1 1.10 2.36 1\n"
" If channel is not provided as last element, expects mono.\n"

Просмотреть файл

@ -235,6 +235,7 @@ class LmExampleDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
Label bos_symbol,
Label eos_symbol);
virtual StateId Start() { return start_state_; }
/// We don't bother caching the final-probs, just the arcs.

Просмотреть файл

@ -141,6 +141,7 @@ struct OnlineNnet2FeaturePipelineInfo {
bool use_ivectors;
OnlineIvectorExtractionInfo ivector_extractor_info;
int32 IvectorDim() { return ivector_extractor_info.extractor.IvectorDim(); }
private:
KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineNnet2FeaturePipelineInfo);
};

Просмотреть файл

@ -42,7 +42,8 @@ int main(int argc, char *argv[]) {
"Usage: online2-wav-dump-features [options] <spk2utt-rspecifier> <wav-rspecifier> <feature-wspecifier>\n"
"The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
"you want to generate features utterance by utterance.\n"
"See steps/online/nnet2/dump_nnet_activations.sh for an example.\n";
"Alternate usage: online2-wav-dump-features [options] --print-ivector-dim=true\n"
"See steps/online/nnet2/{dump_nnet_activations,get_egs.sh} for examples.\n";
ParseOptions po(usage);
@ -50,24 +51,34 @@ int main(int argc, char *argv[]) {
// as well as the basic features.
OnlineNnet2FeaturePipelineConfig feature_config;
BaseFloat chunk_length_secs = 0.05;
bool print_ivector_dim = false;
po.Register("chunk-length", &chunk_length_secs,
"Length of chunk size in seconds, that we process.");
po.Register("print-ivector-dim", &print_ivector_dim,
"If true, print iVector dimension (possibly zero) and exit. This "
"version requires no arguments.");
feature_config.Register(&po);
po.Read(argc, argv);
if (po.NumArgs() != 3) {
if (!print_ivector_dim && po.NumArgs() != 3) {
po.PrintUsage();
return 1;
}
OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
if (print_ivector_dim) {
std::cout << feature_info.IvectorDim() << std::endl;
exit(0);
}
std::string spk2utt_rspecifier = po.GetArg(1),
wav_rspecifier = po.GetArg(2),
feats_wspecifier = po.GetArg(3);
OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
int32 num_done = 0, num_err = 0;
int64 num_frames_tot = 0;

Просмотреть файл

@ -1,7 +1,7 @@
// online2bin/online2-wav-nnet2-am-compute.cc
// Copyright 2014 Johns Hopkins University (author: Daniel Povey)
// David Snyder
// 2014 David Snyder
// See ../../COPYING for clarification regarding multiple authors
//