зеркало из https://github.com/mozilla/kaldi.git
trunk: Adding DNN-based speaker recognition recipe in egs/sre10
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@5223 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
189c77419e
Коммит
55d8f863f3
|
@ -0,0 +1,94 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2013 Daniel Povey
|
||||
# 2014-2015 David Snyder
|
||||
# 2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
|
||||
# 2015 Johns Hopkins University (Author: Daniel Povey)
|
||||
# Apache 2.0.
|
||||
|
||||
# This script extracts iVectors for a set of utterances, given
|
||||
# features and a trained DNN-based iVector extractor.
|
||||
|
||||
# Begin configuration section.
|
||||
nj=30
|
||||
cmd="run.pl"
|
||||
stage=0
|
||||
min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
|
||||
posterior_scale=1.0 # This scale helps to control for successive features being highly
|
||||
# correlated. E.g. try 0.1 or 0.3.
|
||||
# End configuration section.
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
if [ -f path.sh ]; then . ./path.sh; fi
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
|
||||
if [ $# != 5 ]; then
|
||||
echo "Usage: $0 <extractor-dir> <data> <ivector-dir>"
|
||||
echo " e.g.: $0 exp/extractor_2048_male data/train_male exp/ivectors_male"
|
||||
echo "main options (for others, see top of script file)"
|
||||
echo " --config <config-file> # config containing options"
|
||||
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
|
||||
echo " --num-iters <#iters|10> # Number of iterations of E-M"
|
||||
echo " --nj <n|10> # Number of jobs (also see num-processes and num-threads)"
|
||||
echo " --num-threads <n|8> # Number of threads for each process"
|
||||
echo " --stage <stage|0> # To control partial reruns"
|
||||
echo " --num-gselect <n|20> # Number of Gaussians to select using"
|
||||
echo " # diagonal model."
|
||||
echo " --min-post <min-post|0.025> # Pruning threshold for posteriors"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
srcdir=$1
|
||||
nnet=$2
|
||||
data=$3
|
||||
data_dnn=$4
|
||||
dir=$5
|
||||
|
||||
for f in $srcdir/final.ie $srcdir/final.ubm $data/feats.scp ; do
|
||||
[ ! -f $f ] && echo "No such file $f" && exit 1;
|
||||
done
|
||||
|
||||
# Set various variables.
|
||||
mkdir -p $dir/log
|
||||
sdata=$data/split$nj;
|
||||
utils/split_data.sh $data $nj || exit 1;
|
||||
|
||||
sdata_dnn=$data_dnn/split$nj;
|
||||
utils/split_data.sh $data_dnn $nj || exit 1;
|
||||
|
||||
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
|
||||
|
||||
splice_opts=`cat exp/nnet//splice_opts 2>/dev/null` # frame-splicing options
|
||||
|
||||
## Set up features.
|
||||
feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
|
||||
|
||||
nnet_feats="ark,s,cs:apply-cmvn-sliding --center=true scp:$sdata_dnn/JOB/feats.scp ark:- |"
|
||||
|
||||
if [ $stage -le 0 ]; then
|
||||
echo "$0: extracting iVectors"
|
||||
$cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
|
||||
nnet-am-compute --apply-log=true $nnet "$nnet_feats" ark:- \
|
||||
\| select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- \
|
||||
\| logprob-to-post --min-post=$min_post ark:- ark:- \| \
|
||||
scale-post ark:- $posterior_scale ark:- \| \
|
||||
ivector-extract --verbose=2 $srcdir/final.ie "$feats" ark,s,cs:- \
|
||||
ark,scp,t:$dir/ivector.JOB.ark,$dir/ivector.JOB.scp || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
echo "$0: combining iVectors across jobs"
|
||||
for j in $(seq $nj); do cat $dir/ivector.$j.scp; done >$dir/ivector.scp || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ]; then
|
||||
# Be careful here: the speaker-level iVectors are now length-normalized,
|
||||
# even if they are otherwise the same as the utterance-level ones.
|
||||
echo "$0: computing mean of iVectors for each speaker and length-normalizing"
|
||||
$cmd $dir/log/speaker_mean.log \
|
||||
ivector-normalize-length scp:$dir/ivector.scp ark:- \| \
|
||||
ivector-mean ark:$data/spk2utt ark:- ark:- ark,t:$dir/num_utts.ark \| \
|
||||
ivector-normalize-length ark:- ark,scp:$dir/spk_ivector.ark,$dir/spk_ivector.scp || exit 1;
|
||||
fi
|
|
@ -0,0 +1,79 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2015 David Snyder
|
||||
# 2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
|
||||
# 2015 Johns Hopkins University (Author: Daniel Povey)
|
||||
# Apache 2.0
|
||||
|
||||
# This script derives a full-covariance UBM from DNN posteriors and
|
||||
# speaker recognition features.
|
||||
|
||||
# Begin configuration section.
|
||||
nj=40
|
||||
cmd="run.pl"
|
||||
stage=-2
|
||||
delta_window=3
|
||||
delta_order=2
|
||||
num_components=5297
|
||||
# End configuration section.
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
if [ -f path.sh ]; then . ./path.sh; fi
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
if [ $# != 3 ]; then
|
||||
echo "Usage: steps/init_full_ubm_from_dnn.sh <data-speaker-id> <data-dnn> <dnn-model> <new-ubm-dir>"
|
||||
echo "Initializes a full-covariance UBM from DNN posteriors and speaker recognition features."
|
||||
echo " e.g.: steps/init_full_ubm_from_dnn.sh data/train data/train_dnn exp/dnn/final.mdl exp/full_ubm"
|
||||
echo "main options (for others, see top of script file)"
|
||||
echo " --config <config-file> # config containing options"
|
||||
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
|
||||
echo " --nj <n|16> # number of parallel training jobs"
|
||||
echo " --delta-window <n|3> # delta window size"
|
||||
echo " --delta-order <n|2> # delta order"
|
||||
echo " --number-components <n|5297> # number of components in the final GMM needs"
|
||||
echo " # to be equal to the size of the DNN output layer."
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
data=$1
|
||||
data_dnn=$2
|
||||
nnet=$3
|
||||
dir=$4
|
||||
|
||||
for f in $data/feats.scp $data/vad.scp; do
|
||||
[ ! -f $f ] && echo "No such file $f" && exit 1;
|
||||
done
|
||||
|
||||
mkdir -p $dir/log
|
||||
echo $nj > $dir/num_jobs
|
||||
sdata=$data/split$nj;
|
||||
utils/split_data.sh $data $nj || exit 1;
|
||||
|
||||
sdata_dnn=$data_dnn/split$nj;
|
||||
utils/split_data.sh $data_dnn $nj || exit 1;
|
||||
|
||||
delta_opts="--delta-window=$delta_window --delta-order=$delta_order"
|
||||
echo $delta_opts > $dir/delta_opts
|
||||
|
||||
logdir=$dir/log
|
||||
|
||||
nnet_feats="ark,s,cs:apply-cmvn-sliding --center=true scp:$sdata_dnn/JOB/feats.scp ark:- |"
|
||||
|
||||
feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | \
|
||||
apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | \
|
||||
select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
|
||||
|
||||
$cmd JOB=1:$nj $logdir/make_stats.JOB.log \
|
||||
nnet-am-compute --apply-log=true $nnet "$nnet_feats" ark:- \
|
||||
\| select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- \
|
||||
\| logprob-to-post ark:- ark:- \| \
|
||||
fgmm-global-acc-stats-post ark:- $num_components "$feats" \
|
||||
$dir/stats.JOB.acc || exit 1;
|
||||
|
||||
$cmd $dir/log/init.log \
|
||||
fgmm-global-init-from-accs --verbose=2 \
|
||||
"fgmm-global-sum-accs - $dir/stats.*.acc |" $num_components \
|
||||
$dir/final.ubm || exit 1;
|
||||
|
||||
exit 0;
|
|
@ -0,0 +1,181 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2013 Daniel Povey
|
||||
# 2014-2015 David Snyder
|
||||
# 2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
|
||||
# 2015 Johns Hopkins University (Author: Daniel Povey)
|
||||
# Apache 2.0.
|
||||
|
||||
# This script trains the i-vector extractor using a DNN-based UBM. It also requires
|
||||
# an fGMM, usually created by the script sid/init_full_gmm_from_dnn.sh.
|
||||
# Note: there are 3 separate levels of parallelization: num_threads, num_processes,
|
||||
# and num_jobs. This may seem a bit excessive. It has to do with minimizing
|
||||
# memory usage and disk I/O, subject to various constraints. The "num_threads"
|
||||
# is how many threads a program uses; the "num_processes" is the number of separate
|
||||
# processes a single job spawns, and then sums the accumulators in memory.
|
||||
# Our recommendation:
|
||||
# - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
|
||||
# (because of needing to lock various global quantities, the program can't
|
||||
# use many more than 4 threads with good CPU utilization).
|
||||
# - Set num_processes to the number of virtual cores on each machine you have, divided by
|
||||
# num_threads. E.g. 4, if you have 16 virtual cores. If you're on a shared queue
|
||||
# that's busy with other people's jobs, it may be wise to set it to rather less
|
||||
# than this maximum though, or your jobs won't get scheduled. And if memory is
|
||||
# tight you need to be careful; in our normal setup, each process uses about 5G.
|
||||
# - Set num_jobs to as many of the jobs (each using $num_threads * $num_processes CPUs)
|
||||
# your queue will let you run at one time, but don't go much more than 10 or 20, or
|
||||
# summing the accumulators will possibly get slow. If you have a lot of data, you
|
||||
# may want more jobs, though.
|
||||
|
||||
# Begin configuration section.
|
||||
nj=10 # this is the number of separate queue jobs we run, but each one
|
||||
# contains num_processes sub-jobs.. the real number of threads we
|
||||
# run is nj * num_processes * num_threads, and the number of
|
||||
# separate pieces of data is nj * num_processes.
|
||||
num_threads=4
|
||||
num_processes=4 # each job runs this many processes, each with --num-threads threads
|
||||
cmd="run.pl"
|
||||
stage=-4
|
||||
num_gselect=20 # Gaussian-selection using diagonal model: number of Gaussians to select
|
||||
ivector_dim=400 # dimension of the extracted i-vector
|
||||
use_weights=false # set to true to turn on the regression of log-weights on the ivector.
|
||||
num_iters=10
|
||||
min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
|
||||
num_samples_for_weights=3 # smaller than the default for speed (relates to a sampling method)
|
||||
cleanup=true
|
||||
posterior_scale=1.0 # This scale helps to control for successve features being highly
|
||||
# correlated. E.g. try 0.1 or 0.3
|
||||
sum_accs_opt=
|
||||
# End configuration section.
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
if [ -f path.sh ]; then . ./path.sh; fi
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
|
||||
if [ $# != 5 ]; then
|
||||
echo "Usage: $0 <fgmm-model> <dnn-model> <data-speaker-id> <data-dnn> <extractor-dir>"
|
||||
echo " e.g.: $0 exp/sup_ubm/final.ubm exp/dnn/final.mdl data/train data/train_dnn exp/extractor_male"
|
||||
echo "main options (for others, see top of script file)"
|
||||
echo " --config <config-file> # config containing options"
|
||||
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
|
||||
echo " --num-iters <#iters|10> # Number of iterations of E-M"
|
||||
echo " --nj <n|10> # Number of jobs (also see num-processes and num-threads)"
|
||||
echo " --num-processes <n|4> # Number of processes for each queue job (relates"
|
||||
echo " # to summing accs in memory)"
|
||||
echo " --num-threads <n|4> # Number of threads for each process (can't be usefully"
|
||||
echo " # increased much above 4)"
|
||||
echo " --stage <stage|-4> # To control partial reruns"
|
||||
echo " --num-gselect <n|20> # Number of Gaussians to select using"
|
||||
echo " # diagonal model."
|
||||
echo " --sum-accs-opt <option|''> # Option e.g. '-l hostname=a15' to localize"
|
||||
echo " # sum-accs process to nfs server."
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
fgmm_model=$1
|
||||
nnet=$2
|
||||
data=$3
|
||||
data_dnn=$4
|
||||
dir=$5
|
||||
|
||||
srcdir=$(dirname $fgmm_model)
|
||||
|
||||
for f in $fgmm_model $data/feats.scp ; do
|
||||
[ ! -f $f ] && echo "No such file $f" && exit 1;
|
||||
done
|
||||
|
||||
# Set various variables.
|
||||
mkdir -p $dir/log
|
||||
nj_full=$[$nj*$num_processes]
|
||||
sdata=$data/split$nj_full;
|
||||
utils/split_data.sh $data $nj_full || exit 1;
|
||||
|
||||
sdata_dnn=$data_dnn/split$nj_full;
|
||||
utils/split_data.sh $data_dnn $nj_full || exit 1;
|
||||
|
||||
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
|
||||
if [ -f $srcdir/delta_opts ]; then
|
||||
cp $srcdir/delta_opts $dir/ 2>/dev/null
|
||||
fi
|
||||
|
||||
splice_opts=`cat exp/nnet//splice_opts 2>/dev/null` # frame-splicing options
|
||||
|
||||
parallel_opts="-pe smp $[$num_threads*$num_processes]"
|
||||
## Set up features.
|
||||
feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
|
||||
|
||||
nnet_feats="ark,s,cs:apply-cmvn-sliding --center=true scp:$sdata_dnn/JOB/feats.scp ark:- |"
|
||||
|
||||
|
||||
# Initialize the i-vector extractor using the FGMM input
|
||||
if [ $stage -le -2 ]; then
|
||||
cp $fgmm_model $dir/final.ubm || exit 1;
|
||||
$cmd $dir/log/convert.log \
|
||||
fgmm-global-to-gmm $dir/final.ubm $dir/final.dubm || exit 1;
|
||||
$cmd $dir/log/init.log \
|
||||
ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \
|
||||
$dir/final.ubm $dir/0.ie || exit 1;
|
||||
fi
|
||||
|
||||
# Do Gaussian selection and posterior extracion
|
||||
|
||||
if [ $stage -le -1 ]; then
|
||||
echo $nj_full > $dir/num_jobs
|
||||
echo "$0: doing DNN posterior computation"
|
||||
$cmd JOB=1:$nj_full $dir/log/post.JOB.log \
|
||||
nnet-am-compute --apply-log=true $nnet "$nnet_feats" ark:- \
|
||||
\| select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- \
|
||||
\| logprob-to-post --min-post=$min_post ark,s,cs:- ark:- \| \
|
||||
scale-post ark:- $posterior_scale "ark:|gzip -c >$dir/post.JOB.gz" || exit 1;
|
||||
|
||||
else
|
||||
if ! [ $nj_full -eq $(cat $dir/num_jobs) ]; then
|
||||
echo "Num-jobs mismatch $nj_full versus $(cat $dir/num_jobs)"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
x=0
|
||||
while [ $x -lt $num_iters ]; do
|
||||
if [ $stage -le $x ]; then
|
||||
rm $dir/.error 2>/dev/null
|
||||
|
||||
Args=() # bash array of training commands for 1:nj, that put accs to stdout.
|
||||
for j in $(seq $nj_full); do
|
||||
Args[$j]=`echo "ivector-extractor-acc-stats --num-threads=$num_threads --num-samples-for-weights=$num_samples_for_weights $dir/$x.ie '$feats' 'ark,s,cs:gunzip -c $dir/post.JOB.gz|' -|" | sed s/JOB/$j/g`
|
||||
done
|
||||
|
||||
echo "Accumulating stats (pass $x)"
|
||||
for g in $(seq $nj); do
|
||||
start=$[$num_processes*($g-1)+1]
|
||||
$cmd $parallel_opts $dir/log/acc.$x.$g.log \
|
||||
ivector-extractor-sum-accs --parallel=true "${Args[@]:$start:$num_processes}" \
|
||||
$dir/acc.$x.$g || touch $dir/.error &
|
||||
done
|
||||
wait
|
||||
[ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
|
||||
accs=""
|
||||
for j in $(seq $nj); do
|
||||
accs+="$dir/acc.$x.$j "
|
||||
done
|
||||
echo "Summing accs (pass $x)"
|
||||
$cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
|
||||
ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
|
||||
echo "Updating model (pass $x)"
|
||||
nt=$[$num_threads*$num_processes] # use the same number of threads that
|
||||
# each accumulation process uses, since we
|
||||
# can be sure the queue will support this many.
|
||||
$cmd -pe smp $nt $dir/log/update.$x.log \
|
||||
ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
|
||||
rm $dir/acc.$x.*
|
||||
if $cleanup; then
|
||||
rm $dir/acc.$x
|
||||
# rm $dir/$x.ie
|
||||
fi
|
||||
fi
|
||||
x=$[$x+1]
|
||||
done
|
||||
|
||||
ln -s $x.ie $dir/final.ie
|
|
@ -0,0 +1,5 @@
|
|||
This directory contains DNN scripts based on the nnet2 recipes found in
|
||||
the ASR examples (e.g., fisher_english). The scripts have been modified
|
||||
for speaker recognition purposes. Most of the scripts are lightly modified
|
||||
versions of those appearing in the steps or local directories of
|
||||
egs/fisher_english.
|
|
@ -0,0 +1,62 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
|
||||
if [ -f path.sh ]; then . path.sh; fi
|
||||
|
||||
mkdir -p data/lang_test
|
||||
|
||||
arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
|
||||
[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
|
||||
|
||||
mkdir -p data/lang_test
|
||||
cp -r data/lang/* data/lang_test
|
||||
|
||||
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
|
||||
# LM doesn't have these "invalid combinations". These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
|
||||
# our word list. Since our LM doesn't have any, we just give it
|
||||
# /dev/null [we leave it in the script to show how you'd do it].
|
||||
gunzip -c "$arpa_lm" | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl /dev/null | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
|
||||
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
|
||||
|
||||
echo "Checking how stochastic G is (the first of these numbers should be small):"
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
|
||||
## Check lexicon.
|
||||
## just have a look and make sure it seems sane.
|
||||
echo "First few lines of lexicon FST:"
|
||||
fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head
|
||||
|
||||
echo Performing further checks
|
||||
|
||||
# Checking that G.fst is determinizable.
|
||||
fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
|
||||
|
||||
# Checking that L_disambig.fst is determinizable.
|
||||
fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
|
||||
|
||||
# Checking that disambiguated lexicon times G is determinizable
|
||||
# Note: we do this with fstdeterminizestar not fstdeterminize, as
|
||||
# fstdeterminize was taking forever (presumbaly relates to a bug
|
||||
# in this version of OpenFst that makes determinization slow for
|
||||
# some case).
|
||||
fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
|
||||
fstdeterminizestar >/dev/null || echo Error
|
||||
|
||||
# Checking that LG is stochastic:
|
||||
fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
|
||||
fstisstochastic || echo "[log:] LG is not stochastic"
|
||||
|
||||
|
||||
echo "$0 succeeded"
|
||||
|
|
@ -0,0 +1,211 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2013 Johns Hopkins University (Author: Daniel Povey)
|
||||
# Apache 2.0.
|
||||
|
||||
stage=0
|
||||
|
||||
calldata=
|
||||
while test $# -gt 0
|
||||
do
|
||||
case "$1" in
|
||||
--calldata) calldata=1
|
||||
;;
|
||||
*) break;
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
. utils/parse_options.sh
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "$0 [--calldata] <fisher-dir-1> [<fisher-dir-2> ...]"
|
||||
echo " e.g.: $0 /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19\\"
|
||||
echo " /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13"
|
||||
echo " (We also support a single directory that has the contents of all of them)"
|
||||
echo " If specified, --calldata will be used to map Kaldi speaker ID to real"
|
||||
echo " speaker PIN released with the Fisher corpus."
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
# Check that the arguments are all absolute pathnames.
|
||||
|
||||
for dir in $*; do
|
||||
case $dir in /*) ;; *)
|
||||
echo "$0: all arguments must be absolute pathnames."; exit 1;
|
||||
esac
|
||||
done
|
||||
|
||||
# First check we have the right things in there...
|
||||
#
|
||||
rm -r data/local/data/links 2>/dev/null
|
||||
mkdir -p data/local/data/links || exit 1;
|
||||
|
||||
for subdir in fe_03_p1_sph1 fe_03_p1_sph3 fe_03_p1_sph5 fe_03_p1_sph7 \
|
||||
fe_03_p2_sph1 fe_03_p2_sph3 fe_03_p2_sph5 fe_03_p2_sph7 fe_03_p1_sph2 \
|
||||
fe_03_p1_sph4 fe_03_p1_sph6 fe_03_p1_tran fe_03_p2_sph2 fe_03_p2_sph4 \
|
||||
fe_03_p2_sph6 fe_03_p2_tran; do
|
||||
found_subdir=false
|
||||
for dir in $*; do
|
||||
if [ -d $dir/$subdir ]; then
|
||||
found_subdir=true
|
||||
ln -s $dir/$subdir data/local/data/links
|
||||
else
|
||||
new_style_subdir=$(echo $subdir | sed s/fe_03_p2_sph/fisher_eng_tr_sp_d/)
|
||||
if [ -d $dir/$new_style_subdir ]; then
|
||||
found_subdir=true
|
||||
ln -s $dir/$new_style_subdir data/local/data/links/$subdir
|
||||
fi
|
||||
fi
|
||||
done
|
||||
if ! $found_subdir; then
|
||||
echo "$0: could not find the subdirectory $subdir in any of $*"
|
||||
exit 1;
|
||||
fi
|
||||
done
|
||||
|
||||
|
||||
tmpdir=`pwd`/data/local/data
|
||||
links=data/local/data/links
|
||||
|
||||
. ./path.sh # Needed for KALDI_ROOT
|
||||
|
||||
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
|
||||
|
||||
if [ ! -x $sph2pipe ]; then
|
||||
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
# (1) Get transcripts in one file, and clean them up ..
|
||||
|
||||
if [ $stage -le 0 ]; then
|
||||
|
||||
find $links/fe_03_p1_tran/data $links/fe_03_p2_tran/data -name '*.txt' > $tmpdir/transcripts.flist
|
||||
|
||||
for dir in fe_03_p{1,2}_sph{1,2,3,4,5,6,7}; do
|
||||
find $links/$dir/ -name '*.sph'
|
||||
done > $tmpdir/sph.flist
|
||||
|
||||
n=`cat $tmpdir/transcripts.flist | wc -l`
|
||||
if [ $n -ne 11699 ]; then
|
||||
echo "Expected to find 11699 transcript files in the Fisher data, found $n"
|
||||
exit 1;
|
||||
fi
|
||||
n=`cat $tmpdir/sph.flist | wc -l`
|
||||
if [ $n -ne 11699 ]; then
|
||||
echo "Expected to find 11699 .sph files in the Fisher data, found $n"
|
||||
exit 1;
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
mkdir -p data/train_all_asr
|
||||
|
||||
|
||||
## fe_03_00004.sph
|
||||
## Transcpribed at the LDC
|
||||
#
|
||||
#7.38 8.78 A: an- so the topic is
|
||||
|
||||
echo -n > $tmpdir/text.1 || exit 1;
|
||||
|
||||
perl -e '
|
||||
use File::Basename;
|
||||
($tmpdir)=@ARGV;
|
||||
open(F, "<$tmpdir/transcripts.flist") || die "Opening list of transcripts";
|
||||
open(R, "|sort >data/train_all_asr/reco2file_and_channel") || die "Opening reco2file_and_channel";
|
||||
open(T, ">$tmpdir/text.1") || die "Opening text output";
|
||||
while (<F>) {
|
||||
$file = $_;
|
||||
m:([^/]+)\.txt: || die "Bad filename $_";
|
||||
$call_id = $1;
|
||||
print R "$call_id-A $call_id A\n";
|
||||
print R "$call_id-B $call_id B\n";
|
||||
open(I, "<$file") || die "Opening file $_";
|
||||
|
||||
$line1 = <I>;
|
||||
$line1 =~ m/# (.+)\.sph/ || die "Bad first line $line1 in file $file";
|
||||
$call_id eq $1 || die "Mismatch call-id $call_id vs $1\n";
|
||||
while (<I>) {
|
||||
if (m/([0-9.]+)\s+([0-9.]+) ([AB]):\s*(\S.+\S|\S)\s*$/) {
|
||||
$start = sprintf("%06d", $1 * 100.0);
|
||||
$end = sprintf("%06d", $2 * 100.0);
|
||||
length($end) > 6 && die "Time too long $end in file $file";
|
||||
$side = $3;
|
||||
$words = $4;
|
||||
$utt_id = "${call_id}-$side-$start-$end";
|
||||
print T "$utt_id $words\n" || die "Error writing to text file";
|
||||
}
|
||||
}
|
||||
}
|
||||
close(R); close(T) ' $tmpdir || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ]; then
|
||||
sort $tmpdir/text.1 | grep -v '((' | \
|
||||
awk '{if (NF > 1){ print; }}' | \
|
||||
sed 's:\[laugh\]:[laughter]:g' | \
|
||||
sed 's:\[sigh\]:[noise]:g' | \
|
||||
sed 's:\[cough\]:[noise]:g' | \
|
||||
sed 's:\[sigh\]:[noise]:g' | \
|
||||
sed 's:\[mn\]:[noise]:g' | \
|
||||
sed 's:\[breath\]:[noise]:g' | \
|
||||
sed 's:\[lipsmack\]:[noise]:g' > $tmpdir/text.2
|
||||
cp $tmpdir/text.2 data/train_all_asr/text
|
||||
# create segments file and utt2spk file...
|
||||
! cat data/train_all_asr/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > data/train_all_asr/utt2spk \
|
||||
&& echo "Error producing utt2spk file" && exit 1;
|
||||
|
||||
cat data/train_all_asr/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2; $s = sprintf("%.2f", 0.01*$3);
|
||||
$e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e\n"; ' > data/train_all_asr/segments
|
||||
|
||||
utils/utt2spk_to_spk2utt.pl <data/train_all_asr/utt2spk > data/train_all_asr/spk2utt
|
||||
fi
|
||||
|
||||
if [ $stage -le 3 ]; then
|
||||
for f in `cat $tmpdir/sph.flist`; do
|
||||
# convert to absolute path
|
||||
readlink -e $f
|
||||
done > $tmpdir/sph_abs.flist
|
||||
|
||||
cat $tmpdir/sph_abs.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; "; print "$1 $_"; ' > $tmpdir/sph.scp
|
||||
cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
|
||||
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
|
||||
sort -k1,1 -u > data/train_all_asr/wav.scp || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 4 ]; then
|
||||
# get the spk2gender information. This is not a standard part of our
|
||||
# file formats
|
||||
# The files "filetable2fe_03_p2_sph1 fe_03_05852.sph ff
|
||||
cat $links/fe_03_p1_sph{1,2,3,4,5,6,7}/filetable.txt \
|
||||
$links/fe_03_p2_sph{1,2,3,4,5,6,7}/docs/filetable2.txt | \
|
||||
perl -ane 'm:^\S+ (\S+)\.sph ([fm])([fm]): || die "bad line $_;"; print "$1-A $2\n", "$1-B $3\n"; ' | \
|
||||
sort | uniq | utils/filter_scp.pl data/train_all_asr/spk2utt > data/train_all_asr/spk2gender
|
||||
|
||||
if [ ! -s data/train_all_asr/spk2gender ]; then
|
||||
echo "It looks like our first try at getting the spk2gender info did not work."
|
||||
echo "(possibly older distribution?) Trying something else."
|
||||
cat $links/fe_03_p1_tran/doc/fe_03_p1_filelist.tbl $links/fe_03_p2_tran/doc/fe_03_p2_filelist.tbl | \
|
||||
perl -ane 'm:fe_03_p[12]_sph\d\t(\d+)\t([mf])([mf]): || die "Bad line $_";
|
||||
print "fe_03_$1-A $2\n", "fe_03_$1-B $3\n"; ' | \
|
||||
sort | uniq | utils/filter_scp.pl data/train_all_asr/spk2utt > data/train_all_asr/spk2gender
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ ! -z "$calldata" ]; then # fix speaker IDs
|
||||
cat $links/fe_03_p{1,2}_tran/doc/*calldata.tbl > $tmpdir/combined-calldata.tbl
|
||||
local/fisher_fix_speakerid.pl $tmpdir/combined-calldata.tbl data/train_all_asr
|
||||
utils/utt2spk_to_spk2utt.pl data/train_all_asr/utt2spk.new > data/train_all_asr/spk2utt.new
|
||||
# patch files
|
||||
for f in spk2utt utt2spk text segments spk2gender; do
|
||||
cp data/train_all_asr/$f data/train_all_asr/$f.old || exit 1;
|
||||
cp data/train_all_asr/$f.new data/train_all_asr/$f || exit 1;
|
||||
done
|
||||
rm $tmpdir/combined-calldata.tbl
|
||||
fi
|
||||
|
||||
echo "Data preparation succeeded"
|
||||
|
|
@ -0,0 +1,114 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
# Author: Peng Qi (pengqi@cs.stanford.edu)
|
||||
# This script maps Switchboard speaker IDs to the true physical speakers
|
||||
# and fixes the utterances IDs accordingly. Expected to be run one level of
|
||||
# directory above.
|
||||
|
||||
sub trim {
|
||||
(my $s = $_[0]) =~ s/^\s+|\s+$//g;
|
||||
return $s;
|
||||
}
|
||||
|
||||
if ($#ARGV != 1) {
|
||||
print "Usage: swbd1_fix_speakerid.pl <fisher-calldata-tbl-file> <data-dir>\n";
|
||||
print "E.g.: swbd1_fix_speakerid.pl data/local/train/combined-calldata.tbl data/train_all\n";
|
||||
}
|
||||
|
||||
$tab_file = $ARGV[0];
|
||||
$dir = $ARGV[1];
|
||||
|
||||
%conv_to_spk = ();
|
||||
|
||||
open(my $conv_tab, '<', $tab_file) or die "Could not open '$tab_file' $!\n";
|
||||
|
||||
while (my $line = <$conv_tab>) {
|
||||
chomp $line;
|
||||
|
||||
my @fields = split "," , $line;
|
||||
#$fields[0] = trim($fields[0]);
|
||||
$fields[5] = trim($fields[5]);
|
||||
$fields[10] = trim($fields[10]);
|
||||
$conv_to_spk{'fe_03_' . $fields[0] . '-A'} = $fields[5];
|
||||
$conv_to_spk{'fe_03_' . $fields[0] . '-B'} = $fields[10];
|
||||
}
|
||||
|
||||
close($conv_tab);
|
||||
|
||||
# fix utt2spk
|
||||
|
||||
%missingconv = ();
|
||||
|
||||
open(my $utt2spk, '<', $dir . '/utt2spk') or die "Could not open '$dir/utt2spk' $!\n";
|
||||
open(my $utt2spk_new, '>', $dir . '/utt2spk.new');
|
||||
|
||||
while (my $line = <$utt2spk>) {
|
||||
chomp $line;
|
||||
|
||||
my @fields = split " " , $line;
|
||||
my $convid = substr $fields[0], 0, 13;
|
||||
|
||||
if (exists $conv_to_spk{ $convid }) {
|
||||
my $spkid = $conv_to_spk{ $convid };
|
||||
$spkid = "fe_03_" . $spkid;
|
||||
my $newuttid = $spkid . '-' . (substr $fields[0], 6);
|
||||
|
||||
print $utt2spk_new "$newuttid $spkid\n";
|
||||
} else {
|
||||
my $convid = substr $convid, 6, 5;
|
||||
$missingconv{$convid} = 1;
|
||||
|
||||
print $utt2spk_new $fields[0]." ".$fields[1]."\n";
|
||||
}
|
||||
}
|
||||
|
||||
close($utt2spk);
|
||||
close($utt2spk_new);
|
||||
|
||||
foreach my $conv (keys %missingconv) {
|
||||
print "Warning: Conversation ID '$conv' not found in conv.tab, retaining old speaker IDs\n"
|
||||
}
|
||||
|
||||
# fix spk2gender
|
||||
|
||||
if (open(my $spk2gender, '<', $dir . '/spk2gender')) {
|
||||
open(my $spk2gender_new, '>', $dir . '/spk2gender.new');
|
||||
|
||||
while (my $line = <$spk2gender>) {
|
||||
chomp $line;
|
||||
|
||||
my @fields = split " ", $line;
|
||||
my $convid = $fields[0];
|
||||
|
||||
if (exists $conv_to_spk{ $convid }) {
|
||||
my $spkid = $conv_to_spk{ $convid };
|
||||
$spkid = "fe_03_" . $spkid;
|
||||
|
||||
print $spk2gender_new $spkid." ".$fields[1]."\n";
|
||||
} else {
|
||||
print $spk2gender_new $fields[0]." ".$fields[1]."\n";
|
||||
}
|
||||
}
|
||||
|
||||
close($spk2gender);
|
||||
close($spk2gender_new);
|
||||
}
|
||||
|
||||
# fix segments and text
|
||||
|
||||
foreach my $file ('segments','text') {
|
||||
open(my $oldfile, '<', "$dir/$file") or die "Could not open '$dir/$file' $!\n";
|
||||
open(my $newfile, '>', "$dir/$file.new");
|
||||
|
||||
while (my $line = <$oldfile>) {
|
||||
chomp $line;
|
||||
|
||||
my $convid = substr $line, 0, 13;
|
||||
if (exists $conv_to_spk{$convid}) {
|
||||
my $spkid = $conv_to_spk{$convid};
|
||||
print $newfile "fe_03_$spkid-" . (substr $line, 6) . "\n";
|
||||
} else {
|
||||
print $newfile "$line\n";
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,182 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
|
||||
# To be run from one directory above this script.
|
||||
|
||||
## The input is some directory containing the switchboard-1 release 2
|
||||
## corpus (LDC97S62). Note: we don't make many assumptions about how
|
||||
## you unpacked this. We are just doing a "find" command to locate
|
||||
## the .sph files.
|
||||
|
||||
# for example /mnt/matylda2/data/SWITCHBOARD_1R2
|
||||
|
||||
. path.sh
|
||||
|
||||
# The parts of the output of this that will be needed are
|
||||
# [in data/local/dict/ ]
|
||||
# lexicon.txt
|
||||
# extra_questions.txt
|
||||
# nonsilence_phones.txt
|
||||
# optional_silence.txt
|
||||
# silence_phones.txt
|
||||
|
||||
|
||||
#check existing directories
|
||||
[ $# != 0 ] && echo "Usage: local/dnn/fisher_prepare_dict.sh" && exit 1;
|
||||
|
||||
dir=data/local/dict
|
||||
mkdir -p $dir
|
||||
echo "Getting CMU dictionary"
|
||||
svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $dir/cmudict
|
||||
|
||||
# silence phones, one per line.
|
||||
for w in sil laughter noise oov; do echo $w; done > $dir/silence_phones.txt
|
||||
echo sil > $dir/optional_silence.txt
|
||||
|
||||
# For this setup we're discarding stress.
|
||||
cat $dir/cmudict/cmudict.0.7a.symbols | sed s/[0-9]//g | \
|
||||
tr '[A-Z]' '[a-z]' | perl -ane 's:\r::; print;' | sort | uniq > $dir/nonsilence_phones.txt
|
||||
|
||||
# An extra question will be added by including the silence phones in one class.
|
||||
cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
|
||||
|
||||
grep -v ';;;' $dir/cmudict/cmudict.0.7a | tr '[A-Z]' '[a-z]' | \
|
||||
perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; s: : :; print; }' | \
|
||||
perl -ane '@A = split(" ", $_); for ($n = 1; $n<@A;$n++) { $A[$n] =~ s/[0-9]//g; } print join(" ", @A) . "\n";' | \
|
||||
sort | uniq > $dir/lexicon1_raw_nosil.txt || exit 1;
|
||||
|
||||
# Add prons for laughter, noise, oov
|
||||
for w in `grep -v sil $dir/silence_phones.txt`; do
|
||||
echo "[$w] $w"
|
||||
done | cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1;
|
||||
|
||||
|
||||
# This is just for diagnostics:
|
||||
cat data/train_all_asr/text | \
|
||||
awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \
|
||||
sort -nr > $dir/word_counts
|
||||
|
||||
cat $dir/word_counts | awk '{print $2}' > $dir/word_list
|
||||
|
||||
# between lexicon2_raw and lexicon3_expand we limit it to the words seen in
|
||||
# the Fisher data.
|
||||
utils/filter_scp.pl $dir/word_list $dir/lexicon2_raw.txt > $dir/lexicon3_expand.txt
|
||||
|
||||
# From lexicon2_raw to lexicon3_expand, we also expand the vocab for acronyms
|
||||
# like c._n._n. and other underscore-containing things as long as the new vocab
|
||||
# could be divided into finite parts contained in lexicon2_raw
|
||||
cat $dir/lexicon2_raw.txt | \
|
||||
perl -e 'while(<STDIN>) { @A=split; $w = shift @A; $pron{$w} = join(" ", @A); }
|
||||
($w) = @ARGV; open(W, "<$w") || die "Error opening word-counts from $w";
|
||||
while(<W>) { # reading in words we saw in training data..
|
||||
($c, $w) = split;
|
||||
if (!defined $pron{$w}) {
|
||||
@A = split("_", $w);
|
||||
if (@A > 1) {
|
||||
$this_pron = "";
|
||||
$pron_ok = 1;
|
||||
foreach $a (@A) {
|
||||
if (defined($pron{$a})) { $this_pron = $this_pron . "$pron{$a} "; }
|
||||
else { $pron_ok = 0; print STDERR "Not handling word $w, count is $c\n"; last; }
|
||||
}
|
||||
if ($pron_ok) { $new_pron{$w} = $this_pron; }
|
||||
}
|
||||
}
|
||||
}
|
||||
foreach $w (keys %new_pron) { print "$w $new_pron{$w}\n"; }' \
|
||||
$dir/word_counts >> $dir/lexicon3_expand.txt || exit 1;
|
||||
|
||||
|
||||
cat $dir/lexicon3_expand.txt \
|
||||
<( echo "mm m"
|
||||
echo "<unk> oov" ) > $dir/lexicon4_extra.txt
|
||||
|
||||
|
||||
cp $dir/lexicon4_extra.txt $dir/lexicon.txt
|
||||
rm $dir/lexiconp.txt 2>/dev/null; # can confuse later script if this exists.
|
||||
|
||||
awk '{print $1}' $dir/lexicon.txt | \
|
||||
perl -e '($word_counts)=@ARGV;
|
||||
open(W, "<$word_counts")||die "opening word-counts $word_counts";
|
||||
while(<STDIN>) { chop; $seen{$_}=1; }
|
||||
while(<W>) {
|
||||
($c,$w) = split;
|
||||
if (!defined $seen{$w}) { print; }
|
||||
} ' $dir/word_counts > $dir/oov_counts.txt
|
||||
|
||||
echo "*Highest-count OOVs are:"
|
||||
head -n 20 $dir/oov_counts.txt
|
||||
|
||||
utils/validate_dict_dir.pl $dir
|
||||
exit 0;
|
||||
|
||||
|
||||
|
||||
srcdir=data/local/train_asr # This is where we downloaded some stuff..
|
||||
dir=data/local/dict
|
||||
mkdir -p $dir
|
||||
srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text
|
||||
|
||||
# assume swbd_p1_data_prep.sh was done already.
|
||||
[ ! -f "$srcdict" ] && echo "No such file $srcdict" && exit 1;
|
||||
|
||||
#(2a) Dictionary preparation:
|
||||
# Pre-processing (Upper-case, remove comments)
|
||||
awk 'BEGIN{getline}($0 !~ /^#/) {$0=toupper($0); print}' \
|
||||
$srcdict | sort | awk '($0 !~ /^[:space:]*$/) {print}' \
|
||||
> $dir/lexicon1.txt || exit 1;
|
||||
|
||||
|
||||
cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
|
||||
grep -v SIL > $dir/nonsilence_phones.txt || exit 1;
|
||||
|
||||
( echo SIL; echo SPN; echo NSN; echo LAU ) > $dir/silence_phones.txt
|
||||
|
||||
echo SIL > $dir/optional_silence.txt
|
||||
|
||||
# No "extra questions" in the input to this setup, as we don't
|
||||
# have stress or tone.
|
||||
echo -n >$dir/extra_questions.txt
|
||||
|
||||
# Add to the lexicon the silences, noises etc.
|
||||
(echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU';
|
||||
echo '<UNK> SPN' ) | \
|
||||
cat - $dir/lexicon1.txt > $dir/lexicon2.txt || exit 1;
|
||||
|
||||
|
||||
# Map the words in the lexicon. That is-- for each word in the lexicon, we map it
|
||||
# to a new written form. The transformations we do are:
|
||||
# remove laughter markings, e.g.
|
||||
# [LAUGHTER-STORY] -> STORY
|
||||
# Remove partial-words, e.g.
|
||||
# -[40]1K W AH N K EY
|
||||
# becomes -1K
|
||||
# and
|
||||
# -[AN]Y IY
|
||||
# becomes
|
||||
# -Y
|
||||
# -[A]B[OUT]- B
|
||||
# becomes
|
||||
# -B-
|
||||
# Also, curly braces, which appear to be used for "nonstandard"
|
||||
# words or non-words, are removed, e.g.
|
||||
# {WOLMANIZED} W OW L M AX N AY Z D
|
||||
# -> WOLMANIZED
|
||||
# Also, mispronounced words, e.g.
|
||||
# [YEAM/YEAH] Y AE M
|
||||
# are changed to just e.g. YEAM, i.e. the orthography
|
||||
# of the mispronounced version.
|
||||
# Note-- this is only really to be used in training. The main practical
|
||||
# reason is to avoid having tons of disambiguation symbols, which
|
||||
# we otherwise would get because there are many partial words with
|
||||
# the same phone sequences (most problematic: S).
|
||||
# Also, map
|
||||
# THEM_1 EH M -> THEM
|
||||
# so that multiple pronunciations just have alternate entries
|
||||
# in the lexicon.
|
||||
|
||||
local/dnn/swbd_map_words.pl -f 1 $dir/lexicon2.txt | sort | uniq > $dir/lexicon3.txt || exit 1;
|
||||
|
||||
cp $dir/lexicon3.txt $dir/lexicon.txt # This is the final lexicon.
|
||||
|
||||
echo Prepared input dictionary and phone-sets for Switchboard phase 1.
|
|
@ -0,0 +1,111 @@
|
|||
#!/bin/bash
|
||||
|
||||
|
||||
# To be run from one directory above this script.
|
||||
|
||||
|
||||
text=data/train_all_asr/text
|
||||
lexicon=data/local/dict/lexicon.txt
|
||||
|
||||
for f in "$text" "$lexicon"; do
|
||||
[ ! -f $x ] && echo "$0: No such file $f" && exit 1;
|
||||
done
|
||||
|
||||
# This script takes no arguments. It assumes you have already run
|
||||
# fisher_data_prep.sh and fisher_prepare_dict.sh
|
||||
# It takes as input the files
|
||||
#data/train_all/text
|
||||
#data/local/dict/lexicon.txt
|
||||
|
||||
dir=data/local/lm
|
||||
mkdir -p $dir
|
||||
export LC_ALL=C # You'll get errors about things being not sorted, if you
|
||||
# have a different locale.
|
||||
export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
|
||||
( # First make sure the kaldi_lm toolkit is installed.
|
||||
cd ../../../tools || exit 1;
|
||||
if [ -d kaldi_lm ]; then
|
||||
echo Not installing the kaldi_lm toolkit since it is already there.
|
||||
else
|
||||
echo Downloading and installing the kaldi_lm tools
|
||||
if [ ! -f kaldi_lm.tar.gz ]; then
|
||||
wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
|
||||
fi
|
||||
tar -xvzf kaldi_lm.tar.gz || exit 1;
|
||||
cd kaldi_lm
|
||||
make || exit 1;
|
||||
echo Done making the kaldi_lm tools
|
||||
fi
|
||||
) || exit 1;
|
||||
|
||||
mkdir -p $dir
|
||||
|
||||
|
||||
cleantext=$dir/text.no_oov
|
||||
|
||||
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
|
||||
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<unk> ");} } printf("\n");}' \
|
||||
> $cleantext || exit 1;
|
||||
|
||||
|
||||
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
|
||||
sort -nr > $dir/word.counts || exit 1;
|
||||
|
||||
|
||||
# Get counts from acoustic training transcripts, and add one-count
|
||||
# for each word in the lexicon (but not silence, we don't want it
|
||||
# in the LM-- we'll add it optionally later).
|
||||
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
|
||||
cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
|
||||
sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
|
||||
|
||||
# note: we probably won't really make use of <unk> as there aren't any OOVs
|
||||
cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<unk>" > $dir/word_map \
|
||||
|| exit 1;
|
||||
|
||||
# note: ignore 1st field of train.txt, it's the utterance-id.
|
||||
cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
|
||||
{ for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
|
||||
|| exit 1;
|
||||
|
||||
train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
|
||||
|
||||
# Perplexity over 88307.000000 words (excluding 691.000000 OOVs) is 71.241332
|
||||
|
||||
# note: output is
|
||||
# data/local/lm/3gram-mincount/lm_unpruned.gz
|
||||
|
||||
|
||||
exit 0
|
||||
|
||||
|
||||
# From here is some commands to do a baseline with SRILM (assuming
|
||||
# you have it installed).
|
||||
heldout_sent=10000 # Don't change this if you want result to be comparable with
|
||||
# kaldi_lm results
|
||||
sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
|
||||
mkdir -p $sdir
|
||||
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
|
||||
head -$heldout_sent > $sdir/heldout
|
||||
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
|
||||
tail -n +$heldout_sent > $sdir/train
|
||||
|
||||
cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
|
||||
|
||||
|
||||
ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
|
||||
-map-unk "<unk>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
|
||||
ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
|
||||
|
||||
# data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM
|
||||
# file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs
|
||||
# 0 zeroprobs, logprob= -165170 ppl= 71.7609 ppl1= 123.258
|
||||
|
||||
|
||||
# Note: perplexity SRILM gives to Kaldi-LM model is similar to what kaldi-lm reports above.
|
||||
# Difference in WSJ must have been due to different treatment of <unk>.
|
||||
ngram -lm $dir/3gram-mincount/lm_unpruned.gz -ppl $sdir/heldout
|
||||
|
||||
# data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM
|
||||
# file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs
|
||||
# 0 zeroprobs, logprob= -164990 ppl= 71.4278 ppl1= 122.614
|
|
@ -0,0 +1,321 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).
|
||||
# 2015 David Snyder
|
||||
# Apache 2.0.
|
||||
#
|
||||
# This script is based off of get_egs2.sh in ../../steps/nnet2/, but has been
|
||||
# modified for speaker recogntion purposes to use a sliding window CMN.
|
||||
#
|
||||
# This script, which will generally be called from other neural-net training
|
||||
# scripts, extracts the training examples used to train the neural net (and also
|
||||
# the validation examples used for diagnostics), and puts them in separate archives.
|
||||
#
|
||||
# This script differs from get_egs.sh in that it dumps egs with several frames
|
||||
# of labels, controlled by the frames_per_eg config variable (default: 8). This
|
||||
# takes many times less disk space because typically we have 4 to 7 frames of
|
||||
# context on the left and right, and this ends up getting shared. This is at
|
||||
# the expense of slightly higher disk I/O during training time.
|
||||
#
|
||||
# We also have a simpler way of dividing the egs up into pieces, with one level
|
||||
# of index, so we have $dir/egs.{0,1,2,...}.ark instead of having two levels of
|
||||
# indexes. The extra files we write to $dir that explain the structure are
|
||||
# $dir/info/num_archives, which contains the number of files egs.*.ark, and
|
||||
# $dir/info/frames_per_eg, which contains the number of frames of labels per eg
|
||||
# (e.g. 7), and $dir/samples_per_archive. These replace the files
|
||||
# iters_per_epoch and num_jobs_nnet and egs_per_iter that the previous script
|
||||
# wrote to. This script takes the directory where the "egs" are located as the
|
||||
# argument, not the directory one level up.
|
||||
|
||||
# Begin configuration section.
|
||||
cmd=run.pl
|
||||
feat_type= # e.g. set it to "raw" to use raw MFCC
|
||||
frames_per_eg=8 # number of frames of labels per example. more->less disk space and
|
||||
# less time preparing egs, but more I/O during training.
|
||||
# note: the script may reduce this if reduce_frames_per_eg is true.
|
||||
left_context=4 # amount of left-context per eg
|
||||
right_context=4 # amount of right-context per eg
|
||||
|
||||
reduce_frames_per_eg=true # If true, this script may reduce the frames_per_eg
|
||||
# if there is only one archive and even with the
|
||||
# reduced frames_pe_eg, the number of
|
||||
# samples_per_iter that would result is less than or
|
||||
# equal to the user-specified value.
|
||||
num_utts_subset=300 # number of utterances in validation and training
|
||||
# subsets used for shrinkage and diagnostics.
|
||||
num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
|
||||
num_train_frames_combine=10000 # # train frames for the above.
|
||||
num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
|
||||
samples_per_iter=400000 # each iteration of training, see this many samples
|
||||
# per job. This is just a guideline; it will pick a number
|
||||
# that divides the number of samples in the entire data.
|
||||
|
||||
transform_dir= # If supplied, overrides alidir as the place to find fMLLR transforms
|
||||
postdir= # If supplied, we will use posteriors in it as soft training targets.
|
||||
|
||||
stage=0
|
||||
io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.
|
||||
random_copy=false
|
||||
online_ivector_dir= # can be used if we are including speaker information as iVectors.
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
if [ -f path.sh ]; then . ./path.sh; fi
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
|
||||
if [ $# != 3 ]; then
|
||||
echo "Usage: $0 [opts] <data> <ali-dir> <egs-dir>"
|
||||
echo " e.g.: $0 data/train exp/tri3_ali exp/tri4_nnet/egs"
|
||||
echo ""
|
||||
echo "Main options (for others, see top of script file)"
|
||||
echo " --config <config-file> # config file containing options"
|
||||
echo " --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
|
||||
echo " --samples-per-iter <#samples;400000> # Number of samples of data to process per iteration, per"
|
||||
echo " # process."
|
||||
echo " --feat-type <lda|raw> # (by default it tries to guess). The feature type you want"
|
||||
echo " # to use as input to the neural net."
|
||||
echo " --frames-per-eg <frames;8> # number of frames per eg on disk"
|
||||
echo " --left-context <width;4> # Number of frames on left side to append for feature input"
|
||||
echo " --right-context <width;4> # Number of frames on right side to append for feature input"
|
||||
echo " --num-frames-diagnostic <#frames;4000> # Number of frames used in computing (train,valid) diagnostics"
|
||||
echo " --num-valid-frames-combine <#frames;10000> # Number of frames used in getting combination weights at the"
|
||||
echo " # very end."
|
||||
echo " --stage <stage|0> # Used to run a partially-completed training process from somewhere in"
|
||||
echo " # the middle."
|
||||
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
data=$1
|
||||
alidir=$2
|
||||
dir=$3
|
||||
|
||||
|
||||
# Check some files.
|
||||
[ ! -z "$online_ivector_dir" ] && \
|
||||
extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
|
||||
|
||||
for f in $data/feats.scp $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $extra_files; do
|
||||
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
|
||||
done
|
||||
|
||||
|
||||
nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir...
|
||||
|
||||
sdata=$data/split$nj
|
||||
utils/split_data.sh $data $nj
|
||||
|
||||
mkdir -p $dir/log $dir/info
|
||||
cp $alidir/tree $dir
|
||||
|
||||
# Get list of validation utterances.
|
||||
awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
|
||||
> $dir/valid_uttlist || exit 1;
|
||||
|
||||
if [ -f $data/utt2uniq ]; then
|
||||
echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
|
||||
echo "include all perturbed versions of the same 'real' utterances."
|
||||
mv $dir/valid_uttlist $dir/valid_uttlist.tmp
|
||||
utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
|
||||
cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
|
||||
sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
|
||||
awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist
|
||||
rm $dir/uniq2utt $dir/valid_uttlist.tmp
|
||||
fi
|
||||
|
||||
awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
|
||||
utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1;
|
||||
|
||||
[ -z "$transform_dir" ] && transform_dir=$alidir
|
||||
|
||||
## Set up features.
|
||||
if [ -z $feat_type ]; then
|
||||
if [ -f $alidir/final.mat ] && [ ! -f $transform_dir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
|
||||
fi
|
||||
echo "$0: feature type is $feat_type"
|
||||
|
||||
case $feat_type in
|
||||
raw) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- |"
|
||||
valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- |"
|
||||
train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- |"
|
||||
;;
|
||||
lda)
|
||||
splice_opts=`cat $alidir/splice_opts 2>/dev/null`
|
||||
# caution: the top-level nnet training script should copy these to its own dir now.
|
||||
cp $alidir/{splice_opts,final.mat} $dir || exit 1;
|
||||
feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
|
||||
valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
|
||||
train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
|
||||
;;
|
||||
*) echo "$0: invalid feature type $feat_type" && exit 1;
|
||||
esac
|
||||
|
||||
if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
|
||||
echo "$0: using transforms from $transform_dir"
|
||||
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
|
||||
valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/trans.*|' ark:- ark:- |"
|
||||
train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/trans.*|' ark:- ark:- |"
|
||||
fi
|
||||
if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
|
||||
echo "$0: using raw-fMLLR transforms from $transform_dir"
|
||||
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
|
||||
valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/raw_trans.*|' ark:- ark:- |"
|
||||
train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/raw_trans.*|' ark:- ark:- |"
|
||||
fi
|
||||
if [ ! -z "$online_ivector_dir" ]; then
|
||||
feats_one="$(echo "$feats" | sed s:JOB:1:g)"
|
||||
ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
|
||||
echo $ivector_dim > $dir/info/ivector_dim
|
||||
ivectors_opt="--const-feat-dim=$ivector_dim"
|
||||
ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
|
||||
feats="$feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
|
||||
valid_feats="$valid_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
|
||||
train_subset_feats="$train_subset_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
|
||||
else
|
||||
echo 0 >$dir/info/ivector_dim
|
||||
fi
|
||||
|
||||
if [ $stage -le 0 ]; then
|
||||
echo "$0: working out number of frames of training data"
|
||||
num_frames=$(steps/nnet2/get_num_frames.sh $data)
|
||||
echo $num_frames > $dir/info/num_frames
|
||||
else
|
||||
num_frames=`cat $dir/info/num_frames` || exit 1;
|
||||
fi
|
||||
|
||||
# the + 1 is to round up, not down... we assume it doesn't divide exactly.
|
||||
num_archives=$[$num_frames/($frames_per_eg*$samples_per_iter)+1]
|
||||
# (for small data)- while reduce_frames_per_eg == true and the number of
|
||||
# archives is 1 and would still be 1 if we reduced frames_per_eg by 1, reduce it
|
||||
# by 1.
|
||||
reduced=false
|
||||
while $reduce_frames_per_eg && [ $frames_per_eg -gt 1 ] && \
|
||||
[ $[$num_frames/(($frames_per_eg-1)*$samples_per_iter)] -eq 0 ]; do
|
||||
frames_per_eg=$[$frames_per_eg-1]
|
||||
num_archives=1
|
||||
reduced=true
|
||||
done
|
||||
$reduced && echo "$0: reduced frames_per_eg to $frames_per_eg because amount of data is small."
|
||||
|
||||
echo $num_archives >$dir/info/num_archives
|
||||
echo $frames_per_eg >$dir/info/frames_per_eg
|
||||
|
||||
# Working out number of egs per archive
|
||||
egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)]
|
||||
! [ $egs_per_archive -le $samples_per_iter ] && \
|
||||
echo "$0: script error: egs_per_archive=$egs_per_archive not <= samples_per_iter=$samples_per_iter" \
|
||||
&& exit 1;
|
||||
|
||||
echo $egs_per_archive > $dir/info/egs_per_archive
|
||||
|
||||
echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
|
||||
echo "$0: $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
|
||||
|
||||
# Making soft links to storage directories. This is a no-up unless
|
||||
# the subdirectory $dir/storage/ exists. See utils/create_split_dir.pl
|
||||
for x in `seq $num_archives`; do
|
||||
utils/create_data_link.pl $dir/egs.$x.ark
|
||||
for y in `seq $nj`; do
|
||||
utils/create_data_link.pl $dir/egs_orig.$x.$y.ark
|
||||
done
|
||||
done
|
||||
|
||||
nnet_context_opts="--left-context=$left_context --right-context=$right_context"
|
||||
|
||||
echo $left_context > $dir/info/left_context
|
||||
echo $right_context > $dir/info/right_context
|
||||
if [ $stage -le 2 ]; then
|
||||
echo "$0: Getting validation and training subset examples."
|
||||
rm $dir/.error 2>/dev/null
|
||||
echo "$0: ... extracting validation and training-subset alignments."
|
||||
set -o pipefail;
|
||||
for id in $(seq $nj); do gunzip -c $alidir/ali.$id.gz; done | \
|
||||
copy-int-vector ark:- ark,t:- | \
|
||||
utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) | \
|
||||
gzip -c >$dir/ali_special.gz || exit 1;
|
||||
set +o pipefail; # unset the pipefail option.
|
||||
|
||||
$cmd $dir/log/create_valid_subset.log \
|
||||
nnet-get-egs $ivectors_opt $nnet_context_opts "$valid_feats" \
|
||||
"ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
|
||||
"ark:$dir/valid_all.egs" || touch $dir/.error &
|
||||
$cmd $dir/log/create_train_subset.log \
|
||||
nnet-get-egs $ivectors_opt $nnet_context_opts "$train_subset_feats" \
|
||||
"ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
|
||||
"ark:$dir/train_subset_all.egs" || touch $dir/.error &
|
||||
wait;
|
||||
[ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
|
||||
echo "... Getting subsets of validation examples for diagnostics and combination."
|
||||
$cmd $dir/log/create_valid_subset_combine.log \
|
||||
nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \
|
||||
ark:$dir/valid_combine.egs || touch $dir/.error &
|
||||
$cmd $dir/log/create_valid_subset_diagnostic.log \
|
||||
nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \
|
||||
ark:$dir/valid_diagnostic.egs || touch $dir/.error &
|
||||
|
||||
$cmd $dir/log/create_train_subset_combine.log \
|
||||
nnet-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \
|
||||
ark:$dir/train_combine.egs || touch $dir/.error &
|
||||
$cmd $dir/log/create_train_subset_diagnostic.log \
|
||||
nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \
|
||||
ark:$dir/train_diagnostic.egs || touch $dir/.error &
|
||||
wait
|
||||
sleep 5 # wait for file system to sync.
|
||||
cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs
|
||||
|
||||
for f in $dir/{combine,train_diagnostic,valid_diagnostic}.egs; do
|
||||
[ ! -s $f ] && echo "No examples in file $f" && exit 1;
|
||||
done
|
||||
rm $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_combine.egs $dir/ali_special.gz
|
||||
fi
|
||||
|
||||
if [ $stage -le 3 ]; then
|
||||
# create egs_orig.*.*.ark; the first index goes to $num_archives,
|
||||
# the second to $nj (which is the number of jobs in the original alignment
|
||||
# dir)
|
||||
|
||||
egs_list=
|
||||
for n in $(seq $num_archives); do
|
||||
egs_list="$egs_list ark:$dir/egs_orig.$n.JOB.ark"
|
||||
done
|
||||
echo "$0: Generating training examples on disk"
|
||||
# The examples will go round-robin to egs_list.
|
||||
if [ ! -z $postdir ]; then
|
||||
$cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
|
||||
nnet-get-egs $ivectors_opt $nnet_context_opts --num-frames=$frames_per_eg "$feats" \
|
||||
scp:$postdir/post.JOB.scp ark:- \| \
|
||||
nnet-copy-egs ark:- $egs_list || exit 1;
|
||||
else
|
||||
$cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
|
||||
nnet-get-egs $ivectors_opt $nnet_context_opts --num-frames=$frames_per_eg "$feats" \
|
||||
"ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
|
||||
nnet-copy-egs ark:- $egs_list || exit 1;
|
||||
fi
|
||||
fi
|
||||
if [ $stage -le 4 ]; then
|
||||
echo "$0: recombining and shuffling order of archives on disk"
|
||||
# combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and
|
||||
# shuffle the order, writing to the egs.JOB.ark
|
||||
|
||||
egs_list=
|
||||
for n in $(seq $nj); do
|
||||
egs_list="$egs_list $dir/egs_orig.JOB.$n.ark"
|
||||
done
|
||||
|
||||
$cmd $io_opts $extra_opts JOB=1:$num_archives $dir/log/shuffle.JOB.log \
|
||||
nnet-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:$dir/egs.JOB.ark || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 5 ]; then
|
||||
echo "$0: removing temporary archives"
|
||||
for x in `seq $num_archives`; do
|
||||
for y in `seq $nj`; do
|
||||
file=$dir/egs_orig.$x.$y.ark
|
||||
[ -L $file ] && rm $(readlink -f $file)
|
||||
rm $file
|
||||
done
|
||||
done
|
||||
fi
|
||||
|
||||
echo "$0: Finished preparing training examples"
|
|
@ -0,0 +1,181 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey).
|
||||
# 2015 David Snyder
|
||||
# Apache 2.0.
|
||||
#
|
||||
# This script is based off of get_lda.sh in ../../steps/nnet2/, but has been
|
||||
# modified for speaker recogntion purposes to use a sliding window CMN.
|
||||
#
|
||||
# This script, which will generally be called from other neural-net training
|
||||
# scripts, extracts the training examples used to train the neural net (and also
|
||||
# the validation examples used for diagnostics), and puts them in separate archives.
|
||||
|
||||
# Begin configuration section.
|
||||
cmd=run.pl
|
||||
|
||||
feat_type=
|
||||
stage=0
|
||||
splice_width=4 # meaning +- 4 frames on each side for second LDA
|
||||
left_context= # left context for second LDA
|
||||
right_context= # right context for second LDA
|
||||
rand_prune=4.0 # Relates to a speedup we do for LDA.
|
||||
within_class_factor=0.0001 # This affects the scaling of the transform rows...
|
||||
# sorry for no explanation, you'll have to see the code.
|
||||
transform_dir= # If supplied, overrides alidir
|
||||
num_feats=10000 # maximum number of feature files to use. Beyond a certain point it just
|
||||
# gets silly to use more data.
|
||||
lda_dim= # This defaults to no dimension reduction.
|
||||
online_ivector_dir=
|
||||
ivector_randomize_prob=0.0 # if >0.0, randomizes iVectors during training with
|
||||
# this prob per iVector.
|
||||
ivector_dir=
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
if [ -f path.sh ]; then . ./path.sh; fi
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
|
||||
if [ $# != 4 ]; then
|
||||
echo "Usage: steps/nnet2/get_lda.sh [opts] <data> <lang> <ali-dir> <exp-dir>"
|
||||
echo " e.g.: steps/nnet2/get_lda.sh data/train data/lang exp/tri3_ali exp/tri4_nnet"
|
||||
echo " As well as extracting the examples, this script will also do the LDA computation,"
|
||||
echo " if --est-lda=true (default:true)"
|
||||
echo ""
|
||||
echo "Main options (for others, see top of script file)"
|
||||
echo " --config <config-file> # config file containing options"
|
||||
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
|
||||
echo " --splice-width <width|4> # Number of frames on each side to append for feature input"
|
||||
echo " # (note: we splice processed, typically 40-dimensional frames"
|
||||
echo " --left-context <width;4> # Number of frames on left side to append for feature input, overrides splice-width"
|
||||
echo " --right-context <width;4> # Number of frames on right side to append for feature input, overrides splice-width"
|
||||
echo " --stage <stage|0> # Used to run a partially-completed training process from somewhere in"
|
||||
echo " # the middle."
|
||||
echo " --online-vector-dir <dir|none> # Directory produced by"
|
||||
echo " # steps/online/nnet2/extract_ivectors_online.sh"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
data=$1
|
||||
lang=$2
|
||||
alidir=$3
|
||||
dir=$4
|
||||
|
||||
[ -z "$left_context" ] && left_context=$splice_width
|
||||
[ -z "$right_context" ] && right_context=$splice_width
|
||||
|
||||
[ ! -z "$online_ivector_dir" ] && \
|
||||
extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
|
||||
|
||||
# Check some files.
|
||||
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $extra_files; do
|
||||
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
|
||||
done
|
||||
|
||||
|
||||
# Set some variables.
|
||||
oov=`cat $lang/oov.int`
|
||||
num_leaves=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
|
||||
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
|
||||
|
||||
nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir...
|
||||
# in this dir we'll have just one job.
|
||||
sdata=$data/split$nj
|
||||
utils/split_data.sh $data $nj
|
||||
|
||||
mkdir -p $dir/log
|
||||
echo $nj > $dir/num_jobs
|
||||
cp $alidir/tree $dir
|
||||
|
||||
[ -z "$transform_dir" ] && transform_dir=$alidir
|
||||
|
||||
## Set up features. Note: these are different from the normal features
|
||||
## because we have one rspecifier that has the features for the entire
|
||||
## training set, not separate ones for each batch.
|
||||
if [ -z $feat_type ]; then
|
||||
if [ -f $alidir/final.mat ] && ! [ -f $alidir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
|
||||
fi
|
||||
echo "$0: feature type is $feat_type"
|
||||
|
||||
|
||||
# If we have more than $num_feats feature files (default: 10k),
|
||||
# we use a random subset. This won't affect the transform much, and will
|
||||
# spare us an unnecessary pass over the data. Probably 10k is
|
||||
# way too much, but for small datasets this phase is quite fast.
|
||||
N=$[$num_feats/$nj]
|
||||
|
||||
case $feat_type in
|
||||
raw) feats="ark,s,cs:utils/subset_scp.pl --quiet $N $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- |"
|
||||
;;
|
||||
lda)
|
||||
splice_opts=`cat $alidir/splice_opts 2>/dev/null`
|
||||
cp $alidir/{splice_opts,final.mat} $dir || exit 1;
|
||||
feats="ark,s,cs:utils/subset_scp.pl --quiet $N $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
|
||||
;;
|
||||
*) echo "$0: invalid feature type $feat_type" && exit 1;
|
||||
esac
|
||||
|
||||
if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
|
||||
echo "$0: using transforms from $transform_dir"
|
||||
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
|
||||
fi
|
||||
if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
|
||||
echo "$0: using raw-fMLLR transforms from $transform_dir"
|
||||
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
|
||||
fi
|
||||
|
||||
|
||||
feats_one="$(echo "$feats" | sed s:JOB:1:g)"
|
||||
# note: feat_dim is the raw, un-spliced feature dim without the iVectors.
|
||||
feat_dim=$(feat-to-dim "$feats_one" -) || exit 1;
|
||||
# by default: no dim reduction.
|
||||
|
||||
spliced_feats="$feats splice-feats --left-context=$left_context --right-context=$right_context ark:- ark:- |"
|
||||
|
||||
if [ ! -z "$online_ivector_dir" ]; then
|
||||
ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
|
||||
# note: subsample-feats, with negative value of n, repeats each feature n times.
|
||||
spliced_feats="$spliced_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- | ivector-randomize --randomize-prob=$ivector_randomize_prob ark:- ark:- |' ark:- |"
|
||||
ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
|
||||
else
|
||||
ivector_dim=0
|
||||
fi
|
||||
echo $ivector_dim >$dir/ivector_dim
|
||||
|
||||
if [ -z "$lda_dim" ]; then
|
||||
spliced_feats_one="$(echo "$spliced_feats" | sed s:JOB:1:g)"
|
||||
lda_dim=$(feat-to-dim "$spliced_feats_one" -) || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 0 ]; then
|
||||
echo "$0: Accumulating LDA statistics."
|
||||
rm $dir/lda.*.acc 2>/dev/null # in case any left over from before.
|
||||
$cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
|
||||
ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
|
||||
weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
|
||||
acc-lda --rand-prune=$rand_prune $alidir/final.mdl "$spliced_feats" ark,s,cs:- \
|
||||
$dir/lda.JOB.acc || exit 1;
|
||||
fi
|
||||
|
||||
echo $feat_dim > $dir/feat_dim
|
||||
echo $lda_dim > $dir/lda_dim
|
||||
echo $ivector_dim > $dir/ivector_dim
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
sum-lda-accs $dir/lda.acc $dir/lda.*.acc 2>$dir/log/lda_sum.log || exit 1;
|
||||
rm $dir/lda.*.acc
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ]; then
|
||||
# There are various things that we sometimes (but not always) need
|
||||
# the within-class covariance and its Cholesky factor for, and we
|
||||
# write these to disk just in case.
|
||||
nnet-get-feature-transform --write-cholesky=$dir/cholesky.tpmat \
|
||||
--write-within-covar=$dir/within_covar.spmat \
|
||||
--within-class-factor=$within_class_factor --dim=$lda_dim \
|
||||
$dir/lda.mat $dir/lda.acc \
|
||||
2>$dir/log/lda_est.log || exit 1;
|
||||
fi
|
||||
|
||||
echo "$0: Finished estimating LDA"
|
|
@ -0,0 +1,50 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Remove excess utterances once they appear more than a specified
|
||||
# number of times with the same transcription, in a data set.
|
||||
# E.g. useful for removing excess "uh-huh" from training.
|
||||
|
||||
if [ $# != 3 ]; then
|
||||
echo "Usage: remove_dup_utts.sh max-count src-data-dir dest-data-dir"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
maxcount=$1
|
||||
srcdir=$2
|
||||
destdir=$3
|
||||
mkdir -p $destdir
|
||||
|
||||
[ ! -f $srcdir/text ] && echo "Invalid input directory $srcdir" && exit 1;
|
||||
|
||||
cp $srcdir/* $destdir
|
||||
cat $srcdir/text | \
|
||||
perl -e '
|
||||
$maxcount = shift @ARGV;
|
||||
@all = ();
|
||||
$p1 = 103349; $p2 = 71147; $k = 0;
|
||||
sub random { # our own random number generator: predictable.
|
||||
$k = ($k + $p1) % $p2;
|
||||
return ($k / $p2);
|
||||
}
|
||||
while(<>) {
|
||||
push @all, $_;
|
||||
@A = split(" ", $_);
|
||||
shift @A;
|
||||
$text = join(" ", @A);
|
||||
$count{$text} ++;
|
||||
}
|
||||
foreach $line (@all) {
|
||||
@A = split(" ", $line);
|
||||
shift @A;
|
||||
$text = join(" ", @A);
|
||||
$n = $count{$text};
|
||||
if ($n < $maxcount || random() < ($maxcount / $n)) {
|
||||
print $line;
|
||||
}
|
||||
}' $maxcount >$destdir/text
|
||||
|
||||
echo "Reduced number of utterances from `cat $srcdir/text | wc -l` to `cat $destdir/text | wc -l`"
|
||||
|
||||
echo "Using fix_data_dir.sh to reconcile the other files."
|
||||
utils/fix_data_dir.sh $destdir
|
||||
rm -r $destdir/.backup
|
|
@ -0,0 +1,28 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Make the features.
|
||||
|
||||
. cmd.sh
|
||||
|
||||
stage=1
|
||||
set -e
|
||||
. cmd.sh
|
||||
. ./path.sh
|
||||
. ./utils/parse_options.sh
|
||||
|
||||
mkdir -p exp/nnet2_online
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
# this shows how you can split across multiple file-systems. we'll split the
|
||||
# MFCC dir across multiple locations. You might want to be careful here, if you
|
||||
# have multiple copies of Kaldi checked out and run the same recipe, not to let
|
||||
# them overwrite each other.
|
||||
mfccdir=mfcc
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
|
||||
date=$(date +'%m_%d_%H_%M')
|
||||
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english-$date/s5/$mfccdir/storage $mfccdir/storage
|
||||
fi
|
||||
utils/copy_data_dir.sh data/train_asr data/train_hires_asr
|
||||
steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
|
||||
--cmd "$train_cmd" data/train_hires_asr exp/make_hires/train $mfccdir || exit 1;
|
||||
fi
|
|
@ -0,0 +1,71 @@
|
|||
#!/bin/bash
|
||||
|
||||
# This script is based on run_nnet2_multisplice.sh in
|
||||
# egs/fisher_english/s5/local/online. It has been modified
|
||||
# for speaker recognition.
|
||||
|
||||
. cmd.sh
|
||||
|
||||
|
||||
stage=1
|
||||
train_stage=-10
|
||||
use_gpu=true
|
||||
set -e
|
||||
. cmd.sh
|
||||
. ./path.sh
|
||||
. ./utils/parse_options.sh
|
||||
|
||||
|
||||
# assume use_gpu=true since it would be way too slow otherwise.
|
||||
|
||||
if ! cuda-compiled; then
|
||||
cat <<EOF && exit 1
|
||||
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
|
||||
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
|
||||
where "nvcc" is installed.
|
||||
EOF
|
||||
fi
|
||||
parallel_opts="-l gpu=1"
|
||||
num_threads=1
|
||||
minibatch_size=512
|
||||
dir=exp/nnet2_online/nnet_ms_a
|
||||
mkdir -p exp/nnet2_online
|
||||
|
||||
|
||||
# Stages 1 through 5 are done in run_nnet2_common.sh,
|
||||
# so it can be shared with other similar scripts.
|
||||
local/dnn/run_nnet2_common.sh --stage $stage
|
||||
|
||||
if [ $stage -le 6 ]; then
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
|
||||
utils/create_split_dir.pl /export/b0{6,7,8,9}/$(USER)/kaldi-dsata/egs/fisher_english/s5/$dir/egs/storage $dir/egs/storage
|
||||
fi
|
||||
|
||||
# Because we have a lot of data here and we don't want the training to take
|
||||
# too long, we reduce the number of epochs from the defaults (15 + 5) to (3 +
|
||||
# 1). The option "--io-opts '-tc 12'" is to have more than the default number
|
||||
# (5) of jobs dumping the egs to disk; this is OK since we're splitting our
|
||||
# data across four filesystems for speed.
|
||||
|
||||
|
||||
local/dnn/train_multisplice_accel2.sh --stage $train_stage \
|
||||
--feat-type raw \
|
||||
--splice-indexes "layer0/-2:-1:0:1:2 layer1/-1:2 layer3/-3:3 layer4/-7:2" \
|
||||
--num-epochs 6 \
|
||||
--num-hidden-layers 6 \
|
||||
--num-jobs-initial 3 --num-jobs-final 18 \
|
||||
--num-threads "$num_threads" \
|
||||
--minibatch-size "$minibatch_size" \
|
||||
--parallel-opts "$parallel_opts" \
|
||||
--mix-up 10500 \
|
||||
--initial-effective-lrate 0.0015 --final-effective-lrate 0.00015 \
|
||||
--cmd "$decode_cmd" \
|
||||
--egs-dir "$common_egs_dir" \
|
||||
--pnorm-input-dim 3500 \
|
||||
--pnorm-output-dim 350 \
|
||||
data/train_hires_asr data/lang exp/tri5a $dir || exit 1;
|
||||
|
||||
fi
|
||||
|
||||
exit 0;
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
#!/bin/bash
|
||||
|
||||
# This script is based on egs/fisher_english/s5/run.sh. It trains a
|
||||
# multisplice time-delay neural network used in the DNN-based speaker
|
||||
# recognition recipes.
|
||||
|
||||
# It's best to run the commands in this one by one.
|
||||
|
||||
. cmd.sh
|
||||
. path.sh
|
||||
mfccdir=`pwd`/mfcc
|
||||
set -e
|
||||
|
||||
# the next command produces the data in local/train_all_asr
|
||||
local/dnn/fisher_data_prep.sh /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \
|
||||
/export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13
|
||||
# You could also try specifying the --calldata argument to this command as below.
|
||||
# If specified, the script will use actual speaker personal identification
|
||||
# numbers released with the dataset, i.e. real speaker IDs. Note: --calldata has
|
||||
# to be the first argument of this script.
|
||||
# local/fisher_data_prep.sh --calldata /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \
|
||||
# /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13
|
||||
|
||||
# at BUT:
|
||||
# local/fisher_data_prep.sh /mnt/matylda6/jhu09/qpovey/FISHER/LDC2005T19 /mnt/matylda2/data/FISHER/
|
||||
|
||||
local/dnn/fisher_prepare_dict.sh
|
||||
|
||||
utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
|
||||
|
||||
local/dnn/fisher_train_lms.sh
|
||||
local/dnn/fisher_create_test_lang.sh
|
||||
|
||||
# Use the first 4k sentences as dev set. Note: when we trained the LM, we used
|
||||
# the 1st 10k sentences as dev set, so the 1st 4k won't have been used in the
|
||||
# LM training data. However, they will be in the lexicon, plus speakers
|
||||
# may overlap, so it's still not quite equivalent to a test set.
|
||||
|
||||
utils/fix_data_dir.sh data/train_all_asr
|
||||
|
||||
steps/make_mfcc.sh --nj 40 --cmd "$train_cmd" --mfcc-config conf/mfcc_asr.conf \
|
||||
data/train_all_asr exp/make_mfcc/train_all_asr $mfccdir || exit 1;
|
||||
|
||||
utils/fix_data_dir.sh data/train_all_asr
|
||||
utils/validate_data_dir.sh data/train_all_asr
|
||||
|
||||
|
||||
# The dev and test sets are each about 3.3 hours long. These are not carefully
|
||||
# done; there may be some speaker overlap with each other and with the training
|
||||
# set. Note: in our LM-training setup we excluded the first 10k utterances (they
|
||||
# were used for tuning but not for training), so the LM was not (directly) trained
|
||||
# on either the dev or test sets.
|
||||
utils/subset_data_dir.sh --first data/train_all_asr 10000 data/dev_and_test_asr
|
||||
utils/subset_data_dir.sh --first data/dev_and_test_asr 5000 data/dev_asr
|
||||
utils/subset_data_dir.sh --last data/dev_and_test_asr 5000 data/test_asr
|
||||
rm -r data/dev_and_test_asr
|
||||
|
||||
steps/compute_cmvn_stats.sh data/dev_asr exp/make_mfcc/dev_asr $mfccdir
|
||||
steps/compute_cmvn_stats.sh data/test_asr exp/make_mfcc/test_asr $mfccdir
|
||||
|
||||
n=$[`cat data/train_all_asr/segments | wc -l` - 10000]
|
||||
utils/subset_data_dir.sh --last data/train_all_asr $n data/train_asr
|
||||
steps/compute_cmvn_stats.sh data/train_asr exp/make_mfcc/train_asr $mfccdir
|
||||
|
||||
|
||||
# Now-- there are 1.6 million utterances, and we want to start the monophone training
|
||||
# on relatively short utterances (easier to align), but not only the very shortest
|
||||
# ones (mostly uh-huh). So take the 100k shortest ones, and then take 10k random
|
||||
# utterances from those.
|
||||
|
||||
utils/subset_data_dir.sh --shortest data/train_asr 100000 data/train_asr_100kshort
|
||||
utils/subset_data_dir.sh data/train_asr_100kshort 10000 data/train_asr_10k
|
||||
local/dnn/remove_dup_utts.sh 100 data/train_asr_10k data/train_asr_10k_nodup
|
||||
utils/subset_data_dir.sh --speakers data/train_asr 30000 data/train_asr_30k
|
||||
utils/subset_data_dir.sh --speakers data/train_asr 100000 data/train_asr_100k
|
||||
|
||||
|
||||
# The next commands are not necessary for the scripts to run, but increase
|
||||
# efficiency of data access by putting the mfcc's of the subset
|
||||
# in a contiguous place in a file.
|
||||
( . path.sh;
|
||||
# make sure mfccdir is defined as above..
|
||||
cp data/train_asr_10k_nodup/feats.scp{,.bak}
|
||||
copy-feats scp:data/train_asr_10k_nodup/feats.scp ark,scp:$mfccdir/kaldi_fish_10k_nodup.ark,$mfccdir/kaldi_fish_10k_nodup.scp \
|
||||
&& cp $mfccdir/kaldi_fish_10k_nodup.scp data/train_asr_10k_nodup/feats.scp
|
||||
)
|
||||
( . path.sh;
|
||||
# make sure mfccdir is defined as above..
|
||||
cp data/train_asr_30k/feats.scp{,.bak}
|
||||
copy-feats scp:data/train_asr_30k/feats.scp ark,scp:$mfccdir/kaldi_fish_30k.ark,$mfccdir/kaldi_fish_30k.scp \
|
||||
&& cp $mfccdir/kaldi_fish_30k.scp data/train_asr_30k/feats.scp
|
||||
)
|
||||
( . path.sh;
|
||||
# make sure mfccdir is defined as above..
|
||||
cp data/train_asr_100k/feats.scp{,.bak}
|
||||
copy-feats scp:data/train_asr_100k/feats.scp ark,scp:$mfccdir/kaldi_fish_100k.ark,$mfccdir/kaldi_fish_100k.scp \
|
||||
&& cp $mfccdir/kaldi_fish_100k.scp data/train_asr_100k/feats.scp
|
||||
)
|
||||
|
||||
steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
|
||||
data/train_asr_10k_nodup data/lang exp/mono0a
|
||||
|
||||
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
|
||||
data/train_asr_30k data/lang exp/mono0a exp/mono0a_ali || exit 1;
|
||||
|
||||
steps/train_deltas.sh --cmd "$train_cmd" \
|
||||
2500 20000 data/train_asr_30k data/lang exp/mono0a_ali exp/tri1 || exit 1;
|
||||
|
||||
|
||||
(utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
|
||||
steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
|
||||
exp/tri1/graph data/dev exp/tri1/decode_dev)&
|
||||
|
||||
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
|
||||
data/train_asr_30k data/lang exp/tri1 exp/tri1_ali || exit 1;
|
||||
|
||||
steps/train_deltas.sh --cmd "$train_cmd" \
|
||||
2500 20000 data/train_asr_30k data/lang exp/tri1_ali exp/tri2 || exit 1;
|
||||
|
||||
(
|
||||
utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1;
|
||||
steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
|
||||
exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1;
|
||||
)&
|
||||
|
||||
|
||||
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
|
||||
data/train_asr_100k data/lang exp/tri2 exp/tri2_ali || exit 1;
|
||||
|
||||
# Train tri3a, which is LDA+MLLT, on 100k data.
|
||||
steps/train_lda_mllt.sh --cmd "$train_cmd" \
|
||||
--splice-opts "--left-context=3 --right-context=3" \
|
||||
5000 40000 data/train_asr_100k data/lang exp/tri2_ali exp/tri3a || exit 1;
|
||||
(
|
||||
utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
|
||||
steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
|
||||
exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
|
||||
)&
|
||||
|
||||
|
||||
# Next we'll use fMLLR and train with SAT (i.e. on
|
||||
# fMLLR features)
|
||||
|
||||
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
|
||||
data/train_asr_100k data/lang exp/tri3a exp/tri3a_ali || exit 1;
|
||||
|
||||
steps/train_sat.sh --cmd "$train_cmd" \
|
||||
5000 100000 data/train_asr_100k data/lang exp/tri3a_ali exp/tri4a || exit 1;
|
||||
|
||||
(
|
||||
utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
|
||||
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
|
||||
exp/tri4a/graph data/dev exp/tri4a/decode_dev
|
||||
)&
|
||||
|
||||
|
||||
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
|
||||
data/train_asr data/lang exp/tri4a exp/tri4a_ali || exit 1;
|
||||
|
||||
|
||||
steps/train_sat.sh --cmd "$train_cmd" \
|
||||
7000 300000 data/train_asr data/lang exp/tri4a_ali exp/tri5a || exit 1;
|
||||
|
||||
(
|
||||
utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
|
||||
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
|
||||
exp/tri5a/graph data/dev exp/tri5a/decode_dev
|
||||
)&
|
||||
|
||||
# this will help find issues with the lexicon.
|
||||
# steps/cleanup/debug_lexicon.sh --nj 300 --cmd "$train_cmd" data/train_asr_100k data/lang exp/tri5a data/local/dict/lexicon.txt exp/debug_lexicon_100k
|
||||
|
||||
## The following is based on the best current neural net recipe.
|
||||
local/dnn/run_nnet2_multisplice.sh
|
|
@ -0,0 +1,641 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).
|
||||
# 2013 Xiaohui Zhang
|
||||
# 2013 Guoguo Chen
|
||||
# 2014 Vimal Manohar
|
||||
# 2014 Vijayaditya Peddinti
|
||||
# Apache 2.0.
|
||||
|
||||
# This is a modified version of train_multisplice_accel2.sh in
|
||||
# steps/nnet2/ for speaker recognition. The main difference is
|
||||
# that it uses different get_lda.sh and get_egs2.sh scripts.
|
||||
#
|
||||
# The original train_multisplice_accel2.sh was a modified version of
|
||||
# train_pnorm_multisplice2.sh (still using pnorm). The "accel" refers to the
|
||||
# fact that we increase the number of jobs during training (from
|
||||
# --num-jobs-initial to --num-jobs-final). We dropped "pnorm" from the name as
|
||||
# it was getting too long.
|
||||
|
||||
|
||||
# Begin configuration section.
|
||||
cmd=run.pl
|
||||
num_epochs=15 # Number of epochs of training;
|
||||
# the number of iterations is worked out from this.
|
||||
initial_effective_lrate=0.01
|
||||
final_effective_lrate=0.001
|
||||
bias_stddev=0.5
|
||||
pnorm_input_dim=3000
|
||||
pnorm_output_dim=300
|
||||
minibatch_size=128 # by default use a smallish minibatch size for neural net
|
||||
# training; this controls instability which would otherwise
|
||||
# be a problem with multi-threaded update.
|
||||
|
||||
samples_per_iter=400000 # each iteration of training, see this many samples
|
||||
# per job. This option is passed to get_egs.sh
|
||||
num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training
|
||||
num_jobs_final=8 # Number of neural net jobs to run in parallel at the end of training
|
||||
prior_subset_size=10000 # 10k samples per job, for computing priors. Should be
|
||||
# more than enough.
|
||||
num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
|
||||
get_egs_stage=0
|
||||
online_ivector_dir=
|
||||
remove_egs=true # set to false to disable removing egs.
|
||||
|
||||
max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
|
||||
# to the final 'combine' stage, but these models will themselves be averages of
|
||||
# iteration-number ranges.
|
||||
|
||||
shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
|
||||
# on each iter. You could set it to 0 or to a large value for complete
|
||||
# randomization, but this would both consume memory and cause spikes in
|
||||
# disk I/O. Smaller is easier on disk and memory but less random. It's
|
||||
# not a huge deal though, as samples are anyway randomized right at the start.
|
||||
# (the point of this is to get data in different minibatches on different iterations,
|
||||
# since in the preconditioning method, 2 samples in the same minibatch can
|
||||
# affect each others' gradients.
|
||||
|
||||
add_layers_period=2 # by default, add new layers every 2 iterations.
|
||||
num_hidden_layers=3
|
||||
stage=-4
|
||||
exit_stage=-100 # you can set this to terminate the training early. Exits before running this stage
|
||||
|
||||
splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
|
||||
# Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
|
||||
# note: hidden layers which are composed of one or more components,
|
||||
# so hidden layer indexing is different from component count
|
||||
|
||||
|
||||
io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. These don't
|
||||
randprune=4.0 # speeds up LDA.
|
||||
alpha=4.0 # relates to preconditioning.
|
||||
update_period=4 # relates to online preconditioning: says how often we update the subspace.
|
||||
num_samples_history=2000 # relates to online preconditioning
|
||||
max_change_per_sample=0.075
|
||||
precondition_rank_in=20 # relates to online preconditioning
|
||||
precondition_rank_out=80 # relates to online preconditioning
|
||||
|
||||
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
|
||||
# specified.)
|
||||
num_threads=16
|
||||
parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G"
|
||||
# by default we use 16 threads; this lets the queue know.
|
||||
# note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
|
||||
combine_num_threads=8
|
||||
combine_parallel_opts="-pe smp 8" # queue options for the "combine" stage.
|
||||
cleanup=true
|
||||
egs_dir=
|
||||
lda_opts=
|
||||
lda_dim=
|
||||
egs_opts=
|
||||
transform_dir= # If supplied, overrides alidir
|
||||
feat_type= # Can be used to force "raw" features.
|
||||
align_cmd= # The cmd that is passed to steps/nnet2/align.sh
|
||||
align_use_gpu= # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
|
||||
realign_times= # List of times on which we realign. Each time is
|
||||
# floating point number strictly between 0 and 1, which
|
||||
# will be multiplied by the num-iters to get an iteration
|
||||
# number.
|
||||
num_jobs_align=30 # Number of jobs for realignment
|
||||
# End configuration section.
|
||||
frames_per_eg=8 # to be passed on to get_egs2.sh
|
||||
|
||||
trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
if [ -f path.sh ]; then . ./path.sh; fi
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
if [ $# != 4 ]; then
|
||||
echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
|
||||
echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
|
||||
echo ""
|
||||
echo "Main options (for others, see top of script file)"
|
||||
echo " --config <config-file> # config file containing options"
|
||||
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
|
||||
echo " --num-epochs <#epochs|15> # Number of epochs of training"
|
||||
echo " --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training."
|
||||
echo " --final-effective-lrate <lrate|0.004> # effective learning rate at end of training."
|
||||
echo " # data, 0.00025 for large data"
|
||||
echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
|
||||
echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers"
|
||||
echo " --mix-up <#pseudo-gaussians|0> # Can be used to have multiple targets in final output layer,"
|
||||
echo " # per context-dependent state. Try a number several times #states."
|
||||
echo " --num-jobs-initial <num-jobs|1> # Number of parallel jobs to use for neural net training, at the start."
|
||||
echo " --num-jobs-final <num-jobs|8> # Number of parallel jobs to use for neural net training, at the end"
|
||||
echo " --num-threads <num-threads|16> # Number of parallel threads per job (will affect results"
|
||||
echo " # as well as speed; may interact with batch size; if you increase"
|
||||
echo " # this, you may want to decrease the batch size."
|
||||
echo " --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\"> # extra options to pass to e.g. queue.pl for processes that"
|
||||
echo " # use multiple threads... note, you might have to reduce mem_free,ram_free"
|
||||
echo " # versus your defaults, because it gets multiplied by the -pe smp argument."
|
||||
echo " --io-opts <opts|\"-tc 10\"> # Options given to e.g. queue.pl for jobs that do a lot of I/O."
|
||||
echo " --minibatch-size <minibatch-size|128> # Size of minibatch to process (note: product with --num-threads"
|
||||
echo " # should not get too large, e.g. >2k)."
|
||||
echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per"
|
||||
echo " # process."
|
||||
echo " --splice-indexes <string|layer0/-4:-3:-2:-1:0:1:2:3:4> "
|
||||
echo " # Frame indices used for each splice layer."
|
||||
echo " # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
|
||||
echo " # (note: we splice processed, typically 40-dimensional frames"
|
||||
echo " --lda-dim <dim|''> # Dimension to reduce spliced features to with LDA"
|
||||
echo " --realign-epochs <list-of-epochs|''> # A list of space-separated epoch indices the beginning of which"
|
||||
echo " # realignment is to be done"
|
||||
echo " --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
|
||||
echo " --align-use-gpu (yes/no) # specify is gpu is to be used for realignment"
|
||||
echo " --num-jobs-align <#njobs|30> # Number of jobs to perform realignment"
|
||||
echo " --stage <stage|-4> # Used to run a partially-completed training process from somewhere in"
|
||||
echo " # the middle."
|
||||
|
||||
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
data=$1
|
||||
lang=$2
|
||||
alidir=$3
|
||||
dir=$4
|
||||
|
||||
if [ ! -z "$realign_times" ]; then
|
||||
[ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1
|
||||
[ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1
|
||||
fi
|
||||
|
||||
# Check some files.
|
||||
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
|
||||
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
|
||||
done
|
||||
|
||||
|
||||
# Set some variables.
|
||||
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
|
||||
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
|
||||
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1
|
||||
|
||||
nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir...
|
||||
# in this dir we'll have just one job.
|
||||
sdata=$data/split$nj
|
||||
utils/split_data.sh $data $nj
|
||||
|
||||
mkdir -p $dir/log
|
||||
echo $nj > $dir/num_jobs
|
||||
cp $alidir/tree $dir
|
||||
|
||||
# process the splice_inds string, to get a layer-wise context string
|
||||
# to be processed by the nnet-components
|
||||
# this would be mainly used by SpliceComponent|SpliceMaxComponent
|
||||
python steps/nnet2/make_multisplice_configs.py contexts --splice-indexes "$splice_indexes" $dir || exit -1;
|
||||
context_string=$(cat $dir/vars) || exit -1
|
||||
echo $context_string
|
||||
eval $context_string || exit -1; #
|
||||
# initializes variables used by get_lda.sh and get_egs.sh
|
||||
# get_lda.sh : first_left_context, first_right_context,
|
||||
# get_egs.sh : nnet_left_context & nnet_right_context
|
||||
|
||||
extra_opts=()
|
||||
[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
|
||||
[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
|
||||
[ -z "$transform_dir" ] && transform_dir=$alidir
|
||||
extra_opts+=(--transform-dir $transform_dir)
|
||||
|
||||
if [ $stage -le -4 ]; then
|
||||
echo "$0: calling get_lda.sh"
|
||||
local/dnn/get_lda.sh $lda_opts "${extra_opts[@]}" --left-context $first_left_context --right-context $first_right_context --cmd "$cmd" $data $lang $alidir $dir || exit 1;
|
||||
fi
|
||||
# these files will have been written by get_lda.sh
|
||||
feat_dim=$(cat $dir/feat_dim) || exit 1;
|
||||
ivector_dim=$(cat $dir/ivector_dim) || exit 1;
|
||||
lda_dim=$(cat $dir/lda_dim) || exit 1;
|
||||
|
||||
if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
|
||||
|
||||
extra_opts+=(--left-context $nnet_left_context )
|
||||
extra_opts+=(--right-context $nnet_right_context )
|
||||
echo "$0: calling get_egs2.sh"
|
||||
local/dnn/get_egs2.sh $egs_opts "${extra_opts[@]}" \
|
||||
--samples-per-iter $samples_per_iter --stage $get_egs_stage \
|
||||
--io-opts "$io_opts" \
|
||||
--cmd "$cmd" $egs_opts \
|
||||
--frames-per-eg $frames_per_eg \
|
||||
$data $alidir $dir/egs || exit 1;
|
||||
fi
|
||||
|
||||
if [ -z $egs_dir ]; then
|
||||
egs_dir=$dir/egs
|
||||
# confirm that the provided egs_dir has the necessary context
|
||||
egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
|
||||
egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
|
||||
echo $egs_left_context $nnet_left_context $egs_right_context $nnet_right_context
|
||||
([[ $egs_left_context -lt $nnet_left_context ]] || [[ $egs_right_context -lt $nnet_right_context ]]) &&
|
||||
echo "Provided egs_dir $egs_dir does not have sufficient context to train the neural network." && exit -1;
|
||||
fi
|
||||
|
||||
frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
|
||||
num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
|
||||
|
||||
# num_archives_expanded considers each separate label-position from
|
||||
# 0..frames_per_eg-1 to be a separate archive.
|
||||
num_archives_expanded=$[$num_archives*$frames_per_eg]
|
||||
|
||||
[ $num_jobs_initial -gt $num_jobs_final ] && \
|
||||
echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;
|
||||
|
||||
[ $num_jobs_final -gt $num_archives_expanded ] && \
|
||||
echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;
|
||||
|
||||
if ! [ $num_hidden_layers -ge 1 ]; then
|
||||
echo "Invalid num-hidden-layers $num_hidden_layers"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $stage -le -2 ]; then
|
||||
echo "$0: initializing neural net";
|
||||
lda_mat=$dir/lda.mat
|
||||
tot_input_dim=$[$feat_dim+$ivector_dim]
|
||||
|
||||
online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"
|
||||
|
||||
initial_lrate=$(perl -e "print ($initial_effective_lrate*$num_jobs_initial);")
|
||||
|
||||
# create the config files for nnet initialization
|
||||
python steps/nnet2/make_multisplice_configs.py \
|
||||
--splice-indexes "$splice_indexes" \
|
||||
--total-input-dim $tot_input_dim \
|
||||
--ivector-dim $ivector_dim \
|
||||
--lda-mat "$lda_mat" \
|
||||
--lda-dim $lda_dim \
|
||||
--pnorm-input-dim $pnorm_input_dim \
|
||||
--pnorm-output-dim $pnorm_output_dim \
|
||||
--online-preconditioning-opts "$online_preconditioning_opts" \
|
||||
--initial-learning-rate $initial_lrate \
|
||||
--bias-stddev $bias_stddev \
|
||||
--num-hidden-layers $num_hidden_layers \
|
||||
--num-targets $num_leaves \
|
||||
configs $dir || exit -1;
|
||||
|
||||
$cmd $dir/log/nnet_init.log \
|
||||
nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
|
||||
$dir/0.mdl || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le -1 ]; then
|
||||
echo "Training transition probabilities and setting priors"
|
||||
$cmd $dir/log/train_trans.log \
|
||||
nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
|
||||
|| exit 1;
|
||||
fi
|
||||
|
||||
# set num_iters so that as close as possible, we process the data $num_epochs
|
||||
# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded,
|
||||
# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
|
||||
|
||||
num_archives_to_process=$[$num_epochs*$num_archives_expanded]
|
||||
num_archives_processed=0
|
||||
num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]
|
||||
|
||||
! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
|
||||
&& echo "$0: Insufficient epochs" && exit 1
|
||||
|
||||
finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
|
||||
|
||||
|
||||
# mix up at the iteration where we've processed about half the data; this keeps
|
||||
# the overall training procedure fairly invariant to the number of initial and
|
||||
# final jobs.
|
||||
# j = initial, k = final, n = num-iters, x = half-of-data epoch,
|
||||
# p is proportion of data we want to process (e.g. p=0.5 here).
|
||||
# solve for x if the amount of data processed by epoch x is p
|
||||
# times the amount by iteration n.
|
||||
# put this in wolfram alpha:
|
||||
# solve { x*j + (k-j)*x*x/(2*n) = p * (j*n + (k-j)*n/2), {x} }
|
||||
# got: x = (j n-sqrt(-n^2 (j^2 (p-1)-k^2 p)))/(j-k) and j!=k and n!=0
|
||||
# simplified manually to: n * (sqrt(((1-p)j^2 + p k^2)/2) - j)/(j-k)
|
||||
mix_up_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters 0.5)
|
||||
! [ $mix_up_iter -gt $finish_add_layers_iter ] && \
|
||||
echo "Mix-up-iter is $mix_up_iter, should be greater than $finish_add_layers_iter -> add more epochs?" \
|
||||
&& exit 1;
|
||||
|
||||
echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
|
||||
[ $mix_up -gt 0 ] && echo "$0: Will mix up on iteration $mix_up_iter"
|
||||
|
||||
if [ $num_threads -eq 1 ]; then
|
||||
parallel_suffix="-simple" # this enables us to use GPU code if
|
||||
# we have just one thread.
|
||||
parallel_train_opts=
|
||||
if ! cuda-compiled; then
|
||||
echo "$0: WARNING: you are running with one thread but you have not compiled"
|
||||
echo " for CUDA. You may be running a setup optimized for GPUs. If you have"
|
||||
echo " GPUs and have nvcc installed, go to src/ and do ./configure; make"
|
||||
fi
|
||||
else
|
||||
parallel_suffix="-parallel"
|
||||
parallel_train_opts="--num-threads=$num_threads"
|
||||
fi
|
||||
|
||||
|
||||
approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]
|
||||
# First work out how many models we want to combine over in the final
|
||||
# nnet-combine-fast invocation. This equals
|
||||
# min(max(max_models_combine, approx_iters_per_epoch_final),
|
||||
# 2/3 * iters_after_mixup)
|
||||
num_models_combine=$max_models_combine
|
||||
if [ $num_models_combine -lt $approx_iters_per_epoch_final ]; then
|
||||
num_models_combine=$approx_iters_per_epoch_final
|
||||
fi
|
||||
iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
|
||||
if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
|
||||
num_models_combine=$iters_after_mixup_23
|
||||
fi
|
||||
first_model_combine=$[$num_iters-$num_models_combine+1]
|
||||
|
||||
x=0
|
||||
|
||||
|
||||
for realign_time in $realign_times; do
|
||||
# Work out the iterations on which we will re-align, if the --realign-times
|
||||
# option was used. This is slightly approximate.
|
||||
! perl -e "exit($realign_time > 0.0 && $realign_time < 1.0 ? 0:1);" && \
|
||||
echo "Invalid --realign-times option $realign_times: elements must be strictly between 0 and 1.";
|
||||
# the next formula is based on the one for mix_up_iter above.
|
||||
realign_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters $realign_time) || exit 1;
|
||||
realign_this_iter[$realign_iter]=$realign_time
|
||||
done
|
||||
|
||||
cur_egs_dir=$egs_dir
|
||||
|
||||
while [ $x -lt $num_iters ]; do
|
||||
[ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;
|
||||
|
||||
this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")
|
||||
|
||||
ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
|
||||
this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
|
||||
|
||||
echo "On iteration $x, learning rate is $this_learning_rate."
|
||||
|
||||
if [ ! -z "${realign_this_iter[$x]}" ]; then
|
||||
prev_egs_dir=$cur_egs_dir
|
||||
cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
|
||||
fi
|
||||
|
||||
if [ $x -ge 0 ] && [ $stage -le $x ]; then
|
||||
if [ ! -z "${realign_this_iter[$x]}" ]; then
|
||||
time=${realign_this_iter[$x]}
|
||||
|
||||
echo "Getting average posterior for purposes of adjusting the priors."
|
||||
# Note: this just uses CPUs, using a smallish subset of data.
|
||||
# always use the first egs archive, which makes the script simpler;
|
||||
# we're using different random subsets of it.
|
||||
rm $dir/post.$x.*.vec 2>/dev/null
|
||||
$cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
|
||||
nnet-copy-egs --srand=JOB --frame=random ark:$prev_egs_dir/egs.1.ark ark:- \| \
|
||||
nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
|
||||
nnet-compute-from-egs "nnet-to-raw-nnet $dir/$x.mdl -|" ark:- ark:- \| \
|
||||
matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
|
||||
|
||||
sleep 3; # make sure there is time for $dir/post.$x.*.vec to appear.
|
||||
|
||||
$cmd $dir/log/vector_sum.$x.log \
|
||||
vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
|
||||
rm $dir/post.$x.*.vec;
|
||||
|
||||
echo "Re-adjusting priors based on computed posteriors"
|
||||
$cmd $dir/log/adjust_priors.$x.log \
|
||||
nnet-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;
|
||||
|
||||
sleep 2
|
||||
|
||||
steps/nnet2/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
|
||||
--transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
|
||||
--iter $x $data $lang $dir $dir/ali_$time || exit 1
|
||||
|
||||
steps/nnet2/relabel_egs2.sh --cmd "$cmd" --iter $x $dir/ali_$time \
|
||||
$prev_egs_dir $cur_egs_dir || exit 1
|
||||
|
||||
if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
|
||||
steps/nnet2/remove_egs.sh $prev_egs_dir
|
||||
fi
|
||||
fi
|
||||
|
||||
# Set off jobs doing some diagnostics, in the background.
|
||||
# Use the egs dir from the previous iteration for the diagnostics
|
||||
$cmd $dir/log/compute_prob_valid.$x.log \
|
||||
nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
|
||||
$cmd $dir/log/compute_prob_train.$x.log \
|
||||
nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/train_diagnostic.egs &
|
||||
if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
|
||||
$cmd $dir/log/progress.$x.log \
|
||||
nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
|
||||
ark:$cur_egs_dir/train_diagnostic.egs '&&' \
|
||||
nnet-am-info $dir/$x.mdl &
|
||||
fi
|
||||
|
||||
echo "Training neural net (pass $x)"
|
||||
|
||||
if [ $x -gt 0 ] && \
|
||||
[ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
|
||||
[ $[$x%$add_layers_period] -eq 0 ]; then
|
||||
do_average=false # if we've just mixed up, don't do averaging take the best.
|
||||
cur_num_hidden_layers=$[$x/$add_layers_period];
|
||||
mdl="nnet-init --srand=$x $dir/hidden_${cur_num_hidden_layers}.config - | nnet-insert $dir/$x.mdl - - | nnet-am-copy --learning-rate=$this_learning_rate - -|"
|
||||
else
|
||||
do_average=true
|
||||
if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
|
||||
mdl="nnet-am-copy --learning-rate=$this_learning_rate $dir/$x.mdl -|"
|
||||
fi
|
||||
if $do_average; then
|
||||
this_minibatch_size=$minibatch_size
|
||||
else
|
||||
# on iteration zero or when we just added a layer, use a smaller minibatch
|
||||
# size and just one job: the model-averaging doesn't seem to be helpful
|
||||
# when the model is changing too fast (i.e. it worsens the objective
|
||||
# function), and the smaller minibatch size will help to keep
|
||||
# the update stable.
|
||||
this_minibatch_size=$[$minibatch_size/2];
|
||||
fi
|
||||
|
||||
rm $dir/.error 2>/dev/null
|
||||
|
||||
|
||||
( # this sub-shell is so that when we "wait" below,
|
||||
# we only wait for the training jobs that we just spawned,
|
||||
# not the diagnostic jobs that we spawned above.
|
||||
|
||||
# We can't easily use a single parallel SGE job to do the main training,
|
||||
# because the computation of which archive and which --frame option
|
||||
# to use for each job is a little complex, so we spawn each one separately.
|
||||
for n in $(seq $this_num_jobs); do
|
||||
k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
|
||||
# the other indexes from.
|
||||
archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
|
||||
frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
|
||||
# index; this increases more slowly than the archive index because the
|
||||
# same archive with different frame indexes will give similar gradients,
|
||||
# so we want to separate them in time.
|
||||
|
||||
$cmd $parallel_opts $dir/log/train.$x.$n.log \
|
||||
nnet-train$parallel_suffix $parallel_train_opts \
|
||||
--minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
|
||||
"ark:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
|
||||
$dir/$[$x+1].$n.mdl || touch $dir/.error &
|
||||
done
|
||||
wait
|
||||
)
|
||||
# the error message below is not that informative, but $cmd will
|
||||
# have printed a more specific one.
|
||||
[ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;
|
||||
|
||||
nnets_list=
|
||||
for n in `seq 1 $this_num_jobs`; do
|
||||
nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
|
||||
done
|
||||
|
||||
if $do_average; then
|
||||
# average the output of the different jobs.
|
||||
$cmd $dir/log/average.$x.log \
|
||||
nnet-am-average $nnets_list $dir/$[$x+1].mdl || exit 1;
|
||||
else
|
||||
# choose the best from the different jobs.
|
||||
n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
|
||||
$fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
|
||||
undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
|
||||
close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
|
||||
$best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
|
||||
[ -z "$n" ] && echo "Error getting best model" && exit 1;
|
||||
cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
|
||||
fi
|
||||
|
||||
if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
|
||||
# mix up.
|
||||
echo Mixing up from $num_leaves to $mix_up components
|
||||
$cmd $dir/log/mix_up.$x.log \
|
||||
nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
|
||||
$dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
|
||||
fi
|
||||
rm $nnets_list
|
||||
[ ! -f $dir/$[$x+1].mdl ] && exit 1;
|
||||
if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
|
||||
[ $[($x-1)%100] -ne 0 ] && [ $[$x-1] -lt $first_model_combine ]; then
|
||||
rm $dir/$[$x-1].mdl
|
||||
fi
|
||||
fi
|
||||
x=$[$x+1]
|
||||
num_archives_processed=$[$num_archives_processed+$this_num_jobs]
|
||||
done
|
||||
|
||||
|
||||
if [ $stage -le $num_iters ]; then
|
||||
echo "Doing final combination to produce final.mdl"
|
||||
|
||||
# Now do combination.
|
||||
nnets_list=()
|
||||
# the if..else..fi statement below sets 'nnets_list'.
|
||||
if [ $max_models_combine -lt $num_models_combine ]; then
|
||||
# The number of models to combine is too large, e.g. > 20. In this case,
|
||||
# each argument to nnet-combine-fast will be an average of multiple models.
|
||||
cur_offset=0 # current offset from first_model_combine.
|
||||
for n in $(seq $max_models_combine); do
|
||||
next_offset=$[($n*$num_models_combine)/$max_models_combine]
|
||||
sub_list=""
|
||||
for o in $(seq $cur_offset $[$next_offset-1]); do
|
||||
iter=$[$first_model_combine+$o]
|
||||
mdl=$dir/$iter.mdl
|
||||
[ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
|
||||
sub_list="$sub_list $mdl"
|
||||
done
|
||||
nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
|
||||
cur_offset=$next_offset
|
||||
done
|
||||
else
|
||||
nnets_list=
|
||||
for n in $(seq 0 $[num_models_combine-1]); do
|
||||
iter=$[$first_model_combine+$n]
|
||||
mdl=$dir/$iter.mdl
|
||||
[ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
|
||||
nnets_list[$n]=$mdl
|
||||
done
|
||||
fi
|
||||
|
||||
|
||||
# Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
|
||||
# if there are many models it can give out-of-memory error; set num-threads to 8
|
||||
# to speed it up (this isn't ideal...)
|
||||
num_egs=`nnet-copy-egs ark:$cur_egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
|
||||
mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
|
||||
[ $mb -gt 512 ] && mb=512
|
||||
# Setting --initial-model to a large value makes it initialize the combination
|
||||
# with the average of all the models. It's important not to start with a
|
||||
# single model, or, due to the invariance to scaling that these nonlinearities
|
||||
# give us, we get zero diagonal entries in the fisher matrix that
|
||||
# nnet-combine-fast uses for scaling, which after flooring and inversion, has
|
||||
# the effect that the initial model chosen gets much higher learning rates
|
||||
# than the others. This prevents the optimization from working well.
|
||||
$cmd $combine_parallel_opts $dir/log/combine.log \
|
||||
nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
|
||||
--num-threads=$combine_num_threads \
|
||||
--verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$cur_egs_dir/combine.egs \
|
||||
$dir/final.mdl || exit 1;
|
||||
|
||||
# Normalize stddev for affine or block affine layers that are followed by a
|
||||
# pnorm layer and then a normalize layer.
|
||||
$cmd $dir/log/normalize.log \
|
||||
nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;
|
||||
|
||||
# Compute the probability of the final, combined model with
|
||||
# the same subset we used for the previous compute_probs, as the
|
||||
# different subsets will lead to different probs.
|
||||
$cmd $dir/log/compute_prob_valid.final.log \
|
||||
nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
|
||||
$cmd $dir/log/compute_prob_train.final.log \
|
||||
nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/train_diagnostic.egs &
|
||||
fi
|
||||
|
||||
if [ $stage -le $[$num_iters+1] ]; then
|
||||
echo "Getting average posterior for purposes of adjusting the priors."
|
||||
# Note: this just uses CPUs, using a smallish subset of data.
|
||||
rm $dir/post.$x.*.vec 2>/dev/null
|
||||
$cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
|
||||
nnet-copy-egs --frame=random --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \
|
||||
nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
|
||||
nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
|
||||
matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
|
||||
|
||||
sleep 3; # make sure there is time for $dir/post.$x.*.vec to appear.
|
||||
|
||||
$cmd $dir/log/vector_sum.$x.log \
|
||||
vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
|
||||
|
||||
rm $dir/post.$x.*.vec;
|
||||
|
||||
echo "Re-adjusting priors based on computed posteriors"
|
||||
$cmd $dir/log/adjust_priors.final.log \
|
||||
nnet-adjust-priors $dir/final.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
|
||||
fi
|
||||
|
||||
|
||||
if [ ! -f $dir/final.mdl ]; then
|
||||
echo "$0: $dir/final.mdl does not exist."
|
||||
# we don't want to clean up if the training didn't succeed.
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
sleep 2
|
||||
|
||||
echo Done
|
||||
|
||||
if $cleanup; then
|
||||
echo Cleaning up data
|
||||
if $remove_egs && [[ $cur_egs_dir =~ $dir/egs* ]]; then
|
||||
steps/nnet2/remove_egs.sh $cur_egs_dir
|
||||
fi
|
||||
|
||||
echo Removing most of the models
|
||||
for x in `seq 0 $num_iters`; do
|
||||
if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
|
||||
# delete all but every 100th model; don't delete the ones which combine to form the final model.
|
||||
rm $dir/$x.mdl
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
|
@ -1,13 +1,12 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2015 David Snyder
|
||||
# 2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
|
||||
# 2015 Johns Hopkins University (Author: Daniel Povey)
|
||||
# Apache 2.0.
|
||||
#
|
||||
# See README.txt for more info on data required.
|
||||
# Results (EERs) are inline in comments below.
|
||||
|
||||
# This example script is still a bit of a mess, and needs to be
|
||||
# cleaned up, but it shows you all the basic ingredients.
|
||||
|
||||
. cmd.sh
|
||||
. path.sh
|
||||
set -e
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
# "queue.pl" uses qsub. The options to it are
|
||||
# options to qsub. If you have GridEngine installed,
|
||||
# change this to a queue you have access to.
|
||||
# Otherwise, use "run.pl", which will run jobs locally
|
||||
# (make sure your --num-jobs options are no more than
|
||||
# the number of cpus on your machine.
|
||||
|
||||
#a) JHU cluster options
|
||||
export train_cmd="queue.pl -l arch=*64*"
|
||||
export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
|
||||
#export cuda_cmd="..."
|
||||
export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
|
||||
|
||||
#b) BUT cluster options
|
||||
#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
|
||||
#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
|
||||
#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
|
||||
#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
|
||||
#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
|
||||
#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
|
||||
|
||||
#c) run it locally...
|
||||
#export train_cmd=run.pl
|
||||
#export decode_cmd=run.pl
|
||||
export cuda_cmd=run.pl
|
||||
#export mkgraph_cmd=run.pl
|
||||
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
beam=11.0 # beam for decoding. Was 13.0 in the scripts.
|
||||
first_beam=8.0 # beam for 1st-pass decoding in SAT.
|
||||
lattice_beam=6.0
|
|
@ -0,0 +1,2 @@
|
|||
beam=13.0 # beam for decoding. Was 13.0 in the scripts.
|
||||
lattice_beam=8.0 # this has most effect on size of the lattices.
|
|
@ -0,0 +1,6 @@
|
|||
--sample-frequency=8000
|
||||
--frame-length=25 # the default is 25
|
||||
--low-freq=20 # the default.
|
||||
--high-freq=3700 # the default is zero meaning use the Nyquist (4k in this case).
|
||||
--num-ceps=20 # higher than the default which is 12.
|
||||
--snip-edges=false
|
|
@ -0,0 +1,3 @@
|
|||
--use-energy=false # only non-default option.
|
||||
--sample-frequency=8000 # Switchboard is sampled at 8kHz
|
||||
--snip-edges=false
|
|
@ -0,0 +1,11 @@
|
|||
# config for high-resolution MFCC features, intended for neural network training.
|
||||
# Note: we keep all cepstra, so it has the same info as filterbank features,
|
||||
# but MFCC is more easily compressible (because less correlated) which is why
|
||||
# we prefer this method.
|
||||
--use-energy=false # use average of log energy, not energy.
|
||||
--sample-frequency=8000 # Switchboard is sampled at 8kHz
|
||||
--num-mel-bins=40 # similar to Google's setup.
|
||||
--num-ceps=40 # there is no dimensionality reduction.
|
||||
--low-freq=40 # low cutoff frequency for mel bins
|
||||
--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
|
||||
--snip-edges=false
|
|
@ -0,0 +1,2 @@
|
|||
--vad-energy-threshold=5.5
|
||||
--vad-energy-mean-scale=0.5
|
|
@ -0,0 +1 @@
|
|||
../v1/local/
|
|
@ -0,0 +1,263 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2015 David Snyder
|
||||
# 2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
|
||||
# 2015 Johns Hopkins University (Author: Daniel Povey)
|
||||
# Apache 2.0.
|
||||
#
|
||||
# See README.txt for more info on data required.
|
||||
# Results (EERs) are inline in comments below.
|
||||
#
|
||||
# This example script shows how to replace the GMM-UBM
|
||||
# with a DNN trained for ASR. It also demonstrates the
|
||||
# using the DNN to create a supervised-GMM.
|
||||
|
||||
. cmd.sh
|
||||
. path.sh
|
||||
set -e
|
||||
mfccdir=`pwd`/mfcc
|
||||
vaddir=`pwd`/mfcc
|
||||
trials_female=data/sre10_test_female/trials
|
||||
trials_male=data/sre10_test_male/trials
|
||||
trials=data/sre10_test/trials
|
||||
nnet=exp/nnet2_online/nnet_ms_a/final.mdl
|
||||
num_components=5297
|
||||
|
||||
# Train a DNN on about 1800 hours of the english portion of Fisher.
|
||||
local/dnn/train_dnn.sh
|
||||
|
||||
# Prepare the SRE 2010 evaluation data.
|
||||
local/make_sre_2010_test.pl /export/corpora5/SRE/SRE2010/eval/ data/
|
||||
local/make_sre_2010_train.pl /export/corpora5/SRE/SRE2010/eval/ data/
|
||||
|
||||
# Prepare a collection of NIST SRE data prior to 2010. This is
|
||||
# used to train the PLDA model and is also combined with SWB
|
||||
# for UBM and i-vector extractor training data.
|
||||
local/make_sre.sh data
|
||||
|
||||
# Prepare SWB for UBM and i-vector extractor training.
|
||||
local/make_swbd2_phase2.pl /export/corpora5/LDC/LDC99S79 \
|
||||
data/swbd2_phase2_train
|
||||
local/make_swbd2_phase3.pl /export/corpora5/LDC/LDC2002S06 \
|
||||
data/swbd2_phase3_train
|
||||
local/make_swbd_cellular1.pl /export/corpora5/LDC/LDC2001S13 \
|
||||
data/swbd_cellular1_train
|
||||
local/make_swbd_cellular2.pl /export/corpora5/LDC/LDC2004S07 \
|
||||
data/swbd_cellular2_train
|
||||
|
||||
utils/combine_data.sh data/train \
|
||||
data/swbd_cellular1_train data/swbd_cellular2_train \
|
||||
data/swbd2_phase2_train data/swbd2_phase3_train data/sre
|
||||
|
||||
cp -r data/train data/train_dnn
|
||||
cp -r data/sre data/sre_dnn
|
||||
cp -r data/sre10_train data/sre10_train_dnn
|
||||
cp -r data/sre10_test data/sre10_test_dnn
|
||||
|
||||
# Extract speaker recogntion features.
|
||||
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
|
||||
data/train exp/make_mfcc $mfccdir
|
||||
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
|
||||
data/sre exp/make_mfcc $mfccdir
|
||||
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
|
||||
data/sre10_train exp/make_mfcc $mfccdir
|
||||
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
|
||||
data/sre10_test exp/make_mfcc $mfccdir
|
||||
|
||||
# Extract DNN features.
|
||||
steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
|
||||
data/train_dnn exp/make_mfcc $mfccdir
|
||||
steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
|
||||
data/sre_dnn exp/make_mfcc $mfccdir
|
||||
steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
|
||||
data/sre10_train_dnn exp/make_mfcc $mfccdir
|
||||
steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
|
||||
data/sre10_test_dnn exp/make_mfcc $mfccdir
|
||||
|
||||
for name in sre_dnn sre10_train_dnn sre10_test_dnn train_dnn sre sre10_train sre10_test train; do
|
||||
utils/fix_data_dir.sh data/${name}
|
||||
done
|
||||
|
||||
# Compute VAD decisions. These will be shared across both sets of features.
|
||||
sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
|
||||
data/train exp/make_vad $vaddir
|
||||
sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
|
||||
data/sre exp/make_vad $vaddir
|
||||
sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
|
||||
data/sre10_train exp/make_vad $vaddir
|
||||
sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
|
||||
data/sre10_test exp/make_vad $vaddir
|
||||
|
||||
for name sre sre10_train sre10_test train; do
|
||||
cp data/${name}/vad.scp data/${name}_dnn/vad.scp
|
||||
cp data/${name}/utt2spk data/${name}_dnn/utt2spk
|
||||
cp data/${name}/spk2utt data/${name}_dnn/spk2utt
|
||||
utils/fix_data_dir.sh data/${name}
|
||||
utils/fix_data_dir.sh data/${name}_dnn
|
||||
done
|
||||
|
||||
# Subset training data for faster sup-GMM initialization.
|
||||
utils/subset_data_dir.sh data/train_dnn 32000 data/train_dnn_32k
|
||||
utils/fix_data_dir.sh data/train_dnn_32k
|
||||
utils/subset_data_dir.sh --utt-list data/train_dnn_32k/utt2spk data/train data/train_32k
|
||||
utils/fix_data_dir.sh data/train_32k
|
||||
|
||||
# Initialize a full GMM from the DNN posteriors and speaker recognition
|
||||
# features. This can be used both alone, as a UBM, or to initialize the
|
||||
# i-vector extractor in a DNN-based system.
|
||||
sid/init_full_ubm_from_dnn.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" \
|
||||
data/train_32k \
|
||||
data/train_dnn_32k $nnet exp/full_ubm
|
||||
|
||||
# Train an i-vector extractor based on just the supervised-GMM.
|
||||
sid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=70G,ram_free=70G" \
|
||||
--ivector-dim 600 \
|
||||
--num-iters 5 exp/full_ubm/final.ubm data/train \
|
||||
exp/extractor_sup_gmm
|
||||
|
||||
# Train an i-vector extractor based on the DNN-UBM.
|
||||
sid/train_ivector_extractor_dnn.sh --cmd "$train_cmd -l mem_free=80G,ram_free=80G" \
|
||||
--min-post 0.015 \
|
||||
--ivector-dim 600 \
|
||||
--num-iters 5 exp/full_ubm/final.ubm $nnet \
|
||||
data/train \
|
||||
data/train_dnn \
|
||||
exp/extractor_dnn
|
||||
|
||||
# Extract i-vectors from the extractor with the sup-GMM UBM.
|
||||
sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 50 \
|
||||
exp/extractor_sup_gmm data/sre10_train \
|
||||
exp/ivectors_sre10_train_sup_gmm
|
||||
|
||||
sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 50 \
|
||||
exp/extractor_sup_gmm data/sre10_test \
|
||||
exp/ivectors_sre10_test_sup_gmm
|
||||
|
||||
sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 50 \
|
||||
exp/extractor_sup_gmm data/sre \
|
||||
exp/ivectors_sre_sup_gmm
|
||||
|
||||
# Extract i-vectors using the extractor with the DNN-UBM.
|
||||
sid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
|
||||
exp/extractor_dnn \
|
||||
$nnet \
|
||||
data/sre10_test \
|
||||
data/sre10_test_dnn \
|
||||
exp/ivectors10_test_dnn
|
||||
|
||||
sid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
|
||||
exp/extractor_dnn \
|
||||
$nnet \
|
||||
data/sre10_train \
|
||||
data/sre10_train_dnn \
|
||||
exp/ivectors10_train_dnn
|
||||
|
||||
sid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
|
||||
exp/extractor_dnn \
|
||||
$nnet \
|
||||
data/sre \
|
||||
data/sre_dnn \
|
||||
exp/ivectors_sre_dnn
|
||||
|
||||
# Separate the i-vectors into male and female partitions and calculate
|
||||
# i-vector means used by the scoring scripts.
|
||||
local/scoring_common.sh data/sre data/sre10_train data/sre10_test \
|
||||
exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm \
|
||||
exp/ivectors_sre10_test_sup_gmm
|
||||
|
||||
local/scoring_common.sh data/sre data/sre10_train data/sre10_test \
|
||||
exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn \
|
||||
exp/ivectors_sre10_test_dnn
|
||||
|
||||
# The commented out scripts show how to do cosine scoring with and without
|
||||
# first reducing the i-vector dimensionality with LDA. PLDA tends to work
|
||||
# best, so we don't focus on the scores obtained here.
|
||||
#
|
||||
# local/cosine_scoring.sh data/sre10_train data/sre10_test \
|
||||
# exp/ivectors_sre10_train exp/ivectors_sre10_test $trials local/scores_gmm_2048_ind_pooled
|
||||
# local/lda_scoring.sh data/sre data/sre10_train data/sre10_test \
|
||||
# exp/ivectors_sre exp/ivectors_sre10_train exp/ivectors_sre10_test $trials local/scores_gmm_2048_ind_pooled
|
||||
|
||||
# Create a gender independent PLDA model and do scoring with the sup-GMM system.
|
||||
local/plda_scoring.sh data/sre data/sre10_train data/sre10_test \
|
||||
exp/ivectors_sre_test_sup_gmm exp/ivectors_sre10_train_sup_gmm \
|
||||
exp/ivectors_sre10_test_sup_gmm $trials local/scores_sup_gmm_ind_pooled
|
||||
local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_female data/sre10_test_female \
|
||||
exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm_female \
|
||||
exp/ivectors_sre10_test_sup_gmm_female $trials_female local/scores_sup_gmm_ind_female
|
||||
local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_male data/sre10_test_male \
|
||||
exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm_male \
|
||||
exp/ivectors_sre10_test_sup_gmm_male $trials_male local/scores_sup_gmm_ind_male
|
||||
|
||||
# Create gender dependent PLDA models and do scoring with the sup-GMM system.
|
||||
local/plda_scoring.sh data/sre_female data/sre10_train_female data/sre10_test_female \
|
||||
exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm_female \
|
||||
exp/ivectors_sre10_test_sup_gmm_female $trials_female local/scores_sup_gmm_dep_female
|
||||
local/plda_scoring.sh data/sre_male data/sre10_train_male data/sre10_test_male \
|
||||
exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm_male \
|
||||
exp/ivectors_sre10_test_sup_gmm_male $trials_male local/scores_sup_gmm_dep_male
|
||||
mkdir -p local/scores_sup_gmm_dep_pooled
|
||||
cat local/scores_sup_gmm_dep_male/plda_scores local/scores_sup_gmm_dep_female/plda_scores \
|
||||
> local/scores_sup_gmm_dep_pooled/plda_scores
|
||||
|
||||
# Create a gender independent PLDA model and do scoring with the DNN system.
|
||||
local/plda_scoring.sh data/sre data/sre10_train data/sre10_test \
|
||||
exp/ivectors_sre_test_dnn exp/ivectors_sre10_train_dnn \
|
||||
exp/ivectors_sre10_test_dnn $trials local/scores_dnn_ind_pooled
|
||||
local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_female data/sre10_test_female \
|
||||
exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn_female \
|
||||
exp/ivectors_sre10_test_dnn_female $trials_female local/scores_dnn_ind_female
|
||||
local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_male data/sre10_test_male \
|
||||
exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn_male \
|
||||
exp/ivectors_sre10_test_dnn_male $trials_male local/scores_dnn_ind_male
|
||||
|
||||
# Create gender dependent PLDA models and do scoring with the DNN system.
|
||||
local/plda_scoring.sh data/sre_female data/sre10_train_female data/sre10_test_female \
|
||||
exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn_female \
|
||||
exp/ivectors_sre10_test_dnn_female $trials_female local/scores_dnn_dep_female
|
||||
local/plda_scoring.sh data/sre_male data/sre10_train_male data/sre10_test_male \
|
||||
exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn_male \
|
||||
exp/ivectors_sre10_test_dnn_male $trials_male local/scores_dnn_dep_male
|
||||
mkdir -p local/scores_dnn_dep_pooled
|
||||
cat local/scores_dnn_dep_male/plda_scores local/scores_dnn_dep_female/plda_scores \
|
||||
> local/scores_dnn_dep_pooled/plda_scores
|
||||
|
||||
|
||||
# Sup-GMM PLDA EER
|
||||
# ind pooled: 1.94
|
||||
# ind female: 1.98
|
||||
# ind male: 1.79
|
||||
# dep female: 1.87
|
||||
# dep male: 1.30
|
||||
# dep pooled: 1.65
|
||||
echo "Sup-GMM-$num_components EER"
|
||||
for x in ind dep; do
|
||||
for y in female male pooled; do
|
||||
eer=`compute-eer <(python local/prepare_for_eer.py $trials local/scores_sup_gmm_${x}_${y}/plda_scores) 2> /dev/null`
|
||||
echo "${x} ${y}: $eer"
|
||||
done
|
||||
done
|
||||
|
||||
# DNN PLDA EER
|
||||
# ind pooled: 1.20
|
||||
# ind female: 1.46
|
||||
# ind male: 0.87
|
||||
# dep female: 1.43
|
||||
# dep male: 0.72
|
||||
# dep pooled: 1.09
|
||||
echo "DNN-$num_components EER"
|
||||
for x in ind dep; do
|
||||
for y in female male pooled; do
|
||||
eer=`compute-eer <(python local/prepare_for_eer.py $trials local/scores_dnn_${x}_${y}/plda_scores) 2> /dev/null`
|
||||
echo "${x} ${y}: $eer"
|
||||
done
|
||||
done
|
||||
|
||||
# In comparison, here is the EER for an unsupervised GMM-based system
|
||||
# with 5297 components (the same as the number of senones in the DNN):
|
||||
# GMM-5297 PLDA EER
|
||||
# ind pooled: 2.42
|
||||
# ind female: 2.43
|
||||
# ind male: 2.40
|
||||
# dep female: 2.16
|
||||
# dep male: 1.53
|
||||
# dep pooled: 2.00
|
|
@ -0,0 +1 @@
|
|||
../v1/sid
|
|
@ -0,0 +1 @@
|
|||
../v1/steps
|
|
@ -0,0 +1 @@
|
|||
../v1/utils
|
|
@ -1,6 +1,8 @@
|
|||
// fgmmbin/fgmm-global-acc-stats-post.cc
|
||||
|
||||
// Copyright 2015 David Snyder
|
||||
// 2015 Johns Hopkins University (Author: Daniel Povey)
|
||||
// 2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
// fgmmbin/fgmm-global-init-from-accs.cc
|
||||
|
||||
// Copyright 2015 David Snyder
|
||||
// 2015 Johns Hopkins University (Author: Daniel Povey)
|
||||
// 2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
|
|
Загрузка…
Ссылка в новой задаче