trunk: Adding DNN-based speaker recognition recipe in egs/sre10

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@5223 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
David Snyder 2015-07-10 17:53:28 +00:00
Родитель 189c77419e
Коммит 55d8f863f3
31 изменённых файлов: 2833 добавлений и 3 удалений

Просмотреть файл

@ -0,0 +1,94 @@
#!/bin/bash
# Copyright 2013 Daniel Povey
# 2014-2015 David Snyder
# 2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
# 2015 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
# This script extracts iVectors for a set of utterances, given
# features and a trained DNN-based iVector extractor.
# Begin configuration section.
nj=30
cmd="run.pl"
stage=0
min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
posterior_scale=1.0 # This scale helps to control for successive features being highly
# correlated. E.g. try 0.1 or 0.3.
# End configuration section.
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 5 ]; then
echo "Usage: $0 <extractor-dir> <data> <ivector-dir>"
echo " e.g.: $0 exp/extractor_2048_male data/train_male exp/ivectors_male"
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --num-iters <#iters|10> # Number of iterations of E-M"
echo " --nj <n|10> # Number of jobs (also see num-processes and num-threads)"
echo " --num-threads <n|8> # Number of threads for each process"
echo " --stage <stage|0> # To control partial reruns"
echo " --num-gselect <n|20> # Number of Gaussians to select using"
echo " # diagonal model."
echo " --min-post <min-post|0.025> # Pruning threshold for posteriors"
exit 1;
fi
srcdir=$1
nnet=$2
data=$3
data_dnn=$4
dir=$5
for f in $srcdir/final.ie $srcdir/final.ubm $data/feats.scp ; do
[ ! -f $f ] && echo "No such file $f" && exit 1;
done
# Set various variables.
mkdir -p $dir/log
sdata=$data/split$nj;
utils/split_data.sh $data $nj || exit 1;
sdata_dnn=$data_dnn/split$nj;
utils/split_data.sh $data_dnn $nj || exit 1;
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
splice_opts=`cat exp/nnet//splice_opts 2>/dev/null` # frame-splicing options
## Set up features.
feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
nnet_feats="ark,s,cs:apply-cmvn-sliding --center=true scp:$sdata_dnn/JOB/feats.scp ark:- |"
if [ $stage -le 0 ]; then
echo "$0: extracting iVectors"
$cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
nnet-am-compute --apply-log=true $nnet "$nnet_feats" ark:- \
\| select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- \
\| logprob-to-post --min-post=$min_post ark:- ark:- \| \
scale-post ark:- $posterior_scale ark:- \| \
ivector-extract --verbose=2 $srcdir/final.ie "$feats" ark,s,cs:- \
ark,scp,t:$dir/ivector.JOB.ark,$dir/ivector.JOB.scp || exit 1;
fi
if [ $stage -le 1 ]; then
echo "$0: combining iVectors across jobs"
for j in $(seq $nj); do cat $dir/ivector.$j.scp; done >$dir/ivector.scp || exit 1;
fi
if [ $stage -le 2 ]; then
# Be careful here: the speaker-level iVectors are now length-normalized,
# even if they are otherwise the same as the utterance-level ones.
echo "$0: computing mean of iVectors for each speaker and length-normalizing"
$cmd $dir/log/speaker_mean.log \
ivector-normalize-length scp:$dir/ivector.scp ark:- \| \
ivector-mean ark:$data/spk2utt ark:- ark:- ark,t:$dir/num_utts.ark \| \
ivector-normalize-length ark:- ark,scp:$dir/spk_ivector.ark,$dir/spk_ivector.scp || exit 1;
fi

Просмотреть файл

@ -0,0 +1,79 @@
#!/bin/bash
# Copyright 2015 David Snyder
# 2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
# 2015 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# This script derives a full-covariance UBM from DNN posteriors and
# speaker recognition features.
# Begin configuration section.
nj=40
cmd="run.pl"
stage=-2
delta_window=3
delta_order=2
num_components=5297
# End configuration section.
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: steps/init_full_ubm_from_dnn.sh <data-speaker-id> <data-dnn> <dnn-model> <new-ubm-dir>"
echo "Initializes a full-covariance UBM from DNN posteriors and speaker recognition features."
echo " e.g.: steps/init_full_ubm_from_dnn.sh data/train data/train_dnn exp/dnn/final.mdl exp/full_ubm"
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --nj <n|16> # number of parallel training jobs"
echo " --delta-window <n|3> # delta window size"
echo " --delta-order <n|2> # delta order"
echo " --number-components <n|5297> # number of components in the final GMM needs"
echo " # to be equal to the size of the DNN output layer."
exit 1;
fi
data=$1
data_dnn=$2
nnet=$3
dir=$4
for f in $data/feats.scp $data/vad.scp; do
[ ! -f $f ] && echo "No such file $f" && exit 1;
done
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj;
utils/split_data.sh $data $nj || exit 1;
sdata_dnn=$data_dnn/split$nj;
utils/split_data.sh $data_dnn $nj || exit 1;
delta_opts="--delta-window=$delta_window --delta-order=$delta_order"
echo $delta_opts > $dir/delta_opts
logdir=$dir/log
nnet_feats="ark,s,cs:apply-cmvn-sliding --center=true scp:$sdata_dnn/JOB/feats.scp ark:- |"
feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | \
apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | \
select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
$cmd JOB=1:$nj $logdir/make_stats.JOB.log \
nnet-am-compute --apply-log=true $nnet "$nnet_feats" ark:- \
\| select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- \
\| logprob-to-post ark:- ark:- \| \
fgmm-global-acc-stats-post ark:- $num_components "$feats" \
$dir/stats.JOB.acc || exit 1;
$cmd $dir/log/init.log \
fgmm-global-init-from-accs --verbose=2 \
"fgmm-global-sum-accs - $dir/stats.*.acc |" $num_components \
$dir/final.ubm || exit 1;
exit 0;

Просмотреть файл

@ -0,0 +1,181 @@
#!/bin/bash
# Copyright 2013 Daniel Povey
# 2014-2015 David Snyder
# 2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
# 2015 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
# This script trains the i-vector extractor using a DNN-based UBM. It also requires
# an fGMM, usually created by the script sid/init_full_gmm_from_dnn.sh.
# Note: there are 3 separate levels of parallelization: num_threads, num_processes,
# and num_jobs. This may seem a bit excessive. It has to do with minimizing
# memory usage and disk I/O, subject to various constraints. The "num_threads"
# is how many threads a program uses; the "num_processes" is the number of separate
# processes a single job spawns, and then sums the accumulators in memory.
# Our recommendation:
# - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
# (because of needing to lock various global quantities, the program can't
# use many more than 4 threads with good CPU utilization).
# - Set num_processes to the number of virtual cores on each machine you have, divided by
# num_threads. E.g. 4, if you have 16 virtual cores. If you're on a shared queue
# that's busy with other people's jobs, it may be wise to set it to rather less
# than this maximum though, or your jobs won't get scheduled. And if memory is
# tight you need to be careful; in our normal setup, each process uses about 5G.
# - Set num_jobs to as many of the jobs (each using $num_threads * $num_processes CPUs)
# your queue will let you run at one time, but don't go much more than 10 or 20, or
# summing the accumulators will possibly get slow. If you have a lot of data, you
# may want more jobs, though.
# Begin configuration section.
nj=10 # this is the number of separate queue jobs we run, but each one
# contains num_processes sub-jobs.. the real number of threads we
# run is nj * num_processes * num_threads, and the number of
# separate pieces of data is nj * num_processes.
num_threads=4
num_processes=4 # each job runs this many processes, each with --num-threads threads
cmd="run.pl"
stage=-4
num_gselect=20 # Gaussian-selection using diagonal model: number of Gaussians to select
ivector_dim=400 # dimension of the extracted i-vector
use_weights=false # set to true to turn on the regression of log-weights on the ivector.
num_iters=10
min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
num_samples_for_weights=3 # smaller than the default for speed (relates to a sampling method)
cleanup=true
posterior_scale=1.0 # This scale helps to control for successve features being highly
# correlated. E.g. try 0.1 or 0.3
sum_accs_opt=
# End configuration section.
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 5 ]; then
echo "Usage: $0 <fgmm-model> <dnn-model> <data-speaker-id> <data-dnn> <extractor-dir>"
echo " e.g.: $0 exp/sup_ubm/final.ubm exp/dnn/final.mdl data/train data/train_dnn exp/extractor_male"
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --num-iters <#iters|10> # Number of iterations of E-M"
echo " --nj <n|10> # Number of jobs (also see num-processes and num-threads)"
echo " --num-processes <n|4> # Number of processes for each queue job (relates"
echo " # to summing accs in memory)"
echo " --num-threads <n|4> # Number of threads for each process (can't be usefully"
echo " # increased much above 4)"
echo " --stage <stage|-4> # To control partial reruns"
echo " --num-gselect <n|20> # Number of Gaussians to select using"
echo " # diagonal model."
echo " --sum-accs-opt <option|''> # Option e.g. '-l hostname=a15' to localize"
echo " # sum-accs process to nfs server."
exit 1;
fi
fgmm_model=$1
nnet=$2
data=$3
data_dnn=$4
dir=$5
srcdir=$(dirname $fgmm_model)
for f in $fgmm_model $data/feats.scp ; do
[ ! -f $f ] && echo "No such file $f" && exit 1;
done
# Set various variables.
mkdir -p $dir/log
nj_full=$[$nj*$num_processes]
sdata=$data/split$nj_full;
utils/split_data.sh $data $nj_full || exit 1;
sdata_dnn=$data_dnn/split$nj_full;
utils/split_data.sh $data_dnn $nj_full || exit 1;
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
if [ -f $srcdir/delta_opts ]; then
cp $srcdir/delta_opts $dir/ 2>/dev/null
fi
splice_opts=`cat exp/nnet//splice_opts 2>/dev/null` # frame-splicing options
parallel_opts="-pe smp $[$num_threads*$num_processes]"
## Set up features.
feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
nnet_feats="ark,s,cs:apply-cmvn-sliding --center=true scp:$sdata_dnn/JOB/feats.scp ark:- |"
# Initialize the i-vector extractor using the FGMM input
if [ $stage -le -2 ]; then
cp $fgmm_model $dir/final.ubm || exit 1;
$cmd $dir/log/convert.log \
fgmm-global-to-gmm $dir/final.ubm $dir/final.dubm || exit 1;
$cmd $dir/log/init.log \
ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \
$dir/final.ubm $dir/0.ie || exit 1;
fi
# Do Gaussian selection and posterior extracion
if [ $stage -le -1 ]; then
echo $nj_full > $dir/num_jobs
echo "$0: doing DNN posterior computation"
$cmd JOB=1:$nj_full $dir/log/post.JOB.log \
nnet-am-compute --apply-log=true $nnet "$nnet_feats" ark:- \
\| select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- \
\| logprob-to-post --min-post=$min_post ark,s,cs:- ark:- \| \
scale-post ark:- $posterior_scale "ark:|gzip -c >$dir/post.JOB.gz" || exit 1;
else
if ! [ $nj_full -eq $(cat $dir/num_jobs) ]; then
echo "Num-jobs mismatch $nj_full versus $(cat $dir/num_jobs)"
exit 1
fi
fi
x=0
while [ $x -lt $num_iters ]; do
if [ $stage -le $x ]; then
rm $dir/.error 2>/dev/null
Args=() # bash array of training commands for 1:nj, that put accs to stdout.
for j in $(seq $nj_full); do
Args[$j]=`echo "ivector-extractor-acc-stats --num-threads=$num_threads --num-samples-for-weights=$num_samples_for_weights $dir/$x.ie '$feats' 'ark,s,cs:gunzip -c $dir/post.JOB.gz|' -|" | sed s/JOB/$j/g`
done
echo "Accumulating stats (pass $x)"
for g in $(seq $nj); do
start=$[$num_processes*($g-1)+1]
$cmd $parallel_opts $dir/log/acc.$x.$g.log \
ivector-extractor-sum-accs --parallel=true "${Args[@]:$start:$num_processes}" \
$dir/acc.$x.$g || touch $dir/.error &
done
wait
[ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
accs=""
for j in $(seq $nj); do
accs+="$dir/acc.$x.$j "
done
echo "Summing accs (pass $x)"
$cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
echo "Updating model (pass $x)"
nt=$[$num_threads*$num_processes] # use the same number of threads that
# each accumulation process uses, since we
# can be sure the queue will support this many.
$cmd -pe smp $nt $dir/log/update.$x.log \
ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
rm $dir/acc.$x.*
if $cleanup; then
rm $dir/acc.$x
# rm $dir/$x.ie
fi
fi
x=$[$x+1]
done
ln -s $x.ie $dir/final.ie

Просмотреть файл

@ -0,0 +1,5 @@
This directory contains DNN scripts based on the nnet2 recipes found in
the ASR examples (e.g., fisher_english). The scripts have been modified
for speaker recognition purposes. Most of the scripts are lightly modified
versions of those appearing in the steps or local directories of
egs/fisher_english.

Просмотреть файл

@ -0,0 +1,62 @@
#!/bin/bash
#
if [ -f path.sh ]; then . path.sh; fi
mkdir -p data/lang_test
arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
mkdir -p data/lang_test
cp -r data/lang/* data/lang_test
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_test/G.fst
## Check lexicon.
## just have a look and make sure it seems sane.
echo "First few lines of lexicon FST:"
fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head
echo Performing further checks
# Checking that G.fst is determinizable.
fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
# Checking that L_disambig.fst is determinizable.
fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
# Checking that disambiguated lexicon times G is determinizable
# Note: we do this with fstdeterminizestar not fstdeterminize, as
# fstdeterminize was taking forever (presumbaly relates to a bug
# in this version of OpenFst that makes determinization slow for
# some case).
fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
fstdeterminizestar >/dev/null || echo Error
# Checking that LG is stochastic:
fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
fstisstochastic || echo "[log:] LG is not stochastic"
echo "$0 succeeded"

Просмотреть файл

@ -0,0 +1,211 @@
#!/bin/bash
# Copyright 2013 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
stage=0
calldata=
while test $# -gt 0
do
case "$1" in
--calldata) calldata=1
;;
*) break;
;;
esac
shift
done
. utils/parse_options.sh
if [ $# -eq 0 ]; then
echo "$0 [--calldata] <fisher-dir-1> [<fisher-dir-2> ...]"
echo " e.g.: $0 /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19\\"
echo " /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13"
echo " (We also support a single directory that has the contents of all of them)"
echo " If specified, --calldata will be used to map Kaldi speaker ID to real"
echo " speaker PIN released with the Fisher corpus."
exit 1;
fi
# Check that the arguments are all absolute pathnames.
for dir in $*; do
case $dir in /*) ;; *)
echo "$0: all arguments must be absolute pathnames."; exit 1;
esac
done
# First check we have the right things in there...
#
rm -r data/local/data/links 2>/dev/null
mkdir -p data/local/data/links || exit 1;
for subdir in fe_03_p1_sph1 fe_03_p1_sph3 fe_03_p1_sph5 fe_03_p1_sph7 \
fe_03_p2_sph1 fe_03_p2_sph3 fe_03_p2_sph5 fe_03_p2_sph7 fe_03_p1_sph2 \
fe_03_p1_sph4 fe_03_p1_sph6 fe_03_p1_tran fe_03_p2_sph2 fe_03_p2_sph4 \
fe_03_p2_sph6 fe_03_p2_tran; do
found_subdir=false
for dir in $*; do
if [ -d $dir/$subdir ]; then
found_subdir=true
ln -s $dir/$subdir data/local/data/links
else
new_style_subdir=$(echo $subdir | sed s/fe_03_p2_sph/fisher_eng_tr_sp_d/)
if [ -d $dir/$new_style_subdir ]; then
found_subdir=true
ln -s $dir/$new_style_subdir data/local/data/links/$subdir
fi
fi
done
if ! $found_subdir; then
echo "$0: could not find the subdirectory $subdir in any of $*"
exit 1;
fi
done
tmpdir=`pwd`/data/local/data
links=data/local/data/links
. ./path.sh # Needed for KALDI_ROOT
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
exit 1;
fi
# (1) Get transcripts in one file, and clean them up ..
if [ $stage -le 0 ]; then
find $links/fe_03_p1_tran/data $links/fe_03_p2_tran/data -name '*.txt' > $tmpdir/transcripts.flist
for dir in fe_03_p{1,2}_sph{1,2,3,4,5,6,7}; do
find $links/$dir/ -name '*.sph'
done > $tmpdir/sph.flist
n=`cat $tmpdir/transcripts.flist | wc -l`
if [ $n -ne 11699 ]; then
echo "Expected to find 11699 transcript files in the Fisher data, found $n"
exit 1;
fi
n=`cat $tmpdir/sph.flist | wc -l`
if [ $n -ne 11699 ]; then
echo "Expected to find 11699 .sph files in the Fisher data, found $n"
exit 1;
fi
fi
if [ $stage -le 1 ]; then
mkdir -p data/train_all_asr
## fe_03_00004.sph
## Transcpribed at the LDC
#
#7.38 8.78 A: an- so the topic is
echo -n > $tmpdir/text.1 || exit 1;
perl -e '
use File::Basename;
($tmpdir)=@ARGV;
open(F, "<$tmpdir/transcripts.flist") || die "Opening list of transcripts";
open(R, "|sort >data/train_all_asr/reco2file_and_channel") || die "Opening reco2file_and_channel";
open(T, ">$tmpdir/text.1") || die "Opening text output";
while (<F>) {
$file = $_;
m:([^/]+)\.txt: || die "Bad filename $_";
$call_id = $1;
print R "$call_id-A $call_id A\n";
print R "$call_id-B $call_id B\n";
open(I, "<$file") || die "Opening file $_";
$line1 = <I>;
$line1 =~ m/# (.+)\.sph/ || die "Bad first line $line1 in file $file";
$call_id eq $1 || die "Mismatch call-id $call_id vs $1\n";
while (<I>) {
if (m/([0-9.]+)\s+([0-9.]+) ([AB]):\s*(\S.+\S|\S)\s*$/) {
$start = sprintf("%06d", $1 * 100.0);
$end = sprintf("%06d", $2 * 100.0);
length($end) > 6 && die "Time too long $end in file $file";
$side = $3;
$words = $4;
$utt_id = "${call_id}-$side-$start-$end";
print T "$utt_id $words\n" || die "Error writing to text file";
}
}
}
close(R); close(T) ' $tmpdir || exit 1;
fi
if [ $stage -le 2 ]; then
sort $tmpdir/text.1 | grep -v '((' | \
awk '{if (NF > 1){ print; }}' | \
sed 's:\[laugh\]:[laughter]:g' | \
sed 's:\[sigh\]:[noise]:g' | \
sed 's:\[cough\]:[noise]:g' | \
sed 's:\[sigh\]:[noise]:g' | \
sed 's:\[mn\]:[noise]:g' | \
sed 's:\[breath\]:[noise]:g' | \
sed 's:\[lipsmack\]:[noise]:g' > $tmpdir/text.2
cp $tmpdir/text.2 data/train_all_asr/text
# create segments file and utt2spk file...
! cat data/train_all_asr/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > data/train_all_asr/utt2spk \
&& echo "Error producing utt2spk file" && exit 1;
cat data/train_all_asr/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2; $s = sprintf("%.2f", 0.01*$3);
$e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e\n"; ' > data/train_all_asr/segments
utils/utt2spk_to_spk2utt.pl <data/train_all_asr/utt2spk > data/train_all_asr/spk2utt
fi
if [ $stage -le 3 ]; then
for f in `cat $tmpdir/sph.flist`; do
# convert to absolute path
readlink -e $f
done > $tmpdir/sph_abs.flist
cat $tmpdir/sph_abs.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; "; print "$1 $_"; ' > $tmpdir/sph.scp
cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
sort -k1,1 -u > data/train_all_asr/wav.scp || exit 1;
fi
if [ $stage -le 4 ]; then
# get the spk2gender information. This is not a standard part of our
# file formats
# The files "filetable2fe_03_p2_sph1 fe_03_05852.sph ff
cat $links/fe_03_p1_sph{1,2,3,4,5,6,7}/filetable.txt \
$links/fe_03_p2_sph{1,2,3,4,5,6,7}/docs/filetable2.txt | \
perl -ane 'm:^\S+ (\S+)\.sph ([fm])([fm]): || die "bad line $_;"; print "$1-A $2\n", "$1-B $3\n"; ' | \
sort | uniq | utils/filter_scp.pl data/train_all_asr/spk2utt > data/train_all_asr/spk2gender
if [ ! -s data/train_all_asr/spk2gender ]; then
echo "It looks like our first try at getting the spk2gender info did not work."
echo "(possibly older distribution?) Trying something else."
cat $links/fe_03_p1_tran/doc/fe_03_p1_filelist.tbl $links/fe_03_p2_tran/doc/fe_03_p2_filelist.tbl | \
perl -ane 'm:fe_03_p[12]_sph\d\t(\d+)\t([mf])([mf]): || die "Bad line $_";
print "fe_03_$1-A $2\n", "fe_03_$1-B $3\n"; ' | \
sort | uniq | utils/filter_scp.pl data/train_all_asr/spk2utt > data/train_all_asr/spk2gender
fi
fi
if [ ! -z "$calldata" ]; then # fix speaker IDs
cat $links/fe_03_p{1,2}_tran/doc/*calldata.tbl > $tmpdir/combined-calldata.tbl
local/fisher_fix_speakerid.pl $tmpdir/combined-calldata.tbl data/train_all_asr
utils/utt2spk_to_spk2utt.pl data/train_all_asr/utt2spk.new > data/train_all_asr/spk2utt.new
# patch files
for f in spk2utt utt2spk text segments spk2gender; do
cp data/train_all_asr/$f data/train_all_asr/$f.old || exit 1;
cp data/train_all_asr/$f.new data/train_all_asr/$f || exit 1;
done
rm $tmpdir/combined-calldata.tbl
fi
echo "Data preparation succeeded"

Просмотреть файл

@ -0,0 +1,114 @@
#!/usr/bin/perl -w
# Author: Peng Qi (pengqi@cs.stanford.edu)
# This script maps Switchboard speaker IDs to the true physical speakers
# and fixes the utterances IDs accordingly. Expected to be run one level of
# directory above.
sub trim {
(my $s = $_[0]) =~ s/^\s+|\s+$//g;
return $s;
}
if ($#ARGV != 1) {
print "Usage: swbd1_fix_speakerid.pl <fisher-calldata-tbl-file> <data-dir>\n";
print "E.g.: swbd1_fix_speakerid.pl data/local/train/combined-calldata.tbl data/train_all\n";
}
$tab_file = $ARGV[0];
$dir = $ARGV[1];
%conv_to_spk = ();
open(my $conv_tab, '<', $tab_file) or die "Could not open '$tab_file' $!\n";
while (my $line = <$conv_tab>) {
chomp $line;
my @fields = split "," , $line;
#$fields[0] = trim($fields[0]);
$fields[5] = trim($fields[5]);
$fields[10] = trim($fields[10]);
$conv_to_spk{'fe_03_' . $fields[0] . '-A'} = $fields[5];
$conv_to_spk{'fe_03_' . $fields[0] . '-B'} = $fields[10];
}
close($conv_tab);
# fix utt2spk
%missingconv = ();
open(my $utt2spk, '<', $dir . '/utt2spk') or die "Could not open '$dir/utt2spk' $!\n";
open(my $utt2spk_new, '>', $dir . '/utt2spk.new');
while (my $line = <$utt2spk>) {
chomp $line;
my @fields = split " " , $line;
my $convid = substr $fields[0], 0, 13;
if (exists $conv_to_spk{ $convid }) {
my $spkid = $conv_to_spk{ $convid };
$spkid = "fe_03_" . $spkid;
my $newuttid = $spkid . '-' . (substr $fields[0], 6);
print $utt2spk_new "$newuttid $spkid\n";
} else {
my $convid = substr $convid, 6, 5;
$missingconv{$convid} = 1;
print $utt2spk_new $fields[0]." ".$fields[1]."\n";
}
}
close($utt2spk);
close($utt2spk_new);
foreach my $conv (keys %missingconv) {
print "Warning: Conversation ID '$conv' not found in conv.tab, retaining old speaker IDs\n"
}
# fix spk2gender
if (open(my $spk2gender, '<', $dir . '/spk2gender')) {
open(my $spk2gender_new, '>', $dir . '/spk2gender.new');
while (my $line = <$spk2gender>) {
chomp $line;
my @fields = split " ", $line;
my $convid = $fields[0];
if (exists $conv_to_spk{ $convid }) {
my $spkid = $conv_to_spk{ $convid };
$spkid = "fe_03_" . $spkid;
print $spk2gender_new $spkid." ".$fields[1]."\n";
} else {
print $spk2gender_new $fields[0]." ".$fields[1]."\n";
}
}
close($spk2gender);
close($spk2gender_new);
}
# fix segments and text
foreach my $file ('segments','text') {
open(my $oldfile, '<', "$dir/$file") or die "Could not open '$dir/$file' $!\n";
open(my $newfile, '>', "$dir/$file.new");
while (my $line = <$oldfile>) {
chomp $line;
my $convid = substr $line, 0, 13;
if (exists $conv_to_spk{$convid}) {
my $spkid = $conv_to_spk{$convid};
print $newfile "fe_03_$spkid-" . (substr $line, 6) . "\n";
} else {
print $newfile "$line\n";
}
}
}

Просмотреть файл

@ -0,0 +1,182 @@
#!/bin/bash
#
# To be run from one directory above this script.
## The input is some directory containing the switchboard-1 release 2
## corpus (LDC97S62). Note: we don't make many assumptions about how
## you unpacked this. We are just doing a "find" command to locate
## the .sph files.
# for example /mnt/matylda2/data/SWITCHBOARD_1R2
. path.sh
# The parts of the output of this that will be needed are
# [in data/local/dict/ ]
# lexicon.txt
# extra_questions.txt
# nonsilence_phones.txt
# optional_silence.txt
# silence_phones.txt
#check existing directories
[ $# != 0 ] && echo "Usage: local/dnn/fisher_prepare_dict.sh" && exit 1;
dir=data/local/dict
mkdir -p $dir
echo "Getting CMU dictionary"
svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $dir/cmudict
# silence phones, one per line.
for w in sil laughter noise oov; do echo $w; done > $dir/silence_phones.txt
echo sil > $dir/optional_silence.txt
# For this setup we're discarding stress.
cat $dir/cmudict/cmudict.0.7a.symbols | sed s/[0-9]//g | \
tr '[A-Z]' '[a-z]' | perl -ane 's:\r::; print;' | sort | uniq > $dir/nonsilence_phones.txt
# An extra question will be added by including the silence phones in one class.
cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
grep -v ';;;' $dir/cmudict/cmudict.0.7a | tr '[A-Z]' '[a-z]' | \
perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; s: : :; print; }' | \
perl -ane '@A = split(" ", $_); for ($n = 1; $n<@A;$n++) { $A[$n] =~ s/[0-9]//g; } print join(" ", @A) . "\n";' | \
sort | uniq > $dir/lexicon1_raw_nosil.txt || exit 1;
# Add prons for laughter, noise, oov
for w in `grep -v sil $dir/silence_phones.txt`; do
echo "[$w] $w"
done | cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1;
# This is just for diagnostics:
cat data/train_all_asr/text | \
awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \
sort -nr > $dir/word_counts
cat $dir/word_counts | awk '{print $2}' > $dir/word_list
# between lexicon2_raw and lexicon3_expand we limit it to the words seen in
# the Fisher data.
utils/filter_scp.pl $dir/word_list $dir/lexicon2_raw.txt > $dir/lexicon3_expand.txt
# From lexicon2_raw to lexicon3_expand, we also expand the vocab for acronyms
# like c._n._n. and other underscore-containing things as long as the new vocab
# could be divided into finite parts contained in lexicon2_raw
cat $dir/lexicon2_raw.txt | \
perl -e 'while(<STDIN>) { @A=split; $w = shift @A; $pron{$w} = join(" ", @A); }
($w) = @ARGV; open(W, "<$w") || die "Error opening word-counts from $w";
while(<W>) { # reading in words we saw in training data..
($c, $w) = split;
if (!defined $pron{$w}) {
@A = split("_", $w);
if (@A > 1) {
$this_pron = "";
$pron_ok = 1;
foreach $a (@A) {
if (defined($pron{$a})) { $this_pron = $this_pron . "$pron{$a} "; }
else { $pron_ok = 0; print STDERR "Not handling word $w, count is $c\n"; last; }
}
if ($pron_ok) { $new_pron{$w} = $this_pron; }
}
}
}
foreach $w (keys %new_pron) { print "$w $new_pron{$w}\n"; }' \
$dir/word_counts >> $dir/lexicon3_expand.txt || exit 1;
cat $dir/lexicon3_expand.txt \
<( echo "mm m"
echo "<unk> oov" ) > $dir/lexicon4_extra.txt
cp $dir/lexicon4_extra.txt $dir/lexicon.txt
rm $dir/lexiconp.txt 2>/dev/null; # can confuse later script if this exists.
awk '{print $1}' $dir/lexicon.txt | \
perl -e '($word_counts)=@ARGV;
open(W, "<$word_counts")||die "opening word-counts $word_counts";
while(<STDIN>) { chop; $seen{$_}=1; }
while(<W>) {
($c,$w) = split;
if (!defined $seen{$w}) { print; }
} ' $dir/word_counts > $dir/oov_counts.txt
echo "*Highest-count OOVs are:"
head -n 20 $dir/oov_counts.txt
utils/validate_dict_dir.pl $dir
exit 0;
srcdir=data/local/train_asr # This is where we downloaded some stuff..
dir=data/local/dict
mkdir -p $dir
srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text
# assume swbd_p1_data_prep.sh was done already.
[ ! -f "$srcdict" ] && echo "No such file $srcdict" && exit 1;
#(2a) Dictionary preparation:
# Pre-processing (Upper-case, remove comments)
awk 'BEGIN{getline}($0 !~ /^#/) {$0=toupper($0); print}' \
$srcdict | sort | awk '($0 !~ /^[:space:]*$/) {print}' \
> $dir/lexicon1.txt || exit 1;
cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
grep -v SIL > $dir/nonsilence_phones.txt || exit 1;
( echo SIL; echo SPN; echo NSN; echo LAU ) > $dir/silence_phones.txt
echo SIL > $dir/optional_silence.txt
# No "extra questions" in the input to this setup, as we don't
# have stress or tone.
echo -n >$dir/extra_questions.txt
# Add to the lexicon the silences, noises etc.
(echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU';
echo '<UNK> SPN' ) | \
cat - $dir/lexicon1.txt > $dir/lexicon2.txt || exit 1;
# Map the words in the lexicon. That is-- for each word in the lexicon, we map it
# to a new written form. The transformations we do are:
# remove laughter markings, e.g.
# [LAUGHTER-STORY] -> STORY
# Remove partial-words, e.g.
# -[40]1K W AH N K EY
# becomes -1K
# and
# -[AN]Y IY
# becomes
# -Y
# -[A]B[OUT]- B
# becomes
# -B-
# Also, curly braces, which appear to be used for "nonstandard"
# words or non-words, are removed, e.g.
# {WOLMANIZED} W OW L M AX N AY Z D
# -> WOLMANIZED
# Also, mispronounced words, e.g.
# [YEAM/YEAH] Y AE M
# are changed to just e.g. YEAM, i.e. the orthography
# of the mispronounced version.
# Note-- this is only really to be used in training. The main practical
# reason is to avoid having tons of disambiguation symbols, which
# we otherwise would get because there are many partial words with
# the same phone sequences (most problematic: S).
# Also, map
# THEM_1 EH M -> THEM
# so that multiple pronunciations just have alternate entries
# in the lexicon.
local/dnn/swbd_map_words.pl -f 1 $dir/lexicon2.txt | sort | uniq > $dir/lexicon3.txt || exit 1;
cp $dir/lexicon3.txt $dir/lexicon.txt # This is the final lexicon.
echo Prepared input dictionary and phone-sets for Switchboard phase 1.

Просмотреть файл

@ -0,0 +1,111 @@
#!/bin/bash
# To be run from one directory above this script.
text=data/train_all_asr/text
lexicon=data/local/dict/lexicon.txt
for f in "$text" "$lexicon"; do
[ ! -f $x ] && echo "$0: No such file $f" && exit 1;
done
# This script takes no arguments. It assumes you have already run
# fisher_data_prep.sh and fisher_prepare_dict.sh
# It takes as input the files
#data/train_all/text
#data/local/dict/lexicon.txt
dir=data/local/lm
mkdir -p $dir
export LC_ALL=C # You'll get errors about things being not sorted, if you
# have a different locale.
export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
( # First make sure the kaldi_lm toolkit is installed.
cd ../../../tools || exit 1;
if [ -d kaldi_lm ]; then
echo Not installing the kaldi_lm toolkit since it is already there.
else
echo Downloading and installing the kaldi_lm tools
if [ ! -f kaldi_lm.tar.gz ]; then
wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
fi
tar -xvzf kaldi_lm.tar.gz || exit 1;
cd kaldi_lm
make || exit 1;
echo Done making the kaldi_lm tools
fi
) || exit 1;
mkdir -p $dir
cleantext=$dir/text.no_oov
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<unk> ");} } printf("\n");}' \
> $cleantext || exit 1;
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
sort -nr > $dir/word.counts || exit 1;
# Get counts from acoustic training transcripts, and add one-count
# for each word in the lexicon (but not silence, we don't want it
# in the LM-- we'll add it optionally later).
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
# note: we probably won't really make use of <unk> as there aren't any OOVs
cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<unk>" > $dir/word_map \
|| exit 1;
# note: ignore 1st field of train.txt, it's the utterance-id.
cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
{ for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
|| exit 1;
train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
# Perplexity over 88307.000000 words (excluding 691.000000 OOVs) is 71.241332
# note: output is
# data/local/lm/3gram-mincount/lm_unpruned.gz
exit 0
# From here is some commands to do a baseline with SRILM (assuming
# you have it installed).
heldout_sent=10000 # Don't change this if you want result to be comparable with
# kaldi_lm results
sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
mkdir -p $sdir
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
head -$heldout_sent > $sdir/heldout
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
tail -n +$heldout_sent > $sdir/train
cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
-map-unk "<unk>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
# data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM
# file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs
# 0 zeroprobs, logprob= -165170 ppl= 71.7609 ppl1= 123.258
# Note: perplexity SRILM gives to Kaldi-LM model is similar to what kaldi-lm reports above.
# Difference in WSJ must have been due to different treatment of <unk>.
ngram -lm $dir/3gram-mincount/lm_unpruned.gz -ppl $sdir/heldout
# data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM
# file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs
# 0 zeroprobs, logprob= -164990 ppl= 71.4278 ppl1= 122.614

Просмотреть файл

@ -0,0 +1,321 @@
#!/bin/bash
# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).
# 2015 David Snyder
# Apache 2.0.
#
# This script is based off of get_egs2.sh in ../../steps/nnet2/, but has been
# modified for speaker recogntion purposes to use a sliding window CMN.
#
# This script, which will generally be called from other neural-net training
# scripts, extracts the training examples used to train the neural net (and also
# the validation examples used for diagnostics), and puts them in separate archives.
#
# This script differs from get_egs.sh in that it dumps egs with several frames
# of labels, controlled by the frames_per_eg config variable (default: 8). This
# takes many times less disk space because typically we have 4 to 7 frames of
# context on the left and right, and this ends up getting shared. This is at
# the expense of slightly higher disk I/O during training time.
#
# We also have a simpler way of dividing the egs up into pieces, with one level
# of index, so we have $dir/egs.{0,1,2,...}.ark instead of having two levels of
# indexes. The extra files we write to $dir that explain the structure are
# $dir/info/num_archives, which contains the number of files egs.*.ark, and
# $dir/info/frames_per_eg, which contains the number of frames of labels per eg
# (e.g. 7), and $dir/samples_per_archive. These replace the files
# iters_per_epoch and num_jobs_nnet and egs_per_iter that the previous script
# wrote to. This script takes the directory where the "egs" are located as the
# argument, not the directory one level up.
# Begin configuration section.
cmd=run.pl
feat_type= # e.g. set it to "raw" to use raw MFCC
frames_per_eg=8 # number of frames of labels per example. more->less disk space and
# less time preparing egs, but more I/O during training.
# note: the script may reduce this if reduce_frames_per_eg is true.
left_context=4 # amount of left-context per eg
right_context=4 # amount of right-context per eg
reduce_frames_per_eg=true # If true, this script may reduce the frames_per_eg
# if there is only one archive and even with the
# reduced frames_pe_eg, the number of
# samples_per_iter that would result is less than or
# equal to the user-specified value.
num_utts_subset=300 # number of utterances in validation and training
# subsets used for shrinkage and diagnostics.
num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
num_train_frames_combine=10000 # # train frames for the above.
num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
samples_per_iter=400000 # each iteration of training, see this many samples
# per job. This is just a guideline; it will pick a number
# that divides the number of samples in the entire data.
transform_dir= # If supplied, overrides alidir as the place to find fMLLR transforms
postdir= # If supplied, we will use posteriors in it as soft training targets.
stage=0
io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.
random_copy=false
online_ivector_dir= # can be used if we are including speaker information as iVectors.
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: $0 [opts] <data> <ali-dir> <egs-dir>"
echo " e.g.: $0 data/train exp/tri3_ali exp/tri4_nnet/egs"
echo ""
echo "Main options (for others, see top of script file)"
echo " --config <config-file> # config file containing options"
echo " --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
echo " --samples-per-iter <#samples;400000> # Number of samples of data to process per iteration, per"
echo " # process."
echo " --feat-type <lda|raw> # (by default it tries to guess). The feature type you want"
echo " # to use as input to the neural net."
echo " --frames-per-eg <frames;8> # number of frames per eg on disk"
echo " --left-context <width;4> # Number of frames on left side to append for feature input"
echo " --right-context <width;4> # Number of frames on right side to append for feature input"
echo " --num-frames-diagnostic <#frames;4000> # Number of frames used in computing (train,valid) diagnostics"
echo " --num-valid-frames-combine <#frames;10000> # Number of frames used in getting combination weights at the"
echo " # very end."
echo " --stage <stage|0> # Used to run a partially-completed training process from somewhere in"
echo " # the middle."
exit 1;
fi
data=$1
alidir=$2
dir=$3
# Check some files.
[ ! -z "$online_ivector_dir" ] && \
extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
for f in $data/feats.scp $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $extra_files; do
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir...
sdata=$data/split$nj
utils/split_data.sh $data $nj
mkdir -p $dir/log $dir/info
cp $alidir/tree $dir
# Get list of validation utterances.
awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
> $dir/valid_uttlist || exit 1;
if [ -f $data/utt2uniq ]; then
echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
echo "include all perturbed versions of the same 'real' utterances."
mv $dir/valid_uttlist $dir/valid_uttlist.tmp
utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist
rm $dir/uniq2utt $dir/valid_uttlist.tmp
fi
awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1;
[ -z "$transform_dir" ] && transform_dir=$alidir
## Set up features.
if [ -z $feat_type ]; then
if [ -f $alidir/final.mat ] && [ ! -f $transform_dir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
fi
echo "$0: feature type is $feat_type"
case $feat_type in
raw) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- |"
valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- |"
train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- |"
;;
lda)
splice_opts=`cat $alidir/splice_opts 2>/dev/null`
# caution: the top-level nnet training script should copy these to its own dir now.
cp $alidir/{splice_opts,final.mat} $dir || exit 1;
feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
;;
*) echo "$0: invalid feature type $feat_type" && exit 1;
esac
if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
echo "$0: using transforms from $transform_dir"
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/trans.*|' ark:- ark:- |"
train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/trans.*|' ark:- ark:- |"
fi
if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
echo "$0: using raw-fMLLR transforms from $transform_dir"
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/raw_trans.*|' ark:- ark:- |"
train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/raw_trans.*|' ark:- ark:- |"
fi
if [ ! -z "$online_ivector_dir" ]; then
feats_one="$(echo "$feats" | sed s:JOB:1:g)"
ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
echo $ivector_dim > $dir/info/ivector_dim
ivectors_opt="--const-feat-dim=$ivector_dim"
ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
feats="$feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
valid_feats="$valid_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
train_subset_feats="$train_subset_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
else
echo 0 >$dir/info/ivector_dim
fi
if [ $stage -le 0 ]; then
echo "$0: working out number of frames of training data"
num_frames=$(steps/nnet2/get_num_frames.sh $data)
echo $num_frames > $dir/info/num_frames
else
num_frames=`cat $dir/info/num_frames` || exit 1;
fi
# the + 1 is to round up, not down... we assume it doesn't divide exactly.
num_archives=$[$num_frames/($frames_per_eg*$samples_per_iter)+1]
# (for small data)- while reduce_frames_per_eg == true and the number of
# archives is 1 and would still be 1 if we reduced frames_per_eg by 1, reduce it
# by 1.
reduced=false
while $reduce_frames_per_eg && [ $frames_per_eg -gt 1 ] && \
[ $[$num_frames/(($frames_per_eg-1)*$samples_per_iter)] -eq 0 ]; do
frames_per_eg=$[$frames_per_eg-1]
num_archives=1
reduced=true
done
$reduced && echo "$0: reduced frames_per_eg to $frames_per_eg because amount of data is small."
echo $num_archives >$dir/info/num_archives
echo $frames_per_eg >$dir/info/frames_per_eg
# Working out number of egs per archive
egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)]
! [ $egs_per_archive -le $samples_per_iter ] && \
echo "$0: script error: egs_per_archive=$egs_per_archive not <= samples_per_iter=$samples_per_iter" \
&& exit 1;
echo $egs_per_archive > $dir/info/egs_per_archive
echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
echo "$0: $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
# Making soft links to storage directories. This is a no-up unless
# the subdirectory $dir/storage/ exists. See utils/create_split_dir.pl
for x in `seq $num_archives`; do
utils/create_data_link.pl $dir/egs.$x.ark
for y in `seq $nj`; do
utils/create_data_link.pl $dir/egs_orig.$x.$y.ark
done
done
nnet_context_opts="--left-context=$left_context --right-context=$right_context"
echo $left_context > $dir/info/left_context
echo $right_context > $dir/info/right_context
if [ $stage -le 2 ]; then
echo "$0: Getting validation and training subset examples."
rm $dir/.error 2>/dev/null
echo "$0: ... extracting validation and training-subset alignments."
set -o pipefail;
for id in $(seq $nj); do gunzip -c $alidir/ali.$id.gz; done | \
copy-int-vector ark:- ark,t:- | \
utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) | \
gzip -c >$dir/ali_special.gz || exit 1;
set +o pipefail; # unset the pipefail option.
$cmd $dir/log/create_valid_subset.log \
nnet-get-egs $ivectors_opt $nnet_context_opts "$valid_feats" \
"ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
"ark:$dir/valid_all.egs" || touch $dir/.error &
$cmd $dir/log/create_train_subset.log \
nnet-get-egs $ivectors_opt $nnet_context_opts "$train_subset_feats" \
"ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
"ark:$dir/train_subset_all.egs" || touch $dir/.error &
wait;
[ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
echo "... Getting subsets of validation examples for diagnostics and combination."
$cmd $dir/log/create_valid_subset_combine.log \
nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \
ark:$dir/valid_combine.egs || touch $dir/.error &
$cmd $dir/log/create_valid_subset_diagnostic.log \
nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \
ark:$dir/valid_diagnostic.egs || touch $dir/.error &
$cmd $dir/log/create_train_subset_combine.log \
nnet-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \
ark:$dir/train_combine.egs || touch $dir/.error &
$cmd $dir/log/create_train_subset_diagnostic.log \
nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \
ark:$dir/train_diagnostic.egs || touch $dir/.error &
wait
sleep 5 # wait for file system to sync.
cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs
for f in $dir/{combine,train_diagnostic,valid_diagnostic}.egs; do
[ ! -s $f ] && echo "No examples in file $f" && exit 1;
done
rm $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_combine.egs $dir/ali_special.gz
fi
if [ $stage -le 3 ]; then
# create egs_orig.*.*.ark; the first index goes to $num_archives,
# the second to $nj (which is the number of jobs in the original alignment
# dir)
egs_list=
for n in $(seq $num_archives); do
egs_list="$egs_list ark:$dir/egs_orig.$n.JOB.ark"
done
echo "$0: Generating training examples on disk"
# The examples will go round-robin to egs_list.
if [ ! -z $postdir ]; then
$cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
nnet-get-egs $ivectors_opt $nnet_context_opts --num-frames=$frames_per_eg "$feats" \
scp:$postdir/post.JOB.scp ark:- \| \
nnet-copy-egs ark:- $egs_list || exit 1;
else
$cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
nnet-get-egs $ivectors_opt $nnet_context_opts --num-frames=$frames_per_eg "$feats" \
"ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
nnet-copy-egs ark:- $egs_list || exit 1;
fi
fi
if [ $stage -le 4 ]; then
echo "$0: recombining and shuffling order of archives on disk"
# combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and
# shuffle the order, writing to the egs.JOB.ark
egs_list=
for n in $(seq $nj); do
egs_list="$egs_list $dir/egs_orig.JOB.$n.ark"
done
$cmd $io_opts $extra_opts JOB=1:$num_archives $dir/log/shuffle.JOB.log \
nnet-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:$dir/egs.JOB.ark || exit 1;
fi
if [ $stage -le 5 ]; then
echo "$0: removing temporary archives"
for x in `seq $num_archives`; do
for y in `seq $nj`; do
file=$dir/egs_orig.$x.$y.ark
[ -L $file ] && rm $(readlink -f $file)
rm $file
done
done
fi
echo "$0: Finished preparing training examples"

181
egs/sre10/v1/local/dnn/get_lda.sh Executable file
Просмотреть файл

@ -0,0 +1,181 @@
#!/bin/bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey).
# 2015 David Snyder
# Apache 2.0.
#
# This script is based off of get_lda.sh in ../../steps/nnet2/, but has been
# modified for speaker recogntion purposes to use a sliding window CMN.
#
# This script, which will generally be called from other neural-net training
# scripts, extracts the training examples used to train the neural net (and also
# the validation examples used for diagnostics), and puts them in separate archives.
# Begin configuration section.
cmd=run.pl
feat_type=
stage=0
splice_width=4 # meaning +- 4 frames on each side for second LDA
left_context= # left context for second LDA
right_context= # right context for second LDA
rand_prune=4.0 # Relates to a speedup we do for LDA.
within_class_factor=0.0001 # This affects the scaling of the transform rows...
# sorry for no explanation, you'll have to see the code.
transform_dir= # If supplied, overrides alidir
num_feats=10000 # maximum number of feature files to use. Beyond a certain point it just
# gets silly to use more data.
lda_dim= # This defaults to no dimension reduction.
online_ivector_dir=
ivector_randomize_prob=0.0 # if >0.0, randomizes iVectors during training with
# this prob per iVector.
ivector_dir=
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 4 ]; then
echo "Usage: steps/nnet2/get_lda.sh [opts] <data> <lang> <ali-dir> <exp-dir>"
echo " e.g.: steps/nnet2/get_lda.sh data/train data/lang exp/tri3_ali exp/tri4_nnet"
echo " As well as extracting the examples, this script will also do the LDA computation,"
echo " if --est-lda=true (default:true)"
echo ""
echo "Main options (for others, see top of script file)"
echo " --config <config-file> # config file containing options"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --splice-width <width|4> # Number of frames on each side to append for feature input"
echo " # (note: we splice processed, typically 40-dimensional frames"
echo " --left-context <width;4> # Number of frames on left side to append for feature input, overrides splice-width"
echo " --right-context <width;4> # Number of frames on right side to append for feature input, overrides splice-width"
echo " --stage <stage|0> # Used to run a partially-completed training process from somewhere in"
echo " # the middle."
echo " --online-vector-dir <dir|none> # Directory produced by"
echo " # steps/online/nnet2/extract_ivectors_online.sh"
exit 1;
fi
data=$1
lang=$2
alidir=$3
dir=$4
[ -z "$left_context" ] && left_context=$splice_width
[ -z "$right_context" ] && right_context=$splice_width
[ ! -z "$online_ivector_dir" ] && \
extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $extra_files; do
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
# Set some variables.
oov=`cat $lang/oov.int`
num_leaves=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj
mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir
[ -z "$transform_dir" ] && transform_dir=$alidir
## Set up features. Note: these are different from the normal features
## because we have one rspecifier that has the features for the entire
## training set, not separate ones for each batch.
if [ -z $feat_type ]; then
if [ -f $alidir/final.mat ] && ! [ -f $alidir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
fi
echo "$0: feature type is $feat_type"
# If we have more than $num_feats feature files (default: 10k),
# we use a random subset. This won't affect the transform much, and will
# spare us an unnecessary pass over the data. Probably 10k is
# way too much, but for small datasets this phase is quite fast.
N=$[$num_feats/$nj]
case $feat_type in
raw) feats="ark,s,cs:utils/subset_scp.pl --quiet $N $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- |"
;;
lda)
splice_opts=`cat $alidir/splice_opts 2>/dev/null`
cp $alidir/{splice_opts,final.mat} $dir || exit 1;
feats="ark,s,cs:utils/subset_scp.pl --quiet $N $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
;;
*) echo "$0: invalid feature type $feat_type" && exit 1;
esac
if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
echo "$0: using transforms from $transform_dir"
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
fi
if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
echo "$0: using raw-fMLLR transforms from $transform_dir"
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
fi
feats_one="$(echo "$feats" | sed s:JOB:1:g)"
# note: feat_dim is the raw, un-spliced feature dim without the iVectors.
feat_dim=$(feat-to-dim "$feats_one" -) || exit 1;
# by default: no dim reduction.
spliced_feats="$feats splice-feats --left-context=$left_context --right-context=$right_context ark:- ark:- |"
if [ ! -z "$online_ivector_dir" ]; then
ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
# note: subsample-feats, with negative value of n, repeats each feature n times.
spliced_feats="$spliced_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- | ivector-randomize --randomize-prob=$ivector_randomize_prob ark:- ark:- |' ark:- |"
ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
else
ivector_dim=0
fi
echo $ivector_dim >$dir/ivector_dim
if [ -z "$lda_dim" ]; then
spliced_feats_one="$(echo "$spliced_feats" | sed s:JOB:1:g)"
lda_dim=$(feat-to-dim "$spliced_feats_one" -) || exit 1;
fi
if [ $stage -le 0 ]; then
echo "$0: Accumulating LDA statistics."
rm $dir/lda.*.acc 2>/dev/null # in case any left over from before.
$cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
acc-lda --rand-prune=$rand_prune $alidir/final.mdl "$spliced_feats" ark,s,cs:- \
$dir/lda.JOB.acc || exit 1;
fi
echo $feat_dim > $dir/feat_dim
echo $lda_dim > $dir/lda_dim
echo $ivector_dim > $dir/ivector_dim
if [ $stage -le 1 ]; then
sum-lda-accs $dir/lda.acc $dir/lda.*.acc 2>$dir/log/lda_sum.log || exit 1;
rm $dir/lda.*.acc
fi
if [ $stage -le 2 ]; then
# There are various things that we sometimes (but not always) need
# the within-class covariance and its Cholesky factor for, and we
# write these to disk just in case.
nnet-get-feature-transform --write-cholesky=$dir/cholesky.tpmat \
--write-within-covar=$dir/within_covar.spmat \
--within-class-factor=$within_class_factor --dim=$lda_dim \
$dir/lda.mat $dir/lda.acc \
2>$dir/log/lda_est.log || exit 1;
fi
echo "$0: Finished estimating LDA"

Просмотреть файл

@ -0,0 +1,50 @@
#!/bin/bash
# Remove excess utterances once they appear more than a specified
# number of times with the same transcription, in a data set.
# E.g. useful for removing excess "uh-huh" from training.
if [ $# != 3 ]; then
echo "Usage: remove_dup_utts.sh max-count src-data-dir dest-data-dir"
exit 1;
fi
maxcount=$1
srcdir=$2
destdir=$3
mkdir -p $destdir
[ ! -f $srcdir/text ] && echo "Invalid input directory $srcdir" && exit 1;
cp $srcdir/* $destdir
cat $srcdir/text | \
perl -e '
$maxcount = shift @ARGV;
@all = ();
$p1 = 103349; $p2 = 71147; $k = 0;
sub random { # our own random number generator: predictable.
$k = ($k + $p1) % $p2;
return ($k / $p2);
}
while(<>) {
push @all, $_;
@A = split(" ", $_);
shift @A;
$text = join(" ", @A);
$count{$text} ++;
}
foreach $line (@all) {
@A = split(" ", $line);
shift @A;
$text = join(" ", @A);
$n = $count{$text};
if ($n < $maxcount || random() < ($maxcount / $n)) {
print $line;
}
}' $maxcount >$destdir/text
echo "Reduced number of utterances from `cat $srcdir/text | wc -l` to `cat $destdir/text | wc -l`"
echo "Using fix_data_dir.sh to reconcile the other files."
utils/fix_data_dir.sh $destdir
rm -r $destdir/.backup

Просмотреть файл

@ -0,0 +1,28 @@
#!/bin/bash
# Make the features.
. cmd.sh
stage=1
set -e
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
mkdir -p exp/nnet2_online
if [ $stage -le 1 ]; then
# this shows how you can split across multiple file-systems. we'll split the
# MFCC dir across multiple locations. You might want to be careful here, if you
# have multiple copies of Kaldi checked out and run the same recipe, not to let
# them overwrite each other.
mfccdir=mfcc
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
date=$(date +'%m_%d_%H_%M')
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english-$date/s5/$mfccdir/storage $mfccdir/storage
fi
utils/copy_data_dir.sh data/train_asr data/train_hires_asr
steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" data/train_hires_asr exp/make_hires/train $mfccdir || exit 1;
fi

Просмотреть файл

@ -0,0 +1,71 @@
#!/bin/bash
# This script is based on run_nnet2_multisplice.sh in
# egs/fisher_english/s5/local/online. It has been modified
# for speaker recognition.
. cmd.sh
stage=1
train_stage=-10
use_gpu=true
set -e
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
# assume use_gpu=true since it would be way too slow otherwise.
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
fi
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
dir=exp/nnet2_online/nnet_ms_a
mkdir -p exp/nnet2_online
# Stages 1 through 5 are done in run_nnet2_common.sh,
# so it can be shared with other similar scripts.
local/dnn/run_nnet2_common.sh --stage $stage
if [ $stage -le 6 ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
utils/create_split_dir.pl /export/b0{6,7,8,9}/$(USER)/kaldi-dsata/egs/fisher_english/s5/$dir/egs/storage $dir/egs/storage
fi
# Because we have a lot of data here and we don't want the training to take
# too long, we reduce the number of epochs from the defaults (15 + 5) to (3 +
# 1). The option "--io-opts '-tc 12'" is to have more than the default number
# (5) of jobs dumping the egs to disk; this is OK since we're splitting our
# data across four filesystems for speed.
local/dnn/train_multisplice_accel2.sh --stage $train_stage \
--feat-type raw \
--splice-indexes "layer0/-2:-1:0:1:2 layer1/-1:2 layer3/-3:3 layer4/-7:2" \
--num-epochs 6 \
--num-hidden-layers 6 \
--num-jobs-initial 3 --num-jobs-final 18 \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--mix-up 10500 \
--initial-effective-lrate 0.0015 --final-effective-lrate 0.00015 \
--cmd "$decode_cmd" \
--egs-dir "$common_egs_dir" \
--pnorm-input-dim 3500 \
--pnorm-output-dim 350 \
data/train_hires_asr data/lang exp/tri5a $dir || exit 1;
fi
exit 0;

Просмотреть файл

@ -0,0 +1,174 @@
#!/bin/bash
# This script is based on egs/fisher_english/s5/run.sh. It trains a
# multisplice time-delay neural network used in the DNN-based speaker
# recognition recipes.
# It's best to run the commands in this one by one.
. cmd.sh
. path.sh
mfccdir=`pwd`/mfcc
set -e
# the next command produces the data in local/train_all_asr
local/dnn/fisher_data_prep.sh /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \
/export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13
# You could also try specifying the --calldata argument to this command as below.
# If specified, the script will use actual speaker personal identification
# numbers released with the dataset, i.e. real speaker IDs. Note: --calldata has
# to be the first argument of this script.
# local/fisher_data_prep.sh --calldata /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \
# /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13
# at BUT:
# local/fisher_data_prep.sh /mnt/matylda6/jhu09/qpovey/FISHER/LDC2005T19 /mnt/matylda2/data/FISHER/
local/dnn/fisher_prepare_dict.sh
utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
local/dnn/fisher_train_lms.sh
local/dnn/fisher_create_test_lang.sh
# Use the first 4k sentences as dev set. Note: when we trained the LM, we used
# the 1st 10k sentences as dev set, so the 1st 4k won't have been used in the
# LM training data. However, they will be in the lexicon, plus speakers
# may overlap, so it's still not quite equivalent to a test set.
utils/fix_data_dir.sh data/train_all_asr
steps/make_mfcc.sh --nj 40 --cmd "$train_cmd" --mfcc-config conf/mfcc_asr.conf \
data/train_all_asr exp/make_mfcc/train_all_asr $mfccdir || exit 1;
utils/fix_data_dir.sh data/train_all_asr
utils/validate_data_dir.sh data/train_all_asr
# The dev and test sets are each about 3.3 hours long. These are not carefully
# done; there may be some speaker overlap with each other and with the training
# set. Note: in our LM-training setup we excluded the first 10k utterances (they
# were used for tuning but not for training), so the LM was not (directly) trained
# on either the dev or test sets.
utils/subset_data_dir.sh --first data/train_all_asr 10000 data/dev_and_test_asr
utils/subset_data_dir.sh --first data/dev_and_test_asr 5000 data/dev_asr
utils/subset_data_dir.sh --last data/dev_and_test_asr 5000 data/test_asr
rm -r data/dev_and_test_asr
steps/compute_cmvn_stats.sh data/dev_asr exp/make_mfcc/dev_asr $mfccdir
steps/compute_cmvn_stats.sh data/test_asr exp/make_mfcc/test_asr $mfccdir
n=$[`cat data/train_all_asr/segments | wc -l` - 10000]
utils/subset_data_dir.sh --last data/train_all_asr $n data/train_asr
steps/compute_cmvn_stats.sh data/train_asr exp/make_mfcc/train_asr $mfccdir
# Now-- there are 1.6 million utterances, and we want to start the monophone training
# on relatively short utterances (easier to align), but not only the very shortest
# ones (mostly uh-huh). So take the 100k shortest ones, and then take 10k random
# utterances from those.
utils/subset_data_dir.sh --shortest data/train_asr 100000 data/train_asr_100kshort
utils/subset_data_dir.sh data/train_asr_100kshort 10000 data/train_asr_10k
local/dnn/remove_dup_utts.sh 100 data/train_asr_10k data/train_asr_10k_nodup
utils/subset_data_dir.sh --speakers data/train_asr 30000 data/train_asr_30k
utils/subset_data_dir.sh --speakers data/train_asr 100000 data/train_asr_100k
# The next commands are not necessary for the scripts to run, but increase
# efficiency of data access by putting the mfcc's of the subset
# in a contiguous place in a file.
( . path.sh;
# make sure mfccdir is defined as above..
cp data/train_asr_10k_nodup/feats.scp{,.bak}
copy-feats scp:data/train_asr_10k_nodup/feats.scp ark,scp:$mfccdir/kaldi_fish_10k_nodup.ark,$mfccdir/kaldi_fish_10k_nodup.scp \
&& cp $mfccdir/kaldi_fish_10k_nodup.scp data/train_asr_10k_nodup/feats.scp
)
( . path.sh;
# make sure mfccdir is defined as above..
cp data/train_asr_30k/feats.scp{,.bak}
copy-feats scp:data/train_asr_30k/feats.scp ark,scp:$mfccdir/kaldi_fish_30k.ark,$mfccdir/kaldi_fish_30k.scp \
&& cp $mfccdir/kaldi_fish_30k.scp data/train_asr_30k/feats.scp
)
( . path.sh;
# make sure mfccdir is defined as above..
cp data/train_asr_100k/feats.scp{,.bak}
copy-feats scp:data/train_asr_100k/feats.scp ark,scp:$mfccdir/kaldi_fish_100k.ark,$mfccdir/kaldi_fish_100k.scp \
&& cp $mfccdir/kaldi_fish_100k.scp data/train_asr_100k/feats.scp
)
steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
data/train_asr_10k_nodup data/lang exp/mono0a
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train_asr_30k data/lang exp/mono0a exp/mono0a_ali || exit 1;
steps/train_deltas.sh --cmd "$train_cmd" \
2500 20000 data/train_asr_30k data/lang exp/mono0a_ali exp/tri1 || exit 1;
(utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri1/graph data/dev exp/tri1/decode_dev)&
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train_asr_30k data/lang exp/tri1 exp/tri1_ali || exit 1;
steps/train_deltas.sh --cmd "$train_cmd" \
2500 20000 data/train_asr_30k data/lang exp/tri1_ali exp/tri2 || exit 1;
(
utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1;
steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1;
)&
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train_asr_100k data/lang exp/tri2 exp/tri2_ali || exit 1;
# Train tri3a, which is LDA+MLLT, on 100k data.
steps/train_lda_mllt.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" \
5000 40000 data/train_asr_100k data/lang exp/tri2_ali exp/tri3a || exit 1;
(
utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
)&
# Next we'll use fMLLR and train with SAT (i.e. on
# fMLLR features)
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train_asr_100k data/lang exp/tri3a exp/tri3a_ali || exit 1;
steps/train_sat.sh --cmd "$train_cmd" \
5000 100000 data/train_asr_100k data/lang exp/tri3a_ali exp/tri4a || exit 1;
(
utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri4a/graph data/dev exp/tri4a/decode_dev
)&
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train_asr data/lang exp/tri4a exp/tri4a_ali || exit 1;
steps/train_sat.sh --cmd "$train_cmd" \
7000 300000 data/train_asr data/lang exp/tri4a_ali exp/tri5a || exit 1;
(
utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri5a/graph data/dev exp/tri5a/decode_dev
)&
# this will help find issues with the lexicon.
# steps/cleanup/debug_lexicon.sh --nj 300 --cmd "$train_cmd" data/train_asr_100k data/lang exp/tri5a data/local/dict/lexicon.txt exp/debug_lexicon_100k
## The following is based on the best current neural net recipe.
local/dnn/run_nnet2_multisplice.sh

Просмотреть файл

@ -0,0 +1,641 @@
#!/bin/bash
# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).
# 2013 Xiaohui Zhang
# 2013 Guoguo Chen
# 2014 Vimal Manohar
# 2014 Vijayaditya Peddinti
# Apache 2.0.
# This is a modified version of train_multisplice_accel2.sh in
# steps/nnet2/ for speaker recognition. The main difference is
# that it uses different get_lda.sh and get_egs2.sh scripts.
#
# The original train_multisplice_accel2.sh was a modified version of
# train_pnorm_multisplice2.sh (still using pnorm). The "accel" refers to the
# fact that we increase the number of jobs during training (from
# --num-jobs-initial to --num-jobs-final). We dropped "pnorm" from the name as
# it was getting too long.
# Begin configuration section.
cmd=run.pl
num_epochs=15 # Number of epochs of training;
# the number of iterations is worked out from this.
initial_effective_lrate=0.01
final_effective_lrate=0.001
bias_stddev=0.5
pnorm_input_dim=3000
pnorm_output_dim=300
minibatch_size=128 # by default use a smallish minibatch size for neural net
# training; this controls instability which would otherwise
# be a problem with multi-threaded update.
samples_per_iter=400000 # each iteration of training, see this many samples
# per job. This option is passed to get_egs.sh
num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training
num_jobs_final=8 # Number of neural net jobs to run in parallel at the end of training
prior_subset_size=10000 # 10k samples per job, for computing priors. Should be
# more than enough.
num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
get_egs_stage=0
online_ivector_dir=
remove_egs=true # set to false to disable removing egs.
max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
# to the final 'combine' stage, but these models will themselves be averages of
# iteration-number ranges.
shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
# on each iter. You could set it to 0 or to a large value for complete
# randomization, but this would both consume memory and cause spikes in
# disk I/O. Smaller is easier on disk and memory but less random. It's
# not a huge deal though, as samples are anyway randomized right at the start.
# (the point of this is to get data in different minibatches on different iterations,
# since in the preconditioning method, 2 samples in the same minibatch can
# affect each others' gradients.
add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3
stage=-4
exit_stage=-100 # you can set this to terminate the training early. Exits before running this stage
splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
# Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
# note: hidden layers which are composed of one or more components,
# so hidden layer indexing is different from component count
io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. These don't
randprune=4.0 # speeds up LDA.
alpha=4.0 # relates to preconditioning.
update_period=4 # relates to online preconditioning: says how often we update the subspace.
num_samples_history=2000 # relates to online preconditioning
max_change_per_sample=0.075
precondition_rank_in=20 # relates to online preconditioning
precondition_rank_out=80 # relates to online preconditioning
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
# specified.)
num_threads=16
parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G"
# by default we use 16 threads; this lets the queue know.
# note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_num_threads=8
combine_parallel_opts="-pe smp 8" # queue options for the "combine" stage.
cleanup=true
egs_dir=
lda_opts=
lda_dim=
egs_opts=
transform_dir= # If supplied, overrides alidir
feat_type= # Can be used to force "raw" features.
align_cmd= # The cmd that is passed to steps/nnet2/align.sh
align_use_gpu= # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
realign_times= # List of times on which we realign. Each time is
# floating point number strictly between 0 and 1, which
# will be multiplied by the num-iters to get an iteration
# number.
num_jobs_align=30 # Number of jobs for realignment
# End configuration section.
frames_per_eg=8 # to be passed on to get_egs2.sh
trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 4 ]; then
echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
echo ""
echo "Main options (for others, see top of script file)"
echo " --config <config-file> # config file containing options"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --num-epochs <#epochs|15> # Number of epochs of training"
echo " --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training."
echo " --final-effective-lrate <lrate|0.004> # effective learning rate at end of training."
echo " # data, 0.00025 for large data"
echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers"
echo " --mix-up <#pseudo-gaussians|0> # Can be used to have multiple targets in final output layer,"
echo " # per context-dependent state. Try a number several times #states."
echo " --num-jobs-initial <num-jobs|1> # Number of parallel jobs to use for neural net training, at the start."
echo " --num-jobs-final <num-jobs|8> # Number of parallel jobs to use for neural net training, at the end"
echo " --num-threads <num-threads|16> # Number of parallel threads per job (will affect results"
echo " # as well as speed; may interact with batch size; if you increase"
echo " # this, you may want to decrease the batch size."
echo " --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\"> # extra options to pass to e.g. queue.pl for processes that"
echo " # use multiple threads... note, you might have to reduce mem_free,ram_free"
echo " # versus your defaults, because it gets multiplied by the -pe smp argument."
echo " --io-opts <opts|\"-tc 10\"> # Options given to e.g. queue.pl for jobs that do a lot of I/O."
echo " --minibatch-size <minibatch-size|128> # Size of minibatch to process (note: product with --num-threads"
echo " # should not get too large, e.g. >2k)."
echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per"
echo " # process."
echo " --splice-indexes <string|layer0/-4:-3:-2:-1:0:1:2:3:4> "
echo " # Frame indices used for each splice layer."
echo " # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
echo " # (note: we splice processed, typically 40-dimensional frames"
echo " --lda-dim <dim|''> # Dimension to reduce spliced features to with LDA"
echo " --realign-epochs <list-of-epochs|''> # A list of space-separated epoch indices the beginning of which"
echo " # realignment is to be done"
echo " --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
echo " --align-use-gpu (yes/no) # specify is gpu is to be used for realignment"
echo " --num-jobs-align <#njobs|30> # Number of jobs to perform realignment"
echo " --stage <stage|-4> # Used to run a partially-completed training process from somewhere in"
echo " # the middle."
exit 1;
fi
data=$1
lang=$2
alidir=$3
dir=$4
if [ ! -z "$realign_times" ]; then
[ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1
[ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1
fi
# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1
nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj
mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir
# process the splice_inds string, to get a layer-wise context string
# to be processed by the nnet-components
# this would be mainly used by SpliceComponent|SpliceMaxComponent
python steps/nnet2/make_multisplice_configs.py contexts --splice-indexes "$splice_indexes" $dir || exit -1;
context_string=$(cat $dir/vars) || exit -1
echo $context_string
eval $context_string || exit -1; #
# initializes variables used by get_lda.sh and get_egs.sh
# get_lda.sh : first_left_context, first_right_context,
# get_egs.sh : nnet_left_context & nnet_right_context
extra_opts=()
[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
[ -z "$transform_dir" ] && transform_dir=$alidir
extra_opts+=(--transform-dir $transform_dir)
if [ $stage -le -4 ]; then
echo "$0: calling get_lda.sh"
local/dnn/get_lda.sh $lda_opts "${extra_opts[@]}" --left-context $first_left_context --right-context $first_right_context --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi
# these files will have been written by get_lda.sh
feat_dim=$(cat $dir/feat_dim) || exit 1;
ivector_dim=$(cat $dir/ivector_dim) || exit 1;
lda_dim=$(cat $dir/lda_dim) || exit 1;
if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
extra_opts+=(--left-context $nnet_left_context )
extra_opts+=(--right-context $nnet_right_context )
echo "$0: calling get_egs2.sh"
local/dnn/get_egs2.sh $egs_opts "${extra_opts[@]}" \
--samples-per-iter $samples_per_iter --stage $get_egs_stage \
--io-opts "$io_opts" \
--cmd "$cmd" $egs_opts \
--frames-per-eg $frames_per_eg \
$data $alidir $dir/egs || exit 1;
fi
if [ -z $egs_dir ]; then
egs_dir=$dir/egs
# confirm that the provided egs_dir has the necessary context
egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
echo $egs_left_context $nnet_left_context $egs_right_context $nnet_right_context
([[ $egs_left_context -lt $nnet_left_context ]] || [[ $egs_right_context -lt $nnet_right_context ]]) &&
echo "Provided egs_dir $egs_dir does not have sufficient context to train the neural network." && exit -1;
fi
frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
# num_archives_expanded considers each separate label-position from
# 0..frames_per_eg-1 to be a separate archive.
num_archives_expanded=$[$num_archives*$frames_per_eg]
[ $num_jobs_initial -gt $num_jobs_final ] && \
echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;
[ $num_jobs_final -gt $num_archives_expanded ] && \
echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;
if ! [ $num_hidden_layers -ge 1 ]; then
echo "Invalid num-hidden-layers $num_hidden_layers"
exit 1
fi
if [ $stage -le -2 ]; then
echo "$0: initializing neural net";
lda_mat=$dir/lda.mat
tot_input_dim=$[$feat_dim+$ivector_dim]
online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"
initial_lrate=$(perl -e "print ($initial_effective_lrate*$num_jobs_initial);")
# create the config files for nnet initialization
python steps/nnet2/make_multisplice_configs.py \
--splice-indexes "$splice_indexes" \
--total-input-dim $tot_input_dim \
--ivector-dim $ivector_dim \
--lda-mat "$lda_mat" \
--lda-dim $lda_dim \
--pnorm-input-dim $pnorm_input_dim \
--pnorm-output-dim $pnorm_output_dim \
--online-preconditioning-opts "$online_preconditioning_opts" \
--initial-learning-rate $initial_lrate \
--bias-stddev $bias_stddev \
--num-hidden-layers $num_hidden_layers \
--num-targets $num_leaves \
configs $dir || exit -1;
$cmd $dir/log/nnet_init.log \
nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
$dir/0.mdl || exit 1;
fi
if [ $stage -le -1 ]; then
echo "Training transition probabilities and setting priors"
$cmd $dir/log/train_trans.log \
nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
|| exit 1;
fi
# set num_iters so that as close as possible, we process the data $num_epochs
# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded,
# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
num_archives_to_process=$[$num_epochs*$num_archives_expanded]
num_archives_processed=0
num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]
! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
&& echo "$0: Insufficient epochs" && exit 1
finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
# mix up at the iteration where we've processed about half the data; this keeps
# the overall training procedure fairly invariant to the number of initial and
# final jobs.
# j = initial, k = final, n = num-iters, x = half-of-data epoch,
# p is proportion of data we want to process (e.g. p=0.5 here).
# solve for x if the amount of data processed by epoch x is p
# times the amount by iteration n.
# put this in wolfram alpha:
# solve { x*j + (k-j)*x*x/(2*n) = p * (j*n + (k-j)*n/2), {x} }
# got: x = (j n-sqrt(-n^2 (j^2 (p-1)-k^2 p)))/(j-k) and j!=k and n!=0
# simplified manually to: n * (sqrt(((1-p)j^2 + p k^2)/2) - j)/(j-k)
mix_up_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters 0.5)
! [ $mix_up_iter -gt $finish_add_layers_iter ] && \
echo "Mix-up-iter is $mix_up_iter, should be greater than $finish_add_layers_iter -> add more epochs?" \
&& exit 1;
echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
[ $mix_up -gt 0 ] && echo "$0: Will mix up on iteration $mix_up_iter"
if [ $num_threads -eq 1 ]; then
parallel_suffix="-simple" # this enables us to use GPU code if
# we have just one thread.
parallel_train_opts=
if ! cuda-compiled; then
echo "$0: WARNING: you are running with one thread but you have not compiled"
echo " for CUDA. You may be running a setup optimized for GPUs. If you have"
echo " GPUs and have nvcc installed, go to src/ and do ./configure; make"
fi
else
parallel_suffix="-parallel"
parallel_train_opts="--num-threads=$num_threads"
fi
approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]
# First work out how many models we want to combine over in the final
# nnet-combine-fast invocation. This equals
# min(max(max_models_combine, approx_iters_per_epoch_final),
# 2/3 * iters_after_mixup)
num_models_combine=$max_models_combine
if [ $num_models_combine -lt $approx_iters_per_epoch_final ]; then
num_models_combine=$approx_iters_per_epoch_final
fi
iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
num_models_combine=$iters_after_mixup_23
fi
first_model_combine=$[$num_iters-$num_models_combine+1]
x=0
for realign_time in $realign_times; do
# Work out the iterations on which we will re-align, if the --realign-times
# option was used. This is slightly approximate.
! perl -e "exit($realign_time > 0.0 && $realign_time < 1.0 ? 0:1);" && \
echo "Invalid --realign-times option $realign_times: elements must be strictly between 0 and 1.";
# the next formula is based on the one for mix_up_iter above.
realign_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters $realign_time) || exit 1;
realign_this_iter[$realign_iter]=$realign_time
done
cur_egs_dir=$egs_dir
while [ $x -lt $num_iters ]; do
[ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;
this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")
ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
echo "On iteration $x, learning rate is $this_learning_rate."
if [ ! -z "${realign_this_iter[$x]}" ]; then
prev_egs_dir=$cur_egs_dir
cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
fi
if [ $x -ge 0 ] && [ $stage -le $x ]; then
if [ ! -z "${realign_this_iter[$x]}" ]; then
time=${realign_this_iter[$x]}
echo "Getting average posterior for purposes of adjusting the priors."
# Note: this just uses CPUs, using a smallish subset of data.
# always use the first egs archive, which makes the script simpler;
# we're using different random subsets of it.
rm $dir/post.$x.*.vec 2>/dev/null
$cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
nnet-copy-egs --srand=JOB --frame=random ark:$prev_egs_dir/egs.1.ark ark:- \| \
nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
nnet-compute-from-egs "nnet-to-raw-nnet $dir/$x.mdl -|" ark:- ark:- \| \
matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
sleep 3; # make sure there is time for $dir/post.$x.*.vec to appear.
$cmd $dir/log/vector_sum.$x.log \
vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
rm $dir/post.$x.*.vec;
echo "Re-adjusting priors based on computed posteriors"
$cmd $dir/log/adjust_priors.$x.log \
nnet-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;
sleep 2
steps/nnet2/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
--transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
--iter $x $data $lang $dir $dir/ali_$time || exit 1
steps/nnet2/relabel_egs2.sh --cmd "$cmd" --iter $x $dir/ali_$time \
$prev_egs_dir $cur_egs_dir || exit 1
if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
steps/nnet2/remove_egs.sh $prev_egs_dir
fi
fi
# Set off jobs doing some diagnostics, in the background.
# Use the egs dir from the previous iteration for the diagnostics
$cmd $dir/log/compute_prob_valid.$x.log \
nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
$cmd $dir/log/compute_prob_train.$x.log \
nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/train_diagnostic.egs &
if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
$cmd $dir/log/progress.$x.log \
nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
ark:$cur_egs_dir/train_diagnostic.egs '&&' \
nnet-am-info $dir/$x.mdl &
fi
echo "Training neural net (pass $x)"
if [ $x -gt 0 ] && \
[ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
[ $[$x%$add_layers_period] -eq 0 ]; then
do_average=false # if we've just mixed up, don't do averaging take the best.
cur_num_hidden_layers=$[$x/$add_layers_period];
mdl="nnet-init --srand=$x $dir/hidden_${cur_num_hidden_layers}.config - | nnet-insert $dir/$x.mdl - - | nnet-am-copy --learning-rate=$this_learning_rate - -|"
else
do_average=true
if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
mdl="nnet-am-copy --learning-rate=$this_learning_rate $dir/$x.mdl -|"
fi
if $do_average; then
this_minibatch_size=$minibatch_size
else
# on iteration zero or when we just added a layer, use a smaller minibatch
# size and just one job: the model-averaging doesn't seem to be helpful
# when the model is changing too fast (i.e. it worsens the objective
# function), and the smaller minibatch size will help to keep
# the update stable.
this_minibatch_size=$[$minibatch_size/2];
fi
rm $dir/.error 2>/dev/null
( # this sub-shell is so that when we "wait" below,
# we only wait for the training jobs that we just spawned,
# not the diagnostic jobs that we spawned above.
# We can't easily use a single parallel SGE job to do the main training,
# because the computation of which archive and which --frame option
# to use for each job is a little complex, so we spawn each one separately.
for n in $(seq $this_num_jobs); do
k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
# the other indexes from.
archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
# index; this increases more slowly than the archive index because the
# same archive with different frame indexes will give similar gradients,
# so we want to separate them in time.
$cmd $parallel_opts $dir/log/train.$x.$n.log \
nnet-train$parallel_suffix $parallel_train_opts \
--minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
"ark:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
$dir/$[$x+1].$n.mdl || touch $dir/.error &
done
wait
)
# the error message below is not that informative, but $cmd will
# have printed a more specific one.
[ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;
nnets_list=
for n in `seq 1 $this_num_jobs`; do
nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
done
if $do_average; then
# average the output of the different jobs.
$cmd $dir/log/average.$x.log \
nnet-am-average $nnets_list $dir/$[$x+1].mdl || exit 1;
else
# choose the best from the different jobs.
n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
$fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
$best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
[ -z "$n" ] && echo "Error getting best model" && exit 1;
cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
fi
if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
# mix up.
echo Mixing up from $num_leaves to $mix_up components
$cmd $dir/log/mix_up.$x.log \
nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
$dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
fi
rm $nnets_list
[ ! -f $dir/$[$x+1].mdl ] && exit 1;
if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
[ $[($x-1)%100] -ne 0 ] && [ $[$x-1] -lt $first_model_combine ]; then
rm $dir/$[$x-1].mdl
fi
fi
x=$[$x+1]
num_archives_processed=$[$num_archives_processed+$this_num_jobs]
done
if [ $stage -le $num_iters ]; then
echo "Doing final combination to produce final.mdl"
# Now do combination.
nnets_list=()
# the if..else..fi statement below sets 'nnets_list'.
if [ $max_models_combine -lt $num_models_combine ]; then
# The number of models to combine is too large, e.g. > 20. In this case,
# each argument to nnet-combine-fast will be an average of multiple models.
cur_offset=0 # current offset from first_model_combine.
for n in $(seq $max_models_combine); do
next_offset=$[($n*$num_models_combine)/$max_models_combine]
sub_list=""
for o in $(seq $cur_offset $[$next_offset-1]); do
iter=$[$first_model_combine+$o]
mdl=$dir/$iter.mdl
[ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
sub_list="$sub_list $mdl"
done
nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
cur_offset=$next_offset
done
else
nnets_list=
for n in $(seq 0 $[num_models_combine-1]); do
iter=$[$first_model_combine+$n]
mdl=$dir/$iter.mdl
[ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
nnets_list[$n]=$mdl
done
fi
# Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
# if there are many models it can give out-of-memory error; set num-threads to 8
# to speed it up (this isn't ideal...)
num_egs=`nnet-copy-egs ark:$cur_egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
[ $mb -gt 512 ] && mb=512
# Setting --initial-model to a large value makes it initialize the combination
# with the average of all the models. It's important not to start with a
# single model, or, due to the invariance to scaling that these nonlinearities
# give us, we get zero diagonal entries in the fisher matrix that
# nnet-combine-fast uses for scaling, which after flooring and inversion, has
# the effect that the initial model chosen gets much higher learning rates
# than the others. This prevents the optimization from working well.
$cmd $combine_parallel_opts $dir/log/combine.log \
nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
--num-threads=$combine_num_threads \
--verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$cur_egs_dir/combine.egs \
$dir/final.mdl || exit 1;
# Normalize stddev for affine or block affine layers that are followed by a
# pnorm layer and then a normalize layer.
$cmd $dir/log/normalize.log \
nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;
# Compute the probability of the final, combined model with
# the same subset we used for the previous compute_probs, as the
# different subsets will lead to different probs.
$cmd $dir/log/compute_prob_valid.final.log \
nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
$cmd $dir/log/compute_prob_train.final.log \
nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/train_diagnostic.egs &
fi
if [ $stage -le $[$num_iters+1] ]; then
echo "Getting average posterior for purposes of adjusting the priors."
# Note: this just uses CPUs, using a smallish subset of data.
rm $dir/post.$x.*.vec 2>/dev/null
$cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
nnet-copy-egs --frame=random --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \
nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
sleep 3; # make sure there is time for $dir/post.$x.*.vec to appear.
$cmd $dir/log/vector_sum.$x.log \
vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
rm $dir/post.$x.*.vec;
echo "Re-adjusting priors based on computed posteriors"
$cmd $dir/log/adjust_priors.final.log \
nnet-adjust-priors $dir/final.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
fi
if [ ! -f $dir/final.mdl ]; then
echo "$0: $dir/final.mdl does not exist."
# we don't want to clean up if the training didn't succeed.
exit 1;
fi
sleep 2
echo Done
if $cleanup; then
echo Cleaning up data
if $remove_egs && [[ $cur_egs_dir =~ $dir/egs* ]]; then
steps/nnet2/remove_egs.sh $cur_egs_dir
fi
echo Removing most of the models
for x in `seq 0 $num_iters`; do
if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
# delete all but every 100th model; don't delete the ones which combine to form the final model.
rm $dir/$x.mdl
fi
done
fi

Просмотреть файл

@ -1,13 +1,12 @@
#!/bin/bash
# Copyright 2015 David Snyder
# 2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
# 2015 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
#
# See README.txt for more info on data required.
# Results (EERs) are inline in comments below.
# This example script is still a bit of a mess, and needs to be
# cleaned up, but it shows you all the basic ingredients.
. cmd.sh
. path.sh
set -e

28
egs/sre10/v2/cmd.sh Executable file
Просмотреть файл

@ -0,0 +1,28 @@
# "queue.pl" uses qsub. The options to it are
# options to qsub. If you have GridEngine installed,
# change this to a queue you have access to.
# Otherwise, use "run.pl", which will run jobs locally
# (make sure your --num-jobs options are no more than
# the number of cpus on your machine.
#a) JHU cluster options
export train_cmd="queue.pl -l arch=*64*"
export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
#export cuda_cmd="..."
export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
#b) BUT cluster options
#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
#c) run it locally...
#export train_cmd=run.pl
#export decode_cmd=run.pl
export cuda_cmd=run.pl
#export mkgraph_cmd=run.pl

Просмотреть файл

@ -0,0 +1,3 @@
beam=11.0 # beam for decoding. Was 13.0 in the scripts.
first_beam=8.0 # beam for 1st-pass decoding in SAT.
lattice_beam=6.0

Просмотреть файл

@ -0,0 +1,2 @@
beam=13.0 # beam for decoding. Was 13.0 in the scripts.
lattice_beam=8.0 # this has most effect on size of the lattices.

Просмотреть файл

@ -0,0 +1,6 @@
--sample-frequency=8000
--frame-length=25 # the default is 25
--low-freq=20 # the default.
--high-freq=3700 # the default is zero meaning use the Nyquist (4k in this case).
--num-ceps=20 # higher than the default which is 12.
--snip-edges=false

Просмотреть файл

@ -0,0 +1,3 @@
--use-energy=false # only non-default option.
--sample-frequency=8000 # Switchboard is sampled at 8kHz
--snip-edges=false

Просмотреть файл

@ -0,0 +1,11 @@
# config for high-resolution MFCC features, intended for neural network training.
# Note: we keep all cepstra, so it has the same info as filterbank features,
# but MFCC is more easily compressible (because less correlated) which is why
# we prefer this method.
--use-energy=false # use average of log energy, not energy.
--sample-frequency=8000 # Switchboard is sampled at 8kHz
--num-mel-bins=40 # similar to Google's setup.
--num-ceps=40 # there is no dimensionality reduction.
--low-freq=40 # low cutoff frequency for mel bins
--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
--snip-edges=false

Просмотреть файл

@ -0,0 +1,2 @@
--vad-energy-threshold=5.5
--vad-energy-mean-scale=0.5

1
egs/sre10/v2/local Symbolic link
Просмотреть файл

@ -0,0 +1 @@
../v1/local/

263
egs/sre10/v2/run.sh Executable file
Просмотреть файл

@ -0,0 +1,263 @@
#!/bin/bash
# Copyright 2015 David Snyder
# 2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
# 2015 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
#
# See README.txt for more info on data required.
# Results (EERs) are inline in comments below.
#
# This example script shows how to replace the GMM-UBM
# with a DNN trained for ASR. It also demonstrates the
# using the DNN to create a supervised-GMM.
. cmd.sh
. path.sh
set -e
mfccdir=`pwd`/mfcc
vaddir=`pwd`/mfcc
trials_female=data/sre10_test_female/trials
trials_male=data/sre10_test_male/trials
trials=data/sre10_test/trials
nnet=exp/nnet2_online/nnet_ms_a/final.mdl
num_components=5297
# Train a DNN on about 1800 hours of the english portion of Fisher.
local/dnn/train_dnn.sh
# Prepare the SRE 2010 evaluation data.
local/make_sre_2010_test.pl /export/corpora5/SRE/SRE2010/eval/ data/
local/make_sre_2010_train.pl /export/corpora5/SRE/SRE2010/eval/ data/
# Prepare a collection of NIST SRE data prior to 2010. This is
# used to train the PLDA model and is also combined with SWB
# for UBM and i-vector extractor training data.
local/make_sre.sh data
# Prepare SWB for UBM and i-vector extractor training.
local/make_swbd2_phase2.pl /export/corpora5/LDC/LDC99S79 \
data/swbd2_phase2_train
local/make_swbd2_phase3.pl /export/corpora5/LDC/LDC2002S06 \
data/swbd2_phase3_train
local/make_swbd_cellular1.pl /export/corpora5/LDC/LDC2001S13 \
data/swbd_cellular1_train
local/make_swbd_cellular2.pl /export/corpora5/LDC/LDC2004S07 \
data/swbd_cellular2_train
utils/combine_data.sh data/train \
data/swbd_cellular1_train data/swbd_cellular2_train \
data/swbd2_phase2_train data/swbd2_phase3_train data/sre
cp -r data/train data/train_dnn
cp -r data/sre data/sre_dnn
cp -r data/sre10_train data/sre10_train_dnn
cp -r data/sre10_test data/sre10_test_dnn
# Extract speaker recogntion features.
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
data/train exp/make_mfcc $mfccdir
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
data/sre exp/make_mfcc $mfccdir
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
data/sre10_train exp/make_mfcc $mfccdir
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
data/sre10_test exp/make_mfcc $mfccdir
# Extract DNN features.
steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
data/train_dnn exp/make_mfcc $mfccdir
steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
data/sre_dnn exp/make_mfcc $mfccdir
steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
data/sre10_train_dnn exp/make_mfcc $mfccdir
steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
data/sre10_test_dnn exp/make_mfcc $mfccdir
for name in sre_dnn sre10_train_dnn sre10_test_dnn train_dnn sre sre10_train sre10_test train; do
utils/fix_data_dir.sh data/${name}
done
# Compute VAD decisions. These will be shared across both sets of features.
sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
data/train exp/make_vad $vaddir
sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
data/sre exp/make_vad $vaddir
sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
data/sre10_train exp/make_vad $vaddir
sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
data/sre10_test exp/make_vad $vaddir
for name sre sre10_train sre10_test train; do
cp data/${name}/vad.scp data/${name}_dnn/vad.scp
cp data/${name}/utt2spk data/${name}_dnn/utt2spk
cp data/${name}/spk2utt data/${name}_dnn/spk2utt
utils/fix_data_dir.sh data/${name}
utils/fix_data_dir.sh data/${name}_dnn
done
# Subset training data for faster sup-GMM initialization.
utils/subset_data_dir.sh data/train_dnn 32000 data/train_dnn_32k
utils/fix_data_dir.sh data/train_dnn_32k
utils/subset_data_dir.sh --utt-list data/train_dnn_32k/utt2spk data/train data/train_32k
utils/fix_data_dir.sh data/train_32k
# Initialize a full GMM from the DNN posteriors and speaker recognition
# features. This can be used both alone, as a UBM, or to initialize the
# i-vector extractor in a DNN-based system.
sid/init_full_ubm_from_dnn.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" \
data/train_32k \
data/train_dnn_32k $nnet exp/full_ubm
# Train an i-vector extractor based on just the supervised-GMM.
sid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=70G,ram_free=70G" \
--ivector-dim 600 \
--num-iters 5 exp/full_ubm/final.ubm data/train \
exp/extractor_sup_gmm
# Train an i-vector extractor based on the DNN-UBM.
sid/train_ivector_extractor_dnn.sh --cmd "$train_cmd -l mem_free=80G,ram_free=80G" \
--min-post 0.015 \
--ivector-dim 600 \
--num-iters 5 exp/full_ubm/final.ubm $nnet \
data/train \
data/train_dnn \
exp/extractor_dnn
# Extract i-vectors from the extractor with the sup-GMM UBM.
sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 50 \
exp/extractor_sup_gmm data/sre10_train \
exp/ivectors_sre10_train_sup_gmm
sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 50 \
exp/extractor_sup_gmm data/sre10_test \
exp/ivectors_sre10_test_sup_gmm
sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 50 \
exp/extractor_sup_gmm data/sre \
exp/ivectors_sre_sup_gmm
# Extract i-vectors using the extractor with the DNN-UBM.
sid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
exp/extractor_dnn \
$nnet \
data/sre10_test \
data/sre10_test_dnn \
exp/ivectors10_test_dnn
sid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
exp/extractor_dnn \
$nnet \
data/sre10_train \
data/sre10_train_dnn \
exp/ivectors10_train_dnn
sid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
exp/extractor_dnn \
$nnet \
data/sre \
data/sre_dnn \
exp/ivectors_sre_dnn
# Separate the i-vectors into male and female partitions and calculate
# i-vector means used by the scoring scripts.
local/scoring_common.sh data/sre data/sre10_train data/sre10_test \
exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm \
exp/ivectors_sre10_test_sup_gmm
local/scoring_common.sh data/sre data/sre10_train data/sre10_test \
exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn \
exp/ivectors_sre10_test_dnn
# The commented out scripts show how to do cosine scoring with and without
# first reducing the i-vector dimensionality with LDA. PLDA tends to work
# best, so we don't focus on the scores obtained here.
#
# local/cosine_scoring.sh data/sre10_train data/sre10_test \
# exp/ivectors_sre10_train exp/ivectors_sre10_test $trials local/scores_gmm_2048_ind_pooled
# local/lda_scoring.sh data/sre data/sre10_train data/sre10_test \
# exp/ivectors_sre exp/ivectors_sre10_train exp/ivectors_sre10_test $trials local/scores_gmm_2048_ind_pooled
# Create a gender independent PLDA model and do scoring with the sup-GMM system.
local/plda_scoring.sh data/sre data/sre10_train data/sre10_test \
exp/ivectors_sre_test_sup_gmm exp/ivectors_sre10_train_sup_gmm \
exp/ivectors_sre10_test_sup_gmm $trials local/scores_sup_gmm_ind_pooled
local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_female data/sre10_test_female \
exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm_female \
exp/ivectors_sre10_test_sup_gmm_female $trials_female local/scores_sup_gmm_ind_female
local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_male data/sre10_test_male \
exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm_male \
exp/ivectors_sre10_test_sup_gmm_male $trials_male local/scores_sup_gmm_ind_male
# Create gender dependent PLDA models and do scoring with the sup-GMM system.
local/plda_scoring.sh data/sre_female data/sre10_train_female data/sre10_test_female \
exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm_female \
exp/ivectors_sre10_test_sup_gmm_female $trials_female local/scores_sup_gmm_dep_female
local/plda_scoring.sh data/sre_male data/sre10_train_male data/sre10_test_male \
exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm_male \
exp/ivectors_sre10_test_sup_gmm_male $trials_male local/scores_sup_gmm_dep_male
mkdir -p local/scores_sup_gmm_dep_pooled
cat local/scores_sup_gmm_dep_male/plda_scores local/scores_sup_gmm_dep_female/plda_scores \
> local/scores_sup_gmm_dep_pooled/plda_scores
# Create a gender independent PLDA model and do scoring with the DNN system.
local/plda_scoring.sh data/sre data/sre10_train data/sre10_test \
exp/ivectors_sre_test_dnn exp/ivectors_sre10_train_dnn \
exp/ivectors_sre10_test_dnn $trials local/scores_dnn_ind_pooled
local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_female data/sre10_test_female \
exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn_female \
exp/ivectors_sre10_test_dnn_female $trials_female local/scores_dnn_ind_female
local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_male data/sre10_test_male \
exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn_male \
exp/ivectors_sre10_test_dnn_male $trials_male local/scores_dnn_ind_male
# Create gender dependent PLDA models and do scoring with the DNN system.
local/plda_scoring.sh data/sre_female data/sre10_train_female data/sre10_test_female \
exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn_female \
exp/ivectors_sre10_test_dnn_female $trials_female local/scores_dnn_dep_female
local/plda_scoring.sh data/sre_male data/sre10_train_male data/sre10_test_male \
exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn_male \
exp/ivectors_sre10_test_dnn_male $trials_male local/scores_dnn_dep_male
mkdir -p local/scores_dnn_dep_pooled
cat local/scores_dnn_dep_male/plda_scores local/scores_dnn_dep_female/plda_scores \
> local/scores_dnn_dep_pooled/plda_scores
# Sup-GMM PLDA EER
# ind pooled: 1.94
# ind female: 1.98
# ind male: 1.79
# dep female: 1.87
# dep male: 1.30
# dep pooled: 1.65
echo "Sup-GMM-$num_components EER"
for x in ind dep; do
for y in female male pooled; do
eer=`compute-eer <(python local/prepare_for_eer.py $trials local/scores_sup_gmm_${x}_${y}/plda_scores) 2> /dev/null`
echo "${x} ${y}: $eer"
done
done
# DNN PLDA EER
# ind pooled: 1.20
# ind female: 1.46
# ind male: 0.87
# dep female: 1.43
# dep male: 0.72
# dep pooled: 1.09
echo "DNN-$num_components EER"
for x in ind dep; do
for y in female male pooled; do
eer=`compute-eer <(python local/prepare_for_eer.py $trials local/scores_dnn_${x}_${y}/plda_scores) 2> /dev/null`
echo "${x} ${y}: $eer"
done
done
# In comparison, here is the EER for an unsupervised GMM-based system
# with 5297 components (the same as the number of senones in the DNN):
# GMM-5297 PLDA EER
# ind pooled: 2.42
# ind female: 2.43
# ind male: 2.40
# dep female: 2.16
# dep male: 1.53
# dep pooled: 2.00

1
egs/sre10/v2/sid Symbolic link
Просмотреть файл

@ -0,0 +1 @@
../v1/sid

1
egs/sre10/v2/steps Symbolic link
Просмотреть файл

@ -0,0 +1 @@
../v1/steps

1
egs/sre10/v2/utils Symbolic link
Просмотреть файл

@ -0,0 +1 @@
../v1/utils

Просмотреть файл

@ -1,6 +1,8 @@
// fgmmbin/fgmm-global-acc-stats-post.cc
// Copyright 2015 David Snyder
// 2015 Johns Hopkins University (Author: Daniel Povey)
// 2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
// See ../../COPYING for clarification regarding multiple authors
//

Просмотреть файл

@ -1,6 +1,8 @@
// fgmmbin/fgmm-global-init-from-accs.cc
// Copyright 2015 David Snyder
// 2015 Johns Hopkins University (Author: Daniel Povey)
// 2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
// See ../../COPYING for clarification regarding multiple authors
//