trunk: merging various changes from sandbox/language_id.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4180 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2014-07-20 05:23:18 +00:00
Родитель a66bfe959d 369bb365a6
Коммит 80299dfdba
20 изменённых файлов: 751 добавлений и 26 удалений

Просмотреть файл

@ -0,0 +1,5 @@
--sample-frequency=8000
--frame-length=20 # the default is 25.
--low-freq=20 # the default.
--high-freq=3700 # the default is zero meaning use the Nyquist (4k in this case).
--num-ceps=13

116
egs/lre/v1/lid/get_vtln_warps.sh Executable file
Просмотреть файл

@ -0,0 +1,116 @@
#!/bin/bash
# Copyright 2014 Daniel Povey
# Apache 2.0
#
# This script takes a data directory and a directory computed by
# ./train_lvtln_model.sh, and it computes per-utterance warp-factors utt2warp. It
# expects vad.scp to exist in the data directory. Note: like
# train_lvtln_model.sh, it uses features of the speaker-id type, i.e. double
# delta features with sliding window cepstral mean normalization.
# Begin configuration.
stage=-1
config=
cmd=run.pl
logdet_scale=0.0
subsample=5 # We use every 5th frame by default; this is more
# CPU-efficient.
nj=4
cleanup=true
num_gselect=25
num_iters=5 # number of iters of transform estimation
# End configuration.
echo "$0 $@" # Print the command line for logging
[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: $0 <data-dir> <vtln-dir> <exp-dir>"
echo "e.g.: $0 data/train_novtln exp/vtln exp/train_warps"
echo "where <vtln-dir> is produced by train_lvtln_model.sh"
echo "Output is <exp-dir>/utt2warp"
echo "main options (for others, see top of script file)"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --nj <num-jobs> # number of jobs to use (default 4)"
echo " --config <config-file> # config containing options"
echo " --stage <stage> # stage to do partial re-run from."
exit 1;
fi
data=$1
vtlndir=$2
dir=$3
for f in $data/feats.scp $data/spk2utt $vtlndir/final.lvtln $vtlndir/final.dubm $vtlndir/final.ali_dubm; do
[ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
done
if [ -f $data/utt2warp ]; then
echo "$0: source data directory $data appears to already have VTLN.";
exit 1;
fi
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj;
split_data.sh $data $nj || exit 1;
cmvn_sliding_opts="--norm-vars=false --center=true --cmn-window=300"
# don't change $cmvn_sliding_opts, it should probably match the
# options used in ../sid/train_diag_ubm.sh and ./train_lvtln_model.sh
sifeats="ark,s,cs:add-deltas scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding $cmvn_sliding_opts ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
if [ $stage -le -1 ]; then
echo "$0: computing Gaussian selection info."
$cmd JOB=1:$nj $dir/log/gselect.JOB.log \
gmm-gselect --n=$num_gselect $vtlndir/final.ali_dubm "$sifeats" \
"ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
fi
feats="$sifeats"
x=0
while [ $x -lt $num_iters ]; do
if [ $stage -le $x ]; then
echo "$0: pass $x of computing LVTLN transforms"
if [ $x -eq 0 ]; then ubm=$vtlndir/final.ali_dubm; else ubm=$vtlndir/final.dubm; fi
$cmd JOB=1:$nj $dir/log/lvtln.$x.JOB.log \
gmm-global-gselect-to-post "$ubm" "$feats" \
"ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark:- \| \
gmm-global-est-lvtln-trans \
--logdet-scale=$logdet_scale --verbose=1 \
$vtlndir/final.dubm $vtlndir/final.lvtln "$sifeats" ark,s,cs:- \
ark:$dir/trans.$x.JOB ark,t:$dir/warp.$x.JOB || exit 1
# consolidate the warps into one file.
for j in $(seq $nj); do cat $dir/warp.$x.$j; done > $dir/warp.$x
rm $dir/warp.$x.*
fi
feats="$sifeats transform-feats ark:$dir/trans.$x.JOB ark:- ark:- |"
x=$[$x+1]
done
ln -sf warp.$[$x-1] $dir/utt2warp
$cleanup && rm $dir/gselect.*.gz $dir/trans.*
echo "$0: Distribution of classes for one job is below."
grep 'Distribution of classes' $dir/log/lvtln.$[$x-1].1.log
echo "$0: created warp factors in $dir/utt2warp"
# Summarize warning messages...
utils/summarize_warnings.pl $dir/log
echo "$0: Done getting VTLN warps in $dir"

Просмотреть файл

@ -0,0 +1,280 @@
#!/bin/bash
# Copyright 2014 Daniel Povey
# Apache 2.0
#
# This training script computes some things you will need in order to
# extract VTLN-warped features. It takes as input the data directory
# and an already-trained diagonal-covariance UBM. Note: although this
# script is in the lid/ directory, because it is intended to be
# used in language identification, but it uses features of the
# same type as those used in the speaker-id scripts (see ../sid/),
# i.e. double-delta features, rather than the "shifted delta cepstra"
# features commonly used in language id.
#
# This script works with either mfcc or plp features; for plp features, you will
# need to set the --base-feat-type option. Regardless, you will need to set the
# --mfcc-config or --plp-config option if your feature-extraction config is not
# called conf/${base_feat_type}.conf. The output of this script will be in
# $dir/final.lvtln and $dir/final.dubm and $dir/final.ali_dubm; the directory
# can be passed to ./get_vtln_warps.sh to get VTLN warps for a data directory,
# or (for data passed to this script) you can use the warping factors this
# script outputs in $dir/final.warp
#
# Begin configuration.
stage=-4 # This allows restarting after partway, when something when wrong.
config=
cmd=run.pl
num_iters=15 # Number of iterations of training.
num_utt_lvtln_init=400; # number of utterances (subset) to initialize
# LVTLN transform. Not too critical.
min_warp=0.85
max_warp=1.25
warp_step=0.01
base_feat_type=mfcc # or could be PLP.
mfcc_config=conf/mfcc.conf # default, can be overridden.
plp_config=conf/plp.conf # default, can be overridden.
logdet_scale=0.0
subsample=5 # We use every 5th frame by default; this is more
# CPU-efficient.
min_gaussian_weight=0.0001 # does not matter; inherited from diag-ubm training script.
nj=4
cleanup=true
num_gselect=15
# End configuration.
echo "$0 $@" # Print the command line for logging
[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;
num_classes=$(perl -e "print int(1.5 + ($max_warp - $min_warp) / $warp_step);") || exit 1;
default_class=$(perl -e "print int(0.5 + (1.0 - $min_warp) / $warp_step);") || exit 1;
if [ $# != 3 ]; then
echo "Usage: $0 <data-dir> <diag-ubm-dir> <exp-dir>"
echo "e.g.: $0 data/train_vtln exp/diag_ubm_vtln exp/vtln"
echo "main options (for others, see top of script file)"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --nj <num-jobs> # number of jobs to use (default 4)"
echo " --config <config-file> # config containing options"
echo " --stage <stage> # stage to do partial re-run from."
echo " --num-iters <num-iters> # number of iterations of training"
echo " --base-feat-type <feat-type> # mfcc or plp, mfcc is default"
echo " --mfcc-config <config> # config for MFCC extraction, default is"
echo " # conf/mfcc.conf"
echo " --plp-config <config> # config for PLP extraction, default is"
echo " # conf/plp.conf"
exit 1;
fi
data=$1
ubmdir=$2
dir=$3
for f in $data/feats.scp $ubmdir/final.dubm; do
[ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
done
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj;
split_data.sh $data $nj || exit 1;
cmvn_sliding_opts="--norm-vars=false --center=true --cmn-window=300"
# don't change $cmvn_sliding_opts, it should probably match the
# options used in ../sid/train_diag_ubm.sh.
sifeats="ark,s,cs:add-deltas scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding $cmvn_sliding_opts ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
# for the subsets of features that we use to estimate the linear transforms, we
# don't bother with CMN. This will give us wrong offsets on the transforms,
# but it won't matter because we will allow an arbitrary bias term when we apply
# these transforms.
# you need to define CLASS when invoking $cmd on featsub_warped.
featsub_warped="ark:add-deltas ark:$dir/feats.CLASS.ark ark:- | select-voiced-frames ark:- scp,s,cs:$data/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
featsub_unwarped="ark:add-deltas ark:$dir/feats.$default_class.ark ark:- | select-voiced-frames ark:- scp,s,cs:$data/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
if [ -f $data/utt2warp ]; then
echo "$0: source data directory $data appears to already have VTLN.";
exit 1;
fi
# create a small subset of utterances for purposes of initializing the LVTLN transform
# utils/shuffle_list.pl is deterministic, unlike sort -R.
cat $data/utt2spk | awk '{print $1}' | utils/shuffle_list.pl | \
head -n $num_utt_lvtln_init > $dir/utt_subset
if [ $stage -le -4 ]; then
echo "$0: computing warped subset of features"
if [ -f $data/segments ]; then
echo "$0 [info]: segments file exists: using that."
subset_feats="utils/filter_scp.pl $dir/utt_subset $data/segments | extract-segments scp:$data/wav.scp - ark:- "
else
echo "$0 [info]: no segments file exists: using wav.scp directly."
subset_feats="utils/filter_scp.pl $dir/utt_subset $data/wav.scp | wav-copy scp:- ark:- "
fi
rm $dir/.error 2>/dev/null
for c in $(seq 0 $[$num_classes-1]); do
this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));")
config_name=${base_feat_type}_config # e.g. mfcc_config or plp_config
this_config=$(eval echo \$$config_name) # e.g. conf/mfcc.conf or conf/plp.conf by default.
$cmd $dir/log/compute_warped_feats.$c.log \
$subset_feats \| compute-${base_feat_type}-feats --verbose=2 \
--config=$this_config --vtln-warp=$this_warp ark:- ark:- \| \
copy-feats --compress=true ark:- ark:$dir/feats.$c.ark || touch $dir/.error &
done
wait;
if [ -f $dir/.error ]; then
echo "$0: Computing warped features failed: check $dir/log/compute_warped_feats.*.log"
exit 1;
fi
fi
if ! utils/filter_scp.pl $dir/utt_subset $data/feats.scp | \
compare-feats --threshold=0.98 scp:- ark:$dir/feats.$default_class.ark >&/dev/null; then
echo "$0: features stored on disk differ from those computed with no warping."
echo " Possibly your feature type is wrong (--base-feat-type option)"
exit 1;
fi
if [ -f $data/segments ]; then
subset_utts="ark:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
else
echo "$0 [info]: no segments file exists: using wav.scp directly."
subset_utts="ark:wav-copy scp:$sdata/JOB/wav.scp ark:- |"
fi
if [ $stage -le -3 ]; then
echo "$0: initializing base LVTLN transforms in $dir/0.lvtln (ignore warnings below)"
dim=$(feat-to-dim "$featsub_unwarped" - ) || exit 1;
$cmd $dir/log/init_lvtln.log \
gmm-init-lvtln --dim=$dim --num-classes=$num_classes --default-class=$default_class \
$dir/0.lvtln || exit 1;
for c in $(seq 0 $[$num_classes-1]); do
this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));")
orig_feats=ark:$dir/feats.$default_class.ark
warped_feats=ark:$dir/feats.$c.ark
logfile=$dir/log/train_special.$c.log
this_featsub_warped="$(echo $featsub_warped | sed s/CLASS/$c/)"
if ! gmm-train-lvtln-special --warp=$this_warp --normalize-var=true \
$c $dir/0.lvtln $dir/0.lvtln \
"$featsub_unwarped" "$this_featsub_warped" 2>$logfile; then
echo "$0: Error training LVTLN transform, see $logfile";
exit 1;
fi
done
fi
cp $ubmdir/final.dubm $dir/0.dubm
if [ $stage -le -2 ]; then
echo "$0: computing Gaussian selection info."
$cmd JOB=1:$nj $dir/log/gselect.JOB.log \
gmm-gselect --n=$num_gselect $ubmdir/final.dubm "$sifeats" \
"ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
fi
if [ $stage -le -1 ]; then
echo "$0: computing initial LVTLN transforms" # do this per-utt.
$cmd JOB=1:$nj $dir/log/lvtln.0.JOB.log \
gmm-global-gselect-to-post $dir/0.dubm "$sifeats" \
"ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark:- \| \
gmm-global-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \
$dir/0.dubm $dir/0.lvtln "$sifeats" ark,s,cs:- ark:$dir/trans.0.JOB ark,t:$dir/warp.0.JOB || exit 1
# consolidate the warps into one file.
for j in $(seq $nj); do cat $dir/warp.0.$j; done > $dir/warp.0
rm $dir/warp.0.*
fi
x=0
while [ $x -lt $num_iters ]; do
feats="$sifeats transform-feats ark:$dir/trans.$x.JOB ark:- ark:- |"
# First update the model.
if [ $stage -le $x ]; then
echo "$0: Updating model on pass $x"
# Accumulate stats.
$cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
gmm-global-acc-stats "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \
$dir/$x.dubm "$feats" $dir/$x.JOB.acc || exit 1;
$cmd $dir/log/update.$x.log \
gmm-global-est --remove-low-count-gaussians=false --min-gaussian-weight=$min_gaussian_weight \
$dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc|" \
$dir/$[$x+1].dubm || exit 1;
$cleanup && rm $dir/$x.*.acc $dir/$x.dubm
fi
# Now update the LVTLN transforms (and warps.)
if [ $stage -le $x ]; then
echo "$0: re-estimating LVTLN transforms on pass $x"
$cmd JOB=1:$nj $dir/log/lvtln.$x.JOB.log \
gmm-global-gselect-to-post $dir/$[$x+1].dubm "$feats" \
"ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark:- \| \
gmm-global-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \
$dir/$[$x+1].dubm $dir/0.lvtln "$sifeats" ark,s,cs:- \
ark:$dir/trans.$[$x+1].JOB ark,t:$dir/warp.$[$x+1].JOB || exit 1
# consolidate the warps into one file.
for j in $(seq $nj); do cat $dir/warp.$[$x+1].$j; done > $dir/warp.$[$x+1]
rm $dir/warp.$[$x+1].*
$cleanup && rm $dir/trans.$x.*
fi
x=$[$x+1]
done
feats="$sifeats transform-feats ark:$dir/trans.$x.JOB ark:- ark:- |"
if [ $stage -le $x ]; then
# Accumulate stats for "alignment model"-- this model is computed with the
# speaker-independent features, but matches Gaussian-for-Gaussian with the
# final speaker-adapted model.
$cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
gmm-global-acc-stats-twofeats "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \
$dir/$x.dubm "$feats" "$sifeats" $dir/$x.JOB.acc || exit 1
[ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
# Update model.
$cmd $dir/log/est_alimdl.log \
gmm-global-est --min-gaussian-weight=$min_gaussian_weight \
--remove-low-count-gaussians=false $dir/$x.dubm \
"gmm-global-sum-accs - $dir/$x.*.acc|" $dir/$x.ali_dubm || exit 1;
$cleanup && rm $dir/$x.*.acc
fi
if true; then # Diagnostics
ln -sf warp.$x $dir/final.warp
if [ -f $data/spk2gender ]; then
# To make it easier to eyeball the male and female speakers' warps
# separately, separate them out.
for g in m f; do # means: for gender in male female
cat $dir/final.warp | \
utils/filter_scp.pl <(grep -w $g $data/spk2gender | awk '{print $1}') > $dir/final.warp.$g
echo -n "The last few warp factors for gender $g are: "
tail -n 10 $dir/final.warp.$g | awk '{printf("%s ", $2);}';
echo
done
fi
fi
ln -sf $x.dubm $dir/final.dubm
ln -sf $x.ali_dubm $dir/final.ali_dubm
ln -sf 0.lvtln $dir/final.lvtln
# Summarize warning messages...
utils/summarize_warnings.pl $dir/log
echo "$0: Done training LVTLN model in $dir"

Просмотреть файл

@ -1,5 +1,6 @@
#!/bin/bash
# Copyright 2014 David Snyder
# 2014 Daniel Povey
# Apache 2.0.
#
# An incomplete run.sh for this example.
@ -54,6 +55,40 @@ local/split_long_utts.sh --max-utt-len 120 data/train_unsplit data/train
# max_voiced=3000
# local/vad_split_utts.sh --max-voiced $max_voiced data/train_unsplit $mfccdir data/train
use_vtln=true
if $use_vtln; then
for t in train lre07; do
cp -rt data/${t} data/${t}_novtln
rm -r data/${t}_novtln/{split,.backup,spk2warp} 2>/dev/null || true
steps/make_mfcc.sh --mfcc-config conf/mfcc_vtln.conf --nj 100 --cmd "$train_cmd" \
data/${t}_novtln exp/make_mfcc $mfccdir
lid/compute_vad_decision.sh data/${t}_novtln exp/make_mfcc $mfccdir
done
# Vtln-related things:
# We'll use a subset of utterances to train the GMM we'll use for VTLN
# warping.
utils/subset_data_dir.sh data/train_novtln 5000 data/train_novtln_5k
# for the features we use to estimate VTLN warp factors, we use more cepstra
# (13 instead of just 7); this needs to be tuned.
steps/make_mfcc.sh --mfcc-config conf/mfcc_vtln.conf --nj 50 --cmd "$train_cmd" \
data/train_5k_novtln exp/make_mfcc $mfccdir
# note, we're using the speaker-id version of the train_diag_ubm.sh script, which
# uses double-delta instead of SDC features. We train a 256-Gaussian UBM; this
# has to be tuned.
sid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train_5k_novtln 256 \
exp/diag_ubm_vtln
lid/train_lvtln_model.sh --mfcc-config conf/mfcc_vtln.conf --nj 30 --cmd "$train_cmd" \
data/train_5k_novtln exp/diag_ubm_vtln exp/vtln
for t in lre07 train; do
lid/get_vtln_warps.sh --nj 100 --cmd "$train_cmd" \
data/${t}_novtln exp/vtln exp/${t}_warps
cp exp/${t}_warps/utt2warp $data/$t/
done
fi
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 100 --cmd "$train_cmd" \
data/train exp/make_mfcc $mfccdir
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \

1
egs/lre/v1/sid Symbolic link
Просмотреть файл

@ -0,0 +1 @@
../../sre08/v1/sid

Просмотреть файл

@ -23,11 +23,11 @@ echo "$0 $@" # Print the command line for logging
fake=false
two_channel=false
if [ $1 == "--fake" ]; then
if [ "$1" == "--fake" ]; then
fake=true
shift
fi
if [ $1 == "--two-channel" ]; then
if [ "$1" == "--two-channel" ]; then
two_channel=true
shift
fi

Просмотреть файл

@ -65,6 +65,9 @@ utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
if [ -f $data/spk2warp ]; then
echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
elif [ -f $data/utt2warp ]; then
echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
vtln_opts="--vtln-map=ark:$data/utt2warp"
fi
if [ -f $data/segments ]; then

Просмотреть файл

@ -71,14 +71,21 @@ else
postprocess_config_opt=
fi
# note: in general, the double-parenthesis construct in bash "((" is "C-style
# syntax" where we can get rid of the $ for variable names, and omit spaces.
# The "for" loop in this style is a special construct.
if [ -f $data/spk2warp ]; then
echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
elif [ -f $data/utt2warp ]; then
echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
vtln_opts="--vtln-map=ark:$data/utt2warp"
fi
if [ -f $data/segments ]; then
echo "$0 [info]: segments file exists: using that."
split_segments=""
# note: in general, the double-parenthesis construct in bash "((" is "C-style
# syntax" where we can get rid of the $ for variable names, and omit spaces.
# The "for" loop in this style is a special construct.
for ((n=1; n<=nj; n++)); do
split_segments="$split_segments $logdir/segments.$n"
done
@ -86,7 +93,7 @@ if [ -f $data/segments ]; then
utils/split_scp.pl $data/segments $split_segments || exit 1;
rm $logdir/.error 2>/dev/null
mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-mfcc-feats --verbose=2 --config=$mfcc_config ark:- ark:- |"
mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config ark:- ark:- |"
pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
$cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
@ -104,7 +111,7 @@ else
utils/split_scp.pl $scp $split_scps || exit 1;
mfcc_feats="ark:compute-mfcc-feats --verbose=2 --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
mfcc_feats="ark:compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
$cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \

Просмотреть файл

@ -65,6 +65,9 @@ utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
if [ -f $data/spk2warp ]; then
echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
elif [ -f $data/utt2warp ]; then
echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
vtln_opts="--vtln-map=ark:$data/utt2warp"
fi
if [ -f $data/segments ]; then

Просмотреть файл

@ -78,6 +78,9 @@ fi
if [ -f $data/spk2warp ]; then
echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
elif [ -f $data/utt2warp ]; then
echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
vtln_opts="--vtln-map=ark:$data/utt2warp"
fi
if [ -f $data/segments ]; then

Просмотреть файл

@ -5,7 +5,7 @@
# This training script trains linear-VTLN models starting from an existing
# system based on either LDA+MLLT or delta+delta-delta features.
# Works with either mfcc or plp features, but you need to set the
# --base-feature-type option.
# --base-feat-type option.
# The resulting system can be used with align_lvtln.sh and/or decode_lvtln.sh
# to get VTLN warping factors for data, for warped data extraction, or (for
# the training data) you can use the warping factors this script outputs
@ -65,7 +65,7 @@ alidir=$5
dir=$6
for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt $data/wav.scp; do
[ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
numgauss=$numleaves
@ -111,7 +111,7 @@ else
fi
if [ -f $data/utt2warp ]; then
echo "$0: source directory appears to already have VTLN.";
echo "$0: source data directory $data appears to already have VTLN.";
exit 1;
fi

Просмотреть файл

@ -98,6 +98,8 @@ function do_filtering {
[ -f $srcdir/vad.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp
[ -f $srcdir/utt2lang ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang
[ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
[ -f $srcdir/spk2warp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp
[ -f $srcdir/utt2warp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp
[ -f $srcdir/text ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text
[ -f $srcdir/spk2gender ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
[ -f $srcdir/cmvn.scp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp

Просмотреть файл

@ -6,15 +6,15 @@ no_wav=false
no_text=false
for x in `seq 3`; do
if [ $1 == "--no-feats" ]; then
if [ "$1" == "--no-feats" ]; then
no_feats=true
shift;
fi
if [ $1 == "--no-text" ]; then
if [ "$1" == "--no-text" ]; then
no_text=true
shift;
fi
if [ $1 == "--no-wav" ]; then
if [ "$1" == "--no-wav" ]; then
no_wav=true
shift;
fi
@ -242,6 +242,20 @@ if [ -f $data/spk2warp ]; then
fi
fi
if [ -f $data/utt2warp ]; then
check_sorted_and_uniq $data/utt2warp
! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \
echo "Mal-formed spk2warp file" && exit 1;
cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp
cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
if ! cmp -s $tmpdir/utts{,.utt2warp}; then
echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/utts{,.utt2warp}
exit 1;
fi
fi
# check some optionally-required things
for f in vad.scp utt2lang; do
if [ -f $data/$f ]; then

Просмотреть файл

@ -27,7 +27,8 @@ BINFILES = gmm-init-mono gmm-est gmm-acc-stats-ali gmm-align \
gmm-est-basis-fmllr-gpost gmm-latgen-tracking gmm-latgen-faster-parallel \
gmm-est-fmllr-raw gmm-est-fmllr-raw-gpost gmm-global-init-from-feats \
gmm-global-info gmm-latgen-faster-regtree-fmllr gmm-est-fmllr-global \
gmm-acc-mllt-global gmm-transform-means-global gmm-global-gselect-to-post
gmm-acc-mllt-global gmm-transform-means-global gmm-global-gselect-to-post \
gmm-global-est-lvtln-trans
OBJFILES =

Просмотреть файл

@ -2,6 +2,7 @@
// Copyright 2009-2011 Microsoft Corporation
// 2014 Guoguo Chen
// 2014 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
@ -31,8 +32,9 @@ int main(int argc, char *argv[]) {
using namespace kaldi;
try {
const char *usage =
"Accumulate stats for GMM training, computing posteriors with one set of features\n"
"Accumulate stats for GMM training, computing posteriors with one set of features\n"
"but accumulating statistics with another.\n"
"First features are used to get posteriors, second to accumulate stats\n"
"Usage: gmm-acc-stats-twofeats [options] <model-in> <feature1-rspecifier> <feature2-rspecifier> <posteriors-rspecifier> <stats-out>\n"
"e.g.: \n"
" gmm-acc-stats-twofeats 1.mdl 1.ali scp:train.scp scp:train_new.scp ark:1.ali 1.acc\n";

Просмотреть файл

@ -1,6 +1,7 @@
// gmmbin/gmm-est-lvtln-trans.cc
// Copyright 2009-2011 Microsoft Corporation; Saarland University
// 2014 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
@ -125,8 +126,8 @@ int main(int argc, char *argv[]) {
const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
const GaussPost &gpost = gpost_reader.Value(utt);
if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
KALDI_WARN << "GauPost vector has wrong size " << (gpost.size())
<< " vs. " << (feats.NumRows());
KALDI_WARN << "GauPost vector has wrong size " << gpost.size()
<< " vs. " << feats.NumRows();
num_other_error++;
continue;
}
@ -172,8 +173,8 @@ int main(int argc, char *argv[]) {
const GaussPost &gpost = gpost_reader.Value(utt);
if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
KALDI_WARN << "GauPost has wrong size " << (gpost.size())
<< " vs. " << (feats.NumRows());
KALDI_WARN << "GauPost has wrong size " << gpost.size()
<< " vs. " << feats.NumRows();
num_other_error++;
continue;
}
@ -219,7 +220,7 @@ int main(int argc, char *argv[]) {
<< " with no gposts, " << num_other_error << " with other errors.";
KALDI_LOG << "Overall LVTLN auxf impr per frame is "
<< (tot_lvtln_impr / tot_t) << " over " << tot_t << " frames.";
return 0;
return (num_done == 0 ? 1 : 0);
} catch(const std::exception &e) {
std::cerr << e.what();
return -1;

Просмотреть файл

@ -1,6 +1,7 @@
// gmmbin/gmm-global-acc-stats-twofeats.cc
// Copyright 2009-2011 Microsoft Corporation; Saarland University
// 2014 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
@ -32,6 +33,7 @@ int main(int argc, char *argv[]) {
const char *usage =
"Accumulate stats for training a diagonal-covariance GMM, two-feature version\n"
"First features are used to get posteriors, second to accumulate stats\n"
"Usage: gmm-global-acc-stats-twofeats [options] <model-in> "
"<feature1-rspecifier> <feature2-rspecifier> <stats-out>\n"
"e.g.: gmm-global-acc-stats-twofeats 1.mdl scp:train.scp scp:train2.scp 1.acc\n";

Просмотреть файл

@ -0,0 +1,235 @@
// gmmbin/gmm-global-est-lvtln-trans.cc
// Copyright 2009-2011 Microsoft Corporation; Saarland University
// 2014 Daniel Povey
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include <string>
using std::string;
#include <vector>
using std::vector;
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "gmm/am-diag-gmm.h"
#include "hmm/transition-model.h"
#include "transform/lvtln.h"
#include "hmm/posterior.h"
namespace kaldi {
void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
const Posterior &post,
const DiagGmm &gmm,
FmllrDiagGmmAccs *spk_stats) {
KALDI_ASSERT(static_cast<int32>(post.size()) == feats.NumRows());
for (size_t i = 0; i < post.size(); i++) {
std::vector<int32> gselect(post[i].size());
Vector<BaseFloat> this_post(post[i].size());
for (size_t j = 0; j < post[i].size(); j++) {
int32 g = post[i][j].first;
BaseFloat weight = post[i][j].second;
gselect[j] = g;
this_post(j) = weight;
}
spk_stats->AccumulateFromPosteriorsPreselect(gmm, gselect,
feats.Row(i),
this_post);
}
}
}
int main(int argc, char *argv[]) {
try {
typedef kaldi::int32 int32;
using namespace kaldi;
const char *usage =
"Estimate linear-VTLN transforms, either per utterance or for "
"the supplied set of speakers (spk2utt option); this version\n"
"is for a global diagonal GMM (also known as a UBM). Reads posteriors\n"
"indicating Gaussian indexes in the UBM.\n"
"\n"
"Usage: gmm-global-est-lvtln-trans [options] <gmm-in> <lvtln-in> "
"<feature-rspecifier> <gpost-rspecifier> <lvtln-trans-wspecifier> [<warp-wspecifier>]\n"
"e.g.: gmm-global-est-lvtln-trans 0.ubm 0.lvtln '$feats' ark,s,cs:- ark:1.trans ark:1.warp\n"
"(where the <gpost-rspecifier> will likely come from gmm-global-get-post or\n"
"gmm-global-gselect-to-post\n";
ParseOptions po(usage);
string spk2utt_rspecifier;
BaseFloat logdet_scale = 1.0;
std::string norm_type = "offset";
po.Register("norm-type", &norm_type, "type of fMLLR applied (\"offset\"|\"none\"|\"diag\")");
po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
"utterance-list map");
po.Register("logdet-scale", &logdet_scale, "Scale on log-determinant term in auxiliary function");
po.Read(argc, argv);
if (po.NumArgs() < 5 || po.NumArgs() > 6) {
po.PrintUsage();
exit(1);
}
string
model_rxfilename = po.GetArg(1),
lvtln_rxfilename = po.GetArg(2),
feature_rspecifier = po.GetArg(3),
post_rspecifier = po.GetArg(4),
trans_wspecifier = po.GetArg(5),
warp_wspecifier = po.GetOptArg(6);
DiagGmm gmm;
ReadKaldiObject(model_rxfilename, &gmm);
LinearVtln lvtln;
ReadKaldiObject(lvtln_rxfilename, &lvtln);
RandomAccessPosteriorReader post_reader(post_rspecifier);
double tot_lvtln_impr = 0.0, tot_t = 0.0;
BaseFloatMatrixWriter transform_writer(trans_wspecifier);
BaseFloatWriter warp_writer(warp_wspecifier);
std::vector<int32> class_counts(lvtln.NumClasses(), 0);
int32 num_done = 0, num_no_post = 0, num_other_error = 0;
if (spk2utt_rspecifier != "") { // per-speaker adaptation
SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
FmllrDiagGmmAccs spk_stats(lvtln.Dim());
string spk = spk2utt_reader.Key();
const vector<string> &uttlist = spk2utt_reader.Value();
for (size_t i = 0; i < uttlist.size(); i++) {
std::string utt = uttlist[i];
if (!feature_reader.HasKey(utt)) {
KALDI_WARN << "Did not find features for utterance " << utt;
continue;
}
if (!post_reader.HasKey(utt)) {
KALDI_WARN << "Did not find posteriors for utterance " << utt;
num_no_post++;
continue;
}
const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
const Posterior &post = post_reader.Value(utt);
if (static_cast<int32>(post.size()) != feats.NumRows()) {
KALDI_WARN << "Posterior vector has wrong size " << post.size()
<< " vs. " << feats.NumRows();
num_other_error++;
continue;
}
AccumulateForUtterance(feats, post, gmm, &spk_stats);
num_done++;
} // end looping over all utterances of the current speaker
BaseFloat impr, spk_tot_t;
{ // Compute the transform and write it out.
Matrix<BaseFloat> transform(lvtln.Dim(), lvtln.Dim()+1);
int32 class_idx;
lvtln.ComputeTransform(spk_stats,
norm_type,
logdet_scale,
&transform,
&class_idx,
NULL,
&impr,
&spk_tot_t);
class_counts[class_idx]++;
transform_writer.Write(spk, transform);
if (warp_wspecifier != "")
warp_writer.Write(spk, lvtln.GetWarp(class_idx));
}
KALDI_LOG << "For speaker " << spk << ", auxf-impr from LVTLN is "
<< (impr/spk_tot_t) << ", over " << spk_tot_t << " frames.";
tot_lvtln_impr += impr;
tot_t += spk_tot_t;
} // end looping over speakers
} else { // per-utterance adaptation
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
for (; !feature_reader.Done(); feature_reader.Next()) {
string utt = feature_reader.Key();
if (!post_reader.HasKey(utt)) {
KALDI_WARN << "Did not find posterior for utterance "
<< utt;
num_no_post++;
continue;
}
const Matrix<BaseFloat> &feats = feature_reader.Value();
const Posterior &post = post_reader.Value(utt);
if (static_cast<int32>(post.size()) != feats.NumRows()) {
KALDI_WARN << "Posterior has wrong size " << post.size()
<< " vs. " << feats.NumRows();
num_other_error++;
continue;
}
num_done++;
FmllrDiagGmmAccs spk_stats(lvtln.Dim());
AccumulateForUtterance(feats, post, gmm,
&spk_stats);
BaseFloat impr, utt_tot_t = spk_stats.beta_;
{ // Compute the transform and write it out.
Matrix<BaseFloat> transform(lvtln.Dim(), lvtln.Dim()+1);
int32 class_idx;
lvtln.ComputeTransform(spk_stats,
norm_type,
logdet_scale,
&transform,
&class_idx,
NULL,
&impr,
&utt_tot_t);
class_counts[class_idx]++;
transform_writer.Write(utt, transform);
if (warp_wspecifier != "")
warp_writer.Write(utt, lvtln.GetWarp(class_idx));
}
KALDI_LOG << "For utterance " << utt << ", auxf-impr from LVTLN is "
<< (impr/utt_tot_t) << ", over " << utt_tot_t << " frames.";
tot_lvtln_impr += impr;
tot_t += utt_tot_t;
}
}
{
std::ostringstream s;
for (size_t i = 0; i < class_counts.size(); i++)
s << ' ' << class_counts[i];
KALDI_LOG << "Distribution of classes is: " << s.str();
}
KALDI_LOG << "Done " << num_done << " files, " << num_no_post
<< " with no posteriors, " << num_other_error << " with other errors.";
KALDI_LOG << "Overall LVTLN auxf impr per frame is "
<< (tot_lvtln_impr / tot_t) << " over " << tot_t << " frames.";
return (num_done == 0 ? 1 : 0);
} catch(const std::exception &e) {
std::cerr << e.what();
return -1;
}
}

Просмотреть файл

@ -100,7 +100,7 @@ class FmllrDiagGmmAccs: public AffineXformStats {
const VectorBase<BaseFloat> &posteriors);
/// Accumulate stats for a GMM, given supplied posteriors. The "posteriors"
/// vector should be have the same size as "gselect".n
/// vector should be have the same size as "gselect".
void AccumulateFromPosteriorsPreselect(
const DiagGmm &gmm,
const std::vector<int32> &gselect,

Просмотреть файл

@ -57,7 +57,18 @@ void LinearVtln::Read(std::istream &is, bool binary) {
ExpectToken(is, binary, "<warp>");
ReadBasicType(is, binary, &(warps_[i]));
}
ExpectToken(is, binary, "</LinearVtln>");
std::string token;
ReadToken(is, binary, &token);
if (token == "</LinearVtln>") {
// the older code had a bug in that it wasn't writing or reading
// default_class_. The following guess at its value is likely to be
// correct.
default_class_ = (sz + 1) / 2;
} else {
KALDI_ASSERT(token == "<DefaultClass>");
ReadBasicType(is, binary, &default_class_);
ExpectToken(is, binary, "</LinearVtln>");
}
}
void LinearVtln::Write(std::ostream &os, bool binary) const {
@ -76,6 +87,8 @@ void LinearVtln::Write(std::ostream &os, bool binary) const {
WriteBasicType(os, binary, warps_[i]);
if(!binary) os << "\n";
}
WriteToken(os, binary, "<DefaultClass>");
WriteBasicType(os, binary, default_class_);
WriteToken(os, binary, "</LinearVtln>");
}
@ -97,11 +110,13 @@ void LinearVtln::ComputeTransform(const FmllrDiagGmmAccs &accs,
if (accs.beta_ == 0.0) {
KALDI_WARN << "no stats, returning default transform";
*class_idx = default_class_;
int32 dim = Dim();
KALDI_ASSERT(Ws != NULL && Ws->NumRows() == dim && Ws->NumCols() == dim+1);
Ws->Range(0, dim, 0, dim).CopyFromMat(A_[default_class_]);
Ws->Range(0, dim, dim, 1).SetZero(); // Set last column to zero.
if (Ws) {
KALDI_ASSERT(Ws->NumRows() == dim && Ws->NumCols() == dim+1);
Ws->Range(0, dim, 0, dim).CopyFromMat(A_[default_class_]);
Ws->Range(0, dim, dim, 1).SetZero(); // Set last column to zero.
}
if (class_idx) *class_idx = default_class_;
if (logdet_out) *logdet_out = logdets_[default_class_];
if (objf_impr) *objf_impr = 0;
if (count) *count = 0;