зеркало из https://github.com/mozilla/kaldi.git
sandbox/language_id: getting VTLN model estimation working given a UBM.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@4173 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
25344d93b0
Коммит
4fd9c20c6a
|
@ -0,0 +1,5 @@
|
|||
--sample-frequency=8000
|
||||
--frame-length=20 # the default is 25.
|
||||
--low-freq=20 # the default.
|
||||
--high-freq=3700 # the default is zero meaning use the Nyquist (4k in this case).
|
||||
--num-ceps=13
|
|
@ -0,0 +1,282 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2014 Daniel Povey
|
||||
# Apache 2.0
|
||||
|
||||
#
|
||||
# This training script computes some things you will need in order to
|
||||
# extract VTLN-warped features. It takes as input the data directory
|
||||
# and an already-trained diagonal-covariance UBM. Note: although this
|
||||
# script is in the lid/ directory, because it is intended to be
|
||||
# used in language identification, but it uses features of the
|
||||
# same type as those used in the speaker-id scripts (see ../sid/),
|
||||
# i.e. double-delta features, rather than the "shifted delta cepstra"
|
||||
# features commonly used in language id.
|
||||
#
|
||||
# This script works with either mfcc or plp features; for plp features, you will
|
||||
# need to set the --base-feat-type option. Regardless, you will need to set the
|
||||
# --mfcc-config or --plp-config option if your feature-extraction config is not
|
||||
# called conf/${base_feat_type}.conf. The output of this script will be in
|
||||
# $dir/final.lvtln and $dir/final.dubm and $dir/final.ali_dubm; the directory
|
||||
# can be passed to ./get_vtln_warps.sh to get VTLN warps for a data directory,
|
||||
# or (for data passed to this script) you can use the warping factors this
|
||||
# script outputs in $dir/final.warp
|
||||
#
|
||||
|
||||
# Begin configuration.
|
||||
stage=-4 # This allows restarting after partway, when something when wrong.
|
||||
config=
|
||||
cmd=run.pl
|
||||
num_iters=15 # Number of iterations of training.
|
||||
num_utt_lvtln_init=400; # number of utterances (subset) to initialize
|
||||
# LVTLN transform. Not too critical.
|
||||
min_warp=0.85
|
||||
max_warp=1.25
|
||||
warp_step=0.01
|
||||
base_feat_type=mfcc # or could be PLP.
|
||||
mfcc_config=conf/mfcc.conf # default, can be overridden.
|
||||
plp_config=conf/plp.conf # default, can be overridden.
|
||||
logdet_scale=0.0
|
||||
subsample=5 # We use every 5th frame by default; this is more
|
||||
# CPU-efficient.
|
||||
min_gaussian_weight=0.0001 # does not matter; inherited from diag-ubm training script.
|
||||
nj=4
|
||||
cleanup=true
|
||||
num_gselect=15
|
||||
# End configuration.
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
[ -f path.sh ] && . ./path.sh;
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
num_classes=$(perl -e "print int(1.5 + ($max_warp - $min_warp) / $warp_step);") || exit 1;
|
||||
default_class=$(perl -e "print int(0.5 + (1.0 - $min_warp) / $warp_step);") || exit 1;
|
||||
|
||||
if [ $# != 3 ]; then
|
||||
echo "Usage: $0 <data-dir> <diag-ubm-dir> <exp-dir>"
|
||||
echo "e.g.: $0 data/train_vtln exp/diag_ubm_vtln exp/vtln"
|
||||
echo "main options (for others, see top of script file)"
|
||||
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
|
||||
echo " --nj <num-jobs> # number of jobs to use (default 4)"
|
||||
echo " --config <config-file> # config containing options"
|
||||
echo " --stage <stage> # stage to do partial re-run from."
|
||||
echo " --num-iters <num-iters> # number of iterations of training"
|
||||
echo " --base-feat-type <feat-type> # mfcc or plp, mfcc is default"
|
||||
echo " --mfcc-config <config> # config for MFCC extraction, default is"
|
||||
echo " # conf/mfcc.conf"
|
||||
echo " --plp-config <config> # config for PLP extraction, default is"
|
||||
echo " # conf/plp.conf"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
data=$1
|
||||
ubmdir=$2
|
||||
dir=$3
|
||||
|
||||
for f in $data/feats.scp $ubmdir/final.dubm; do
|
||||
[ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
|
||||
done
|
||||
|
||||
|
||||
mkdir -p $dir/log
|
||||
echo $nj > $dir/num_jobs
|
||||
|
||||
sdata=$data/split$nj;
|
||||
split_data.sh $data $nj || exit 1;
|
||||
|
||||
cmvn_sliding_opts="--norm-vars=false --center=true --cmn-window=300"
|
||||
# don't change $cmvn_sliding_opts, it should probably match the
|
||||
# options used in ../sid/train_diag_ubm.sh.
|
||||
sifeats="ark,s,cs:add-deltas scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding $cmvn_sliding_opts ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
|
||||
|
||||
|
||||
# for the subsets of features that we use to estimate the linear transforms, we
|
||||
# don't bother with CMN. This will give us wrong offsets on the transforms,
|
||||
# but it won't matter because we will allow an arbitrary bias term when we apply
|
||||
# these transforms.
|
||||
|
||||
# you need to define CLASS when invoking $cmd on featsub_warped.
|
||||
featsub_warped="ark:add-deltas ark:$dir/feats.CLASS.ark ark:- | select-voiced-frames ark:- scp,s,cs:$data/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
|
||||
featsub_unwarped="ark:add-deltas ark:$dir/feats.$default_class.ark ark:- | select-voiced-frames ark:- scp,s,cs:$data/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
|
||||
|
||||
|
||||
if [ -f $data/utt2warp ]; then
|
||||
echo "$0: source data directory $data appears to already have VTLN.";
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
# create a small subset of utterances for purposes of initializing the LVTLN transform
|
||||
# utils/shuffle_list.pl is deterministic, unlike sort -R.
|
||||
cat $data/utt2spk | awk '{print $1}' | utils/shuffle_list.pl | \
|
||||
head -n $num_utt_lvtln_init > $dir/utt_subset
|
||||
|
||||
if [ $stage -le -4 ]; then
|
||||
echo "$0: computing warped subset of features"
|
||||
if [ -f $data/segments ]; then
|
||||
echo "$0 [info]: segments file exists: using that."
|
||||
subset_feats="utils/filter_scp.pl $dir/utt_subset $data/segments | extract-segments scp:$data/wav.scp - ark:- "
|
||||
else
|
||||
echo "$0 [info]: no segments file exists: using wav.scp directly."
|
||||
subset_feats="utils/filter_scp.pl $dir/utt_subset $data/wav.scp | wav-copy scp:- ark:- "
|
||||
fi
|
||||
rm $dir/.error 2>/dev/null
|
||||
for c in $(seq 0 $[$num_classes-1]); do
|
||||
this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));")
|
||||
config_name=${base_feat_type}_config # e.g. mfcc_config or plp_config
|
||||
this_config=$(eval echo \$$config_name) # e.g. conf/mfcc.conf or conf/plp.conf by default.
|
||||
$cmd $dir/log/compute_warped_feats.$c.log \
|
||||
$subset_feats \| compute-${base_feat_type}-feats --verbose=2 \
|
||||
--config=$this_config --vtln-warp=$this_warp ark:- ark:- \| \
|
||||
copy-feats --compress=true ark:- ark:$dir/feats.$c.ark || touch $dir/.error &
|
||||
done
|
||||
wait;
|
||||
if [ -f $dir/.error ]; then
|
||||
echo "$0: Computing warped features failed: check $dir/log/compute_warped_feats.*.log"
|
||||
exit 1;
|
||||
fi
|
||||
fi
|
||||
|
||||
if ! utils/filter_scp.pl $dir/utt_subset $data/feats.scp | \
|
||||
compare-feats --threshold=0.98 scp:- ark:$dir/feats.$default_class.ark >&/dev/null; then
|
||||
echo "$0: features stored on disk differ from those computed with no warping."
|
||||
echo " Possibly your feature type is wrong (--base-feat-type option)"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
if [ -f $data/segments ]; then
|
||||
subset_utts="ark:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
|
||||
else
|
||||
echo "$0 [info]: no segments file exists: using wav.scp directly."
|
||||
subset_utts="ark:wav-copy scp:$sdata/JOB/wav.scp ark:- |"
|
||||
fi
|
||||
|
||||
if [ $stage -le -3 ]; then
|
||||
echo "$0: initializing base LVTLN transforms in $dir/0.lvtln (ignore warnings below)"
|
||||
dim=$(feat-to-dim "$featsub_unwarped" - ) || exit 1;
|
||||
|
||||
$cmd $dir/log/init_lvtln.log \
|
||||
gmm-init-lvtln --dim=$dim --num-classes=$num_classes --default-class=$default_class \
|
||||
$dir/0.lvtln || exit 1;
|
||||
|
||||
for c in $(seq 0 $[$num_classes-1]); do
|
||||
this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));")
|
||||
orig_feats=ark:$dir/feats.$default_class.ark
|
||||
warped_feats=ark:$dir/feats.$c.ark
|
||||
logfile=$dir/log/train_special.$c.log
|
||||
this_featsub_warped="$(echo $featsub_warped | sed s/CLASS/$c/)"
|
||||
if ! gmm-train-lvtln-special --warp=$this_warp --normalize-var=true \
|
||||
$c $dir/0.lvtln $dir/0.lvtln \
|
||||
"$featsub_unwarped" "$this_featsub_warped" 2>$logfile; then
|
||||
echo "$0: Error training LVTLN transform, see $logfile";
|
||||
exit 1;
|
||||
fi
|
||||
done
|
||||
rm $dir/final.lvtln 2>/dev/null
|
||||
ln -s 0.lvtln $dir/final.lvtln
|
||||
fi
|
||||
|
||||
cp $ubmdir/final.dubm $dir/0.dubm
|
||||
|
||||
if [ $stage -le -2 ]; then
|
||||
echo "$0: computing Gaussian selection info."
|
||||
|
||||
$cmd JOB=1:$nj $dir/log/gselect.JOB.log \
|
||||
gmm-gselect --n=$num_gselect $ubmdir/final.dubm "$sifeats" \
|
||||
"ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le -1 ]; then
|
||||
echo "$0: computing initial LVTLN transforms" # do this per-utt.
|
||||
|
||||
$cmd JOB=1:$nj $dir/log/lvtln.0.JOB.log \
|
||||
gmm-global-gselect-to-post $dir/0.dubm "$sifeats" \
|
||||
"ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark:- \| \
|
||||
gmm-global-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \
|
||||
$dir/0.dubm $dir/0.lvtln "$sifeats" ark,s,cs:- ark:$dir/trans.0.JOB ark,t:$dir/warp.0.JOB || exit 1
|
||||
|
||||
# consolidate the warps into one file.
|
||||
for j in $(seq $nj); do cat $dir/warp.0.$j; done > $dir/warp.0
|
||||
rm $dir/warp.0.*
|
||||
fi
|
||||
|
||||
|
||||
x=0
|
||||
while [ $x -lt $num_iters ]; do
|
||||
feats="$sifeats transform-feats ark:$dir/trans.$x.JOB ark:- ark:- |"
|
||||
|
||||
# First update the model.
|
||||
if [ $stage -le $x ]; then
|
||||
echo "$0: Updating model on pass $x"
|
||||
# Accumulate stats.
|
||||
$cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
|
||||
gmm-global-acc-stats "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \
|
||||
$dir/$x.dubm "$feats" $dir/$x.JOB.acc || exit 1;
|
||||
|
||||
$cmd $dir/log/update.$x.log \
|
||||
gmm-global-est --remove-low-count-gaussians=false --min-gaussian-weight=$min_gaussian_weight \
|
||||
$dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc|" \
|
||||
$dir/$[$x+1].dubm || exit 1;
|
||||
$cleanup && rm $dir/$x.*.acc $dir/$x.dubm
|
||||
fi
|
||||
|
||||
# Now update the LVTLN transforms (and warps.)
|
||||
if [ $stage -le $x ]; then
|
||||
echo "$0: re-estimating LVTLN transforms on pass $x"
|
||||
$cmd JOB=1:$nj $dir/log/lvtln.$x.JOB.log \
|
||||
gmm-global-gselect-to-post $dir/$[$x+1].dubm "$feats" \
|
||||
"ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark:- \| \
|
||||
gmm-global-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \
|
||||
$dir/$[$x+1].dubm $dir/0.lvtln "$sifeats" ark,s,cs:- \
|
||||
ark:$dir/trans.$[$x+1].JOB ark,t:$dir/warp.$[$x+1].JOB || exit 1
|
||||
|
||||
# consolidate the warps into one file.
|
||||
for j in $(seq $nj); do cat $dir/warp.$[$x+1].$j; done > $dir/warp.$[$x+1]
|
||||
rm $dir/warp.$[$x+1].*
|
||||
$cleanup && rm $dir/trans.$x.*
|
||||
fi
|
||||
x=$[$x+1]
|
||||
done
|
||||
|
||||
feats="$sifeats transform-feats ark:$dir/trans.$x.JOB ark:- ark:- |"
|
||||
|
||||
if [ $stage -le $x ]; then
|
||||
# Accumulate stats for "alignment model"-- this model is computed with the
|
||||
# speaker-independent features, but matches Gaussian-for-Gaussian with the
|
||||
# final speaker-adapted model.
|
||||
$cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
|
||||
gmm-global-acc-stats-twofeats "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \
|
||||
$dir/$x.dubm "$feats" "$sifeats" $dir/$x.JOB.acc || exit 1
|
||||
[ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
|
||||
# Update model.
|
||||
$cmd $dir/log/est_alimdl.log \
|
||||
gmm-global-est --min-gaussian-weight=$min_gaussian_weight \
|
||||
--remove-low-count-gaussians=false $dir/$x.dubm \
|
||||
"gmm-global-sum-accs - $dir/$x.*.acc|" $dir/$x.ali_dubm || exit 1;
|
||||
$cleanup && rm $dir/$x.*.acc
|
||||
fi
|
||||
|
||||
if true; then # Diagnostics
|
||||
ln -sf warp.$x $dir/final.warp
|
||||
if [ -f $data/spk2gender ]; then
|
||||
# To make it easier to eyeball the male and female speakers' warps
|
||||
# separately, separate them out.
|
||||
for g in m f; do # means: for gender in male female
|
||||
cat $dir/final.warp | \
|
||||
utils/filter_scp.pl <(grep -w $g $data/spk2gender | awk '{print $1}') > $dir/final.warp.$g
|
||||
echo -n "The last few warp factors for gender $g are: "
|
||||
tail -n 10 $dir/final.warp.$g | awk '{printf("%s ", $2);}';
|
||||
echo
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
ln -sf $x.dubm $dir/final.dubm
|
||||
ln -sf $x.ali_dubm $dir/final.ali_dubm
|
||||
ln -sf 0.lvtln $dir/final.lvtln
|
||||
|
||||
# Summarize warning messages...
|
||||
utils/summarize_warnings.pl $dir/log
|
||||
|
||||
echo "$0: Done training LVTLN model in $dir"
|
|
@ -54,6 +54,26 @@ local/split_long_utts.sh --max-utt-len 120 data/train_unsplit data/train
|
|||
# max_voiced=3000
|
||||
# local/vad_split_utts.sh --max-voiced $max_voiced data/train_unsplit $mfccdir data/train
|
||||
|
||||
# Vtln-related things:
|
||||
# We'll use a subset of utterances to train the GMM we'll use for VTLN
|
||||
# warping.
|
||||
utils/subset_data_dir.sh data/train 5000 data/train_5k_novtln
|
||||
|
||||
# for the features we use to estimate VTLN warp factors, we use more cepstra
|
||||
# (13 instead of just 7); this needs to be tuned.
|
||||
steps/make_mfcc.sh --mfcc-config conf/mfcc_vtln.conf --nj 50 --cmd "$train_cmd" \
|
||||
data/train_5k_novtln exp/make_mfcc $mfccdir
|
||||
lid/compute_vad_decision.sh data/train_5k_novtln exp/make_mfcc $mfccdir
|
||||
# note, we're using the speaker-id version of the train_diag_ubm.sh script, which
|
||||
# uses double-delta instead of SDC features. We train a 256-Gaussian UBM; this
|
||||
# has to be tuned.
|
||||
sid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train_5k_novtln 256 \
|
||||
exp/diag_ubm_vtln
|
||||
lid/train_lvtln_model.sh --mfcc-config conf/mfcc_vtln.conf --nj 30 --cmd "$train_cmd" \
|
||||
data/train_5k_novtln exp/diag_ubm_vtln exp/vtln
|
||||
|
||||
)
|
||||
|
||||
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 100 --cmd "$train_cmd" \
|
||||
data/train exp/make_mfcc $mfccdir
|
||||
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
../../sre08/v1/sid
|
|
@ -23,11 +23,11 @@ echo "$0 $@" # Print the command line for logging
|
|||
fake=false
|
||||
two_channel=false
|
||||
|
||||
if [ $1 == "--fake" ]; then
|
||||
if [ "$1" == "--fake" ]; then
|
||||
fake=true
|
||||
shift
|
||||
fi
|
||||
if [ $1 == "--two-channel" ]; then
|
||||
if [ "$1" == "--two-channel" ]; then
|
||||
two_channel=true
|
||||
shift
|
||||
fi
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
# This training script trains linear-VTLN models starting from an existing
|
||||
# system based on either LDA+MLLT or delta+delta-delta features.
|
||||
# Works with either mfcc or plp features, but you need to set the
|
||||
# --base-feature-type option.
|
||||
# --base-feat-type option.
|
||||
# The resulting system can be used with align_lvtln.sh and/or decode_lvtln.sh
|
||||
# to get VTLN warping factors for data, for warped data extraction, or (for
|
||||
# the training data) you can use the warping factors this script outputs
|
||||
|
@ -65,7 +65,7 @@ alidir=$5
|
|||
dir=$6
|
||||
|
||||
for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt $data/wav.scp; do
|
||||
[ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
|
||||
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
|
||||
done
|
||||
|
||||
numgauss=$numleaves
|
||||
|
@ -111,7 +111,7 @@ else
|
|||
fi
|
||||
|
||||
if [ -f $data/utt2warp ]; then
|
||||
echo "$0: source directory appears to already have VTLN.";
|
||||
echo "$0: source data directory $data appears to already have VTLN.";
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
|
|
|
@ -6,15 +6,15 @@ no_wav=false
|
|||
no_text=false
|
||||
|
||||
for x in `seq 3`; do
|
||||
if [ $1 == "--no-feats" ]; then
|
||||
if [ "$1" == "--no-feats" ]; then
|
||||
no_feats=true
|
||||
shift;
|
||||
fi
|
||||
if [ $1 == "--no-text" ]; then
|
||||
if [ "$1" == "--no-text" ]; then
|
||||
no_text=true
|
||||
shift;
|
||||
fi
|
||||
if [ $1 == "--no-wav" ]; then
|
||||
if [ "$1" == "--no-wav" ]; then
|
||||
no_wav=true
|
||||
shift;
|
||||
fi
|
||||
|
|
|
@ -27,7 +27,8 @@ BINFILES = gmm-init-mono gmm-est gmm-acc-stats-ali gmm-align \
|
|||
gmm-est-basis-fmllr-gpost gmm-latgen-tracking gmm-latgen-faster-parallel \
|
||||
gmm-est-fmllr-raw gmm-est-fmllr-raw-gpost gmm-global-init-from-feats \
|
||||
gmm-global-info gmm-latgen-faster-regtree-fmllr gmm-est-fmllr-global \
|
||||
gmm-acc-mllt-global gmm-transform-means-global gmm-global-gselect-to-post
|
||||
gmm-acc-mllt-global gmm-transform-means-global gmm-global-gselect-to-post \
|
||||
gmm-global-est-lvtln-trans
|
||||
|
||||
OBJFILES =
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
// gmmbin/gmm-est-lvtln-trans.cc
|
||||
|
||||
// Copyright 2009-2011 Microsoft Corporation; Saarland University
|
||||
// 2014 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
|
@ -125,8 +126,8 @@ int main(int argc, char *argv[]) {
|
|||
const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
|
||||
const GaussPost &gpost = gpost_reader.Value(utt);
|
||||
if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
|
||||
KALDI_WARN << "GauPost vector has wrong size " << (gpost.size())
|
||||
<< " vs. " << (feats.NumRows());
|
||||
KALDI_WARN << "GauPost vector has wrong size " << gpost.size()
|
||||
<< " vs. " << feats.NumRows();
|
||||
num_other_error++;
|
||||
continue;
|
||||
}
|
||||
|
@ -172,8 +173,8 @@ int main(int argc, char *argv[]) {
|
|||
const GaussPost &gpost = gpost_reader.Value(utt);
|
||||
|
||||
if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
|
||||
KALDI_WARN << "GauPost has wrong size " << (gpost.size())
|
||||
<< " vs. " << (feats.NumRows());
|
||||
KALDI_WARN << "GauPost has wrong size " << gpost.size()
|
||||
<< " vs. " << feats.NumRows();
|
||||
num_other_error++;
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,235 @@
|
|||
// gmmbin/gmm-global-est-lvtln-trans.cc
|
||||
|
||||
// Copyright 2009-2011 Microsoft Corporation; Saarland University
|
||||
// 2014 Daniel Povey
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string>
|
||||
using std::string;
|
||||
#include <vector>
|
||||
using std::vector;
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "gmm/am-diag-gmm.h"
|
||||
#include "hmm/transition-model.h"
|
||||
#include "transform/lvtln.h"
|
||||
#include "hmm/posterior.h"
|
||||
|
||||
namespace kaldi {
|
||||
void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
|
||||
const Posterior &post,
|
||||
const DiagGmm &gmm,
|
||||
FmllrDiagGmmAccs *spk_stats) {
|
||||
KALDI_ASSERT(static_cast<int32>(post.size()) == feats.NumRows());
|
||||
for (size_t i = 0; i < post.size(); i++) {
|
||||
std::vector<int32> gselect(post[i].size());
|
||||
Vector<BaseFloat> this_post(post[i].size());
|
||||
for (size_t j = 0; j < post[i].size(); j++) {
|
||||
int32 g = post[i][j].first;
|
||||
BaseFloat weight = post[i][j].second;
|
||||
gselect[j] = g;
|
||||
this_post(j) = weight;
|
||||
}
|
||||
spk_stats->AccumulateFromPosteriorsPreselect(gmm, gselect,
|
||||
feats.Row(i),
|
||||
this_post);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
try {
|
||||
typedef kaldi::int32 int32;
|
||||
using namespace kaldi;
|
||||
const char *usage =
|
||||
"Estimate linear-VTLN transforms, either per utterance or for "
|
||||
"the supplied set of speakers (spk2utt option); this version\n"
|
||||
"is for a global diagonal GMM (also known as a UBM). Reads posteriors\n"
|
||||
"indicating Gaussian indexes in the UBM.\n"
|
||||
"\n"
|
||||
"Usage: gmm-global-est-lvtln-trans [options] <gmm-in> <lvtln-in> "
|
||||
"<feature-rspecifier> <gpost-rspecifier> <lvtln-trans-wspecifier> [<warp-wspecifier>]\n"
|
||||
"e.g.: gmm-global-est-lvtln-trans 0.ubm 0.lvtln '$feats' ark,s,cs:- ark:1.trans ark:1.warp\n"
|
||||
"(where the <gpost-rspecifier> will likely come from gmm-global-get-post or\n"
|
||||
"gmm-global-gselect-to-post\n";
|
||||
|
||||
ParseOptions po(usage);
|
||||
string spk2utt_rspecifier;
|
||||
BaseFloat logdet_scale = 1.0;
|
||||
std::string norm_type = "offset";
|
||||
po.Register("norm-type", &norm_type, "type of fMLLR applied (\"offset\"|\"none\"|\"diag\")");
|
||||
po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
|
||||
"utterance-list map");
|
||||
po.Register("logdet-scale", &logdet_scale, "Scale on log-determinant term in auxiliary function");
|
||||
|
||||
po.Read(argc, argv);
|
||||
|
||||
if (po.NumArgs() < 5 || po.NumArgs() > 6) {
|
||||
po.PrintUsage();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
string
|
||||
model_rxfilename = po.GetArg(1),
|
||||
lvtln_rxfilename = po.GetArg(2),
|
||||
feature_rspecifier = po.GetArg(3),
|
||||
post_rspecifier = po.GetArg(4),
|
||||
trans_wspecifier = po.GetArg(5),
|
||||
warp_wspecifier = po.GetOptArg(6);
|
||||
|
||||
DiagGmm gmm;
|
||||
ReadKaldiObject(model_rxfilename, &gmm);
|
||||
LinearVtln lvtln;
|
||||
ReadKaldiObject(lvtln_rxfilename, &lvtln);
|
||||
|
||||
|
||||
RandomAccessPosteriorReader post_reader(post_rspecifier);
|
||||
|
||||
double tot_lvtln_impr = 0.0, tot_t = 0.0;
|
||||
|
||||
BaseFloatMatrixWriter transform_writer(trans_wspecifier);
|
||||
|
||||
BaseFloatWriter warp_writer(warp_wspecifier);
|
||||
|
||||
std::vector<int32> class_counts(lvtln.NumClasses(), 0);
|
||||
int32 num_done = 0, num_no_post = 0, num_other_error = 0;
|
||||
if (spk2utt_rspecifier != "") { // per-speaker adaptation
|
||||
SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
|
||||
RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
|
||||
|
||||
for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
|
||||
FmllrDiagGmmAccs spk_stats(lvtln.Dim());
|
||||
string spk = spk2utt_reader.Key();
|
||||
const vector<string> &uttlist = spk2utt_reader.Value();
|
||||
for (size_t i = 0; i < uttlist.size(); i++) {
|
||||
std::string utt = uttlist[i];
|
||||
if (!feature_reader.HasKey(utt)) {
|
||||
KALDI_WARN << "Did not find features for utterance " << utt;
|
||||
continue;
|
||||
}
|
||||
if (!post_reader.HasKey(utt)) {
|
||||
KALDI_WARN << "Did not find posteriors for utterance " << utt;
|
||||
num_no_post++;
|
||||
continue;
|
||||
}
|
||||
const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
|
||||
const Posterior &post = post_reader.Value(utt);
|
||||
if (static_cast<int32>(post.size()) != feats.NumRows()) {
|
||||
KALDI_WARN << "Posterior vector has wrong size " << post.size()
|
||||
<< " vs. " << feats.NumRows();
|
||||
num_other_error++;
|
||||
continue;
|
||||
}
|
||||
|
||||
AccumulateForUtterance(feats, post, gmm, &spk_stats);
|
||||
|
||||
num_done++;
|
||||
} // end looping over all utterances of the current speaker
|
||||
|
||||
BaseFloat impr, spk_tot_t;
|
||||
{ // Compute the transform and write it out.
|
||||
Matrix<BaseFloat> transform(lvtln.Dim(), lvtln.Dim()+1);
|
||||
int32 class_idx;
|
||||
lvtln.ComputeTransform(spk_stats,
|
||||
norm_type,
|
||||
logdet_scale,
|
||||
&transform,
|
||||
&class_idx,
|
||||
NULL,
|
||||
&impr,
|
||||
&spk_tot_t);
|
||||
class_counts[class_idx]++;
|
||||
transform_writer.Write(spk, transform);
|
||||
if (warp_wspecifier != "")
|
||||
warp_writer.Write(spk, lvtln.GetWarp(class_idx));
|
||||
}
|
||||
KALDI_LOG << "For speaker " << spk << ", auxf-impr from LVTLN is "
|
||||
<< (impr/spk_tot_t) << ", over " << spk_tot_t << " frames.";
|
||||
tot_lvtln_impr += impr;
|
||||
tot_t += spk_tot_t;
|
||||
} // end looping over speakers
|
||||
} else { // per-utterance adaptation
|
||||
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
|
||||
for (; !feature_reader.Done(); feature_reader.Next()) {
|
||||
string utt = feature_reader.Key();
|
||||
if (!post_reader.HasKey(utt)) {
|
||||
KALDI_WARN << "Did not find posterior for utterance "
|
||||
<< utt;
|
||||
num_no_post++;
|
||||
continue;
|
||||
}
|
||||
const Matrix<BaseFloat> &feats = feature_reader.Value();
|
||||
const Posterior &post = post_reader.Value(utt);
|
||||
|
||||
if (static_cast<int32>(post.size()) != feats.NumRows()) {
|
||||
KALDI_WARN << "Posterior has wrong size " << post.size()
|
||||
<< " vs. " << feats.NumRows();
|
||||
num_other_error++;
|
||||
continue;
|
||||
}
|
||||
num_done++;
|
||||
|
||||
FmllrDiagGmmAccs spk_stats(lvtln.Dim());
|
||||
|
||||
AccumulateForUtterance(feats, post, gmm,
|
||||
&spk_stats);
|
||||
BaseFloat impr, utt_tot_t = spk_stats.beta_;
|
||||
{ // Compute the transform and write it out.
|
||||
Matrix<BaseFloat> transform(lvtln.Dim(), lvtln.Dim()+1);
|
||||
int32 class_idx;
|
||||
lvtln.ComputeTransform(spk_stats,
|
||||
norm_type,
|
||||
logdet_scale,
|
||||
&transform,
|
||||
&class_idx,
|
||||
NULL,
|
||||
&impr,
|
||||
&utt_tot_t);
|
||||
class_counts[class_idx]++;
|
||||
transform_writer.Write(utt, transform);
|
||||
if (warp_wspecifier != "")
|
||||
warp_writer.Write(utt, lvtln.GetWarp(class_idx));
|
||||
}
|
||||
|
||||
KALDI_LOG << "For utterance " << utt << ", auxf-impr from LVTLN is "
|
||||
<< (impr/utt_tot_t) << ", over " << utt_tot_t << " frames.";
|
||||
tot_lvtln_impr += impr;
|
||||
tot_t += utt_tot_t;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
std::ostringstream s;
|
||||
for (size_t i = 0; i < class_counts.size(); i++)
|
||||
s << ' ' << class_counts[i];
|
||||
KALDI_LOG << "Distribution of classes is: " << s.str();
|
||||
}
|
||||
|
||||
KALDI_LOG << "Done " << num_done << " files, " << num_no_post
|
||||
<< " with no posteriors, " << num_other_error << " with other errors.";
|
||||
KALDI_LOG << "Overall LVTLN auxf impr per frame is "
|
||||
<< (tot_lvtln_impr / tot_t) << " over " << tot_t << " frames.";
|
||||
return 0;
|
||||
} catch(const std::exception &e) {
|
||||
std::cerr << e.what();
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
|
@ -100,7 +100,7 @@ class FmllrDiagGmmAccs: public AffineXformStats {
|
|||
const VectorBase<BaseFloat> &posteriors);
|
||||
|
||||
/// Accumulate stats for a GMM, given supplied posteriors. The "posteriors"
|
||||
/// vector should be have the same size as "gselect".n
|
||||
/// vector should be have the same size as "gselect".
|
||||
void AccumulateFromPosteriorsPreselect(
|
||||
const DiagGmm &gmm,
|
||||
const std::vector<int32> &gselect,
|
||||
|
|
Загрузка…
Ссылка в новой задаче