From 4fd9c20c6aac4ac035627590970dafdf1172d23c Mon Sep 17 00:00:00 2001 From: Dan Povey Date: Sat, 19 Jul 2014 19:01:45 +0000 Subject: [PATCH] sandbox/language_id: getting VTLN model estimation working given a UBM. git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@4173 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8 --- egs/lre/v1/conf/mfcc_vtln.conf | 5 + egs/lre/v1/lid/train_lvtln_model.sh | 282 +++++++++++++++++++++++ egs/lre/v1/run.sh | 20 ++ egs/lre/v1/sid | 1 + egs/wsj/s5/steps/compute_cmvn_stats.sh | 4 +- egs/wsj/s5/steps/train_lvtln.sh | 6 +- egs/wsj/s5/utils/validate_data_dir.sh | 6 +- src/gmmbin/Makefile | 3 +- src/gmmbin/gmm-est-lvtln-trans.cc | 9 +- src/gmmbin/gmm-global-est-lvtln-trans.cc | 235 +++++++++++++++++++ src/transform/fmllr-diag-gmm.h | 2 +- 11 files changed, 559 insertions(+), 14 deletions(-) create mode 100644 egs/lre/v1/conf/mfcc_vtln.conf create mode 100755 egs/lre/v1/lid/train_lvtln_model.sh create mode 120000 egs/lre/v1/sid create mode 100644 src/gmmbin/gmm-global-est-lvtln-trans.cc diff --git a/egs/lre/v1/conf/mfcc_vtln.conf b/egs/lre/v1/conf/mfcc_vtln.conf new file mode 100644 index 000000000..4c0db12e9 --- /dev/null +++ b/egs/lre/v1/conf/mfcc_vtln.conf @@ -0,0 +1,5 @@ +--sample-frequency=8000 +--frame-length=20 # the default is 25. +--low-freq=20 # the default. +--high-freq=3700 # the default is zero meaning use the Nyquist (4k in this case). +--num-ceps=13 diff --git a/egs/lre/v1/lid/train_lvtln_model.sh b/egs/lre/v1/lid/train_lvtln_model.sh new file mode 100755 index 000000000..d1cf111aa --- /dev/null +++ b/egs/lre/v1/lid/train_lvtln_model.sh @@ -0,0 +1,282 @@ +#!/bin/bash + +# Copyright 2014 Daniel Povey +# Apache 2.0 + +# +# This training script computes some things you will need in order to +# extract VTLN-warped features. It takes as input the data directory +# and an already-trained diagonal-covariance UBM. Note: although this +# script is in the lid/ directory, because it is intended to be +# used in language identification, but it uses features of the +# same type as those used in the speaker-id scripts (see ../sid/), +# i.e. double-delta features, rather than the "shifted delta cepstra" +# features commonly used in language id. +# +# This script works with either mfcc or plp features; for plp features, you will +# need to set the --base-feat-type option. Regardless, you will need to set the +# --mfcc-config or --plp-config option if your feature-extraction config is not +# called conf/${base_feat_type}.conf. The output of this script will be in +# $dir/final.lvtln and $dir/final.dubm and $dir/final.ali_dubm; the directory +# can be passed to ./get_vtln_warps.sh to get VTLN warps for a data directory, +# or (for data passed to this script) you can use the warping factors this +# script outputs in $dir/final.warp +# + +# Begin configuration. +stage=-4 # This allows restarting after partway, when something when wrong. +config= +cmd=run.pl +num_iters=15 # Number of iterations of training. +num_utt_lvtln_init=400; # number of utterances (subset) to initialize + # LVTLN transform. Not too critical. +min_warp=0.85 +max_warp=1.25 +warp_step=0.01 +base_feat_type=mfcc # or could be PLP. +mfcc_config=conf/mfcc.conf # default, can be overridden. +plp_config=conf/plp.conf # default, can be overridden. +logdet_scale=0.0 +subsample=5 # We use every 5th frame by default; this is more + # CPU-efficient. +min_gaussian_weight=0.0001 # does not matter; inherited from diag-ubm training script. +nj=4 +cleanup=true +num_gselect=15 +# End configuration. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh; +. parse_options.sh || exit 1; + +num_classes=$(perl -e "print int(1.5 + ($max_warp - $min_warp) / $warp_step);") || exit 1; +default_class=$(perl -e "print int(0.5 + (1.0 - $min_warp) / $warp_step);") || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train_vtln exp/diag_ubm_vtln exp/vtln" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # number of jobs to use (default 4)" + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + echo " --num-iters # number of iterations of training" + echo " --base-feat-type # mfcc or plp, mfcc is default" + echo " --mfcc-config # config for MFCC extraction, default is" + echo " # conf/mfcc.conf" + echo " --plp-config # config for PLP extraction, default is" + echo " # conf/plp.conf" + exit 1; +fi + +data=$1 +ubmdir=$2 +dir=$3 + +for f in $data/feats.scp $ubmdir/final.dubm; do + [ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1; +done + + +mkdir -p $dir/log +echo $nj > $dir/num_jobs + +sdata=$data/split$nj; +split_data.sh $data $nj || exit 1; + +cmvn_sliding_opts="--norm-vars=false --center=true --cmn-window=300" +# don't change $cmvn_sliding_opts, it should probably match the +# options used in ../sid/train_diag_ubm.sh. +sifeats="ark,s,cs:add-deltas scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding $cmvn_sliding_opts ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |" + + +# for the subsets of features that we use to estimate the linear transforms, we +# don't bother with CMN. This will give us wrong offsets on the transforms, +# but it won't matter because we will allow an arbitrary bias term when we apply +# these transforms. + +# you need to define CLASS when invoking $cmd on featsub_warped. +featsub_warped="ark:add-deltas ark:$dir/feats.CLASS.ark ark:- | select-voiced-frames ark:- scp,s,cs:$data/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |" +featsub_unwarped="ark:add-deltas ark:$dir/feats.$default_class.ark ark:- | select-voiced-frames ark:- scp,s,cs:$data/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |" + + +if [ -f $data/utt2warp ]; then + echo "$0: source data directory $data appears to already have VTLN."; + exit 1; +fi + +# create a small subset of utterances for purposes of initializing the LVTLN transform +# utils/shuffle_list.pl is deterministic, unlike sort -R. +cat $data/utt2spk | awk '{print $1}' | utils/shuffle_list.pl | \ + head -n $num_utt_lvtln_init > $dir/utt_subset + +if [ $stage -le -4 ]; then + echo "$0: computing warped subset of features" + if [ -f $data/segments ]; then + echo "$0 [info]: segments file exists: using that." + subset_feats="utils/filter_scp.pl $dir/utt_subset $data/segments | extract-segments scp:$data/wav.scp - ark:- " + else + echo "$0 [info]: no segments file exists: using wav.scp directly." + subset_feats="utils/filter_scp.pl $dir/utt_subset $data/wav.scp | wav-copy scp:- ark:- " + fi + rm $dir/.error 2>/dev/null + for c in $(seq 0 $[$num_classes-1]); do + this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));") + config_name=${base_feat_type}_config # e.g. mfcc_config or plp_config + this_config=$(eval echo \$$config_name) # e.g. conf/mfcc.conf or conf/plp.conf by default. + $cmd $dir/log/compute_warped_feats.$c.log \ + $subset_feats \| compute-${base_feat_type}-feats --verbose=2 \ + --config=$this_config --vtln-warp=$this_warp ark:- ark:- \| \ + copy-feats --compress=true ark:- ark:$dir/feats.$c.ark || touch $dir/.error & + done + wait; + if [ -f $dir/.error ]; then + echo "$0: Computing warped features failed: check $dir/log/compute_warped_feats.*.log" + exit 1; + fi +fi + +if ! utils/filter_scp.pl $dir/utt_subset $data/feats.scp | \ + compare-feats --threshold=0.98 scp:- ark:$dir/feats.$default_class.ark >&/dev/null; then + echo "$0: features stored on disk differ from those computed with no warping." + echo " Possibly your feature type is wrong (--base-feat-type option)" + exit 1; +fi + +if [ -f $data/segments ]; then + subset_utts="ark:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |" +else + echo "$0 [info]: no segments file exists: using wav.scp directly." + subset_utts="ark:wav-copy scp:$sdata/JOB/wav.scp ark:- |" +fi + +if [ $stage -le -3 ]; then + echo "$0: initializing base LVTLN transforms in $dir/0.lvtln (ignore warnings below)" + dim=$(feat-to-dim "$featsub_unwarped" - ) || exit 1; + + $cmd $dir/log/init_lvtln.log \ + gmm-init-lvtln --dim=$dim --num-classes=$num_classes --default-class=$default_class \ + $dir/0.lvtln || exit 1; + + for c in $(seq 0 $[$num_classes-1]); do + this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));") + orig_feats=ark:$dir/feats.$default_class.ark + warped_feats=ark:$dir/feats.$c.ark + logfile=$dir/log/train_special.$c.log + this_featsub_warped="$(echo $featsub_warped | sed s/CLASS/$c/)" + if ! gmm-train-lvtln-special --warp=$this_warp --normalize-var=true \ + $c $dir/0.lvtln $dir/0.lvtln \ + "$featsub_unwarped" "$this_featsub_warped" 2>$logfile; then + echo "$0: Error training LVTLN transform, see $logfile"; + exit 1; + fi + done + rm $dir/final.lvtln 2>/dev/null + ln -s 0.lvtln $dir/final.lvtln +fi + +cp $ubmdir/final.dubm $dir/0.dubm + +if [ $stage -le -2 ]; then + echo "$0: computing Gaussian selection info." + + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + gmm-gselect --n=$num_gselect $ubmdir/final.dubm "$sifeats" \ + "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; +fi + + +if [ $stage -le -1 ]; then + echo "$0: computing initial LVTLN transforms" # do this per-utt. + + $cmd JOB=1:$nj $dir/log/lvtln.0.JOB.log \ + gmm-global-gselect-to-post $dir/0.dubm "$sifeats" \ + "ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark:- \| \ + gmm-global-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \ + $dir/0.dubm $dir/0.lvtln "$sifeats" ark,s,cs:- ark:$dir/trans.0.JOB ark,t:$dir/warp.0.JOB || exit 1 + + # consolidate the warps into one file. + for j in $(seq $nj); do cat $dir/warp.0.$j; done > $dir/warp.0 + rm $dir/warp.0.* +fi + + +x=0 +while [ $x -lt $num_iters ]; do + feats="$sifeats transform-feats ark:$dir/trans.$x.JOB ark:- ark:- |" + + # First update the model. + if [ $stage -le $x ]; then + echo "$0: Updating model on pass $x" + # Accumulate stats. + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + gmm-global-acc-stats "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \ + $dir/$x.dubm "$feats" $dir/$x.JOB.acc || exit 1; + + $cmd $dir/log/update.$x.log \ + gmm-global-est --remove-low-count-gaussians=false --min-gaussian-weight=$min_gaussian_weight \ + $dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc|" \ + $dir/$[$x+1].dubm || exit 1; + $cleanup && rm $dir/$x.*.acc $dir/$x.dubm + fi + + # Now update the LVTLN transforms (and warps.) + if [ $stage -le $x ]; then + echo "$0: re-estimating LVTLN transforms on pass $x" + $cmd JOB=1:$nj $dir/log/lvtln.$x.JOB.log \ + gmm-global-gselect-to-post $dir/$[$x+1].dubm "$feats" \ + "ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark:- \| \ + gmm-global-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \ + $dir/$[$x+1].dubm $dir/0.lvtln "$sifeats" ark,s,cs:- \ + ark:$dir/trans.$[$x+1].JOB ark,t:$dir/warp.$[$x+1].JOB || exit 1 + + # consolidate the warps into one file. + for j in $(seq $nj); do cat $dir/warp.$[$x+1].$j; done > $dir/warp.$[$x+1] + rm $dir/warp.$[$x+1].* + $cleanup && rm $dir/trans.$x.* + fi + x=$[$x+1] +done + +feats="$sifeats transform-feats ark:$dir/trans.$x.JOB ark:- ark:- |" + +if [ $stage -le $x ]; then + # Accumulate stats for "alignment model"-- this model is computed with the + # speaker-independent features, but matches Gaussian-for-Gaussian with the + # final speaker-adapted model. + $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \ + gmm-global-acc-stats-twofeats "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \ + $dir/$x.dubm "$feats" "$sifeats" $dir/$x.JOB.acc || exit 1 + [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1; + # Update model. + $cmd $dir/log/est_alimdl.log \ + gmm-global-est --min-gaussian-weight=$min_gaussian_weight \ + --remove-low-count-gaussians=false $dir/$x.dubm \ + "gmm-global-sum-accs - $dir/$x.*.acc|" $dir/$x.ali_dubm || exit 1; + $cleanup && rm $dir/$x.*.acc +fi + +if true; then # Diagnostics + ln -sf warp.$x $dir/final.warp + if [ -f $data/spk2gender ]; then + # To make it easier to eyeball the male and female speakers' warps + # separately, separate them out. + for g in m f; do # means: for gender in male female + cat $dir/final.warp | \ + utils/filter_scp.pl <(grep -w $g $data/spk2gender | awk '{print $1}') > $dir/final.warp.$g + echo -n "The last few warp factors for gender $g are: " + tail -n 10 $dir/final.warp.$g | awk '{printf("%s ", $2);}'; + echo + done + fi +fi + +ln -sf $x.dubm $dir/final.dubm +ln -sf $x.ali_dubm $dir/final.ali_dubm +ln -sf 0.lvtln $dir/final.lvtln + +# Summarize warning messages... +utils/summarize_warnings.pl $dir/log + +echo "$0: Done training LVTLN model in $dir" diff --git a/egs/lre/v1/run.sh b/egs/lre/v1/run.sh index af577dd0b..d21f13d9e 100755 --- a/egs/lre/v1/run.sh +++ b/egs/lre/v1/run.sh @@ -54,6 +54,26 @@ local/split_long_utts.sh --max-utt-len 120 data/train_unsplit data/train # max_voiced=3000 # local/vad_split_utts.sh --max-voiced $max_voiced data/train_unsplit $mfccdir data/train + # Vtln-related things: + # We'll use a subset of utterances to train the GMM we'll use for VTLN + # warping. + utils/subset_data_dir.sh data/train 5000 data/train_5k_novtln + + # for the features we use to estimate VTLN warp factors, we use more cepstra + # (13 instead of just 7); this needs to be tuned. + steps/make_mfcc.sh --mfcc-config conf/mfcc_vtln.conf --nj 50 --cmd "$train_cmd" \ + data/train_5k_novtln exp/make_mfcc $mfccdir + lid/compute_vad_decision.sh data/train_5k_novtln exp/make_mfcc $mfccdir + # note, we're using the speaker-id version of the train_diag_ubm.sh script, which + # uses double-delta instead of SDC features. We train a 256-Gaussian UBM; this + # has to be tuned. + sid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train_5k_novtln 256 \ + exp/diag_ubm_vtln + lid/train_lvtln_model.sh --mfcc-config conf/mfcc_vtln.conf --nj 30 --cmd "$train_cmd" \ + data/train_5k_novtln exp/diag_ubm_vtln exp/vtln + +) + steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 100 --cmd "$train_cmd" \ data/train exp/make_mfcc $mfccdir steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \ diff --git a/egs/lre/v1/sid b/egs/lre/v1/sid new file mode 120000 index 000000000..893a12f30 --- /dev/null +++ b/egs/lre/v1/sid @@ -0,0 +1 @@ +../../sre08/v1/sid \ No newline at end of file diff --git a/egs/wsj/s5/steps/compute_cmvn_stats.sh b/egs/wsj/s5/steps/compute_cmvn_stats.sh index ebcc072c8..dddec0b9a 100755 --- a/egs/wsj/s5/steps/compute_cmvn_stats.sh +++ b/egs/wsj/s5/steps/compute_cmvn_stats.sh @@ -23,11 +23,11 @@ echo "$0 $@" # Print the command line for logging fake=false two_channel=false -if [ $1 == "--fake" ]; then +if [ "$1" == "--fake" ]; then fake=true shift fi -if [ $1 == "--two-channel" ]; then +if [ "$1" == "--two-channel" ]; then two_channel=true shift fi diff --git a/egs/wsj/s5/steps/train_lvtln.sh b/egs/wsj/s5/steps/train_lvtln.sh index e5ade3ac4..a26fb1575 100755 --- a/egs/wsj/s5/steps/train_lvtln.sh +++ b/egs/wsj/s5/steps/train_lvtln.sh @@ -5,7 +5,7 @@ # This training script trains linear-VTLN models starting from an existing # system based on either LDA+MLLT or delta+delta-delta features. # Works with either mfcc or plp features, but you need to set the -# --base-feature-type option. +# --base-feat-type option. # The resulting system can be used with align_lvtln.sh and/or decode_lvtln.sh # to get VTLN warping factors for data, for warped data extraction, or (for # the training data) you can use the warping factors this script outputs @@ -65,7 +65,7 @@ alidir=$5 dir=$6 for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt $data/wav.scp; do - [ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1; + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done numgauss=$numleaves @@ -111,7 +111,7 @@ else fi if [ -f $data/utt2warp ]; then - echo "$0: source directory appears to already have VTLN."; + echo "$0: source data directory $data appears to already have VTLN."; exit 1; fi diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh index 4f56ed3e2..9a80162ce 100755 --- a/egs/wsj/s5/utils/validate_data_dir.sh +++ b/egs/wsj/s5/utils/validate_data_dir.sh @@ -6,15 +6,15 @@ no_wav=false no_text=false for x in `seq 3`; do - if [ $1 == "--no-feats" ]; then + if [ "$1" == "--no-feats" ]; then no_feats=true shift; fi - if [ $1 == "--no-text" ]; then + if [ "$1" == "--no-text" ]; then no_text=true shift; fi - if [ $1 == "--no-wav" ]; then + if [ "$1" == "--no-wav" ]; then no_wav=true shift; fi diff --git a/src/gmmbin/Makefile b/src/gmmbin/Makefile index 3eba1f830..cf637d189 100644 --- a/src/gmmbin/Makefile +++ b/src/gmmbin/Makefile @@ -27,7 +27,8 @@ BINFILES = gmm-init-mono gmm-est gmm-acc-stats-ali gmm-align \ gmm-est-basis-fmllr-gpost gmm-latgen-tracking gmm-latgen-faster-parallel \ gmm-est-fmllr-raw gmm-est-fmllr-raw-gpost gmm-global-init-from-feats \ gmm-global-info gmm-latgen-faster-regtree-fmllr gmm-est-fmllr-global \ - gmm-acc-mllt-global gmm-transform-means-global gmm-global-gselect-to-post + gmm-acc-mllt-global gmm-transform-means-global gmm-global-gselect-to-post \ + gmm-global-est-lvtln-trans OBJFILES = diff --git a/src/gmmbin/gmm-est-lvtln-trans.cc b/src/gmmbin/gmm-est-lvtln-trans.cc index 913aea6bb..45aa90596 100644 --- a/src/gmmbin/gmm-est-lvtln-trans.cc +++ b/src/gmmbin/gmm-est-lvtln-trans.cc @@ -1,6 +1,7 @@ // gmmbin/gmm-est-lvtln-trans.cc // Copyright 2009-2011 Microsoft Corporation; Saarland University +// 2014 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -125,8 +126,8 @@ int main(int argc, char *argv[]) { const Matrix &feats = feature_reader.Value(utt); const GaussPost &gpost = gpost_reader.Value(utt); if (static_cast(gpost.size()) != feats.NumRows()) { - KALDI_WARN << "GauPost vector has wrong size " << (gpost.size()) - << " vs. " << (feats.NumRows()); + KALDI_WARN << "GauPost vector has wrong size " << gpost.size() + << " vs. " << feats.NumRows(); num_other_error++; continue; } @@ -172,8 +173,8 @@ int main(int argc, char *argv[]) { const GaussPost &gpost = gpost_reader.Value(utt); if (static_cast(gpost.size()) != feats.NumRows()) { - KALDI_WARN << "GauPost has wrong size " << (gpost.size()) - << " vs. " << (feats.NumRows()); + KALDI_WARN << "GauPost has wrong size " << gpost.size() + << " vs. " << feats.NumRows(); num_other_error++; continue; } diff --git a/src/gmmbin/gmm-global-est-lvtln-trans.cc b/src/gmmbin/gmm-global-est-lvtln-trans.cc new file mode 100644 index 000000000..18bb51095 --- /dev/null +++ b/src/gmmbin/gmm-global-est-lvtln-trans.cc @@ -0,0 +1,235 @@ +// gmmbin/gmm-global-est-lvtln-trans.cc + +// Copyright 2009-2011 Microsoft Corporation; Saarland University +// 2014 Daniel Povey + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include +using std::string; +#include +using std::vector; + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "gmm/am-diag-gmm.h" +#include "hmm/transition-model.h" +#include "transform/lvtln.h" +#include "hmm/posterior.h" + +namespace kaldi { +void AccumulateForUtterance(const Matrix &feats, + const Posterior &post, + const DiagGmm &gmm, + FmllrDiagGmmAccs *spk_stats) { + KALDI_ASSERT(static_cast(post.size()) == feats.NumRows()); + for (size_t i = 0; i < post.size(); i++) { + std::vector gselect(post[i].size()); + Vector this_post(post[i].size()); + for (size_t j = 0; j < post[i].size(); j++) { + int32 g = post[i][j].first; + BaseFloat weight = post[i][j].second; + gselect[j] = g; + this_post(j) = weight; + } + spk_stats->AccumulateFromPosteriorsPreselect(gmm, gselect, + feats.Row(i), + this_post); + } +} + + +} + +int main(int argc, char *argv[]) { + try { + typedef kaldi::int32 int32; + using namespace kaldi; + const char *usage = + "Estimate linear-VTLN transforms, either per utterance or for " + "the supplied set of speakers (spk2utt option); this version\n" + "is for a global diagonal GMM (also known as a UBM). Reads posteriors\n" + "indicating Gaussian indexes in the UBM.\n" + "\n" + "Usage: gmm-global-est-lvtln-trans [options] " + " []\n" + "e.g.: gmm-global-est-lvtln-trans 0.ubm 0.lvtln '$feats' ark,s,cs:- ark:1.trans ark:1.warp\n" + "(where the will likely come from gmm-global-get-post or\n" + "gmm-global-gselect-to-post\n"; + + ParseOptions po(usage); + string spk2utt_rspecifier; + BaseFloat logdet_scale = 1.0; + std::string norm_type = "offset"; + po.Register("norm-type", &norm_type, "type of fMLLR applied (\"offset\"|\"none\"|\"diag\")"); + po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to " + "utterance-list map"); + po.Register("logdet-scale", &logdet_scale, "Scale on log-determinant term in auxiliary function"); + + po.Read(argc, argv); + + if (po.NumArgs() < 5 || po.NumArgs() > 6) { + po.PrintUsage(); + exit(1); + } + + string + model_rxfilename = po.GetArg(1), + lvtln_rxfilename = po.GetArg(2), + feature_rspecifier = po.GetArg(3), + post_rspecifier = po.GetArg(4), + trans_wspecifier = po.GetArg(5), + warp_wspecifier = po.GetOptArg(6); + + DiagGmm gmm; + ReadKaldiObject(model_rxfilename, &gmm); + LinearVtln lvtln; + ReadKaldiObject(lvtln_rxfilename, &lvtln); + + + RandomAccessPosteriorReader post_reader(post_rspecifier); + + double tot_lvtln_impr = 0.0, tot_t = 0.0; + + BaseFloatMatrixWriter transform_writer(trans_wspecifier); + + BaseFloatWriter warp_writer(warp_wspecifier); + + std::vector class_counts(lvtln.NumClasses(), 0); + int32 num_done = 0, num_no_post = 0, num_other_error = 0; + if (spk2utt_rspecifier != "") { // per-speaker adaptation + SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier); + RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier); + + for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) { + FmllrDiagGmmAccs spk_stats(lvtln.Dim()); + string spk = spk2utt_reader.Key(); + const vector &uttlist = spk2utt_reader.Value(); + for (size_t i = 0; i < uttlist.size(); i++) { + std::string utt = uttlist[i]; + if (!feature_reader.HasKey(utt)) { + KALDI_WARN << "Did not find features for utterance " << utt; + continue; + } + if (!post_reader.HasKey(utt)) { + KALDI_WARN << "Did not find posteriors for utterance " << utt; + num_no_post++; + continue; + } + const Matrix &feats = feature_reader.Value(utt); + const Posterior &post = post_reader.Value(utt); + if (static_cast(post.size()) != feats.NumRows()) { + KALDI_WARN << "Posterior vector has wrong size " << post.size() + << " vs. " << feats.NumRows(); + num_other_error++; + continue; + } + + AccumulateForUtterance(feats, post, gmm, &spk_stats); + + num_done++; + } // end looping over all utterances of the current speaker + + BaseFloat impr, spk_tot_t; + { // Compute the transform and write it out. + Matrix transform(lvtln.Dim(), lvtln.Dim()+1); + int32 class_idx; + lvtln.ComputeTransform(spk_stats, + norm_type, + logdet_scale, + &transform, + &class_idx, + NULL, + &impr, + &spk_tot_t); + class_counts[class_idx]++; + transform_writer.Write(spk, transform); + if (warp_wspecifier != "") + warp_writer.Write(spk, lvtln.GetWarp(class_idx)); + } + KALDI_LOG << "For speaker " << spk << ", auxf-impr from LVTLN is " + << (impr/spk_tot_t) << ", over " << spk_tot_t << " frames."; + tot_lvtln_impr += impr; + tot_t += spk_tot_t; + } // end looping over speakers + } else { // per-utterance adaptation + SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); + for (; !feature_reader.Done(); feature_reader.Next()) { + string utt = feature_reader.Key(); + if (!post_reader.HasKey(utt)) { + KALDI_WARN << "Did not find posterior for utterance " + << utt; + num_no_post++; + continue; + } + const Matrix &feats = feature_reader.Value(); + const Posterior &post = post_reader.Value(utt); + + if (static_cast(post.size()) != feats.NumRows()) { + KALDI_WARN << "Posterior has wrong size " << post.size() + << " vs. " << feats.NumRows(); + num_other_error++; + continue; + } + num_done++; + + FmllrDiagGmmAccs spk_stats(lvtln.Dim()); + + AccumulateForUtterance(feats, post, gmm, + &spk_stats); + BaseFloat impr, utt_tot_t = spk_stats.beta_; + { // Compute the transform and write it out. + Matrix transform(lvtln.Dim(), lvtln.Dim()+1); + int32 class_idx; + lvtln.ComputeTransform(spk_stats, + norm_type, + logdet_scale, + &transform, + &class_idx, + NULL, + &impr, + &utt_tot_t); + class_counts[class_idx]++; + transform_writer.Write(utt, transform); + if (warp_wspecifier != "") + warp_writer.Write(utt, lvtln.GetWarp(class_idx)); + } + + KALDI_LOG << "For utterance " << utt << ", auxf-impr from LVTLN is " + << (impr/utt_tot_t) << ", over " << utt_tot_t << " frames."; + tot_lvtln_impr += impr; + tot_t += utt_tot_t; + } + } + + { + std::ostringstream s; + for (size_t i = 0; i < class_counts.size(); i++) + s << ' ' << class_counts[i]; + KALDI_LOG << "Distribution of classes is: " << s.str(); + } + + KALDI_LOG << "Done " << num_done << " files, " << num_no_post + << " with no posteriors, " << num_other_error << " with other errors."; + KALDI_LOG << "Overall LVTLN auxf impr per frame is " + << (tot_lvtln_impr / tot_t) << " over " << tot_t << " frames."; + return 0; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} + diff --git a/src/transform/fmllr-diag-gmm.h b/src/transform/fmllr-diag-gmm.h index 90211de9b..1c203bdc8 100644 --- a/src/transform/fmllr-diag-gmm.h +++ b/src/transform/fmllr-diag-gmm.h @@ -100,7 +100,7 @@ class FmllrDiagGmmAccs: public AffineXformStats { const VectorBase &posteriors); /// Accumulate stats for a GMM, given supplied posteriors. The "posteriors" - /// vector should be have the same size as "gselect".n + /// vector should be have the same size as "gselect". void AccumulateFromPosteriorsPreselect( const DiagGmm &gmm, const std::vector &gselect,