sandbox/language_id: getting VTLN model estimation working given a UBM.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@4173 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2014-07-19 19:01:45 +00:00
Родитель 25344d93b0
Коммит 4fd9c20c6a
11 изменённых файлов: 559 добавлений и 14 удалений

Просмотреть файл

@ -0,0 +1,5 @@
--sample-frequency=8000
--frame-length=20 # the default is 25.
--low-freq=20 # the default.
--high-freq=3700 # the default is zero meaning use the Nyquist (4k in this case).
--num-ceps=13

Просмотреть файл

@ -0,0 +1,282 @@
#!/bin/bash
# Copyright 2014 Daniel Povey
# Apache 2.0
#
# This training script computes some things you will need in order to
# extract VTLN-warped features. It takes as input the data directory
# and an already-trained diagonal-covariance UBM. Note: although this
# script is in the lid/ directory, because it is intended to be
# used in language identification, but it uses features of the
# same type as those used in the speaker-id scripts (see ../sid/),
# i.e. double-delta features, rather than the "shifted delta cepstra"
# features commonly used in language id.
#
# This script works with either mfcc or plp features; for plp features, you will
# need to set the --base-feat-type option. Regardless, you will need to set the
# --mfcc-config or --plp-config option if your feature-extraction config is not
# called conf/${base_feat_type}.conf. The output of this script will be in
# $dir/final.lvtln and $dir/final.dubm and $dir/final.ali_dubm; the directory
# can be passed to ./get_vtln_warps.sh to get VTLN warps for a data directory,
# or (for data passed to this script) you can use the warping factors this
# script outputs in $dir/final.warp
#
# Begin configuration.
stage=-4 # This allows restarting after partway, when something when wrong.
config=
cmd=run.pl
num_iters=15 # Number of iterations of training.
num_utt_lvtln_init=400; # number of utterances (subset) to initialize
# LVTLN transform. Not too critical.
min_warp=0.85
max_warp=1.25
warp_step=0.01
base_feat_type=mfcc # or could be PLP.
mfcc_config=conf/mfcc.conf # default, can be overridden.
plp_config=conf/plp.conf # default, can be overridden.
logdet_scale=0.0
subsample=5 # We use every 5th frame by default; this is more
# CPU-efficient.
min_gaussian_weight=0.0001 # does not matter; inherited from diag-ubm training script.
nj=4
cleanup=true
num_gselect=15
# End configuration.
echo "$0 $@" # Print the command line for logging
[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;
num_classes=$(perl -e "print int(1.5 + ($max_warp - $min_warp) / $warp_step);") || exit 1;
default_class=$(perl -e "print int(0.5 + (1.0 - $min_warp) / $warp_step);") || exit 1;
if [ $# != 3 ]; then
echo "Usage: $0 <data-dir> <diag-ubm-dir> <exp-dir>"
echo "e.g.: $0 data/train_vtln exp/diag_ubm_vtln exp/vtln"
echo "main options (for others, see top of script file)"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --nj <num-jobs> # number of jobs to use (default 4)"
echo " --config <config-file> # config containing options"
echo " --stage <stage> # stage to do partial re-run from."
echo " --num-iters <num-iters> # number of iterations of training"
echo " --base-feat-type <feat-type> # mfcc or plp, mfcc is default"
echo " --mfcc-config <config> # config for MFCC extraction, default is"
echo " # conf/mfcc.conf"
echo " --plp-config <config> # config for PLP extraction, default is"
echo " # conf/plp.conf"
exit 1;
fi
data=$1
ubmdir=$2
dir=$3
for f in $data/feats.scp $ubmdir/final.dubm; do
[ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
done
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj;
split_data.sh $data $nj || exit 1;
cmvn_sliding_opts="--norm-vars=false --center=true --cmn-window=300"
# don't change $cmvn_sliding_opts, it should probably match the
# options used in ../sid/train_diag_ubm.sh.
sifeats="ark,s,cs:add-deltas scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding $cmvn_sliding_opts ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
# for the subsets of features that we use to estimate the linear transforms, we
# don't bother with CMN. This will give us wrong offsets on the transforms,
# but it won't matter because we will allow an arbitrary bias term when we apply
# these transforms.
# you need to define CLASS when invoking $cmd on featsub_warped.
featsub_warped="ark:add-deltas ark:$dir/feats.CLASS.ark ark:- | select-voiced-frames ark:- scp,s,cs:$data/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
featsub_unwarped="ark:add-deltas ark:$dir/feats.$default_class.ark ark:- | select-voiced-frames ark:- scp,s,cs:$data/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
if [ -f $data/utt2warp ]; then
echo "$0: source data directory $data appears to already have VTLN.";
exit 1;
fi
# create a small subset of utterances for purposes of initializing the LVTLN transform
# utils/shuffle_list.pl is deterministic, unlike sort -R.
cat $data/utt2spk | awk '{print $1}' | utils/shuffle_list.pl | \
head -n $num_utt_lvtln_init > $dir/utt_subset
if [ $stage -le -4 ]; then
echo "$0: computing warped subset of features"
if [ -f $data/segments ]; then
echo "$0 [info]: segments file exists: using that."
subset_feats="utils/filter_scp.pl $dir/utt_subset $data/segments | extract-segments scp:$data/wav.scp - ark:- "
else
echo "$0 [info]: no segments file exists: using wav.scp directly."
subset_feats="utils/filter_scp.pl $dir/utt_subset $data/wav.scp | wav-copy scp:- ark:- "
fi
rm $dir/.error 2>/dev/null
for c in $(seq 0 $[$num_classes-1]); do
this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));")
config_name=${base_feat_type}_config # e.g. mfcc_config or plp_config
this_config=$(eval echo \$$config_name) # e.g. conf/mfcc.conf or conf/plp.conf by default.
$cmd $dir/log/compute_warped_feats.$c.log \
$subset_feats \| compute-${base_feat_type}-feats --verbose=2 \
--config=$this_config --vtln-warp=$this_warp ark:- ark:- \| \
copy-feats --compress=true ark:- ark:$dir/feats.$c.ark || touch $dir/.error &
done
wait;
if [ -f $dir/.error ]; then
echo "$0: Computing warped features failed: check $dir/log/compute_warped_feats.*.log"
exit 1;
fi
fi
if ! utils/filter_scp.pl $dir/utt_subset $data/feats.scp | \
compare-feats --threshold=0.98 scp:- ark:$dir/feats.$default_class.ark >&/dev/null; then
echo "$0: features stored on disk differ from those computed with no warping."
echo " Possibly your feature type is wrong (--base-feat-type option)"
exit 1;
fi
if [ -f $data/segments ]; then
subset_utts="ark:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
else
echo "$0 [info]: no segments file exists: using wav.scp directly."
subset_utts="ark:wav-copy scp:$sdata/JOB/wav.scp ark:- |"
fi
if [ $stage -le -3 ]; then
echo "$0: initializing base LVTLN transforms in $dir/0.lvtln (ignore warnings below)"
dim=$(feat-to-dim "$featsub_unwarped" - ) || exit 1;
$cmd $dir/log/init_lvtln.log \
gmm-init-lvtln --dim=$dim --num-classes=$num_classes --default-class=$default_class \
$dir/0.lvtln || exit 1;
for c in $(seq 0 $[$num_classes-1]); do
this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));")
orig_feats=ark:$dir/feats.$default_class.ark
warped_feats=ark:$dir/feats.$c.ark
logfile=$dir/log/train_special.$c.log
this_featsub_warped="$(echo $featsub_warped | sed s/CLASS/$c/)"
if ! gmm-train-lvtln-special --warp=$this_warp --normalize-var=true \
$c $dir/0.lvtln $dir/0.lvtln \
"$featsub_unwarped" "$this_featsub_warped" 2>$logfile; then
echo "$0: Error training LVTLN transform, see $logfile";
exit 1;
fi
done
rm $dir/final.lvtln 2>/dev/null
ln -s 0.lvtln $dir/final.lvtln
fi
cp $ubmdir/final.dubm $dir/0.dubm
if [ $stage -le -2 ]; then
echo "$0: computing Gaussian selection info."
$cmd JOB=1:$nj $dir/log/gselect.JOB.log \
gmm-gselect --n=$num_gselect $ubmdir/final.dubm "$sifeats" \
"ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
fi
if [ $stage -le -1 ]; then
echo "$0: computing initial LVTLN transforms" # do this per-utt.
$cmd JOB=1:$nj $dir/log/lvtln.0.JOB.log \
gmm-global-gselect-to-post $dir/0.dubm "$sifeats" \
"ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark:- \| \
gmm-global-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \
$dir/0.dubm $dir/0.lvtln "$sifeats" ark,s,cs:- ark:$dir/trans.0.JOB ark,t:$dir/warp.0.JOB || exit 1
# consolidate the warps into one file.
for j in $(seq $nj); do cat $dir/warp.0.$j; done > $dir/warp.0
rm $dir/warp.0.*
fi
x=0
while [ $x -lt $num_iters ]; do
feats="$sifeats transform-feats ark:$dir/trans.$x.JOB ark:- ark:- |"
# First update the model.
if [ $stage -le $x ]; then
echo "$0: Updating model on pass $x"
# Accumulate stats.
$cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
gmm-global-acc-stats "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \
$dir/$x.dubm "$feats" $dir/$x.JOB.acc || exit 1;
$cmd $dir/log/update.$x.log \
gmm-global-est --remove-low-count-gaussians=false --min-gaussian-weight=$min_gaussian_weight \
$dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc|" \
$dir/$[$x+1].dubm || exit 1;
$cleanup && rm $dir/$x.*.acc $dir/$x.dubm
fi
# Now update the LVTLN transforms (and warps.)
if [ $stage -le $x ]; then
echo "$0: re-estimating LVTLN transforms on pass $x"
$cmd JOB=1:$nj $dir/log/lvtln.$x.JOB.log \
gmm-global-gselect-to-post $dir/$[$x+1].dubm "$feats" \
"ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark:- \| \
gmm-global-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \
$dir/$[$x+1].dubm $dir/0.lvtln "$sifeats" ark,s,cs:- \
ark:$dir/trans.$[$x+1].JOB ark,t:$dir/warp.$[$x+1].JOB || exit 1
# consolidate the warps into one file.
for j in $(seq $nj); do cat $dir/warp.$[$x+1].$j; done > $dir/warp.$[$x+1]
rm $dir/warp.$[$x+1].*
$cleanup && rm $dir/trans.$x.*
fi
x=$[$x+1]
done
feats="$sifeats transform-feats ark:$dir/trans.$x.JOB ark:- ark:- |"
if [ $stage -le $x ]; then
# Accumulate stats for "alignment model"-- this model is computed with the
# speaker-independent features, but matches Gaussian-for-Gaussian with the
# final speaker-adapted model.
$cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
gmm-global-acc-stats-twofeats "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \
$dir/$x.dubm "$feats" "$sifeats" $dir/$x.JOB.acc || exit 1
[ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
# Update model.
$cmd $dir/log/est_alimdl.log \
gmm-global-est --min-gaussian-weight=$min_gaussian_weight \
--remove-low-count-gaussians=false $dir/$x.dubm \
"gmm-global-sum-accs - $dir/$x.*.acc|" $dir/$x.ali_dubm || exit 1;
$cleanup && rm $dir/$x.*.acc
fi
if true; then # Diagnostics
ln -sf warp.$x $dir/final.warp
if [ -f $data/spk2gender ]; then
# To make it easier to eyeball the male and female speakers' warps
# separately, separate them out.
for g in m f; do # means: for gender in male female
cat $dir/final.warp | \
utils/filter_scp.pl <(grep -w $g $data/spk2gender | awk '{print $1}') > $dir/final.warp.$g
echo -n "The last few warp factors for gender $g are: "
tail -n 10 $dir/final.warp.$g | awk '{printf("%s ", $2);}';
echo
done
fi
fi
ln -sf $x.dubm $dir/final.dubm
ln -sf $x.ali_dubm $dir/final.ali_dubm
ln -sf 0.lvtln $dir/final.lvtln
# Summarize warning messages...
utils/summarize_warnings.pl $dir/log
echo "$0: Done training LVTLN model in $dir"

Просмотреть файл

@ -54,6 +54,26 @@ local/split_long_utts.sh --max-utt-len 120 data/train_unsplit data/train
# max_voiced=3000
# local/vad_split_utts.sh --max-voiced $max_voiced data/train_unsplit $mfccdir data/train
# Vtln-related things:
# We'll use a subset of utterances to train the GMM we'll use for VTLN
# warping.
utils/subset_data_dir.sh data/train 5000 data/train_5k_novtln
# for the features we use to estimate VTLN warp factors, we use more cepstra
# (13 instead of just 7); this needs to be tuned.
steps/make_mfcc.sh --mfcc-config conf/mfcc_vtln.conf --nj 50 --cmd "$train_cmd" \
data/train_5k_novtln exp/make_mfcc $mfccdir
lid/compute_vad_decision.sh data/train_5k_novtln exp/make_mfcc $mfccdir
# note, we're using the speaker-id version of the train_diag_ubm.sh script, which
# uses double-delta instead of SDC features. We train a 256-Gaussian UBM; this
# has to be tuned.
sid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train_5k_novtln 256 \
exp/diag_ubm_vtln
lid/train_lvtln_model.sh --mfcc-config conf/mfcc_vtln.conf --nj 30 --cmd "$train_cmd" \
data/train_5k_novtln exp/diag_ubm_vtln exp/vtln
)
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 100 --cmd "$train_cmd" \
data/train exp/make_mfcc $mfccdir
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \

1
egs/lre/v1/sid Symbolic link
Просмотреть файл

@ -0,0 +1 @@
../../sre08/v1/sid

Просмотреть файл

@ -23,11 +23,11 @@ echo "$0 $@" # Print the command line for logging
fake=false
two_channel=false
if [ $1 == "--fake" ]; then
if [ "$1" == "--fake" ]; then
fake=true
shift
fi
if [ $1 == "--two-channel" ]; then
if [ "$1" == "--two-channel" ]; then
two_channel=true
shift
fi

Просмотреть файл

@ -5,7 +5,7 @@
# This training script trains linear-VTLN models starting from an existing
# system based on either LDA+MLLT or delta+delta-delta features.
# Works with either mfcc or plp features, but you need to set the
# --base-feature-type option.
# --base-feat-type option.
# The resulting system can be used with align_lvtln.sh and/or decode_lvtln.sh
# to get VTLN warping factors for data, for warped data extraction, or (for
# the training data) you can use the warping factors this script outputs
@ -65,7 +65,7 @@ alidir=$5
dir=$6
for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt $data/wav.scp; do
[ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
numgauss=$numleaves
@ -111,7 +111,7 @@ else
fi
if [ -f $data/utt2warp ]; then
echo "$0: source directory appears to already have VTLN.";
echo "$0: source data directory $data appears to already have VTLN.";
exit 1;
fi

Просмотреть файл

@ -6,15 +6,15 @@ no_wav=false
no_text=false
for x in `seq 3`; do
if [ $1 == "--no-feats" ]; then
if [ "$1" == "--no-feats" ]; then
no_feats=true
shift;
fi
if [ $1 == "--no-text" ]; then
if [ "$1" == "--no-text" ]; then
no_text=true
shift;
fi
if [ $1 == "--no-wav" ]; then
if [ "$1" == "--no-wav" ]; then
no_wav=true
shift;
fi

Просмотреть файл

@ -27,7 +27,8 @@ BINFILES = gmm-init-mono gmm-est gmm-acc-stats-ali gmm-align \
gmm-est-basis-fmllr-gpost gmm-latgen-tracking gmm-latgen-faster-parallel \
gmm-est-fmllr-raw gmm-est-fmllr-raw-gpost gmm-global-init-from-feats \
gmm-global-info gmm-latgen-faster-regtree-fmllr gmm-est-fmllr-global \
gmm-acc-mllt-global gmm-transform-means-global gmm-global-gselect-to-post
gmm-acc-mllt-global gmm-transform-means-global gmm-global-gselect-to-post \
gmm-global-est-lvtln-trans
OBJFILES =

Просмотреть файл

@ -1,6 +1,7 @@
// gmmbin/gmm-est-lvtln-trans.cc
// Copyright 2009-2011 Microsoft Corporation; Saarland University
// 2014 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
@ -125,8 +126,8 @@ int main(int argc, char *argv[]) {
const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
const GaussPost &gpost = gpost_reader.Value(utt);
if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
KALDI_WARN << "GauPost vector has wrong size " << (gpost.size())
<< " vs. " << (feats.NumRows());
KALDI_WARN << "GauPost vector has wrong size " << gpost.size()
<< " vs. " << feats.NumRows();
num_other_error++;
continue;
}
@ -172,8 +173,8 @@ int main(int argc, char *argv[]) {
const GaussPost &gpost = gpost_reader.Value(utt);
if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
KALDI_WARN << "GauPost has wrong size " << (gpost.size())
<< " vs. " << (feats.NumRows());
KALDI_WARN << "GauPost has wrong size " << gpost.size()
<< " vs. " << feats.NumRows();
num_other_error++;
continue;
}

Просмотреть файл

@ -0,0 +1,235 @@
// gmmbin/gmm-global-est-lvtln-trans.cc
// Copyright 2009-2011 Microsoft Corporation; Saarland University
// 2014 Daniel Povey
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include <string>
using std::string;
#include <vector>
using std::vector;
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "gmm/am-diag-gmm.h"
#include "hmm/transition-model.h"
#include "transform/lvtln.h"
#include "hmm/posterior.h"
namespace kaldi {
void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
const Posterior &post,
const DiagGmm &gmm,
FmllrDiagGmmAccs *spk_stats) {
KALDI_ASSERT(static_cast<int32>(post.size()) == feats.NumRows());
for (size_t i = 0; i < post.size(); i++) {
std::vector<int32> gselect(post[i].size());
Vector<BaseFloat> this_post(post[i].size());
for (size_t j = 0; j < post[i].size(); j++) {
int32 g = post[i][j].first;
BaseFloat weight = post[i][j].second;
gselect[j] = g;
this_post(j) = weight;
}
spk_stats->AccumulateFromPosteriorsPreselect(gmm, gselect,
feats.Row(i),
this_post);
}
}
}
int main(int argc, char *argv[]) {
try {
typedef kaldi::int32 int32;
using namespace kaldi;
const char *usage =
"Estimate linear-VTLN transforms, either per utterance or for "
"the supplied set of speakers (spk2utt option); this version\n"
"is for a global diagonal GMM (also known as a UBM). Reads posteriors\n"
"indicating Gaussian indexes in the UBM.\n"
"\n"
"Usage: gmm-global-est-lvtln-trans [options] <gmm-in> <lvtln-in> "
"<feature-rspecifier> <gpost-rspecifier> <lvtln-trans-wspecifier> [<warp-wspecifier>]\n"
"e.g.: gmm-global-est-lvtln-trans 0.ubm 0.lvtln '$feats' ark,s,cs:- ark:1.trans ark:1.warp\n"
"(where the <gpost-rspecifier> will likely come from gmm-global-get-post or\n"
"gmm-global-gselect-to-post\n";
ParseOptions po(usage);
string spk2utt_rspecifier;
BaseFloat logdet_scale = 1.0;
std::string norm_type = "offset";
po.Register("norm-type", &norm_type, "type of fMLLR applied (\"offset\"|\"none\"|\"diag\")");
po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
"utterance-list map");
po.Register("logdet-scale", &logdet_scale, "Scale on log-determinant term in auxiliary function");
po.Read(argc, argv);
if (po.NumArgs() < 5 || po.NumArgs() > 6) {
po.PrintUsage();
exit(1);
}
string
model_rxfilename = po.GetArg(1),
lvtln_rxfilename = po.GetArg(2),
feature_rspecifier = po.GetArg(3),
post_rspecifier = po.GetArg(4),
trans_wspecifier = po.GetArg(5),
warp_wspecifier = po.GetOptArg(6);
DiagGmm gmm;
ReadKaldiObject(model_rxfilename, &gmm);
LinearVtln lvtln;
ReadKaldiObject(lvtln_rxfilename, &lvtln);
RandomAccessPosteriorReader post_reader(post_rspecifier);
double tot_lvtln_impr = 0.0, tot_t = 0.0;
BaseFloatMatrixWriter transform_writer(trans_wspecifier);
BaseFloatWriter warp_writer(warp_wspecifier);
std::vector<int32> class_counts(lvtln.NumClasses(), 0);
int32 num_done = 0, num_no_post = 0, num_other_error = 0;
if (spk2utt_rspecifier != "") { // per-speaker adaptation
SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
FmllrDiagGmmAccs spk_stats(lvtln.Dim());
string spk = spk2utt_reader.Key();
const vector<string> &uttlist = spk2utt_reader.Value();
for (size_t i = 0; i < uttlist.size(); i++) {
std::string utt = uttlist[i];
if (!feature_reader.HasKey(utt)) {
KALDI_WARN << "Did not find features for utterance " << utt;
continue;
}
if (!post_reader.HasKey(utt)) {
KALDI_WARN << "Did not find posteriors for utterance " << utt;
num_no_post++;
continue;
}
const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
const Posterior &post = post_reader.Value(utt);
if (static_cast<int32>(post.size()) != feats.NumRows()) {
KALDI_WARN << "Posterior vector has wrong size " << post.size()
<< " vs. " << feats.NumRows();
num_other_error++;
continue;
}
AccumulateForUtterance(feats, post, gmm, &spk_stats);
num_done++;
} // end looping over all utterances of the current speaker
BaseFloat impr, spk_tot_t;
{ // Compute the transform and write it out.
Matrix<BaseFloat> transform(lvtln.Dim(), lvtln.Dim()+1);
int32 class_idx;
lvtln.ComputeTransform(spk_stats,
norm_type,
logdet_scale,
&transform,
&class_idx,
NULL,
&impr,
&spk_tot_t);
class_counts[class_idx]++;
transform_writer.Write(spk, transform);
if (warp_wspecifier != "")
warp_writer.Write(spk, lvtln.GetWarp(class_idx));
}
KALDI_LOG << "For speaker " << spk << ", auxf-impr from LVTLN is "
<< (impr/spk_tot_t) << ", over " << spk_tot_t << " frames.";
tot_lvtln_impr += impr;
tot_t += spk_tot_t;
} // end looping over speakers
} else { // per-utterance adaptation
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
for (; !feature_reader.Done(); feature_reader.Next()) {
string utt = feature_reader.Key();
if (!post_reader.HasKey(utt)) {
KALDI_WARN << "Did not find posterior for utterance "
<< utt;
num_no_post++;
continue;
}
const Matrix<BaseFloat> &feats = feature_reader.Value();
const Posterior &post = post_reader.Value(utt);
if (static_cast<int32>(post.size()) != feats.NumRows()) {
KALDI_WARN << "Posterior has wrong size " << post.size()
<< " vs. " << feats.NumRows();
num_other_error++;
continue;
}
num_done++;
FmllrDiagGmmAccs spk_stats(lvtln.Dim());
AccumulateForUtterance(feats, post, gmm,
&spk_stats);
BaseFloat impr, utt_tot_t = spk_stats.beta_;
{ // Compute the transform and write it out.
Matrix<BaseFloat> transform(lvtln.Dim(), lvtln.Dim()+1);
int32 class_idx;
lvtln.ComputeTransform(spk_stats,
norm_type,
logdet_scale,
&transform,
&class_idx,
NULL,
&impr,
&utt_tot_t);
class_counts[class_idx]++;
transform_writer.Write(utt, transform);
if (warp_wspecifier != "")
warp_writer.Write(utt, lvtln.GetWarp(class_idx));
}
KALDI_LOG << "For utterance " << utt << ", auxf-impr from LVTLN is "
<< (impr/utt_tot_t) << ", over " << utt_tot_t << " frames.";
tot_lvtln_impr += impr;
tot_t += utt_tot_t;
}
}
{
std::ostringstream s;
for (size_t i = 0; i < class_counts.size(); i++)
s << ' ' << class_counts[i];
KALDI_LOG << "Distribution of classes is: " << s.str();
}
KALDI_LOG << "Done " << num_done << " files, " << num_no_post
<< " with no posteriors, " << num_other_error << " with other errors.";
KALDI_LOG << "Overall LVTLN auxf impr per frame is "
<< (tot_lvtln_impr / tot_t) << " over " << tot_t << " frames.";
return 0;
} catch(const std::exception &e) {
std::cerr << e.what();
return -1;
}
}

Просмотреть файл

@ -100,7 +100,7 @@ class FmllrDiagGmmAccs: public AffineXformStats {
const VectorBase<BaseFloat> &posteriors);
/// Accumulate stats for a GMM, given supplied posteriors. The "posteriors"
/// vector should be have the same size as "gselect".n
/// vector should be have the same size as "gselect".
void AccumulateFromPosteriorsPreselect(
const DiagGmm &gmm,
const std::vector<int32> &gselect,