зеркало из https://github.com/mozilla/kaldi.git
trunk: online-nnet2 decoding setup: adding scripts which make it possible to estimate the iVectors per speaker, excluding silence (so not-truly-online decoding). Some code changes for iVector which allow for scaling up the prior term when the data count exceeds a certain value (this seems to be important, for some reason). And misc. code fixes.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4865 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
7d8ff21d63
Коммит
350d8b4123
|
@ -147,7 +147,7 @@ if [ $stage -le 13 ]; then
|
|||
done
|
||||
fi
|
||||
|
||||
exit 0;
|
||||
#exit 0;
|
||||
###### Comment out the "exit 0" above to run the multi-threaded decoding. #####
|
||||
|
||||
if [ $stage -le 14 ]; then
|
||||
|
@ -166,8 +166,8 @@ if [ $stage -le 15 ]; then
|
|||
test=dev_clean
|
||||
steps/online/nnet2/decode.sh --threaded true --do-endpointing true \
|
||||
--config conf/decode.config --cmd "$decode_cmd" --nj 30 \
|
||||
--per-utt true exp/tri6b/graph_pp_tgsmall data/$test \
|
||||
${dir}_online/decode_pp_${test}_tgsmall_utt_threaded_ep || exit 1;
|
||||
--per-utt true exp/tri6b/graph_tgsmall data/$test \
|
||||
${dir}_online/decode_${test}_tgsmall_utt_threaded_ep || exit 1;
|
||||
fi
|
||||
|
||||
exit 0;
|
||||
|
|
|
@ -134,6 +134,34 @@ if [ $stage -le 13 ]; then
|
|||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 14 ]; then
|
||||
# this does offline decoding, as stage 10, except we estimate the iVectors per
|
||||
# speaker, excluding silence (based on alignments from a GMM decoding), with a
|
||||
# different script. This is just to demonstrate that script.
|
||||
|
||||
rm exp/nnet2_online/.error 2>/dev/null
|
||||
for year in eval92 dev93; do
|
||||
steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj 8 \
|
||||
data/test_${year}_hires data/lang exp/nnet2_online/extractor \
|
||||
exp/tri4b/decode_tgpr_$year exp/nnet2_online/ivectors_spk_test_${year} || touch exp/nnet2_online/.error &
|
||||
done
|
||||
wait
|
||||
[ -f exp/nnet2_online/.error ] && echo "$0: Error getting iVectors" && exit 1;
|
||||
|
||||
for lm_suffix in bd_tgpr; do # just use the bd decoding, to avoid wasting time.
|
||||
graph_dir=exp/tri4b/graph_${lm_suffix}
|
||||
# use already-built graphs.
|
||||
for year in eval92 dev93; do
|
||||
steps/nnet2/decode.sh --nj 8 --cmd "$decode_cmd" \
|
||||
--online-ivector-dir exp/nnet2_online/ivectors_spk_test_$year \
|
||||
$graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year}_spk || touch exp/nnet2_online/.error &
|
||||
done
|
||||
done
|
||||
wait
|
||||
[ -f exp/nnet2_online/.error ] && echo "$0: Error decoding" && exit 1;
|
||||
fi
|
||||
|
||||
|
||||
|
||||
|
||||
exit 0;
|
||||
|
|
|
@ -0,0 +1,207 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2013 Daniel Povey
|
||||
# Apache 2.0.
|
||||
|
||||
|
||||
# This script computes iVectors in the same format as extract_ivectors_online.sh,
|
||||
# except that they are actually not really computed online, they are first computed
|
||||
# per speaker and just duplicated many times.
|
||||
#
|
||||
# This setup also makes it possible to use a previous decoding or alignment, to
|
||||
# down-weight silence in the stats (default is --silence-weight 0.0).
|
||||
#
|
||||
# This is for when you use the "online-decoding" setup in an offline task, and
|
||||
# you want the best possible results.
|
||||
|
||||
|
||||
# Begin configuration section.
|
||||
nj=30
|
||||
cmd="run.pl"
|
||||
stage=0
|
||||
num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select
|
||||
min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
|
||||
ivector_period=10
|
||||
posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
|
||||
# inter-frame correlations. Making this small during iVector
|
||||
# extraction is equivalent to scaling up the prior, and will
|
||||
# will tend to produce smaller iVectors where data-counts are
|
||||
# small. It's not so important that this match the value
|
||||
# used when training the iVector extractor, but more important
|
||||
# that this match the value used when you do real online decoding
|
||||
# with the neural nets trained with these iVectors.
|
||||
max_count=100 # Interpret this as a number of frames times posterior scale...
|
||||
# this config ensures that once the count exceeds this (i.e.
|
||||
# 1000 frames, or 10 seconds, by default), we start to scale
|
||||
# down the stats, accentuating the prior term. This seems quite
|
||||
# important for some reason.
|
||||
compress=true # If true, compress the iVectors stored on disk (it's lossy
|
||||
# compression, as used for feature matrices).
|
||||
silence_weight=0.0
|
||||
acwt=0.1 # used if input is a decode dir, to get best path from lattices.
|
||||
mdl=final # change this if decode directory did not have ../final.mdl present.
|
||||
|
||||
# End configuration section.
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
if [ -f path.sh ]; then . ./path.sh; fi
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
|
||||
if [ $# != 4 ] && [ $# != 5 ]; then
|
||||
echo "Usage: $0 [options] <data> <lang> <extractor-dir> [<alignment-dir>|<decode-dir>] <ivector-dir>"
|
||||
echo " e.g.: $0 data/test exp/nnet2_online/extractor exp/tri3/decode_test exp/nnet2_online/ivectors_test"
|
||||
echo "main options (for others, see top of script file)"
|
||||
echo " --config <config-file> # config containing options"
|
||||
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
|
||||
echo " --nj <n|10> # Number of jobs (also see num-processes and num-threads)"
|
||||
echo " # Ignored if <alignment-dir> or <decode-dir> supplied."
|
||||
echo " --stage <stage|0> # To control partial reruns"
|
||||
echo " --num-gselect <n|5> # Number of Gaussians to select using"
|
||||
echo " # diagonal model."
|
||||
echo " --min-post <float;default=0.025> # Pruning threshold for posteriors"
|
||||
echo " --ivector-period <int;default=10> # How often to extract an iVector (frames)"
|
||||
echo " --utts-per-spk-max <int;default=-1> # Controls splitting into 'fake speakers'."
|
||||
echo " # Set to 1 if compatibility with utterance-by-utterance"
|
||||
echo " # decoding is the only factor, and to larger if you care "
|
||||
echo " # also about adaptation over several utterances."
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
if [ $# -eq 4 ]; then
|
||||
data=$1
|
||||
lang=$2
|
||||
srcdir=$3
|
||||
dir=$4
|
||||
else # 5 arguments
|
||||
data=$1
|
||||
lang=$2
|
||||
srcdir=$3
|
||||
ali_or_decode_dir=$4
|
||||
dir=$5
|
||||
fi
|
||||
|
||||
for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $srcdir/global_cmvn.stats $srcdir/splice_opts \
|
||||
$lang/phones.txt $srcdir/online_cmvn.conf $srcdir/final.mat; do
|
||||
[ ! -f $f ] && echo "$0: No such file $f" && exit 1;
|
||||
done
|
||||
|
||||
mkdir -p $dir/log
|
||||
silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
|
||||
|
||||
if [ ! -z "$ali_or_decode_dir" ]; then
|
||||
|
||||
nj_orig=$(cat $ali_or_decode_dir/num_jobs) || exit 1;
|
||||
|
||||
if [ -f $ali_or_decode_dir/ali.1.gz ]; then
|
||||
if [ ! -f $ali_or_decode_dir/${mdl}.mdl ]; then
|
||||
echo "$0: expected $ali_or_decode_dir/${mdl}.mdl to exist."
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 0 ]; then
|
||||
rm $dir/weights.*.gz 2>/dev/null
|
||||
|
||||
$cmd JOB=1:$nj_orig $dir/log/ali_to_post.JOB.log \
|
||||
gunzip -c $ali_or_decode_dir/ali.JOB.gz \| \
|
||||
ali-to-post ark:- ark:- \| \
|
||||
weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/final.mdl ark:- ark:- \| \
|
||||
post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;
|
||||
|
||||
# put all the weights in one archive.
|
||||
for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1;
|
||||
rm $dir/weights.*.gz || exit 1;
|
||||
fi
|
||||
|
||||
elif [ -f $ali_or_decode_dir/lat.1.gz ]; then
|
||||
if [ ! -f $ali_or_decode_dir/../${mdl}.mdl ]; then
|
||||
echo "$0: expected $ali_or_decode_dir/../${mdl}.mdl to exist."
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 0 ]; then
|
||||
rm $dir/weights.*.gz 2>/dev/null
|
||||
|
||||
$cmd JOB=1:$nj_orig $dir/log/lat_to_post.JOB.log \
|
||||
lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $ali_or_decode_dir/lat.JOB.gz|" ark:/dev/null ark:- \| \
|
||||
ali-to-post ark:- ark:- \| \
|
||||
weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/../${mdl}.mdl ark:- ark:- \| \
|
||||
post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;
|
||||
|
||||
# put all the weights in one archive.
|
||||
for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1;
|
||||
rm $dir/weights.*.gz || exit 1;
|
||||
fi
|
||||
else
|
||||
echo "$0: expected ali.1.gz or lat.1.gz to exist in $ali_or_decode_dir";
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
# Now work out the per-speaker iVectors.
|
||||
|
||||
sdata=$data/split$nj;
|
||||
utils/split_data.sh $data $nj || exit 1;
|
||||
|
||||
echo $ivector_period > $dir/ivector_period || exit 1;
|
||||
splice_opts=$(cat $srcdir/splice_opts)
|
||||
|
||||
|
||||
gmm_feats="ark,s,cs:apply-cmvn-online --spk2utt=ark:$sdata/JOB/spk2utt --config=$srcdir/online_cmvn.conf $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
|
||||
feats="ark,s,cs:splice-feats $splice_opts scp:$sdata/JOB/feats.scp ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
|
||||
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
if [ ! -z "$ali_or_decode_dir" ]; then
|
||||
$cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
|
||||
gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
|
||||
weight-post ark:- "ark,s,cs:gunzip -c $dir/weights.gz|" ark:- \| \
|
||||
ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true \
|
||||
--max-count=$max_count --spk2utt=ark:$sdata/JOB/spk2utt \
|
||||
$srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark || exit 1;
|
||||
else
|
||||
$cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
|
||||
gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
|
||||
ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true \
|
||||
--max-count=$max_count --spk2utt=ark:$sdata/JOB/spk2utt \
|
||||
$srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark || exit 1;
|
||||
fi
|
||||
fi
|
||||
|
||||
# get an utterance-level set of iVectors (just duplicate the speaker-level ones).
|
||||
if [ $stage -le 2 ]; then
|
||||
for j in $(seq $nj); do
|
||||
utils/apply_map.pl -f 2 $dir/ivectors_spk.$j.ark <$sdata/$j/utt2spk >$dir/ivectors_utt.$j.ark || exit 1;
|
||||
done
|
||||
fi
|
||||
|
||||
ivector_dim=$[$(head -n 1 $dir/ivectors_spk.1.ark | wc -w) - 3] || exit 1;
|
||||
echo "$0: iVector dim is $ivector_dim"
|
||||
|
||||
base_feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1;
|
||||
|
||||
start_dim=$base_feat_dim
|
||||
end_dim=$[$base_feat_dim+$ivector_dim-1]
|
||||
|
||||
|
||||
if [ $stage -le 3 ]; then
|
||||
# here, we are just using the original features in $sdata/JOB/feats.scp for
|
||||
# their number of rows; we use the select-feats command to remove those
|
||||
# features and retain only the iVector features.
|
||||
$cmd JOB=1:$nj $dir/log/duplicate_feats.JOB.log \
|
||||
append-vector-to-feats scp:$sdata/JOB/feats.scp ark:$dir/ivectors_utt.JOB.ark ark:- \| \
|
||||
select-feats "$start_dim-$end_dim" ark:- ark:- \| \
|
||||
subsample-feats --n=$ivector_period ark:- ark:- \| \
|
||||
copy-feats --compress=$compress ark:- \
|
||||
ark,scp:$dir/ivector_online.JOB.ark,$dir/ivector_online.JOB.scp || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 4 ]; then
|
||||
echo "$0: combining iVectors across jobs"
|
||||
for j in $(seq $nj); do cat $dir/ivector_online.$j.scp; done >$dir/ivector_online.scp || exit 1;
|
||||
fi
|
||||
|
||||
echo "$0: done extracting (pseudo-online) iVectors"
|
|
@ -32,9 +32,6 @@ posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
|
|||
# used when training the iVector extractor, but more important
|
||||
# that this match the value used when you do real online decoding
|
||||
# with the neural nets trained with these iVectors.
|
||||
#utts_per_spk_max=-1 # This option is no longer supported, you should use
|
||||
# steps/online/nnet2/copy_data_dir.sh with the --utts-per-spk-max
|
||||
# option to make a copy of the data dir.
|
||||
compress=true # If true, compress the iVectors stored on disk (it's lossy
|
||||
# compression, as used for feature matrices).
|
||||
|
||||
|
@ -58,10 +55,6 @@ if [ $# != 3 ]; then
|
|||
echo " # diagonal model."
|
||||
echo " --min-post <float;default=0.025> # Pruning threshold for posteriors"
|
||||
echo " --ivector-period <int;default=10> # How often to extract an iVector (frames)"
|
||||
echo " --utts-per-spk-max <int;default=-1> # Controls splitting into 'fake speakers'."
|
||||
echo " # Set to 1 if compatibility with utterance-by-utterance"
|
||||
echo " # decoding is the only factor, and to larger if you care "
|
||||
echo " # also about adaptation over several utterances."
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
|
@ -71,7 +64,7 @@ dir=$3
|
|||
|
||||
for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $srcdir/global_cmvn.stats $srcdir/splice_opts \
|
||||
$srcdir/online_cmvn.conf $srcdir/final.mat; do
|
||||
[ ! -f $f ] && echo "No such file $f" && exit 1;
|
||||
[ ! -f $f ] && echo "$0: No such file $f" && exit 1;
|
||||
done
|
||||
|
||||
# Set various variables.
|
||||
|
@ -86,7 +79,7 @@ splice_opts=$(cat $srcdir/splice_opts)
|
|||
# the program ivector-extract-online2 does a bunch of stuff in memory and is
|
||||
# config-driven... this was easier in this case because the same code is
|
||||
# involved in online decoding. We need to create a config file for iVector
|
||||
# extration.
|
||||
# extraction.
|
||||
|
||||
ieconf=$dir/conf/ivector_extractor.conf
|
||||
echo -n >$ieconf
|
||||
|
@ -104,15 +97,6 @@ echo "--posterior-scale=$posterior_scale" >>$ieconf
|
|||
echo "--max-remembered-frames=1000" >>$ieconf # the default
|
||||
|
||||
|
||||
ns=$(wc -l <$data/spk2utt)
|
||||
if [ "$ns" == 1 -a "$utts_per_spk_max" != 1 -a "$utts_per_spk_max" != -1 ]; then
|
||||
echo "$0: you seem to have just one speaker in your database. This is probably not a good idea."
|
||||
echo " see http://kaldi.sourceforge.net/data_prep.html (search for 'bold') for why"
|
||||
echo " Setting --utts-per-spk-max to 1."
|
||||
utts_per_spk_max=1
|
||||
fi
|
||||
|
||||
|
||||
|
||||
for n in $(seq $nj); do
|
||||
# This will do nothing unless the directory $dir/storage exists;
|
||||
|
|
|
@ -34,7 +34,7 @@ int main(int argc, char *argv[]) {
|
|||
try {
|
||||
const char *usage =
|
||||
"Convert alignments to posteriors\n"
|
||||
"Usage: ali-to-post [options] alignments-rspecifier posteriors-wspecifier\n"
|
||||
"Usage: ali-to-post [options] <alignments-rspecifier> <posteriors-wspecifier>\n"
|
||||
"e.g.:\n"
|
||||
" ali-to-post ark:1.ali ark:1.post\n";
|
||||
|
||||
|
|
|
@ -38,10 +38,14 @@ int main(int argc, char *argv[]) {
|
|||
"See also: copy-feats\n";
|
||||
|
||||
bool binary = true;
|
||||
BaseFloat scale = 1.0;
|
||||
ParseOptions po(usage);
|
||||
|
||||
po.Register("binary", &binary, "Write in binary mode (only relevant if output is a wxfilename)");
|
||||
|
||||
po.Register("binary", &binary,
|
||||
"Write in binary mode (only relevant if output is a wxfilename)");
|
||||
po.Register("scale", &scale,
|
||||
"This option can be used to scale the matrices being copied.");
|
||||
|
||||
po.Read(argc, argv);
|
||||
|
||||
if (po.NumArgs() != 2) {
|
||||
|
@ -68,6 +72,7 @@ int main(int argc, char *argv[]) {
|
|||
if (!in_is_rspecifier) {
|
||||
Matrix<BaseFloat> mat;
|
||||
ReadKaldiObject(matrix_in_fn, &mat);
|
||||
if (scale != 1.0) mat.Scale(scale);
|
||||
Output ko(matrix_out_fn, binary);
|
||||
mat.Write(ko.Stream(), binary);
|
||||
KALDI_LOG << "Copied matrix to " << matrix_out_fn;
|
||||
|
@ -76,8 +81,15 @@ int main(int argc, char *argv[]) {
|
|||
int num_done = 0;
|
||||
BaseFloatMatrixWriter writer(matrix_out_fn);
|
||||
SequentialBaseFloatMatrixReader reader(matrix_in_fn);
|
||||
for (; !reader.Done(); reader.Next(), num_done++)
|
||||
writer.Write(reader.Key(), reader.Value());
|
||||
for (; !reader.Done(); reader.Next(), num_done++) {
|
||||
if (scale != 1.0) {
|
||||
Matrix<BaseFloat> mat(reader.Value());
|
||||
mat.Scale(scale);
|
||||
writer.Write(reader.Key(), mat);
|
||||
} else {
|
||||
writer.Write(reader.Key(), reader.Value());
|
||||
}
|
||||
}
|
||||
KALDI_LOG << "Copied " << num_done << " matrices.";
|
||||
return (num_done != 0 ? 0 : 1);
|
||||
}
|
||||
|
|
|
@ -50,9 +50,9 @@ int main(int argc, char *argv[]) {
|
|||
exit(1);
|
||||
}
|
||||
|
||||
std::string rspecifier1 = po.GetArg(1);
|
||||
std::string rspecifier2 = po.GetArg(2);
|
||||
std::string wspecifier = po.GetArg(3);
|
||||
std::string rspecifier1 = po.GetArg(1),
|
||||
rspecifier2 = po.GetArg(2),
|
||||
wspecifier = po.GetArg(3);
|
||||
|
||||
BaseFloatMatrixWriter feats_writer(wspecifier);
|
||||
SequentialBaseFloatMatrixReader feats_reader1(rspecifier1);
|
||||
|
|
|
@ -78,8 +78,8 @@ int main(int argc, char *argv[]) {
|
|||
"Usage: paste-feats <in-rspecifier1> <in-rspecifier2> [<in-rspecifier3> ...] <out-wspecifier>\n"
|
||||
" or: paste-feats <in-rxfilename1> <in-rxfilename2> [<in-rxfilename3> ...] <out-wxfilename>\n"
|
||||
" e.g. paste-feats ark:feats1.ark \"ark:select-feats 0-3 ark:feats2.ark ark:- |\" ark:feats-out.ark\n"
|
||||
" or: paste-feats foo.mat bar.mat baz.mat\n";
|
||||
|
||||
" or: paste-feats foo.mat bar.mat baz.mat\n"
|
||||
"See also: copy-feats, copy-matrix, append-vector-to-feats, concat-feats\n";
|
||||
|
||||
ParseOptions po(usage);
|
||||
|
||||
|
|
|
@ -107,8 +107,9 @@ void TestIvectorExtraction(const IvectorExtractor &extractor,
|
|||
utt_stats.AccStats(feats, post);
|
||||
|
||||
OnlineIvectorEstimationStats online_stats(extractor.IvectorDim(),
|
||||
extractor.PriorOffset());
|
||||
|
||||
extractor.PriorOffset(),
|
||||
0.0);
|
||||
|
||||
for (int32 t = 0; t < num_frames; t++) {
|
||||
online_stats.AccStats(extractor, feats.Row(t), post[t]);
|
||||
}
|
||||
|
|
|
@ -259,13 +259,13 @@ void IvectorExtractor::GetIvectorDistMean(
|
|||
for (int32 i = 0; i < I; i++) {
|
||||
double gamma = utt_stats.gamma_(i);
|
||||
if (gamma != 0.0) {
|
||||
Vector<double> x(utt_stats.X_.Row(i)); // == \gamma(i) \m_i
|
||||
SubVector<double> x(utt_stats.X_, i); // == \gamma(i) \m_i
|
||||
// next line: a += \gamma_i \M_i^T \Sigma_i^{-1} \m_i
|
||||
linear->AddMatVec(1.0, Sigma_inv_M_[i], kTrans, x, 1.0);
|
||||
}
|
||||
}
|
||||
SubVector<double> q_vec(quadratic->Data(), IvectorDim()*(IvectorDim()+1)/2);
|
||||
q_vec.AddMatVec(1.0, U_, kTrans, Vector<double>(utt_stats.gamma_), 1.0);
|
||||
q_vec.AddMatVec(1.0, U_, kTrans, utt_stats.gamma_, 1.0);
|
||||
}
|
||||
|
||||
void IvectorExtractor::GetIvectorDistPrior(
|
||||
|
@ -543,24 +543,55 @@ void OnlineIvectorEstimationStats::AccStats(
|
|||
quadratic_term_vec.AddVec(weight, U_g);
|
||||
tot_weight += weight;
|
||||
}
|
||||
if (max_count_ != 0.0) {
|
||||
// see comments in header RE max_count for explanation.
|
||||
double old_num_frames = num_frames_,
|
||||
new_num_frames = num_frames_ + tot_weight;
|
||||
double old_prior_scale = std::max(old_num_frames, max_count_) / max_count_,
|
||||
new_prior_scale = std::max(new_num_frames, max_count_) / max_count_;
|
||||
// The prior_scales are the inverses of the scales we would put on the stats
|
||||
// if we were implementing this by scaling the stats. Instead we
|
||||
// scale the prior term.
|
||||
double prior_scale_change = new_prior_scale - old_prior_scale;
|
||||
if (prior_scale_change != 0.0) {
|
||||
linear_term_(0) += prior_offset_ * prior_scale_change;
|
||||
quadratic_term_.AddToDiag(prior_scale_change);
|
||||
}
|
||||
}
|
||||
|
||||
num_frames_ += tot_weight;
|
||||
}
|
||||
|
||||
void OnlineIvectorEstimationStats::Scale(double scale) {
|
||||
KALDI_ASSERT(scale >= 0.0 && scale <= 1.0);
|
||||
double old_num_frames = num_frames_;
|
||||
num_frames_ *= scale;
|
||||
quadratic_term_.Scale(scale);
|
||||
linear_term_.Scale(scale);
|
||||
|
||||
// Scale back up the prior term, by adding in whatever we scaled down.
|
||||
linear_term_(0) += prior_offset_ * (1.0 - scale);
|
||||
quadratic_term_.AddToDiag(1.0 - scale);
|
||||
if (max_count_ == 0.0) {
|
||||
linear_term_(0) += prior_offset_ * (1.0 - scale);
|
||||
quadratic_term_.AddToDiag(1.0 - scale);
|
||||
} else {
|
||||
double new_num_frames = num_frames_;
|
||||
double old_prior_scale =
|
||||
scale * std::max(old_num_frames, max_count_) / max_count_,
|
||||
new_prior_scale = std::max(new_num_frames, max_count_) / max_count_;
|
||||
// old_prior_scale is the scale the prior term currently has in the stats,
|
||||
// i.e. the previous scale times "scale" as we just scaled the stats.
|
||||
// new_prior_scale is the scale we want the prior term to have.
|
||||
linear_term_(0) += prior_offset_ * (new_prior_scale - old_prior_scale);
|
||||
quadratic_term_.AddToDiag(new_prior_scale - old_prior_scale);
|
||||
}
|
||||
}
|
||||
|
||||
void OnlineIvectorEstimationStats::Write(std::ostream &os, bool binary) const {
|
||||
WriteToken(os, binary, "<OnlineIvectorEstimationStats>"); // magic string.
|
||||
WriteToken(os, binary, "<OnlineIvectorEstimationStats>");
|
||||
WriteToken(os, binary, "<PriorOffset>");
|
||||
WriteBasicType(os, binary, prior_offset_);
|
||||
WriteToken(os, binary, "<MaxCount>");
|
||||
WriteBasicType(os, binary, max_count_);
|
||||
WriteToken(os, binary, "<NumFrames>");
|
||||
WriteBasicType(os, binary, num_frames_);
|
||||
WriteToken(os, binary, "<QuadraticTerm>");
|
||||
|
@ -571,11 +602,20 @@ void OnlineIvectorEstimationStats::Write(std::ostream &os, bool binary) const {
|
|||
}
|
||||
|
||||
void OnlineIvectorEstimationStats::Read(std::istream &is, bool binary) {
|
||||
ExpectToken(is, binary, "<OnlineIvectorEstimationStats>"); // magic string.
|
||||
ExpectToken(is, binary, "<OnlineIvectorEstimationStats>");
|
||||
ExpectToken(is, binary, "<PriorOffset>");
|
||||
ReadBasicType(is, binary, &prior_offset_);
|
||||
ExpectToken(is, binary, "<NumFrames>");
|
||||
ReadBasicType(is, binary, &num_frames_);
|
||||
std::string tok;
|
||||
ReadToken(is, binary, &tok);
|
||||
if (tok == "<MaxCount>") {
|
||||
ReadBasicType(is, binary, &max_count_);
|
||||
ExpectToken(is, binary, "<NumFrames>");
|
||||
ReadBasicType(is, binary, &num_frames_);
|
||||
} else {
|
||||
KALDI_ASSERT(tok == "<NumFrames>");
|
||||
max_count_ = 0.0;
|
||||
ReadBasicType(is, binary, &num_frames_);
|
||||
}
|
||||
ExpectToken(is, binary, "<QuadraticTerm>");
|
||||
quadratic_term_.Read(is, binary);
|
||||
ExpectToken(is, binary, "<LinearTerm>");
|
||||
|
@ -638,8 +678,9 @@ double OnlineIvectorEstimationStats::DefaultObjf() const {
|
|||
}
|
||||
|
||||
OnlineIvectorEstimationStats::OnlineIvectorEstimationStats(int32 ivector_dim,
|
||||
BaseFloat prior_offset):
|
||||
prior_offset_(prior_offset), num_frames_(0.0),
|
||||
BaseFloat prior_offset,
|
||||
BaseFloat max_count):
|
||||
prior_offset_(prior_offset), max_count_(max_count), num_frames_(0.0),
|
||||
quadratic_term_(ivector_dim), linear_term_(ivector_dim) {
|
||||
if (ivector_dim != 0) {
|
||||
linear_term_(0) += prior_offset;
|
||||
|
@ -650,6 +691,7 @@ OnlineIvectorEstimationStats::OnlineIvectorEstimationStats(int32 ivector_dim,
|
|||
OnlineIvectorEstimationStats::OnlineIvectorEstimationStats(
|
||||
const OnlineIvectorEstimationStats &other):
|
||||
prior_offset_(other.prior_offset_),
|
||||
max_count_(other.max_count_),
|
||||
num_frames_(other.num_frames_),
|
||||
quadratic_term_(other.quadratic_term_),
|
||||
linear_term_(other.linear_term_) { }
|
||||
|
@ -733,6 +775,12 @@ void IvectorExtractorUtteranceStats::AccStats(
|
|||
}
|
||||
}
|
||||
|
||||
void IvectorExtractorUtteranceStats::Scale(double scale) {
|
||||
gamma_.Scale(scale);
|
||||
X_.Scale(scale);
|
||||
for (size_t i = 0; i < S_.size(); i++)
|
||||
S_[i].Scale(scale);
|
||||
}
|
||||
|
||||
IvectorExtractorStats::IvectorExtractorStats(
|
||||
const IvectorExtractor &extractor,
|
||||
|
@ -1534,6 +1582,7 @@ double EstimateIvectorsOnline(
|
|||
const IvectorExtractor &extractor,
|
||||
int32 ivector_period,
|
||||
int32 num_cg_iters,
|
||||
BaseFloat max_count,
|
||||
Matrix<BaseFloat> *ivectors) {
|
||||
|
||||
KALDI_ASSERT(ivector_period > 0);
|
||||
|
@ -1544,7 +1593,8 @@ double EstimateIvectorsOnline(
|
|||
ivectors->Resize(num_ivectors, extractor.IvectorDim());
|
||||
|
||||
OnlineIvectorEstimationStats online_stats(extractor.IvectorDim(),
|
||||
extractor.PriorOffset());
|
||||
extractor.PriorOffset(),
|
||||
max_count);
|
||||
|
||||
double ans = 0.0;
|
||||
|
||||
|
|
|
@ -45,13 +45,23 @@ namespace kaldi {
|
|||
// "acoustic_weight" is not read by any class declared in this header; it has to
|
||||
// be applied by calling IvectorExtractorUtteranceStats::Scale() before
|
||||
// obtaining the iVector.
|
||||
// The same is true of max_count: it has to be applied by programs themselves
|
||||
// e.g. see ../ivectorbin/ivector-extract.cc.
|
||||
struct IvectorEstimationOptions {
|
||||
double acoustic_weight;
|
||||
IvectorEstimationOptions(): acoustic_weight(1.0) {}
|
||||
double max_count;
|
||||
IvectorEstimationOptions(): acoustic_weight(1.0), max_count(0.0) {}
|
||||
void Register(OptionsItf *po) {
|
||||
po->Register("acoustic-weight", &acoustic_weight,
|
||||
"Weight on part of auxf that involves the data (e.g. 0.2); "
|
||||
"if this weight is small, the prior will have more effect.");
|
||||
po->Register("max-count", &max_count,
|
||||
"Maximum frame count (affects prior scaling): if >0, the prior "
|
||||
"term will be scaled up after the frame count exceeds this "
|
||||
"value. Note that this count is considered after posterior "
|
||||
"scaling (e.g. --acoustic-weight option, or scale argument to "
|
||||
"scale-post), so you would normally use a cutoff 10 times "
|
||||
"smaller than the corresponding number of frames.");
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -301,8 +311,12 @@ class IvectorExtractor {
|
|||
*/
|
||||
class OnlineIvectorEstimationStats {
|
||||
public:
|
||||
// Search above for max_count to see an explanation; if nonzero, it will
|
||||
// put a higher weight on the prior (vs. the stats) once the count passes
|
||||
// that value.
|
||||
OnlineIvectorEstimationStats(int32 ivector_dim,
|
||||
BaseFloat prior_offset);
|
||||
BaseFloat prior_offset,
|
||||
BaseFloat max_count);
|
||||
|
||||
OnlineIvectorEstimationStats(const OnlineIvectorEstimationStats &other);
|
||||
|
||||
|
@ -360,6 +374,7 @@ class OnlineIvectorEstimationStats {
|
|||
|
||||
friend class IvectorExtractor;
|
||||
double prior_offset_;
|
||||
double max_count_;
|
||||
double num_frames_; // num frames (weighted, if applicable).
|
||||
SpMatrix<double> quadratic_term_;
|
||||
Vector<double> linear_term_;
|
||||
|
@ -368,8 +383,10 @@ class OnlineIvectorEstimationStats {
|
|||
|
||||
// This code obtains periodically (for each "ivector_period" frames, e.g. 10
|
||||
// frames), an estimate of the iVector including all frames up to that point.
|
||||
// This emulates what you could do in an online/streaming algorithm; its use
|
||||
// is for neural network training in a way that's matched to online decoding.
|
||||
// This emulates what you could do in an online/streaming algorithm; its use is
|
||||
// for neural network training in a way that's matched to online decoding.
|
||||
// [note: I don't believe we are currently using the program,
|
||||
// ivector-extract-online.cc, that calls this function, in any of the scripts.].
|
||||
// Caution: this program outputs the raw iVectors, where the first component
|
||||
// will generally be very positive. You probably want to subtract PriorOffset()
|
||||
// from the first element of each row of the output before writing it out.
|
||||
|
@ -384,6 +401,7 @@ double EstimateIvectorsOnline(
|
|||
const IvectorExtractor &extractor,
|
||||
int32 ivector_period,
|
||||
int32 num_cg_iters,
|
||||
BaseFloat max_count,
|
||||
Matrix<BaseFloat> *ivectors);
|
||||
|
||||
|
||||
|
|
|
@ -49,6 +49,7 @@ int main(int argc, char *argv[]) {
|
|||
ParseOptions po(usage);
|
||||
int32 num_cg_iters = 15;
|
||||
int32 ivector_period = 10;
|
||||
BaseFloat max_count = 0.0;
|
||||
g_num_threads = 8;
|
||||
|
||||
po.Register("num-cg-iters", &num_cg_iters,
|
||||
|
@ -60,6 +61,12 @@ int main(int argc, char *argv[]) {
|
|||
po.Register("num-threads", &g_num_threads,
|
||||
"Number of threads to use for computing derived variables "
|
||||
"of iVector extractor, at process start-up.");
|
||||
po.Register("max-count", &max_count,
|
||||
"If >0, when the count of posteriors exceeds max-count we will "
|
||||
"start using a stronger prior term. Can make iVectors from "
|
||||
"longer than normal utterances look more 'typical'. Interpret "
|
||||
"this value as a number of frames multiplied by your "
|
||||
"posterior scale (so typically 0.1 times a number of frames).");
|
||||
po.Read(argc, argv);
|
||||
|
||||
if (po.NumArgs() != 4) {
|
||||
|
@ -107,7 +114,7 @@ int main(int argc, char *argv[]) {
|
|||
double objf_impr_per_frame;
|
||||
objf_impr_per_frame = EstimateIvectorsOnline(feats, posterior, extractor,
|
||||
ivector_period, num_cg_iters,
|
||||
&ivectors);
|
||||
max_count, &ivectors);
|
||||
|
||||
BaseFloat offset = extractor.PriorOffset();
|
||||
for (int32 i = 0 ; i < ivectors.NumRows(); i++)
|
||||
|
|
|
@ -63,13 +63,13 @@ class IvectorExtractTask {
|
|||
}
|
||||
~IvectorExtractTask() {
|
||||
if (tot_auxf_change_ != NULL) {
|
||||
int32 T = posterior_.size();
|
||||
double T = TotalPosterior(posterior_);
|
||||
*tot_auxf_change_ += auxf_change_;
|
||||
KALDI_VLOG(2) << "Auxf change for utterance " << utt_ << " was "
|
||||
<< (auxf_change_ / T) << " per frame over " << T
|
||||
<< " frames.";
|
||||
<< " frames (weighted)";
|
||||
}
|
||||
// We actually write out the offset of the iVector's from the mean of the
|
||||
// We actually write out the offset of the iVectors from the mean of the
|
||||
// prior distribution; this is the form we'll need it in for scoring. (most
|
||||
// formulations of iVectors have zero-mean priors so this is not normally an
|
||||
// issue).
|
||||
|
@ -89,11 +89,124 @@ class IvectorExtractTask {
|
|||
double auxf_change_;
|
||||
};
|
||||
|
||||
int32 RunPerSpeaker(const std::string &ivector_extractor_rxfilename,
|
||||
const IvectorEstimationOptions &opts,
|
||||
bool compute_objf_change,
|
||||
const std::string &spk2utt_rspecifier,
|
||||
const std::string &feature_rspecifier,
|
||||
const std::string &posterior_rspecifier,
|
||||
const std::string &ivector_wspecifier) {
|
||||
IvectorExtractor extractor;
|
||||
ReadKaldiObject(ivector_extractor_rxfilename, &extractor);
|
||||
SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
|
||||
RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
|
||||
RandomAccessPosteriorReader posterior_reader(posterior_rspecifier);
|
||||
BaseFloatVectorWriter ivector_writer(ivector_wspecifier);
|
||||
|
||||
double tot_auxf_change = 0.0, tot_post = 0.0, tot_norm = 0.0;
|
||||
int32 num_utt_done = 0, num_utt_err = 0,
|
||||
num_spk_done = 0, num_spk_err = 0;
|
||||
|
||||
for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
|
||||
std::string spk = spk2utt_reader.Key();
|
||||
const std::vector<std::string> &utts = spk2utt_reader.Value();
|
||||
|
||||
bool need_2nd_order_stats = false;
|
||||
|
||||
IvectorExtractorUtteranceStats utt_stats(extractor.NumGauss(),
|
||||
extractor.FeatDim(),
|
||||
need_2nd_order_stats);
|
||||
|
||||
for (size_t i = 0; i < utts.size(); i++) {
|
||||
const std::string &utt = utts[i];
|
||||
if (!feature_reader.HasKey(utt)) {
|
||||
KALDI_WARN << "No features present for utterance " << utt;
|
||||
num_utt_err++;
|
||||
continue;
|
||||
}
|
||||
const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
|
||||
if (!posterior_reader.HasKey(utt)) {
|
||||
KALDI_WARN << "No posteriors present for utterance " << utt;
|
||||
num_utt_err++;
|
||||
continue;
|
||||
}
|
||||
Posterior posterior = posterior_reader.Value(utt);
|
||||
if (feats.NumRows() != posterior.size()) {
|
||||
KALDI_WARN << "Posterior has wrong size " << posterior.size()
|
||||
<< " vs. feats " << feats.NumRows() << " for "
|
||||
<< utt;
|
||||
num_utt_err++;
|
||||
continue;
|
||||
}
|
||||
ScalePosterior(opts.acoustic_weight, &posterior);
|
||||
num_utt_done++;
|
||||
utt_stats.AccStats(feats, posterior);
|
||||
}
|
||||
|
||||
if (utt_stats.NumFrames() == 0.0) {
|
||||
KALDI_WARN << "No stats accumulated for speaker " << spk;
|
||||
num_spk_err++;
|
||||
continue;
|
||||
} else {
|
||||
if (opts.max_count > 0 && utt_stats.NumFrames() > opts.max_count) {
|
||||
double scale = opts.max_count / utt_stats.NumFrames();
|
||||
utt_stats.Scale(scale);
|
||||
KALDI_LOG << "Scaling stats for speaker " << spk << " by scale "
|
||||
<< scale << " due to --max-count=" << opts.max_count;
|
||||
}
|
||||
|
||||
Vector<double> ivector(extractor.IvectorDim());
|
||||
ivector(0) = extractor.PriorOffset();
|
||||
|
||||
if (compute_objf_change) {
|
||||
double old_auxf = extractor.GetAuxf(utt_stats, ivector);
|
||||
extractor.GetIvectorDistribution(utt_stats, &ivector, NULL);
|
||||
double new_auxf = extractor.GetAuxf(utt_stats, ivector);
|
||||
double auxf_change = new_auxf - old_auxf;
|
||||
|
||||
KALDI_LOG << "Auxf change for speaker " << spk << " was "
|
||||
<< (auxf_change / utt_stats.NumFrames()) << " per frame, over "
|
||||
<< utt_stats.NumFrames() << " frames (weighted).";
|
||||
tot_auxf_change += auxf_change;
|
||||
} else {
|
||||
extractor.GetIvectorDistribution(utt_stats, &ivector, NULL);
|
||||
}
|
||||
// We actually write out the offset of the iVectors from the mean of the
|
||||
// prior distribution; this is the form we'll need it in for scoring and
|
||||
// as a feature for neural nets. (most formulations of iVectors have
|
||||
// zero-mean priors so this is not normally an issue).
|
||||
ivector(0) -= extractor.PriorOffset();
|
||||
KALDI_LOG << "Ivector norm for speaker " << spk
|
||||
<< " was " << ivector.Norm(2.0);
|
||||
|
||||
tot_norm += ivector.Norm(2.0) * utt_stats.NumFrames();
|
||||
tot_post += utt_stats.NumFrames();
|
||||
num_spk_done++;
|
||||
Vector<BaseFloat> ivector_flt(ivector);
|
||||
ivector_writer.Write(spk, ivector_flt);
|
||||
}
|
||||
}
|
||||
|
||||
KALDI_LOG << "Done " << num_spk_done << " speakers; " << num_spk_err
|
||||
<< " with errors. " << num_utt_done << " utterances "
|
||||
<< "were processed, " << num_utt_err << " with errors.";
|
||||
if (tot_post != 0.0) {
|
||||
if (compute_objf_change) {
|
||||
KALDI_LOG << "Overall weighted-average objective function improvement was "
|
||||
<< (tot_auxf_change / tot_post) << " over " << tot_post
|
||||
<< " frames (weighted)";
|
||||
}
|
||||
KALDI_LOG << "Average iVector norm (weighted by frames) was "
|
||||
<< (tot_norm / tot_post) << " over " << tot_post
|
||||
<< " frames (weighted)";
|
||||
}
|
||||
return (num_spk_done != 0 ? 0 : 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
using namespace kaldi;
|
||||
typedef kaldi::int32 int32;
|
||||
|
@ -102,7 +215,7 @@ int main(int argc, char *argv[]) {
|
|||
const char *usage =
|
||||
"Extract iVectors for utterances, using a trained iVector extractor,\n"
|
||||
"and features and Gaussian-level posteriors\n"
|
||||
"Usage: ivector-extract [options] <model-in> <feature-rspecifier>"
|
||||
"Usage: ivector-extract [options] <model-in> <feature-rspecifier> "
|
||||
"<posteriors-rspecifier> <ivector-wspecifier>\n"
|
||||
"e.g.: \n"
|
||||
" fgmm-global-gselect-to-post 1.ubm '$feats' 'ark:gunzip -c gselect.1.gz|' ark:- | \\\n"
|
||||
|
@ -110,13 +223,21 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
ParseOptions po(usage);
|
||||
bool compute_objf_change = true;
|
||||
IvectorExtractorStatsOptions stats_opts;
|
||||
IvectorEstimationOptions opts;
|
||||
std::string spk2utt_rspecifier;
|
||||
TaskSequencerConfig sequencer_config;
|
||||
po.Register("compute-objf-change", &compute_objf_change,
|
||||
"If true, compute the change in objective function from using "
|
||||
"nonzero iVector (a potentially useful diagnostic). Combine "
|
||||
"with --verbose=2 for per-utterance information");
|
||||
stats_opts.Register(&po);
|
||||
po.Register("spk2utt", &spk2utt_rspecifier, "Supply this option if you "
|
||||
"want iVectors to be output at the per-speaker level, estimated "
|
||||
"using stats accumulated from multiple utterances. Note: this "
|
||||
"is not the normal way iVectors are obtained for speaker-id. "
|
||||
"This option will cause the program to ignore the --num-threads "
|
||||
"option.");
|
||||
|
||||
opts.Register(&po);
|
||||
sequencer_config.Register(&po);
|
||||
|
||||
po.Read(argc, argv);
|
||||
|
@ -128,63 +249,87 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
std::string ivector_extractor_rxfilename = po.GetArg(1),
|
||||
feature_rspecifier = po.GetArg(2),
|
||||
posteriors_rspecifier = po.GetArg(3),
|
||||
posterior_rspecifier = po.GetArg(3),
|
||||
ivectors_wspecifier = po.GetArg(4);
|
||||
|
||||
// g_num_threads affects how ComputeDerivedVars is called when we read the
|
||||
// extractor.
|
||||
g_num_threads = sequencer_config.num_threads;
|
||||
IvectorExtractor extractor;
|
||||
ReadKaldiObject(ivector_extractor_rxfilename, &extractor);
|
||||
|
||||
double tot_auxf_change = 0.0;
|
||||
int64 tot_t = 0;
|
||||
int32 num_done = 0, num_err = 0;
|
||||
if (spk2utt_rspecifier.empty()) {
|
||||
// g_num_threads affects how ComputeDerivedVars is called when we read the
|
||||
// extractor.
|
||||
g_num_threads = sequencer_config.num_threads;
|
||||
IvectorExtractor extractor;
|
||||
ReadKaldiObject(ivector_extractor_rxfilename, &extractor);
|
||||
|
||||
double tot_auxf_change = 0.0, tot_t = 0.0;
|
||||
int32 num_done = 0, num_err = 0;
|
||||
|
||||
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
|
||||
RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
|
||||
BaseFloatVectorWriter ivector_writer(ivectors_wspecifier);
|
||||
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
|
||||
RandomAccessPosteriorReader posterior_reader(posterior_rspecifier);
|
||||
BaseFloatVectorWriter ivector_writer(ivectors_wspecifier);
|
||||
|
||||
{
|
||||
TaskSequencer<IvectorExtractTask> sequencer(sequencer_config);
|
||||
for (; !feature_reader.Done(); feature_reader.Next()) {
|
||||
std::string utt = feature_reader.Key();
|
||||
if (!posterior_reader.HasKey(utt)) {
|
||||
KALDI_WARN << "No posteriors for utterance " << utt;
|
||||
num_err++;
|
||||
continue;
|
||||
}
|
||||
const Matrix<BaseFloat> &mat = feature_reader.Value();
|
||||
Posterior posterior = posterior_reader.Value(utt);
|
||||
|
||||
if (static_cast<int32>(posterior.size()) != mat.NumRows()) {
|
||||
KALDI_WARN << "Size mismatch between posterior " << posterior.size()
|
||||
<< " and features " << mat.NumRows() << " for utterance "
|
||||
<< utt;
|
||||
num_err++;
|
||||
continue;
|
||||
}
|
||||
|
||||
{
|
||||
TaskSequencer<IvectorExtractTask> sequencer(sequencer_config);
|
||||
for (; !feature_reader.Done(); feature_reader.Next()) {
|
||||
std::string key = feature_reader.Key();
|
||||
if (!posteriors_reader.HasKey(key)) {
|
||||
KALDI_WARN << "No posteriors for utterance " << key;
|
||||
num_err++;
|
||||
continue;
|
||||
double *auxf_ptr = (compute_objf_change ? &tot_auxf_change : NULL );
|
||||
|
||||
double this_t = opts.acoustic_weight * TotalPosterior(posterior),
|
||||
max_count_scale = 1.0;
|
||||
if (opts.max_count > 0 && this_t > opts.max_count) {
|
||||
max_count_scale = opts.max_count / this_t;
|
||||
KALDI_LOG << "Scaling stats for utterance " << utt << " by scale "
|
||||
<< max_count_scale << " due to --max-count="
|
||||
<< opts.max_count;
|
||||
this_t = opts.max_count;
|
||||
}
|
||||
ScalePosterior(opts.acoustic_weight * max_count_scale,
|
||||
&posterior);
|
||||
// note: now, this_t == sum of posteriors.
|
||||
|
||||
sequencer.Run(new IvectorExtractTask(extractor, utt, mat, posterior,
|
||||
&ivector_writer, auxf_ptr));
|
||||
|
||||
tot_t += this_t;
|
||||
num_done++;
|
||||
}
|
||||
const Matrix<BaseFloat> &mat = feature_reader.Value();
|
||||
const Posterior &posterior = posteriors_reader.Value(key);
|
||||
|
||||
if (static_cast<int32>(posterior.size()) != mat.NumRows()) {
|
||||
KALDI_WARN << "Size mismatch between posterior " << posterior.size()
|
||||
<< " and features " << mat.NumRows() << " for utterance "
|
||||
<< key;
|
||||
num_err++;
|
||||
continue;
|
||||
}
|
||||
|
||||
double *auxf_ptr = (compute_objf_change ? &tot_auxf_change : NULL );
|
||||
|
||||
sequencer.Run(new IvectorExtractTask(extractor, key, mat, posterior,
|
||||
&ivector_writer, auxf_ptr));
|
||||
|
||||
tot_t += posterior.size();
|
||||
num_done++;
|
||||
// Destructor of "sequencer" will wait for any remaining tasks.
|
||||
}
|
||||
// Destructor of "sequencer" will wait for any remaining tasks.
|
||||
|
||||
KALDI_LOG << "Done " << num_done << " files, " << num_err
|
||||
<< " with errors. Total (weighted) frames " << tot_t;
|
||||
if (compute_objf_change)
|
||||
KALDI_LOG << "Overall average objective-function change from estimating "
|
||||
<< "ivector was " << (tot_auxf_change / tot_t) << " per frame "
|
||||
<< " over " << tot_t << " (weighted) frames.";
|
||||
|
||||
return (num_done != 0 ? 0 : 1);
|
||||
} else {
|
||||
KALDI_ASSERT(sequencer_config.num_threads == 1 &&
|
||||
"--spk2utt option is incompatible with --num-threads option");
|
||||
return RunPerSpeaker(ivector_extractor_rxfilename,
|
||||
opts,
|
||||
compute_objf_change,
|
||||
spk2utt_rspecifier,
|
||||
feature_rspecifier,
|
||||
posterior_rspecifier,
|
||||
ivectors_wspecifier);
|
||||
}
|
||||
|
||||
KALDI_LOG << "Done " << num_done << " files, " << num_err
|
||||
<< " with errors. Total frames " << tot_t;
|
||||
|
||||
if (compute_objf_change)
|
||||
KALDI_LOG << "Overall average objective-function change from estimating "
|
||||
<< "ivector was " << (tot_auxf_change / tot_t) << " per frame "
|
||||
<< " over " << tot_t << " frames.";
|
||||
|
||||
return (num_done != 0 ? 0 : 1);
|
||||
} catch(const std::exception &e) {
|
||||
std::cerr << e.what();
|
||||
return -1;
|
||||
|
|
|
@ -1010,7 +1010,7 @@ void VectorBase<Real>::AddVec(const Real alpha, const VectorBase<OtherReal> &v)
|
|||
MatrixIndexT dim = dim_;
|
||||
if (alpha != 1.0)
|
||||
for (MatrixIndexT i = 0; i < dim; i++)
|
||||
data[i] += alpha*other_data[i];
|
||||
data[i] += alpha * other_data[i];
|
||||
else
|
||||
for (MatrixIndexT i = 0; i < dim; i++)
|
||||
data[i] += other_data[i];
|
||||
|
|
|
@ -32,6 +32,7 @@ void OnlineIvectorExtractionInfo::Init(
|
|||
num_gselect = config.num_gselect;
|
||||
min_post = config.min_post;
|
||||
posterior_scale = config.posterior_scale;
|
||||
max_count = config.max_count;
|
||||
use_most_recent_ivector = config.use_most_recent_ivector;
|
||||
greedy_ivector_extractor = config.greedy_ivector_extractor;
|
||||
if (greedy_ivector_extractor && !use_most_recent_ivector) {
|
||||
|
@ -161,7 +162,7 @@ void OnlineIvectorFeature::UpdateStatsUntilFrame(int32 frame) {
|
|||
|
||||
Vector<BaseFloat> feat(feat_dim), // features given to iVector extractor
|
||||
log_likes(info_.diag_ubm.NumGauss());
|
||||
|
||||
|
||||
for (; num_frames_stats_ <= frame; num_frames_stats_++) {
|
||||
int32 t = num_frames_stats_; // Frame whose stats we want to get.
|
||||
lda_normalized_->GetFrame(t, &feat);
|
||||
|
@ -262,8 +263,10 @@ OnlineIvectorFeature::OnlineIvectorFeature(
|
|||
const OnlineIvectorExtractionInfo &info,
|
||||
OnlineFeatureInterface *base_feature):
|
||||
info_(info), base_(base_feature),
|
||||
ivector_stats_(info_.extractor.IvectorDim(), info_.extractor.PriorOffset()),
|
||||
num_frames_stats_(0) {
|
||||
ivector_stats_(info_.extractor.IvectorDim(),
|
||||
info_.extractor.PriorOffset(),
|
||||
info_.max_count),
|
||||
num_frames_stats_(0), tot_ubm_loglike_(0.0) {
|
||||
info.Check();
|
||||
KALDI_ASSERT(base_feature != NULL);
|
||||
splice_ = new OnlineSpliceFrames(info_.splice_opts, base_);
|
||||
|
|
|
@ -70,6 +70,12 @@ struct OnlineIvectorExtractionConfig {
|
|||
BaseFloat posterior_scale; // Scale on posteriors used for iVector
|
||||
// extraction; can be interpreted as the inverse
|
||||
// of a scale on the log-prior.
|
||||
BaseFloat max_count; // Maximum stats count we allow before we start scaling
|
||||
// down stats (if nonzero).. this prevents us getting
|
||||
// atypical-looking iVectors for very long utterances.
|
||||
// Interpret this as a number of frames times
|
||||
// posterior_scale, typically 1/10 of a frame count.
|
||||
|
||||
|
||||
// If use_most_recent_ivector is true, we always return the most recent
|
||||
// available iVector rather than the one for the current frame. This means
|
||||
|
@ -91,6 +97,7 @@ struct OnlineIvectorExtractionConfig {
|
|||
|
||||
OnlineIvectorExtractionConfig(): ivector_period(10), num_gselect(5),
|
||||
min_post(0.025), posterior_scale(0.1),
|
||||
max_count(0.0),
|
||||
use_most_recent_ivector(true),
|
||||
greedy_ivector_extractor(false),
|
||||
max_remembered_frames(1000) { }
|
||||
|
@ -122,6 +129,11 @@ struct OnlineIvectorExtractionConfig {
|
|||
"iVector extraction");
|
||||
po->Register("posterior-scale", &posterior_scale, "Scale for posteriors in "
|
||||
"iVector extraction (may be viewed as inverse of prior scale)");
|
||||
po->Register("max-count", &max_count, "Maximum data count we allow before "
|
||||
"we start scaling the stats down (if nonzero)... helps to make "
|
||||
"iVectors from long utterances look more typical. Interpret "
|
||||
"as a frame-count times --posterior-scale, typically 1/10 of "
|
||||
"a number of frames. Suggest 100.");
|
||||
po->Register("use-most-recent-ivector", &use_most_recent_ivector, "If true, "
|
||||
"always use most recent available iVector, rather than the "
|
||||
"one for the designated frame.");
|
||||
|
@ -156,6 +168,7 @@ struct OnlineIvectorExtractionInfo {
|
|||
int32 num_gselect;
|
||||
BaseFloat min_post;
|
||||
BaseFloat posterior_scale;
|
||||
BaseFloat max_count;
|
||||
bool use_most_recent_ivector;
|
||||
bool greedy_ivector_extractor;
|
||||
BaseFloat max_remembered_frames;
|
||||
|
@ -191,7 +204,8 @@ struct OnlineIvectorExtractorAdaptationState {
|
|||
OnlineIvectorExtractorAdaptationState(const OnlineIvectorExtractionInfo &info):
|
||||
cmvn_state(info.global_cmvn_stats),
|
||||
ivector_stats(info.extractor.IvectorDim(),
|
||||
info.extractor.PriorOffset()) { }
|
||||
info.extractor.PriorOffset(),
|
||||
info.max_count) { }
|
||||
|
||||
/// Copy constructor
|
||||
OnlineIvectorExtractorAdaptationState(
|
||||
|
|
|
@ -31,7 +31,7 @@ int main(int argc, char *argv[]) {
|
|||
"Apply online cepstral mean (and possibly variance) computation online,\n"
|
||||
"using the same code as used for online decoding in the 'new' setup in\n"
|
||||
"online2/ and online2bin/. If the --spk2utt option is used, it uses\n"
|
||||
"prior utterances from the same speaker to back off two at the utterance\n"
|
||||
"prior utterances from the same speaker to back off to at the utterance\n"
|
||||
"beginning. See also apply-cmvn-sliding.\n"
|
||||
"\n"
|
||||
"Usage: apply-cmvn-online [options] <global-cmvn-stats> <feature-rspecifier> "
|
||||
|
|
|
@ -84,7 +84,7 @@ int main(int argc, char *argv[]) {
|
|||
RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
|
||||
BaseFloatMatrixWriter ivector_writer(ivectors_wspecifier);
|
||||
|
||||
|
||||
|
||||
for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
|
||||
std::string spk = spk2utt_reader.Key();
|
||||
const std::vector<std::string> &uttlist = spk2utt_reader.Value();
|
||||
|
@ -98,12 +98,12 @@ int main(int argc, char *argv[]) {
|
|||
continue;
|
||||
}
|
||||
const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
|
||||
|
||||
|
||||
OnlineMatrixFeature matrix_feature(feats);
|
||||
|
||||
OnlineIvectorFeature ivector_feature(ivector_info,
|
||||
&matrix_feature);
|
||||
|
||||
|
||||
ivector_feature.SetAdaptationState(adaptation_state);
|
||||
|
||||
int32 T = feats.NumRows(),
|
||||
|
@ -130,9 +130,9 @@ int main(int argc, char *argv[]) {
|
|||
<< ", UBM loglike/frame was "
|
||||
<< ivector_feature.UbmLogLikePerFrame()
|
||||
<< ", iVector length (at utterance end) was "
|
||||
<< ivectors.Row(n-1).Norm(2.0)
|
||||
<< ", objf improvement from iVector estimation was "
|
||||
<< tot_objf_impr;
|
||||
<< ivectors.Row(num_ivectors-1).Norm(2.0)
|
||||
<< ", objf improvement/frame from iVector estimation was "
|
||||
<< ivector_feature.ObjfImprPerFrame();
|
||||
|
||||
ivector_feature.GetAdaptationState(&adaptation_state);
|
||||
ivector_writer.Write(utt, ivectors);
|
||||
|
|
Загрузка…
Ссылка в новой задаче