diff --git a/egs/librispeech/s5/local/online/run_nnet2.sh b/egs/librispeech/s5/local/online/run_nnet2.sh index 9ba7c6eaf..647d8095b 100755 --- a/egs/librispeech/s5/local/online/run_nnet2.sh +++ b/egs/librispeech/s5/local/online/run_nnet2.sh @@ -147,7 +147,7 @@ if [ $stage -le 13 ]; then done fi -exit 0; +#exit 0; ###### Comment out the "exit 0" above to run the multi-threaded decoding. ##### if [ $stage -le 14 ]; then @@ -166,8 +166,8 @@ if [ $stage -le 15 ]; then test=dev_clean steps/online/nnet2/decode.sh --threaded true --do-endpointing true \ --config conf/decode.config --cmd "$decode_cmd" --nj 30 \ - --per-utt true exp/tri6b/graph_pp_tgsmall data/$test \ - ${dir}_online/decode_pp_${test}_tgsmall_utt_threaded_ep || exit 1; + --per-utt true exp/tri6b/graph_tgsmall data/$test \ + ${dir}_online/decode_${test}_tgsmall_utt_threaded_ep || exit 1; fi exit 0; diff --git a/egs/wsj/s5/local/online/run_nnet2.sh b/egs/wsj/s5/local/online/run_nnet2.sh index 433377bff..a7fac1bec 100755 --- a/egs/wsj/s5/local/online/run_nnet2.sh +++ b/egs/wsj/s5/local/online/run_nnet2.sh @@ -134,6 +134,34 @@ if [ $stage -le 13 ]; then done fi +if [ $stage -le 14 ]; then + # this does offline decoding, as stage 10, except we estimate the iVectors per + # speaker, excluding silence (based on alignments from a GMM decoding), with a + # different script. This is just to demonstrate that script. + + rm exp/nnet2_online/.error 2>/dev/null + for year in eval92 dev93; do + steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj 8 \ + data/test_${year}_hires data/lang exp/nnet2_online/extractor \ + exp/tri4b/decode_tgpr_$year exp/nnet2_online/ivectors_spk_test_${year} || touch exp/nnet2_online/.error & + done + wait + [ -f exp/nnet2_online/.error ] && echo "$0: Error getting iVectors" && exit 1; + + for lm_suffix in bd_tgpr; do # just use the bd decoding, to avoid wasting time. + graph_dir=exp/tri4b/graph_${lm_suffix} + # use already-built graphs. + for year in eval92 dev93; do + steps/nnet2/decode.sh --nj 8 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet2_online/ivectors_spk_test_$year \ + $graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year}_spk || touch exp/nnet2_online/.error & + done + done + wait + [ -f exp/nnet2_online/.error ] && echo "$0: Error decoding" && exit 1; +fi + + exit 0; diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh new file mode 100755 index 000000000..5009716cb --- /dev/null +++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh @@ -0,0 +1,207 @@ +#!/bin/bash + +# Copyright 2013 Daniel Povey +# Apache 2.0. + + +# This script computes iVectors in the same format as extract_ivectors_online.sh, +# except that they are actually not really computed online, they are first computed +# per speaker and just duplicated many times. +# +# This setup also makes it possible to use a previous decoding or alignment, to +# down-weight silence in the stats (default is --silence-weight 0.0). +# +# This is for when you use the "online-decoding" setup in an offline task, and +# you want the best possible results. + + +# Begin configuration section. +nj=30 +cmd="run.pl" +stage=0 +num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select +min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out) +ivector_period=10 +posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for + # inter-frame correlations. Making this small during iVector + # extraction is equivalent to scaling up the prior, and will + # will tend to produce smaller iVectors where data-counts are + # small. It's not so important that this match the value + # used when training the iVector extractor, but more important + # that this match the value used when you do real online decoding + # with the neural nets trained with these iVectors. +max_count=100 # Interpret this as a number of frames times posterior scale... + # this config ensures that once the count exceeds this (i.e. + # 1000 frames, or 10 seconds, by default), we start to scale + # down the stats, accentuating the prior term. This seems quite + # important for some reason. +compress=true # If true, compress the iVectors stored on disk (it's lossy + # compression, as used for feature matrices). +silence_weight=0.0 +acwt=0.1 # used if input is a decode dir, to get best path from lattices. +mdl=final # change this if decode directory did not have ../final.mdl present. + +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 4 ] && [ $# != 5 ]; then + echo "Usage: $0 [options] [|] " + echo " e.g.: $0 data/test exp/nnet2_online/extractor exp/tri3/decode_test exp/nnet2_online/ivectors_test" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # Number of jobs (also see num-processes and num-threads)" + echo " # Ignored if or supplied." + echo " --stage # To control partial reruns" + echo " --num-gselect # Number of Gaussians to select using" + echo " # diagonal model." + echo " --min-post # Pruning threshold for posteriors" + echo " --ivector-period # How often to extract an iVector (frames)" + echo " --utts-per-spk-max # Controls splitting into 'fake speakers'." + echo " # Set to 1 if compatibility with utterance-by-utterance" + echo " # decoding is the only factor, and to larger if you care " + echo " # also about adaptation over several utterances." + exit 1; +fi + +if [ $# -eq 4 ]; then + data=$1 + lang=$2 + srcdir=$3 + dir=$4 +else # 5 arguments + data=$1 + lang=$2 + srcdir=$3 + ali_or_decode_dir=$4 + dir=$5 +fi + +for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $srcdir/global_cmvn.stats $srcdir/splice_opts \ + $lang/phones.txt $srcdir/online_cmvn.conf $srcdir/final.mat; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +mkdir -p $dir/log +silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + +if [ ! -z "$ali_or_decode_dir" ]; then + + nj_orig=$(cat $ali_or_decode_dir/num_jobs) || exit 1; + + if [ -f $ali_or_decode_dir/ali.1.gz ]; then + if [ ! -f $ali_or_decode_dir/${mdl}.mdl ]; then + echo "$0: expected $ali_or_decode_dir/${mdl}.mdl to exist." + exit 1; + fi + + if [ $stage -le 0 ]; then + rm $dir/weights.*.gz 2>/dev/null + + $cmd JOB=1:$nj_orig $dir/log/ali_to_post.JOB.log \ + gunzip -c $ali_or_decode_dir/ali.JOB.gz \| \ + ali-to-post ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/final.mdl ark:- ark:- \| \ + post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1; + + # put all the weights in one archive. + for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1; + rm $dir/weights.*.gz || exit 1; + fi + + elif [ -f $ali_or_decode_dir/lat.1.gz ]; then + if [ ! -f $ali_or_decode_dir/../${mdl}.mdl ]; then + echo "$0: expected $ali_or_decode_dir/../${mdl}.mdl to exist." + exit 1; + fi + + + if [ $stage -le 0 ]; then + rm $dir/weights.*.gz 2>/dev/null + + $cmd JOB=1:$nj_orig $dir/log/lat_to_post.JOB.log \ + lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $ali_or_decode_dir/lat.JOB.gz|" ark:/dev/null ark:- \| \ + ali-to-post ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/../${mdl}.mdl ark:- ark:- \| \ + post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1; + + # put all the weights in one archive. + for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1; + rm $dir/weights.*.gz || exit 1; + fi + else + echo "$0: expected ali.1.gz or lat.1.gz to exist in $ali_or_decode_dir"; + exit 1; + fi + +fi + +# Now work out the per-speaker iVectors. + +sdata=$data/split$nj; +utils/split_data.sh $data $nj || exit 1; + +echo $ivector_period > $dir/ivector_period || exit 1; +splice_opts=$(cat $srcdir/splice_opts) + + +gmm_feats="ark,s,cs:apply-cmvn-online --spk2utt=ark:$sdata/JOB/spk2utt --config=$srcdir/online_cmvn.conf $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" +feats="ark,s,cs:splice-feats $splice_opts scp:$sdata/JOB/feats.scp ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + + +if [ $stage -le 1 ]; then + if [ ! -z "$ali_or_decode_dir" ]; then + $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \ + gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \ + weight-post ark:- "ark,s,cs:gunzip -c $dir/weights.gz|" ark:- \| \ + ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true \ + --max-count=$max_count --spk2utt=ark:$sdata/JOB/spk2utt \ + $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark || exit 1; + else + $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \ + gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \ + ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true \ + --max-count=$max_count --spk2utt=ark:$sdata/JOB/spk2utt \ + $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark || exit 1; + fi +fi + +# get an utterance-level set of iVectors (just duplicate the speaker-level ones). +if [ $stage -le 2 ]; then + for j in $(seq $nj); do + utils/apply_map.pl -f 2 $dir/ivectors_spk.$j.ark <$sdata/$j/utt2spk >$dir/ivectors_utt.$j.ark || exit 1; + done +fi + +ivector_dim=$[$(head -n 1 $dir/ivectors_spk.1.ark | wc -w) - 3] || exit 1; +echo "$0: iVector dim is $ivector_dim" + +base_feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1; + +start_dim=$base_feat_dim +end_dim=$[$base_feat_dim+$ivector_dim-1] + + +if [ $stage -le 3 ]; then + # here, we are just using the original features in $sdata/JOB/feats.scp for + # their number of rows; we use the select-feats command to remove those + # features and retain only the iVector features. + $cmd JOB=1:$nj $dir/log/duplicate_feats.JOB.log \ + append-vector-to-feats scp:$sdata/JOB/feats.scp ark:$dir/ivectors_utt.JOB.ark ark:- \| \ + select-feats "$start_dim-$end_dim" ark:- ark:- \| \ + subsample-feats --n=$ivector_period ark:- ark:- \| \ + copy-feats --compress=$compress ark:- \ + ark,scp:$dir/ivector_online.JOB.ark,$dir/ivector_online.JOB.scp || exit 1; +fi + +if [ $stage -le 4 ]; then + echo "$0: combining iVectors across jobs" + for j in $(seq $nj); do cat $dir/ivector_online.$j.scp; done >$dir/ivector_online.scp || exit 1; +fi + +echo "$0: done extracting (pseudo-online) iVectors" diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh index 1a75eb1f2..6f2269f25 100755 --- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh +++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh @@ -32,9 +32,6 @@ posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for # used when training the iVector extractor, but more important # that this match the value used when you do real online decoding # with the neural nets trained with these iVectors. -#utts_per_spk_max=-1 # This option is no longer supported, you should use - # steps/online/nnet2/copy_data_dir.sh with the --utts-per-spk-max - # option to make a copy of the data dir. compress=true # If true, compress the iVectors stored on disk (it's lossy # compression, as used for feature matrices). @@ -58,10 +55,6 @@ if [ $# != 3 ]; then echo " # diagonal model." echo " --min-post # Pruning threshold for posteriors" echo " --ivector-period # How often to extract an iVector (frames)" - echo " --utts-per-spk-max # Controls splitting into 'fake speakers'." - echo " # Set to 1 if compatibility with utterance-by-utterance" - echo " # decoding is the only factor, and to larger if you care " - echo " # also about adaptation over several utterances." exit 1; fi @@ -71,7 +64,7 @@ dir=$3 for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $srcdir/global_cmvn.stats $srcdir/splice_opts \ $srcdir/online_cmvn.conf $srcdir/final.mat; do - [ ! -f $f ] && echo "No such file $f" && exit 1; + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; done # Set various variables. @@ -86,7 +79,7 @@ splice_opts=$(cat $srcdir/splice_opts) # the program ivector-extract-online2 does a bunch of stuff in memory and is # config-driven... this was easier in this case because the same code is # involved in online decoding. We need to create a config file for iVector -# extration. +# extraction. ieconf=$dir/conf/ivector_extractor.conf echo -n >$ieconf @@ -104,15 +97,6 @@ echo "--posterior-scale=$posterior_scale" >>$ieconf echo "--max-remembered-frames=1000" >>$ieconf # the default -ns=$(wc -l <$data/spk2utt) -if [ "$ns" == 1 -a "$utts_per_spk_max" != 1 -a "$utts_per_spk_max" != -1 ]; then - echo "$0: you seem to have just one speaker in your database. This is probably not a good idea." - echo " see http://kaldi.sourceforge.net/data_prep.html (search for 'bold') for why" - echo " Setting --utts-per-spk-max to 1." - utts_per_spk_max=1 -fi - - for n in $(seq $nj); do # This will do nothing unless the directory $dir/storage exists; diff --git a/src/bin/ali-to-post.cc b/src/bin/ali-to-post.cc index 7449da805..589d9d64a 100644 --- a/src/bin/ali-to-post.cc +++ b/src/bin/ali-to-post.cc @@ -34,7 +34,7 @@ int main(int argc, char *argv[]) { try { const char *usage = "Convert alignments to posteriors\n" - "Usage: ali-to-post [options] alignments-rspecifier posteriors-wspecifier\n" + "Usage: ali-to-post [options] \n" "e.g.:\n" " ali-to-post ark:1.ali ark:1.post\n"; diff --git a/src/bin/copy-matrix.cc b/src/bin/copy-matrix.cc index 0738d8dde..d7b8181c6 100644 --- a/src/bin/copy-matrix.cc +++ b/src/bin/copy-matrix.cc @@ -38,10 +38,14 @@ int main(int argc, char *argv[]) { "See also: copy-feats\n"; bool binary = true; + BaseFloat scale = 1.0; ParseOptions po(usage); - po.Register("binary", &binary, "Write in binary mode (only relevant if output is a wxfilename)"); - + po.Register("binary", &binary, + "Write in binary mode (only relevant if output is a wxfilename)"); + po.Register("scale", &scale, + "This option can be used to scale the matrices being copied."); + po.Read(argc, argv); if (po.NumArgs() != 2) { @@ -68,6 +72,7 @@ int main(int argc, char *argv[]) { if (!in_is_rspecifier) { Matrix mat; ReadKaldiObject(matrix_in_fn, &mat); + if (scale != 1.0) mat.Scale(scale); Output ko(matrix_out_fn, binary); mat.Write(ko.Stream(), binary); KALDI_LOG << "Copied matrix to " << matrix_out_fn; @@ -76,8 +81,15 @@ int main(int argc, char *argv[]) { int num_done = 0; BaseFloatMatrixWriter writer(matrix_out_fn); SequentialBaseFloatMatrixReader reader(matrix_in_fn); - for (; !reader.Done(); reader.Next(), num_done++) - writer.Write(reader.Key(), reader.Value()); + for (; !reader.Done(); reader.Next(), num_done++) { + if (scale != 1.0) { + Matrix mat(reader.Value()); + mat.Scale(scale); + writer.Write(reader.Key(), mat); + } else { + writer.Write(reader.Key(), reader.Value()); + } + } KALDI_LOG << "Copied " << num_done << " matrices."; return (num_done != 0 ? 0 : 1); } diff --git a/src/featbin/append-feats.cc b/src/featbin/append-feats.cc index 9ec34941a..cf373d7a3 100644 --- a/src/featbin/append-feats.cc +++ b/src/featbin/append-feats.cc @@ -50,9 +50,9 @@ int main(int argc, char *argv[]) { exit(1); } - std::string rspecifier1 = po.GetArg(1); - std::string rspecifier2 = po.GetArg(2); - std::string wspecifier = po.GetArg(3); + std::string rspecifier1 = po.GetArg(1), + rspecifier2 = po.GetArg(2), + wspecifier = po.GetArg(3); BaseFloatMatrixWriter feats_writer(wspecifier); SequentialBaseFloatMatrixReader feats_reader1(rspecifier1); diff --git a/src/featbin/paste-feats.cc b/src/featbin/paste-feats.cc index 50b16de04..5eab09d96 100644 --- a/src/featbin/paste-feats.cc +++ b/src/featbin/paste-feats.cc @@ -78,8 +78,8 @@ int main(int argc, char *argv[]) { "Usage: paste-feats [ ...] \n" " or: paste-feats [ ...] \n" " e.g. paste-feats ark:feats1.ark \"ark:select-feats 0-3 ark:feats2.ark ark:- |\" ark:feats-out.ark\n" - " or: paste-feats foo.mat bar.mat baz.mat\n"; - + " or: paste-feats foo.mat bar.mat baz.mat\n" + "See also: copy-feats, copy-matrix, append-vector-to-feats, concat-feats\n"; ParseOptions po(usage); diff --git a/src/ivector/ivector-extractor-test.cc b/src/ivector/ivector-extractor-test.cc index d79abbaf7..3b804da2e 100644 --- a/src/ivector/ivector-extractor-test.cc +++ b/src/ivector/ivector-extractor-test.cc @@ -107,8 +107,9 @@ void TestIvectorExtraction(const IvectorExtractor &extractor, utt_stats.AccStats(feats, post); OnlineIvectorEstimationStats online_stats(extractor.IvectorDim(), - extractor.PriorOffset()); - + extractor.PriorOffset(), + 0.0); + for (int32 t = 0; t < num_frames; t++) { online_stats.AccStats(extractor, feats.Row(t), post[t]); } diff --git a/src/ivector/ivector-extractor.cc b/src/ivector/ivector-extractor.cc index b1f4624ec..7c26e73a4 100644 --- a/src/ivector/ivector-extractor.cc +++ b/src/ivector/ivector-extractor.cc @@ -259,13 +259,13 @@ void IvectorExtractor::GetIvectorDistMean( for (int32 i = 0; i < I; i++) { double gamma = utt_stats.gamma_(i); if (gamma != 0.0) { - Vector x(utt_stats.X_.Row(i)); // == \gamma(i) \m_i + SubVector x(utt_stats.X_, i); // == \gamma(i) \m_i // next line: a += \gamma_i \M_i^T \Sigma_i^{-1} \m_i linear->AddMatVec(1.0, Sigma_inv_M_[i], kTrans, x, 1.0); } } SubVector q_vec(quadratic->Data(), IvectorDim()*(IvectorDim()+1)/2); - q_vec.AddMatVec(1.0, U_, kTrans, Vector(utt_stats.gamma_), 1.0); + q_vec.AddMatVec(1.0, U_, kTrans, utt_stats.gamma_, 1.0); } void IvectorExtractor::GetIvectorDistPrior( @@ -543,24 +543,55 @@ void OnlineIvectorEstimationStats::AccStats( quadratic_term_vec.AddVec(weight, U_g); tot_weight += weight; } + if (max_count_ != 0.0) { + // see comments in header RE max_count for explanation. + double old_num_frames = num_frames_, + new_num_frames = num_frames_ + tot_weight; + double old_prior_scale = std::max(old_num_frames, max_count_) / max_count_, + new_prior_scale = std::max(new_num_frames, max_count_) / max_count_; + // The prior_scales are the inverses of the scales we would put on the stats + // if we were implementing this by scaling the stats. Instead we + // scale the prior term. + double prior_scale_change = new_prior_scale - old_prior_scale; + if (prior_scale_change != 0.0) { + linear_term_(0) += prior_offset_ * prior_scale_change; + quadratic_term_.AddToDiag(prior_scale_change); + } + } + num_frames_ += tot_weight; } void OnlineIvectorEstimationStats::Scale(double scale) { KALDI_ASSERT(scale >= 0.0 && scale <= 1.0); + double old_num_frames = num_frames_; num_frames_ *= scale; quadratic_term_.Scale(scale); linear_term_.Scale(scale); // Scale back up the prior term, by adding in whatever we scaled down. - linear_term_(0) += prior_offset_ * (1.0 - scale); - quadratic_term_.AddToDiag(1.0 - scale); + if (max_count_ == 0.0) { + linear_term_(0) += prior_offset_ * (1.0 - scale); + quadratic_term_.AddToDiag(1.0 - scale); + } else { + double new_num_frames = num_frames_; + double old_prior_scale = + scale * std::max(old_num_frames, max_count_) / max_count_, + new_prior_scale = std::max(new_num_frames, max_count_) / max_count_; + // old_prior_scale is the scale the prior term currently has in the stats, + // i.e. the previous scale times "scale" as we just scaled the stats. + // new_prior_scale is the scale we want the prior term to have. + linear_term_(0) += prior_offset_ * (new_prior_scale - old_prior_scale); + quadratic_term_.AddToDiag(new_prior_scale - old_prior_scale); + } } void OnlineIvectorEstimationStats::Write(std::ostream &os, bool binary) const { - WriteToken(os, binary, ""); // magic string. + WriteToken(os, binary, ""); WriteToken(os, binary, ""); WriteBasicType(os, binary, prior_offset_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, max_count_); WriteToken(os, binary, ""); WriteBasicType(os, binary, num_frames_); WriteToken(os, binary, ""); @@ -571,11 +602,20 @@ void OnlineIvectorEstimationStats::Write(std::ostream &os, bool binary) const { } void OnlineIvectorEstimationStats::Read(std::istream &is, bool binary) { - ExpectToken(is, binary, ""); // magic string. + ExpectToken(is, binary, ""); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &prior_offset_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &num_frames_); + std::string tok; + ReadToken(is, binary, &tok); + if (tok == "") { + ReadBasicType(is, binary, &max_count_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &num_frames_); + } else { + KALDI_ASSERT(tok == ""); + max_count_ = 0.0; + ReadBasicType(is, binary, &num_frames_); + } ExpectToken(is, binary, ""); quadratic_term_.Read(is, binary); ExpectToken(is, binary, ""); @@ -638,8 +678,9 @@ double OnlineIvectorEstimationStats::DefaultObjf() const { } OnlineIvectorEstimationStats::OnlineIvectorEstimationStats(int32 ivector_dim, - BaseFloat prior_offset): - prior_offset_(prior_offset), num_frames_(0.0), + BaseFloat prior_offset, + BaseFloat max_count): + prior_offset_(prior_offset), max_count_(max_count), num_frames_(0.0), quadratic_term_(ivector_dim), linear_term_(ivector_dim) { if (ivector_dim != 0) { linear_term_(0) += prior_offset; @@ -650,6 +691,7 @@ OnlineIvectorEstimationStats::OnlineIvectorEstimationStats(int32 ivector_dim, OnlineIvectorEstimationStats::OnlineIvectorEstimationStats( const OnlineIvectorEstimationStats &other): prior_offset_(other.prior_offset_), + max_count_(other.max_count_), num_frames_(other.num_frames_), quadratic_term_(other.quadratic_term_), linear_term_(other.linear_term_) { } @@ -733,6 +775,12 @@ void IvectorExtractorUtteranceStats::AccStats( } } +void IvectorExtractorUtteranceStats::Scale(double scale) { + gamma_.Scale(scale); + X_.Scale(scale); + for (size_t i = 0; i < S_.size(); i++) + S_[i].Scale(scale); +} IvectorExtractorStats::IvectorExtractorStats( const IvectorExtractor &extractor, @@ -1534,6 +1582,7 @@ double EstimateIvectorsOnline( const IvectorExtractor &extractor, int32 ivector_period, int32 num_cg_iters, + BaseFloat max_count, Matrix *ivectors) { KALDI_ASSERT(ivector_period > 0); @@ -1544,7 +1593,8 @@ double EstimateIvectorsOnline( ivectors->Resize(num_ivectors, extractor.IvectorDim()); OnlineIvectorEstimationStats online_stats(extractor.IvectorDim(), - extractor.PriorOffset()); + extractor.PriorOffset(), + max_count); double ans = 0.0; diff --git a/src/ivector/ivector-extractor.h b/src/ivector/ivector-extractor.h index 46c5b2322..602e636ce 100644 --- a/src/ivector/ivector-extractor.h +++ b/src/ivector/ivector-extractor.h @@ -45,13 +45,23 @@ namespace kaldi { // "acoustic_weight" is not read by any class declared in this header; it has to // be applied by calling IvectorExtractorUtteranceStats::Scale() before // obtaining the iVector. +// The same is true of max_count: it has to be applied by programs themselves +// e.g. see ../ivectorbin/ivector-extract.cc. struct IvectorEstimationOptions { double acoustic_weight; - IvectorEstimationOptions(): acoustic_weight(1.0) {} + double max_count; + IvectorEstimationOptions(): acoustic_weight(1.0), max_count(0.0) {} void Register(OptionsItf *po) { po->Register("acoustic-weight", &acoustic_weight, "Weight on part of auxf that involves the data (e.g. 0.2); " "if this weight is small, the prior will have more effect."); + po->Register("max-count", &max_count, + "Maximum frame count (affects prior scaling): if >0, the prior " + "term will be scaled up after the frame count exceeds this " + "value. Note that this count is considered after posterior " + "scaling (e.g. --acoustic-weight option, or scale argument to " + "scale-post), so you would normally use a cutoff 10 times " + "smaller than the corresponding number of frames."); } }; @@ -301,8 +311,12 @@ class IvectorExtractor { */ class OnlineIvectorEstimationStats { public: + // Search above for max_count to see an explanation; if nonzero, it will + // put a higher weight on the prior (vs. the stats) once the count passes + // that value. OnlineIvectorEstimationStats(int32 ivector_dim, - BaseFloat prior_offset); + BaseFloat prior_offset, + BaseFloat max_count); OnlineIvectorEstimationStats(const OnlineIvectorEstimationStats &other); @@ -360,6 +374,7 @@ class OnlineIvectorEstimationStats { friend class IvectorExtractor; double prior_offset_; + double max_count_; double num_frames_; // num frames (weighted, if applicable). SpMatrix quadratic_term_; Vector linear_term_; @@ -368,8 +383,10 @@ class OnlineIvectorEstimationStats { // This code obtains periodically (for each "ivector_period" frames, e.g. 10 // frames), an estimate of the iVector including all frames up to that point. -// This emulates what you could do in an online/streaming algorithm; its use -// is for neural network training in a way that's matched to online decoding. +// This emulates what you could do in an online/streaming algorithm; its use is +// for neural network training in a way that's matched to online decoding. +// [note: I don't believe we are currently using the program, +// ivector-extract-online.cc, that calls this function, in any of the scripts.]. // Caution: this program outputs the raw iVectors, where the first component // will generally be very positive. You probably want to subtract PriorOffset() // from the first element of each row of the output before writing it out. @@ -384,6 +401,7 @@ double EstimateIvectorsOnline( const IvectorExtractor &extractor, int32 ivector_period, int32 num_cg_iters, + BaseFloat max_count, Matrix *ivectors); diff --git a/src/ivectorbin/ivector-extract-online.cc b/src/ivectorbin/ivector-extract-online.cc index 519671367..3c1795d6b 100644 --- a/src/ivectorbin/ivector-extract-online.cc +++ b/src/ivectorbin/ivector-extract-online.cc @@ -49,6 +49,7 @@ int main(int argc, char *argv[]) { ParseOptions po(usage); int32 num_cg_iters = 15; int32 ivector_period = 10; + BaseFloat max_count = 0.0; g_num_threads = 8; po.Register("num-cg-iters", &num_cg_iters, @@ -60,6 +61,12 @@ int main(int argc, char *argv[]) { po.Register("num-threads", &g_num_threads, "Number of threads to use for computing derived variables " "of iVector extractor, at process start-up."); + po.Register("max-count", &max_count, + "If >0, when the count of posteriors exceeds max-count we will " + "start using a stronger prior term. Can make iVectors from " + "longer than normal utterances look more 'typical'. Interpret " + "this value as a number of frames multiplied by your " + "posterior scale (so typically 0.1 times a number of frames)."); po.Read(argc, argv); if (po.NumArgs() != 4) { @@ -107,7 +114,7 @@ int main(int argc, char *argv[]) { double objf_impr_per_frame; objf_impr_per_frame = EstimateIvectorsOnline(feats, posterior, extractor, ivector_period, num_cg_iters, - &ivectors); + max_count, &ivectors); BaseFloat offset = extractor.PriorOffset(); for (int32 i = 0 ; i < ivectors.NumRows(); i++) diff --git a/src/ivectorbin/ivector-extract.cc b/src/ivectorbin/ivector-extract.cc index 69c034e95..220677d9a 100644 --- a/src/ivectorbin/ivector-extract.cc +++ b/src/ivectorbin/ivector-extract.cc @@ -63,13 +63,13 @@ class IvectorExtractTask { } ~IvectorExtractTask() { if (tot_auxf_change_ != NULL) { - int32 T = posterior_.size(); + double T = TotalPosterior(posterior_); *tot_auxf_change_ += auxf_change_; KALDI_VLOG(2) << "Auxf change for utterance " << utt_ << " was " << (auxf_change_ / T) << " per frame over " << T - << " frames."; + << " frames (weighted)"; } - // We actually write out the offset of the iVector's from the mean of the + // We actually write out the offset of the iVectors from the mean of the // prior distribution; this is the form we'll need it in for scoring. (most // formulations of iVectors have zero-mean priors so this is not normally an // issue). @@ -89,11 +89,124 @@ class IvectorExtractTask { double auxf_change_; }; +int32 RunPerSpeaker(const std::string &ivector_extractor_rxfilename, + const IvectorEstimationOptions &opts, + bool compute_objf_change, + const std::string &spk2utt_rspecifier, + const std::string &feature_rspecifier, + const std::string &posterior_rspecifier, + const std::string &ivector_wspecifier) { + IvectorExtractor extractor; + ReadKaldiObject(ivector_extractor_rxfilename, &extractor); + SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier); + RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier); + RandomAccessPosteriorReader posterior_reader(posterior_rspecifier); + BaseFloatVectorWriter ivector_writer(ivector_wspecifier); + + double tot_auxf_change = 0.0, tot_post = 0.0, tot_norm = 0.0; + int32 num_utt_done = 0, num_utt_err = 0, + num_spk_done = 0, num_spk_err = 0; + for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) { + std::string spk = spk2utt_reader.Key(); + const std::vector &utts = spk2utt_reader.Value(); + + bool need_2nd_order_stats = false; + + IvectorExtractorUtteranceStats utt_stats(extractor.NumGauss(), + extractor.FeatDim(), + need_2nd_order_stats); + + for (size_t i = 0; i < utts.size(); i++) { + const std::string &utt = utts[i]; + if (!feature_reader.HasKey(utt)) { + KALDI_WARN << "No features present for utterance " << utt; + num_utt_err++; + continue; + } + const Matrix &feats = feature_reader.Value(utt); + if (!posterior_reader.HasKey(utt)) { + KALDI_WARN << "No posteriors present for utterance " << utt; + num_utt_err++; + continue; + } + Posterior posterior = posterior_reader.Value(utt); + if (feats.NumRows() != posterior.size()) { + KALDI_WARN << "Posterior has wrong size " << posterior.size() + << " vs. feats " << feats.NumRows() << " for " + << utt; + num_utt_err++; + continue; + } + ScalePosterior(opts.acoustic_weight, &posterior); + num_utt_done++; + utt_stats.AccStats(feats, posterior); + } + + if (utt_stats.NumFrames() == 0.0) { + KALDI_WARN << "No stats accumulated for speaker " << spk; + num_spk_err++; + continue; + } else { + if (opts.max_count > 0 && utt_stats.NumFrames() > opts.max_count) { + double scale = opts.max_count / utt_stats.NumFrames(); + utt_stats.Scale(scale); + KALDI_LOG << "Scaling stats for speaker " << spk << " by scale " + << scale << " due to --max-count=" << opts.max_count; + } + + Vector ivector(extractor.IvectorDim()); + ivector(0) = extractor.PriorOffset(); + + if (compute_objf_change) { + double old_auxf = extractor.GetAuxf(utt_stats, ivector); + extractor.GetIvectorDistribution(utt_stats, &ivector, NULL); + double new_auxf = extractor.GetAuxf(utt_stats, ivector); + double auxf_change = new_auxf - old_auxf; + + KALDI_LOG << "Auxf change for speaker " << spk << " was " + << (auxf_change / utt_stats.NumFrames()) << " per frame, over " + << utt_stats.NumFrames() << " frames (weighted)."; + tot_auxf_change += auxf_change; + } else { + extractor.GetIvectorDistribution(utt_stats, &ivector, NULL); + } + // We actually write out the offset of the iVectors from the mean of the + // prior distribution; this is the form we'll need it in for scoring and + // as a feature for neural nets. (most formulations of iVectors have + // zero-mean priors so this is not normally an issue). + ivector(0) -= extractor.PriorOffset(); + KALDI_LOG << "Ivector norm for speaker " << spk + << " was " << ivector.Norm(2.0); + + tot_norm += ivector.Norm(2.0) * utt_stats.NumFrames(); + tot_post += utt_stats.NumFrames(); + num_spk_done++; + Vector ivector_flt(ivector); + ivector_writer.Write(spk, ivector_flt); + } + } + + KALDI_LOG << "Done " << num_spk_done << " speakers; " << num_spk_err + << " with errors. " << num_utt_done << " utterances " + << "were processed, " << num_utt_err << " with errors."; + if (tot_post != 0.0) { + if (compute_objf_change) { + KALDI_LOG << "Overall weighted-average objective function improvement was " + << (tot_auxf_change / tot_post) << " over " << tot_post + << " frames (weighted)"; + } + KALDI_LOG << "Average iVector norm (weighted by frames) was " + << (tot_norm / tot_post) << " over " << tot_post + << " frames (weighted)"; + } + return (num_spk_done != 0 ? 0 : 1); +} } + int main(int argc, char *argv[]) { using namespace kaldi; typedef kaldi::int32 int32; @@ -102,7 +215,7 @@ int main(int argc, char *argv[]) { const char *usage = "Extract iVectors for utterances, using a trained iVector extractor,\n" "and features and Gaussian-level posteriors\n" - "Usage: ivector-extract [options] " + "Usage: ivector-extract [options] " " \n" "e.g.: \n" " fgmm-global-gselect-to-post 1.ubm '$feats' 'ark:gunzip -c gselect.1.gz|' ark:- | \\\n" @@ -110,13 +223,21 @@ int main(int argc, char *argv[]) { ParseOptions po(usage); bool compute_objf_change = true; - IvectorExtractorStatsOptions stats_opts; + IvectorEstimationOptions opts; + std::string spk2utt_rspecifier; TaskSequencerConfig sequencer_config; po.Register("compute-objf-change", &compute_objf_change, "If true, compute the change in objective function from using " "nonzero iVector (a potentially useful diagnostic). Combine " "with --verbose=2 for per-utterance information"); - stats_opts.Register(&po); + po.Register("spk2utt", &spk2utt_rspecifier, "Supply this option if you " + "want iVectors to be output at the per-speaker level, estimated " + "using stats accumulated from multiple utterances. Note: this " + "is not the normal way iVectors are obtained for speaker-id. " + "This option will cause the program to ignore the --num-threads " + "option."); + + opts.Register(&po); sequencer_config.Register(&po); po.Read(argc, argv); @@ -128,63 +249,87 @@ int main(int argc, char *argv[]) { std::string ivector_extractor_rxfilename = po.GetArg(1), feature_rspecifier = po.GetArg(2), - posteriors_rspecifier = po.GetArg(3), + posterior_rspecifier = po.GetArg(3), ivectors_wspecifier = po.GetArg(4); - // g_num_threads affects how ComputeDerivedVars is called when we read the - // extractor. - g_num_threads = sequencer_config.num_threads; - IvectorExtractor extractor; - ReadKaldiObject(ivector_extractor_rxfilename, &extractor); - double tot_auxf_change = 0.0; - int64 tot_t = 0; - int32 num_done = 0, num_err = 0; + if (spk2utt_rspecifier.empty()) { + // g_num_threads affects how ComputeDerivedVars is called when we read the + // extractor. + g_num_threads = sequencer_config.num_threads; + IvectorExtractor extractor; + ReadKaldiObject(ivector_extractor_rxfilename, &extractor); + + double tot_auxf_change = 0.0, tot_t = 0.0; + int32 num_done = 0, num_err = 0; - SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); - RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier); - BaseFloatVectorWriter ivector_writer(ivectors_wspecifier); + SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); + RandomAccessPosteriorReader posterior_reader(posterior_rspecifier); + BaseFloatVectorWriter ivector_writer(ivectors_wspecifier); + + { + TaskSequencer sequencer(sequencer_config); + for (; !feature_reader.Done(); feature_reader.Next()) { + std::string utt = feature_reader.Key(); + if (!posterior_reader.HasKey(utt)) { + KALDI_WARN << "No posteriors for utterance " << utt; + num_err++; + continue; + } + const Matrix &mat = feature_reader.Value(); + Posterior posterior = posterior_reader.Value(utt); + + if (static_cast(posterior.size()) != mat.NumRows()) { + KALDI_WARN << "Size mismatch between posterior " << posterior.size() + << " and features " << mat.NumRows() << " for utterance " + << utt; + num_err++; + continue; + } - { - TaskSequencer sequencer(sequencer_config); - for (; !feature_reader.Done(); feature_reader.Next()) { - std::string key = feature_reader.Key(); - if (!posteriors_reader.HasKey(key)) { - KALDI_WARN << "No posteriors for utterance " << key; - num_err++; - continue; + double *auxf_ptr = (compute_objf_change ? &tot_auxf_change : NULL ); + + double this_t = opts.acoustic_weight * TotalPosterior(posterior), + max_count_scale = 1.0; + if (opts.max_count > 0 && this_t > opts.max_count) { + max_count_scale = opts.max_count / this_t; + KALDI_LOG << "Scaling stats for utterance " << utt << " by scale " + << max_count_scale << " due to --max-count=" + << opts.max_count; + this_t = opts.max_count; + } + ScalePosterior(opts.acoustic_weight * max_count_scale, + &posterior); + // note: now, this_t == sum of posteriors. + + sequencer.Run(new IvectorExtractTask(extractor, utt, mat, posterior, + &ivector_writer, auxf_ptr)); + + tot_t += this_t; + num_done++; } - const Matrix &mat = feature_reader.Value(); - const Posterior &posterior = posteriors_reader.Value(key); - - if (static_cast(posterior.size()) != mat.NumRows()) { - KALDI_WARN << "Size mismatch between posterior " << posterior.size() - << " and features " << mat.NumRows() << " for utterance " - << key; - num_err++; - continue; - } - - double *auxf_ptr = (compute_objf_change ? &tot_auxf_change : NULL ); - - sequencer.Run(new IvectorExtractTask(extractor, key, mat, posterior, - &ivector_writer, auxf_ptr)); - - tot_t += posterior.size(); - num_done++; + // Destructor of "sequencer" will wait for any remaining tasks. } - // Destructor of "sequencer" will wait for any remaining tasks. + + KALDI_LOG << "Done " << num_done << " files, " << num_err + << " with errors. Total (weighted) frames " << tot_t; + if (compute_objf_change) + KALDI_LOG << "Overall average objective-function change from estimating " + << "ivector was " << (tot_auxf_change / tot_t) << " per frame " + << " over " << tot_t << " (weighted) frames."; + + return (num_done != 0 ? 0 : 1); + } else { + KALDI_ASSERT(sequencer_config.num_threads == 1 && + "--spk2utt option is incompatible with --num-threads option"); + return RunPerSpeaker(ivector_extractor_rxfilename, + opts, + compute_objf_change, + spk2utt_rspecifier, + feature_rspecifier, + posterior_rspecifier, + ivectors_wspecifier); } - - KALDI_LOG << "Done " << num_done << " files, " << num_err - << " with errors. Total frames " << tot_t; - - if (compute_objf_change) - KALDI_LOG << "Overall average objective-function change from estimating " - << "ivector was " << (tot_auxf_change / tot_t) << " per frame " - << " over " << tot_t << " frames."; - - return (num_done != 0 ? 0 : 1); } catch(const std::exception &e) { std::cerr << e.what(); return -1; diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc index 4b101c7ff..d61e4e598 100644 --- a/src/matrix/kaldi-vector.cc +++ b/src/matrix/kaldi-vector.cc @@ -1010,7 +1010,7 @@ void VectorBase::AddVec(const Real alpha, const VectorBase &v) MatrixIndexT dim = dim_; if (alpha != 1.0) for (MatrixIndexT i = 0; i < dim; i++) - data[i] += alpha*other_data[i]; + data[i] += alpha * other_data[i]; else for (MatrixIndexT i = 0; i < dim; i++) data[i] += other_data[i]; diff --git a/src/online2/online-ivector-feature.cc b/src/online2/online-ivector-feature.cc index 2f5bce7e8..02c6126b9 100644 --- a/src/online2/online-ivector-feature.cc +++ b/src/online2/online-ivector-feature.cc @@ -32,6 +32,7 @@ void OnlineIvectorExtractionInfo::Init( num_gselect = config.num_gselect; min_post = config.min_post; posterior_scale = config.posterior_scale; + max_count = config.max_count; use_most_recent_ivector = config.use_most_recent_ivector; greedy_ivector_extractor = config.greedy_ivector_extractor; if (greedy_ivector_extractor && !use_most_recent_ivector) { @@ -161,7 +162,7 @@ void OnlineIvectorFeature::UpdateStatsUntilFrame(int32 frame) { Vector feat(feat_dim), // features given to iVector extractor log_likes(info_.diag_ubm.NumGauss()); - + for (; num_frames_stats_ <= frame; num_frames_stats_++) { int32 t = num_frames_stats_; // Frame whose stats we want to get. lda_normalized_->GetFrame(t, &feat); @@ -262,8 +263,10 @@ OnlineIvectorFeature::OnlineIvectorFeature( const OnlineIvectorExtractionInfo &info, OnlineFeatureInterface *base_feature): info_(info), base_(base_feature), - ivector_stats_(info_.extractor.IvectorDim(), info_.extractor.PriorOffset()), - num_frames_stats_(0) { + ivector_stats_(info_.extractor.IvectorDim(), + info_.extractor.PriorOffset(), + info_.max_count), + num_frames_stats_(0), tot_ubm_loglike_(0.0) { info.Check(); KALDI_ASSERT(base_feature != NULL); splice_ = new OnlineSpliceFrames(info_.splice_opts, base_); diff --git a/src/online2/online-ivector-feature.h b/src/online2/online-ivector-feature.h index 06981f91d..209519d88 100644 --- a/src/online2/online-ivector-feature.h +++ b/src/online2/online-ivector-feature.h @@ -70,6 +70,12 @@ struct OnlineIvectorExtractionConfig { BaseFloat posterior_scale; // Scale on posteriors used for iVector // extraction; can be interpreted as the inverse // of a scale on the log-prior. + BaseFloat max_count; // Maximum stats count we allow before we start scaling + // down stats (if nonzero).. this prevents us getting + // atypical-looking iVectors for very long utterances. + // Interpret this as a number of frames times + // posterior_scale, typically 1/10 of a frame count. + // If use_most_recent_ivector is true, we always return the most recent // available iVector rather than the one for the current frame. This means @@ -91,6 +97,7 @@ struct OnlineIvectorExtractionConfig { OnlineIvectorExtractionConfig(): ivector_period(10), num_gselect(5), min_post(0.025), posterior_scale(0.1), + max_count(0.0), use_most_recent_ivector(true), greedy_ivector_extractor(false), max_remembered_frames(1000) { } @@ -122,6 +129,11 @@ struct OnlineIvectorExtractionConfig { "iVector extraction"); po->Register("posterior-scale", &posterior_scale, "Scale for posteriors in " "iVector extraction (may be viewed as inverse of prior scale)"); + po->Register("max-count", &max_count, "Maximum data count we allow before " + "we start scaling the stats down (if nonzero)... helps to make " + "iVectors from long utterances look more typical. Interpret " + "as a frame-count times --posterior-scale, typically 1/10 of " + "a number of frames. Suggest 100."); po->Register("use-most-recent-ivector", &use_most_recent_ivector, "If true, " "always use most recent available iVector, rather than the " "one for the designated frame."); @@ -156,6 +168,7 @@ struct OnlineIvectorExtractionInfo { int32 num_gselect; BaseFloat min_post; BaseFloat posterior_scale; + BaseFloat max_count; bool use_most_recent_ivector; bool greedy_ivector_extractor; BaseFloat max_remembered_frames; @@ -191,7 +204,8 @@ struct OnlineIvectorExtractorAdaptationState { OnlineIvectorExtractorAdaptationState(const OnlineIvectorExtractionInfo &info): cmvn_state(info.global_cmvn_stats), ivector_stats(info.extractor.IvectorDim(), - info.extractor.PriorOffset()) { } + info.extractor.PriorOffset(), + info.max_count) { } /// Copy constructor OnlineIvectorExtractorAdaptationState( diff --git a/src/online2bin/apply-cmvn-online.cc b/src/online2bin/apply-cmvn-online.cc index 7909e0290..2745df5d4 100644 --- a/src/online2bin/apply-cmvn-online.cc +++ b/src/online2bin/apply-cmvn-online.cc @@ -31,7 +31,7 @@ int main(int argc, char *argv[]) { "Apply online cepstral mean (and possibly variance) computation online,\n" "using the same code as used for online decoding in the 'new' setup in\n" "online2/ and online2bin/. If the --spk2utt option is used, it uses\n" - "prior utterances from the same speaker to back off two at the utterance\n" + "prior utterances from the same speaker to back off to at the utterance\n" "beginning. See also apply-cmvn-sliding.\n" "\n" "Usage: apply-cmvn-online [options] " diff --git a/src/online2bin/ivector-extract-online2.cc b/src/online2bin/ivector-extract-online2.cc index bc616ddb9..3251d93b5 100644 --- a/src/online2bin/ivector-extract-online2.cc +++ b/src/online2bin/ivector-extract-online2.cc @@ -84,7 +84,7 @@ int main(int argc, char *argv[]) { RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier); BaseFloatMatrixWriter ivector_writer(ivectors_wspecifier); - + for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) { std::string spk = spk2utt_reader.Key(); const std::vector &uttlist = spk2utt_reader.Value(); @@ -98,12 +98,12 @@ int main(int argc, char *argv[]) { continue; } const Matrix &feats = feature_reader.Value(utt); - + OnlineMatrixFeature matrix_feature(feats); OnlineIvectorFeature ivector_feature(ivector_info, &matrix_feature); - + ivector_feature.SetAdaptationState(adaptation_state); int32 T = feats.NumRows(), @@ -130,9 +130,9 @@ int main(int argc, char *argv[]) { << ", UBM loglike/frame was " << ivector_feature.UbmLogLikePerFrame() << ", iVector length (at utterance end) was " - << ivectors.Row(n-1).Norm(2.0) - << ", objf improvement from iVector estimation was " - << tot_objf_impr; + << ivectors.Row(num_ivectors-1).Norm(2.0) + << ", objf improvement/frame from iVector estimation was " + << ivector_feature.ObjfImprPerFrame(); ivector_feature.GetAdaptationState(&adaptation_state); ivector_writer.Write(utt, ivectors);