trunk: online-nnet2 decoding setup: adding scripts which make it possible to estimate the iVectors per speaker, excluding silence (so not-truly-online decoding). Some code changes for iVector which allow for scaling up the prior term when the data count exceeds a certain value (this seems to be important, for some reason). And misc. code fixes.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4865 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2015-02-08 03:18:18 +00:00
Родитель 7d8ff21d63
Коммит 350d8b4123
18 изменённых файлов: 585 добавлений и 116 удалений

Просмотреть файл

@ -147,7 +147,7 @@ if [ $stage -le 13 ]; then
done
fi
exit 0;
#exit 0;
###### Comment out the "exit 0" above to run the multi-threaded decoding. #####
if [ $stage -le 14 ]; then
@ -166,8 +166,8 @@ if [ $stage -le 15 ]; then
test=dev_clean
steps/online/nnet2/decode.sh --threaded true --do-endpointing true \
--config conf/decode.config --cmd "$decode_cmd" --nj 30 \
--per-utt true exp/tri6b/graph_pp_tgsmall data/$test \
${dir}_online/decode_pp_${test}_tgsmall_utt_threaded_ep || exit 1;
--per-utt true exp/tri6b/graph_tgsmall data/$test \
${dir}_online/decode_${test}_tgsmall_utt_threaded_ep || exit 1;
fi
exit 0;

Просмотреть файл

@ -134,6 +134,34 @@ if [ $stage -le 13 ]; then
done
fi
if [ $stage -le 14 ]; then
# this does offline decoding, as stage 10, except we estimate the iVectors per
# speaker, excluding silence (based on alignments from a GMM decoding), with a
# different script. This is just to demonstrate that script.
rm exp/nnet2_online/.error 2>/dev/null
for year in eval92 dev93; do
steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj 8 \
data/test_${year}_hires data/lang exp/nnet2_online/extractor \
exp/tri4b/decode_tgpr_$year exp/nnet2_online/ivectors_spk_test_${year} || touch exp/nnet2_online/.error &
done
wait
[ -f exp/nnet2_online/.error ] && echo "$0: Error getting iVectors" && exit 1;
for lm_suffix in bd_tgpr; do # just use the bd decoding, to avoid wasting time.
graph_dir=exp/tri4b/graph_${lm_suffix}
# use already-built graphs.
for year in eval92 dev93; do
steps/nnet2/decode.sh --nj 8 --cmd "$decode_cmd" \
--online-ivector-dir exp/nnet2_online/ivectors_spk_test_$year \
$graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year}_spk || touch exp/nnet2_online/.error &
done
done
wait
[ -f exp/nnet2_online/.error ] && echo "$0: Error decoding" && exit 1;
fi
exit 0;

Просмотреть файл

@ -0,0 +1,207 @@
#!/bin/bash
# Copyright 2013 Daniel Povey
# Apache 2.0.
# This script computes iVectors in the same format as extract_ivectors_online.sh,
# except that they are actually not really computed online, they are first computed
# per speaker and just duplicated many times.
#
# This setup also makes it possible to use a previous decoding or alignment, to
# down-weight silence in the stats (default is --silence-weight 0.0).
#
# This is for when you use the "online-decoding" setup in an offline task, and
# you want the best possible results.
# Begin configuration section.
nj=30
cmd="run.pl"
stage=0
num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select
min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
ivector_period=10
posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
# inter-frame correlations. Making this small during iVector
# extraction is equivalent to scaling up the prior, and will
# will tend to produce smaller iVectors where data-counts are
# small. It's not so important that this match the value
# used when training the iVector extractor, but more important
# that this match the value used when you do real online decoding
# with the neural nets trained with these iVectors.
max_count=100 # Interpret this as a number of frames times posterior scale...
# this config ensures that once the count exceeds this (i.e.
# 1000 frames, or 10 seconds, by default), we start to scale
# down the stats, accentuating the prior term. This seems quite
# important for some reason.
compress=true # If true, compress the iVectors stored on disk (it's lossy
# compression, as used for feature matrices).
silence_weight=0.0
acwt=0.1 # used if input is a decode dir, to get best path from lattices.
mdl=final # change this if decode directory did not have ../final.mdl present.
# End configuration section.
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 4 ] && [ $# != 5 ]; then
echo "Usage: $0 [options] <data> <lang> <extractor-dir> [<alignment-dir>|<decode-dir>] <ivector-dir>"
echo " e.g.: $0 data/test exp/nnet2_online/extractor exp/tri3/decode_test exp/nnet2_online/ivectors_test"
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --nj <n|10> # Number of jobs (also see num-processes and num-threads)"
echo " # Ignored if <alignment-dir> or <decode-dir> supplied."
echo " --stage <stage|0> # To control partial reruns"
echo " --num-gselect <n|5> # Number of Gaussians to select using"
echo " # diagonal model."
echo " --min-post <float;default=0.025> # Pruning threshold for posteriors"
echo " --ivector-period <int;default=10> # How often to extract an iVector (frames)"
echo " --utts-per-spk-max <int;default=-1> # Controls splitting into 'fake speakers'."
echo " # Set to 1 if compatibility with utterance-by-utterance"
echo " # decoding is the only factor, and to larger if you care "
echo " # also about adaptation over several utterances."
exit 1;
fi
if [ $# -eq 4 ]; then
data=$1
lang=$2
srcdir=$3
dir=$4
else # 5 arguments
data=$1
lang=$2
srcdir=$3
ali_or_decode_dir=$4
dir=$5
fi
for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $srcdir/global_cmvn.stats $srcdir/splice_opts \
$lang/phones.txt $srcdir/online_cmvn.conf $srcdir/final.mat; do
[ ! -f $f ] && echo "$0: No such file $f" && exit 1;
done
mkdir -p $dir/log
silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
if [ ! -z "$ali_or_decode_dir" ]; then
nj_orig=$(cat $ali_or_decode_dir/num_jobs) || exit 1;
if [ -f $ali_or_decode_dir/ali.1.gz ]; then
if [ ! -f $ali_or_decode_dir/${mdl}.mdl ]; then
echo "$0: expected $ali_or_decode_dir/${mdl}.mdl to exist."
exit 1;
fi
if [ $stage -le 0 ]; then
rm $dir/weights.*.gz 2>/dev/null
$cmd JOB=1:$nj_orig $dir/log/ali_to_post.JOB.log \
gunzip -c $ali_or_decode_dir/ali.JOB.gz \| \
ali-to-post ark:- ark:- \| \
weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/final.mdl ark:- ark:- \| \
post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;
# put all the weights in one archive.
for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1;
rm $dir/weights.*.gz || exit 1;
fi
elif [ -f $ali_or_decode_dir/lat.1.gz ]; then
if [ ! -f $ali_or_decode_dir/../${mdl}.mdl ]; then
echo "$0: expected $ali_or_decode_dir/../${mdl}.mdl to exist."
exit 1;
fi
if [ $stage -le 0 ]; then
rm $dir/weights.*.gz 2>/dev/null
$cmd JOB=1:$nj_orig $dir/log/lat_to_post.JOB.log \
lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $ali_or_decode_dir/lat.JOB.gz|" ark:/dev/null ark:- \| \
ali-to-post ark:- ark:- \| \
weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/../${mdl}.mdl ark:- ark:- \| \
post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;
# put all the weights in one archive.
for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1;
rm $dir/weights.*.gz || exit 1;
fi
else
echo "$0: expected ali.1.gz or lat.1.gz to exist in $ali_or_decode_dir";
exit 1;
fi
fi
# Now work out the per-speaker iVectors.
sdata=$data/split$nj;
utils/split_data.sh $data $nj || exit 1;
echo $ivector_period > $dir/ivector_period || exit 1;
splice_opts=$(cat $srcdir/splice_opts)
gmm_feats="ark,s,cs:apply-cmvn-online --spk2utt=ark:$sdata/JOB/spk2utt --config=$srcdir/online_cmvn.conf $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
feats="ark,s,cs:splice-feats $splice_opts scp:$sdata/JOB/feats.scp ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
if [ $stage -le 1 ]; then
if [ ! -z "$ali_or_decode_dir" ]; then
$cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
weight-post ark:- "ark,s,cs:gunzip -c $dir/weights.gz|" ark:- \| \
ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true \
--max-count=$max_count --spk2utt=ark:$sdata/JOB/spk2utt \
$srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark || exit 1;
else
$cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true \
--max-count=$max_count --spk2utt=ark:$sdata/JOB/spk2utt \
$srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark || exit 1;
fi
fi
# get an utterance-level set of iVectors (just duplicate the speaker-level ones).
if [ $stage -le 2 ]; then
for j in $(seq $nj); do
utils/apply_map.pl -f 2 $dir/ivectors_spk.$j.ark <$sdata/$j/utt2spk >$dir/ivectors_utt.$j.ark || exit 1;
done
fi
ivector_dim=$[$(head -n 1 $dir/ivectors_spk.1.ark | wc -w) - 3] || exit 1;
echo "$0: iVector dim is $ivector_dim"
base_feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1;
start_dim=$base_feat_dim
end_dim=$[$base_feat_dim+$ivector_dim-1]
if [ $stage -le 3 ]; then
# here, we are just using the original features in $sdata/JOB/feats.scp for
# their number of rows; we use the select-feats command to remove those
# features and retain only the iVector features.
$cmd JOB=1:$nj $dir/log/duplicate_feats.JOB.log \
append-vector-to-feats scp:$sdata/JOB/feats.scp ark:$dir/ivectors_utt.JOB.ark ark:- \| \
select-feats "$start_dim-$end_dim" ark:- ark:- \| \
subsample-feats --n=$ivector_period ark:- ark:- \| \
copy-feats --compress=$compress ark:- \
ark,scp:$dir/ivector_online.JOB.ark,$dir/ivector_online.JOB.scp || exit 1;
fi
if [ $stage -le 4 ]; then
echo "$0: combining iVectors across jobs"
for j in $(seq $nj); do cat $dir/ivector_online.$j.scp; done >$dir/ivector_online.scp || exit 1;
fi
echo "$0: done extracting (pseudo-online) iVectors"

Просмотреть файл

@ -32,9 +32,6 @@ posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
# used when training the iVector extractor, but more important
# that this match the value used when you do real online decoding
# with the neural nets trained with these iVectors.
#utts_per_spk_max=-1 # This option is no longer supported, you should use
# steps/online/nnet2/copy_data_dir.sh with the --utts-per-spk-max
# option to make a copy of the data dir.
compress=true # If true, compress the iVectors stored on disk (it's lossy
# compression, as used for feature matrices).
@ -58,10 +55,6 @@ if [ $# != 3 ]; then
echo " # diagonal model."
echo " --min-post <float;default=0.025> # Pruning threshold for posteriors"
echo " --ivector-period <int;default=10> # How often to extract an iVector (frames)"
echo " --utts-per-spk-max <int;default=-1> # Controls splitting into 'fake speakers'."
echo " # Set to 1 if compatibility with utterance-by-utterance"
echo " # decoding is the only factor, and to larger if you care "
echo " # also about adaptation over several utterances."
exit 1;
fi
@ -71,7 +64,7 @@ dir=$3
for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $srcdir/global_cmvn.stats $srcdir/splice_opts \
$srcdir/online_cmvn.conf $srcdir/final.mat; do
[ ! -f $f ] && echo "No such file $f" && exit 1;
[ ! -f $f ] && echo "$0: No such file $f" && exit 1;
done
# Set various variables.
@ -86,7 +79,7 @@ splice_opts=$(cat $srcdir/splice_opts)
# the program ivector-extract-online2 does a bunch of stuff in memory and is
# config-driven... this was easier in this case because the same code is
# involved in online decoding. We need to create a config file for iVector
# extration.
# extraction.
ieconf=$dir/conf/ivector_extractor.conf
echo -n >$ieconf
@ -104,15 +97,6 @@ echo "--posterior-scale=$posterior_scale" >>$ieconf
echo "--max-remembered-frames=1000" >>$ieconf # the default
ns=$(wc -l <$data/spk2utt)
if [ "$ns" == 1 -a "$utts_per_spk_max" != 1 -a "$utts_per_spk_max" != -1 ]; then
echo "$0: you seem to have just one speaker in your database. This is probably not a good idea."
echo " see http://kaldi.sourceforge.net/data_prep.html (search for 'bold') for why"
echo " Setting --utts-per-spk-max to 1."
utts_per_spk_max=1
fi
for n in $(seq $nj); do
# This will do nothing unless the directory $dir/storage exists;

Просмотреть файл

@ -34,7 +34,7 @@ int main(int argc, char *argv[]) {
try {
const char *usage =
"Convert alignments to posteriors\n"
"Usage: ali-to-post [options] alignments-rspecifier posteriors-wspecifier\n"
"Usage: ali-to-post [options] <alignments-rspecifier> <posteriors-wspecifier>\n"
"e.g.:\n"
" ali-to-post ark:1.ali ark:1.post\n";

Просмотреть файл

@ -38,10 +38,14 @@ int main(int argc, char *argv[]) {
"See also: copy-feats\n";
bool binary = true;
BaseFloat scale = 1.0;
ParseOptions po(usage);
po.Register("binary", &binary, "Write in binary mode (only relevant if output is a wxfilename)");
po.Register("binary", &binary,
"Write in binary mode (only relevant if output is a wxfilename)");
po.Register("scale", &scale,
"This option can be used to scale the matrices being copied.");
po.Read(argc, argv);
if (po.NumArgs() != 2) {
@ -68,6 +72,7 @@ int main(int argc, char *argv[]) {
if (!in_is_rspecifier) {
Matrix<BaseFloat> mat;
ReadKaldiObject(matrix_in_fn, &mat);
if (scale != 1.0) mat.Scale(scale);
Output ko(matrix_out_fn, binary);
mat.Write(ko.Stream(), binary);
KALDI_LOG << "Copied matrix to " << matrix_out_fn;
@ -76,8 +81,15 @@ int main(int argc, char *argv[]) {
int num_done = 0;
BaseFloatMatrixWriter writer(matrix_out_fn);
SequentialBaseFloatMatrixReader reader(matrix_in_fn);
for (; !reader.Done(); reader.Next(), num_done++)
writer.Write(reader.Key(), reader.Value());
for (; !reader.Done(); reader.Next(), num_done++) {
if (scale != 1.0) {
Matrix<BaseFloat> mat(reader.Value());
mat.Scale(scale);
writer.Write(reader.Key(), mat);
} else {
writer.Write(reader.Key(), reader.Value());
}
}
KALDI_LOG << "Copied " << num_done << " matrices.";
return (num_done != 0 ? 0 : 1);
}

Просмотреть файл

@ -50,9 +50,9 @@ int main(int argc, char *argv[]) {
exit(1);
}
std::string rspecifier1 = po.GetArg(1);
std::string rspecifier2 = po.GetArg(2);
std::string wspecifier = po.GetArg(3);
std::string rspecifier1 = po.GetArg(1),
rspecifier2 = po.GetArg(2),
wspecifier = po.GetArg(3);
BaseFloatMatrixWriter feats_writer(wspecifier);
SequentialBaseFloatMatrixReader feats_reader1(rspecifier1);

Просмотреть файл

@ -78,8 +78,8 @@ int main(int argc, char *argv[]) {
"Usage: paste-feats <in-rspecifier1> <in-rspecifier2> [<in-rspecifier3> ...] <out-wspecifier>\n"
" or: paste-feats <in-rxfilename1> <in-rxfilename2> [<in-rxfilename3> ...] <out-wxfilename>\n"
" e.g. paste-feats ark:feats1.ark \"ark:select-feats 0-3 ark:feats2.ark ark:- |\" ark:feats-out.ark\n"
" or: paste-feats foo.mat bar.mat baz.mat\n";
" or: paste-feats foo.mat bar.mat baz.mat\n"
"See also: copy-feats, copy-matrix, append-vector-to-feats, concat-feats\n";
ParseOptions po(usage);

Просмотреть файл

@ -107,8 +107,9 @@ void TestIvectorExtraction(const IvectorExtractor &extractor,
utt_stats.AccStats(feats, post);
OnlineIvectorEstimationStats online_stats(extractor.IvectorDim(),
extractor.PriorOffset());
extractor.PriorOffset(),
0.0);
for (int32 t = 0; t < num_frames; t++) {
online_stats.AccStats(extractor, feats.Row(t), post[t]);
}

Просмотреть файл

@ -259,13 +259,13 @@ void IvectorExtractor::GetIvectorDistMean(
for (int32 i = 0; i < I; i++) {
double gamma = utt_stats.gamma_(i);
if (gamma != 0.0) {
Vector<double> x(utt_stats.X_.Row(i)); // == \gamma(i) \m_i
SubVector<double> x(utt_stats.X_, i); // == \gamma(i) \m_i
// next line: a += \gamma_i \M_i^T \Sigma_i^{-1} \m_i
linear->AddMatVec(1.0, Sigma_inv_M_[i], kTrans, x, 1.0);
}
}
SubVector<double> q_vec(quadratic->Data(), IvectorDim()*(IvectorDim()+1)/2);
q_vec.AddMatVec(1.0, U_, kTrans, Vector<double>(utt_stats.gamma_), 1.0);
q_vec.AddMatVec(1.0, U_, kTrans, utt_stats.gamma_, 1.0);
}
void IvectorExtractor::GetIvectorDistPrior(
@ -543,24 +543,55 @@ void OnlineIvectorEstimationStats::AccStats(
quadratic_term_vec.AddVec(weight, U_g);
tot_weight += weight;
}
if (max_count_ != 0.0) {
// see comments in header RE max_count for explanation.
double old_num_frames = num_frames_,
new_num_frames = num_frames_ + tot_weight;
double old_prior_scale = std::max(old_num_frames, max_count_) / max_count_,
new_prior_scale = std::max(new_num_frames, max_count_) / max_count_;
// The prior_scales are the inverses of the scales we would put on the stats
// if we were implementing this by scaling the stats. Instead we
// scale the prior term.
double prior_scale_change = new_prior_scale - old_prior_scale;
if (prior_scale_change != 0.0) {
linear_term_(0) += prior_offset_ * prior_scale_change;
quadratic_term_.AddToDiag(prior_scale_change);
}
}
num_frames_ += tot_weight;
}
void OnlineIvectorEstimationStats::Scale(double scale) {
KALDI_ASSERT(scale >= 0.0 && scale <= 1.0);
double old_num_frames = num_frames_;
num_frames_ *= scale;
quadratic_term_.Scale(scale);
linear_term_.Scale(scale);
// Scale back up the prior term, by adding in whatever we scaled down.
linear_term_(0) += prior_offset_ * (1.0 - scale);
quadratic_term_.AddToDiag(1.0 - scale);
if (max_count_ == 0.0) {
linear_term_(0) += prior_offset_ * (1.0 - scale);
quadratic_term_.AddToDiag(1.0 - scale);
} else {
double new_num_frames = num_frames_;
double old_prior_scale =
scale * std::max(old_num_frames, max_count_) / max_count_,
new_prior_scale = std::max(new_num_frames, max_count_) / max_count_;
// old_prior_scale is the scale the prior term currently has in the stats,
// i.e. the previous scale times "scale" as we just scaled the stats.
// new_prior_scale is the scale we want the prior term to have.
linear_term_(0) += prior_offset_ * (new_prior_scale - old_prior_scale);
quadratic_term_.AddToDiag(new_prior_scale - old_prior_scale);
}
}
void OnlineIvectorEstimationStats::Write(std::ostream &os, bool binary) const {
WriteToken(os, binary, "<OnlineIvectorEstimationStats>"); // magic string.
WriteToken(os, binary, "<OnlineIvectorEstimationStats>");
WriteToken(os, binary, "<PriorOffset>");
WriteBasicType(os, binary, prior_offset_);
WriteToken(os, binary, "<MaxCount>");
WriteBasicType(os, binary, max_count_);
WriteToken(os, binary, "<NumFrames>");
WriteBasicType(os, binary, num_frames_);
WriteToken(os, binary, "<QuadraticTerm>");
@ -571,11 +602,20 @@ void OnlineIvectorEstimationStats::Write(std::ostream &os, bool binary) const {
}
void OnlineIvectorEstimationStats::Read(std::istream &is, bool binary) {
ExpectToken(is, binary, "<OnlineIvectorEstimationStats>"); // magic string.
ExpectToken(is, binary, "<OnlineIvectorEstimationStats>");
ExpectToken(is, binary, "<PriorOffset>");
ReadBasicType(is, binary, &prior_offset_);
ExpectToken(is, binary, "<NumFrames>");
ReadBasicType(is, binary, &num_frames_);
std::string tok;
ReadToken(is, binary, &tok);
if (tok == "<MaxCount>") {
ReadBasicType(is, binary, &max_count_);
ExpectToken(is, binary, "<NumFrames>");
ReadBasicType(is, binary, &num_frames_);
} else {
KALDI_ASSERT(tok == "<NumFrames>");
max_count_ = 0.0;
ReadBasicType(is, binary, &num_frames_);
}
ExpectToken(is, binary, "<QuadraticTerm>");
quadratic_term_.Read(is, binary);
ExpectToken(is, binary, "<LinearTerm>");
@ -638,8 +678,9 @@ double OnlineIvectorEstimationStats::DefaultObjf() const {
}
OnlineIvectorEstimationStats::OnlineIvectorEstimationStats(int32 ivector_dim,
BaseFloat prior_offset):
prior_offset_(prior_offset), num_frames_(0.0),
BaseFloat prior_offset,
BaseFloat max_count):
prior_offset_(prior_offset), max_count_(max_count), num_frames_(0.0),
quadratic_term_(ivector_dim), linear_term_(ivector_dim) {
if (ivector_dim != 0) {
linear_term_(0) += prior_offset;
@ -650,6 +691,7 @@ OnlineIvectorEstimationStats::OnlineIvectorEstimationStats(int32 ivector_dim,
OnlineIvectorEstimationStats::OnlineIvectorEstimationStats(
const OnlineIvectorEstimationStats &other):
prior_offset_(other.prior_offset_),
max_count_(other.max_count_),
num_frames_(other.num_frames_),
quadratic_term_(other.quadratic_term_),
linear_term_(other.linear_term_) { }
@ -733,6 +775,12 @@ void IvectorExtractorUtteranceStats::AccStats(
}
}
void IvectorExtractorUtteranceStats::Scale(double scale) {
gamma_.Scale(scale);
X_.Scale(scale);
for (size_t i = 0; i < S_.size(); i++)
S_[i].Scale(scale);
}
IvectorExtractorStats::IvectorExtractorStats(
const IvectorExtractor &extractor,
@ -1534,6 +1582,7 @@ double EstimateIvectorsOnline(
const IvectorExtractor &extractor,
int32 ivector_period,
int32 num_cg_iters,
BaseFloat max_count,
Matrix<BaseFloat> *ivectors) {
KALDI_ASSERT(ivector_period > 0);
@ -1544,7 +1593,8 @@ double EstimateIvectorsOnline(
ivectors->Resize(num_ivectors, extractor.IvectorDim());
OnlineIvectorEstimationStats online_stats(extractor.IvectorDim(),
extractor.PriorOffset());
extractor.PriorOffset(),
max_count);
double ans = 0.0;

Просмотреть файл

@ -45,13 +45,23 @@ namespace kaldi {
// "acoustic_weight" is not read by any class declared in this header; it has to
// be applied by calling IvectorExtractorUtteranceStats::Scale() before
// obtaining the iVector.
// The same is true of max_count: it has to be applied by programs themselves
// e.g. see ../ivectorbin/ivector-extract.cc.
struct IvectorEstimationOptions {
double acoustic_weight;
IvectorEstimationOptions(): acoustic_weight(1.0) {}
double max_count;
IvectorEstimationOptions(): acoustic_weight(1.0), max_count(0.0) {}
void Register(OptionsItf *po) {
po->Register("acoustic-weight", &acoustic_weight,
"Weight on part of auxf that involves the data (e.g. 0.2); "
"if this weight is small, the prior will have more effect.");
po->Register("max-count", &max_count,
"Maximum frame count (affects prior scaling): if >0, the prior "
"term will be scaled up after the frame count exceeds this "
"value. Note that this count is considered after posterior "
"scaling (e.g. --acoustic-weight option, or scale argument to "
"scale-post), so you would normally use a cutoff 10 times "
"smaller than the corresponding number of frames.");
}
};
@ -301,8 +311,12 @@ class IvectorExtractor {
*/
class OnlineIvectorEstimationStats {
public:
// Search above for max_count to see an explanation; if nonzero, it will
// put a higher weight on the prior (vs. the stats) once the count passes
// that value.
OnlineIvectorEstimationStats(int32 ivector_dim,
BaseFloat prior_offset);
BaseFloat prior_offset,
BaseFloat max_count);
OnlineIvectorEstimationStats(const OnlineIvectorEstimationStats &other);
@ -360,6 +374,7 @@ class OnlineIvectorEstimationStats {
friend class IvectorExtractor;
double prior_offset_;
double max_count_;
double num_frames_; // num frames (weighted, if applicable).
SpMatrix<double> quadratic_term_;
Vector<double> linear_term_;
@ -368,8 +383,10 @@ class OnlineIvectorEstimationStats {
// This code obtains periodically (for each "ivector_period" frames, e.g. 10
// frames), an estimate of the iVector including all frames up to that point.
// This emulates what you could do in an online/streaming algorithm; its use
// is for neural network training in a way that's matched to online decoding.
// This emulates what you could do in an online/streaming algorithm; its use is
// for neural network training in a way that's matched to online decoding.
// [note: I don't believe we are currently using the program,
// ivector-extract-online.cc, that calls this function, in any of the scripts.].
// Caution: this program outputs the raw iVectors, where the first component
// will generally be very positive. You probably want to subtract PriorOffset()
// from the first element of each row of the output before writing it out.
@ -384,6 +401,7 @@ double EstimateIvectorsOnline(
const IvectorExtractor &extractor,
int32 ivector_period,
int32 num_cg_iters,
BaseFloat max_count,
Matrix<BaseFloat> *ivectors);

Просмотреть файл

@ -49,6 +49,7 @@ int main(int argc, char *argv[]) {
ParseOptions po(usage);
int32 num_cg_iters = 15;
int32 ivector_period = 10;
BaseFloat max_count = 0.0;
g_num_threads = 8;
po.Register("num-cg-iters", &num_cg_iters,
@ -60,6 +61,12 @@ int main(int argc, char *argv[]) {
po.Register("num-threads", &g_num_threads,
"Number of threads to use for computing derived variables "
"of iVector extractor, at process start-up.");
po.Register("max-count", &max_count,
"If >0, when the count of posteriors exceeds max-count we will "
"start using a stronger prior term. Can make iVectors from "
"longer than normal utterances look more 'typical'. Interpret "
"this value as a number of frames multiplied by your "
"posterior scale (so typically 0.1 times a number of frames).");
po.Read(argc, argv);
if (po.NumArgs() != 4) {
@ -107,7 +114,7 @@ int main(int argc, char *argv[]) {
double objf_impr_per_frame;
objf_impr_per_frame = EstimateIvectorsOnline(feats, posterior, extractor,
ivector_period, num_cg_iters,
&ivectors);
max_count, &ivectors);
BaseFloat offset = extractor.PriorOffset();
for (int32 i = 0 ; i < ivectors.NumRows(); i++)

Просмотреть файл

@ -63,13 +63,13 @@ class IvectorExtractTask {
}
~IvectorExtractTask() {
if (tot_auxf_change_ != NULL) {
int32 T = posterior_.size();
double T = TotalPosterior(posterior_);
*tot_auxf_change_ += auxf_change_;
KALDI_VLOG(2) << "Auxf change for utterance " << utt_ << " was "
<< (auxf_change_ / T) << " per frame over " << T
<< " frames.";
<< " frames (weighted)";
}
// We actually write out the offset of the iVector's from the mean of the
// We actually write out the offset of the iVectors from the mean of the
// prior distribution; this is the form we'll need it in for scoring. (most
// formulations of iVectors have zero-mean priors so this is not normally an
// issue).
@ -89,11 +89,124 @@ class IvectorExtractTask {
double auxf_change_;
};
int32 RunPerSpeaker(const std::string &ivector_extractor_rxfilename,
const IvectorEstimationOptions &opts,
bool compute_objf_change,
const std::string &spk2utt_rspecifier,
const std::string &feature_rspecifier,
const std::string &posterior_rspecifier,
const std::string &ivector_wspecifier) {
IvectorExtractor extractor;
ReadKaldiObject(ivector_extractor_rxfilename, &extractor);
SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
RandomAccessPosteriorReader posterior_reader(posterior_rspecifier);
BaseFloatVectorWriter ivector_writer(ivector_wspecifier);
double tot_auxf_change = 0.0, tot_post = 0.0, tot_norm = 0.0;
int32 num_utt_done = 0, num_utt_err = 0,
num_spk_done = 0, num_spk_err = 0;
for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
std::string spk = spk2utt_reader.Key();
const std::vector<std::string> &utts = spk2utt_reader.Value();
bool need_2nd_order_stats = false;
IvectorExtractorUtteranceStats utt_stats(extractor.NumGauss(),
extractor.FeatDim(),
need_2nd_order_stats);
for (size_t i = 0; i < utts.size(); i++) {
const std::string &utt = utts[i];
if (!feature_reader.HasKey(utt)) {
KALDI_WARN << "No features present for utterance " << utt;
num_utt_err++;
continue;
}
const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
if (!posterior_reader.HasKey(utt)) {
KALDI_WARN << "No posteriors present for utterance " << utt;
num_utt_err++;
continue;
}
Posterior posterior = posterior_reader.Value(utt);
if (feats.NumRows() != posterior.size()) {
KALDI_WARN << "Posterior has wrong size " << posterior.size()
<< " vs. feats " << feats.NumRows() << " for "
<< utt;
num_utt_err++;
continue;
}
ScalePosterior(opts.acoustic_weight, &posterior);
num_utt_done++;
utt_stats.AccStats(feats, posterior);
}
if (utt_stats.NumFrames() == 0.0) {
KALDI_WARN << "No stats accumulated for speaker " << spk;
num_spk_err++;
continue;
} else {
if (opts.max_count > 0 && utt_stats.NumFrames() > opts.max_count) {
double scale = opts.max_count / utt_stats.NumFrames();
utt_stats.Scale(scale);
KALDI_LOG << "Scaling stats for speaker " << spk << " by scale "
<< scale << " due to --max-count=" << opts.max_count;
}
Vector<double> ivector(extractor.IvectorDim());
ivector(0) = extractor.PriorOffset();
if (compute_objf_change) {
double old_auxf = extractor.GetAuxf(utt_stats, ivector);
extractor.GetIvectorDistribution(utt_stats, &ivector, NULL);
double new_auxf = extractor.GetAuxf(utt_stats, ivector);
double auxf_change = new_auxf - old_auxf;
KALDI_LOG << "Auxf change for speaker " << spk << " was "
<< (auxf_change / utt_stats.NumFrames()) << " per frame, over "
<< utt_stats.NumFrames() << " frames (weighted).";
tot_auxf_change += auxf_change;
} else {
extractor.GetIvectorDistribution(utt_stats, &ivector, NULL);
}
// We actually write out the offset of the iVectors from the mean of the
// prior distribution; this is the form we'll need it in for scoring and
// as a feature for neural nets. (most formulations of iVectors have
// zero-mean priors so this is not normally an issue).
ivector(0) -= extractor.PriorOffset();
KALDI_LOG << "Ivector norm for speaker " << spk
<< " was " << ivector.Norm(2.0);
tot_norm += ivector.Norm(2.0) * utt_stats.NumFrames();
tot_post += utt_stats.NumFrames();
num_spk_done++;
Vector<BaseFloat> ivector_flt(ivector);
ivector_writer.Write(spk, ivector_flt);
}
}
KALDI_LOG << "Done " << num_spk_done << " speakers; " << num_spk_err
<< " with errors. " << num_utt_done << " utterances "
<< "were processed, " << num_utt_err << " with errors.";
if (tot_post != 0.0) {
if (compute_objf_change) {
KALDI_LOG << "Overall weighted-average objective function improvement was "
<< (tot_auxf_change / tot_post) << " over " << tot_post
<< " frames (weighted)";
}
KALDI_LOG << "Average iVector norm (weighted by frames) was "
<< (tot_norm / tot_post) << " over " << tot_post
<< " frames (weighted)";
}
return (num_spk_done != 0 ? 0 : 1);
}
}
int main(int argc, char *argv[]) {
using namespace kaldi;
typedef kaldi::int32 int32;
@ -102,7 +215,7 @@ int main(int argc, char *argv[]) {
const char *usage =
"Extract iVectors for utterances, using a trained iVector extractor,\n"
"and features and Gaussian-level posteriors\n"
"Usage: ivector-extract [options] <model-in> <feature-rspecifier>"
"Usage: ivector-extract [options] <model-in> <feature-rspecifier> "
"<posteriors-rspecifier> <ivector-wspecifier>\n"
"e.g.: \n"
" fgmm-global-gselect-to-post 1.ubm '$feats' 'ark:gunzip -c gselect.1.gz|' ark:- | \\\n"
@ -110,13 +223,21 @@ int main(int argc, char *argv[]) {
ParseOptions po(usage);
bool compute_objf_change = true;
IvectorExtractorStatsOptions stats_opts;
IvectorEstimationOptions opts;
std::string spk2utt_rspecifier;
TaskSequencerConfig sequencer_config;
po.Register("compute-objf-change", &compute_objf_change,
"If true, compute the change in objective function from using "
"nonzero iVector (a potentially useful diagnostic). Combine "
"with --verbose=2 for per-utterance information");
stats_opts.Register(&po);
po.Register("spk2utt", &spk2utt_rspecifier, "Supply this option if you "
"want iVectors to be output at the per-speaker level, estimated "
"using stats accumulated from multiple utterances. Note: this "
"is not the normal way iVectors are obtained for speaker-id. "
"This option will cause the program to ignore the --num-threads "
"option.");
opts.Register(&po);
sequencer_config.Register(&po);
po.Read(argc, argv);
@ -128,63 +249,87 @@ int main(int argc, char *argv[]) {
std::string ivector_extractor_rxfilename = po.GetArg(1),
feature_rspecifier = po.GetArg(2),
posteriors_rspecifier = po.GetArg(3),
posterior_rspecifier = po.GetArg(3),
ivectors_wspecifier = po.GetArg(4);
// g_num_threads affects how ComputeDerivedVars is called when we read the
// extractor.
g_num_threads = sequencer_config.num_threads;
IvectorExtractor extractor;
ReadKaldiObject(ivector_extractor_rxfilename, &extractor);
double tot_auxf_change = 0.0;
int64 tot_t = 0;
int32 num_done = 0, num_err = 0;
if (spk2utt_rspecifier.empty()) {
// g_num_threads affects how ComputeDerivedVars is called when we read the
// extractor.
g_num_threads = sequencer_config.num_threads;
IvectorExtractor extractor;
ReadKaldiObject(ivector_extractor_rxfilename, &extractor);
double tot_auxf_change = 0.0, tot_t = 0.0;
int32 num_done = 0, num_err = 0;
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
BaseFloatVectorWriter ivector_writer(ivectors_wspecifier);
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
RandomAccessPosteriorReader posterior_reader(posterior_rspecifier);
BaseFloatVectorWriter ivector_writer(ivectors_wspecifier);
{
TaskSequencer<IvectorExtractTask> sequencer(sequencer_config);
for (; !feature_reader.Done(); feature_reader.Next()) {
std::string utt = feature_reader.Key();
if (!posterior_reader.HasKey(utt)) {
KALDI_WARN << "No posteriors for utterance " << utt;
num_err++;
continue;
}
const Matrix<BaseFloat> &mat = feature_reader.Value();
Posterior posterior = posterior_reader.Value(utt);
if (static_cast<int32>(posterior.size()) != mat.NumRows()) {
KALDI_WARN << "Size mismatch between posterior " << posterior.size()
<< " and features " << mat.NumRows() << " for utterance "
<< utt;
num_err++;
continue;
}
{
TaskSequencer<IvectorExtractTask> sequencer(sequencer_config);
for (; !feature_reader.Done(); feature_reader.Next()) {
std::string key = feature_reader.Key();
if (!posteriors_reader.HasKey(key)) {
KALDI_WARN << "No posteriors for utterance " << key;
num_err++;
continue;
double *auxf_ptr = (compute_objf_change ? &tot_auxf_change : NULL );
double this_t = opts.acoustic_weight * TotalPosterior(posterior),
max_count_scale = 1.0;
if (opts.max_count > 0 && this_t > opts.max_count) {
max_count_scale = opts.max_count / this_t;
KALDI_LOG << "Scaling stats for utterance " << utt << " by scale "
<< max_count_scale << " due to --max-count="
<< opts.max_count;
this_t = opts.max_count;
}
ScalePosterior(opts.acoustic_weight * max_count_scale,
&posterior);
// note: now, this_t == sum of posteriors.
sequencer.Run(new IvectorExtractTask(extractor, utt, mat, posterior,
&ivector_writer, auxf_ptr));
tot_t += this_t;
num_done++;
}
const Matrix<BaseFloat> &mat = feature_reader.Value();
const Posterior &posterior = posteriors_reader.Value(key);
if (static_cast<int32>(posterior.size()) != mat.NumRows()) {
KALDI_WARN << "Size mismatch between posterior " << posterior.size()
<< " and features " << mat.NumRows() << " for utterance "
<< key;
num_err++;
continue;
}
double *auxf_ptr = (compute_objf_change ? &tot_auxf_change : NULL );
sequencer.Run(new IvectorExtractTask(extractor, key, mat, posterior,
&ivector_writer, auxf_ptr));
tot_t += posterior.size();
num_done++;
// Destructor of "sequencer" will wait for any remaining tasks.
}
// Destructor of "sequencer" will wait for any remaining tasks.
KALDI_LOG << "Done " << num_done << " files, " << num_err
<< " with errors. Total (weighted) frames " << tot_t;
if (compute_objf_change)
KALDI_LOG << "Overall average objective-function change from estimating "
<< "ivector was " << (tot_auxf_change / tot_t) << " per frame "
<< " over " << tot_t << " (weighted) frames.";
return (num_done != 0 ? 0 : 1);
} else {
KALDI_ASSERT(sequencer_config.num_threads == 1 &&
"--spk2utt option is incompatible with --num-threads option");
return RunPerSpeaker(ivector_extractor_rxfilename,
opts,
compute_objf_change,
spk2utt_rspecifier,
feature_rspecifier,
posterior_rspecifier,
ivectors_wspecifier);
}
KALDI_LOG << "Done " << num_done << " files, " << num_err
<< " with errors. Total frames " << tot_t;
if (compute_objf_change)
KALDI_LOG << "Overall average objective-function change from estimating "
<< "ivector was " << (tot_auxf_change / tot_t) << " per frame "
<< " over " << tot_t << " frames.";
return (num_done != 0 ? 0 : 1);
} catch(const std::exception &e) {
std::cerr << e.what();
return -1;

Просмотреть файл

@ -1010,7 +1010,7 @@ void VectorBase<Real>::AddVec(const Real alpha, const VectorBase<OtherReal> &v)
MatrixIndexT dim = dim_;
if (alpha != 1.0)
for (MatrixIndexT i = 0; i < dim; i++)
data[i] += alpha*other_data[i];
data[i] += alpha * other_data[i];
else
for (MatrixIndexT i = 0; i < dim; i++)
data[i] += other_data[i];

Просмотреть файл

@ -32,6 +32,7 @@ void OnlineIvectorExtractionInfo::Init(
num_gselect = config.num_gselect;
min_post = config.min_post;
posterior_scale = config.posterior_scale;
max_count = config.max_count;
use_most_recent_ivector = config.use_most_recent_ivector;
greedy_ivector_extractor = config.greedy_ivector_extractor;
if (greedy_ivector_extractor && !use_most_recent_ivector) {
@ -161,7 +162,7 @@ void OnlineIvectorFeature::UpdateStatsUntilFrame(int32 frame) {
Vector<BaseFloat> feat(feat_dim), // features given to iVector extractor
log_likes(info_.diag_ubm.NumGauss());
for (; num_frames_stats_ <= frame; num_frames_stats_++) {
int32 t = num_frames_stats_; // Frame whose stats we want to get.
lda_normalized_->GetFrame(t, &feat);
@ -262,8 +263,10 @@ OnlineIvectorFeature::OnlineIvectorFeature(
const OnlineIvectorExtractionInfo &info,
OnlineFeatureInterface *base_feature):
info_(info), base_(base_feature),
ivector_stats_(info_.extractor.IvectorDim(), info_.extractor.PriorOffset()),
num_frames_stats_(0) {
ivector_stats_(info_.extractor.IvectorDim(),
info_.extractor.PriorOffset(),
info_.max_count),
num_frames_stats_(0), tot_ubm_loglike_(0.0) {
info.Check();
KALDI_ASSERT(base_feature != NULL);
splice_ = new OnlineSpliceFrames(info_.splice_opts, base_);

Просмотреть файл

@ -70,6 +70,12 @@ struct OnlineIvectorExtractionConfig {
BaseFloat posterior_scale; // Scale on posteriors used for iVector
// extraction; can be interpreted as the inverse
// of a scale on the log-prior.
BaseFloat max_count; // Maximum stats count we allow before we start scaling
// down stats (if nonzero).. this prevents us getting
// atypical-looking iVectors for very long utterances.
// Interpret this as a number of frames times
// posterior_scale, typically 1/10 of a frame count.
// If use_most_recent_ivector is true, we always return the most recent
// available iVector rather than the one for the current frame. This means
@ -91,6 +97,7 @@ struct OnlineIvectorExtractionConfig {
OnlineIvectorExtractionConfig(): ivector_period(10), num_gselect(5),
min_post(0.025), posterior_scale(0.1),
max_count(0.0),
use_most_recent_ivector(true),
greedy_ivector_extractor(false),
max_remembered_frames(1000) { }
@ -122,6 +129,11 @@ struct OnlineIvectorExtractionConfig {
"iVector extraction");
po->Register("posterior-scale", &posterior_scale, "Scale for posteriors in "
"iVector extraction (may be viewed as inverse of prior scale)");
po->Register("max-count", &max_count, "Maximum data count we allow before "
"we start scaling the stats down (if nonzero)... helps to make "
"iVectors from long utterances look more typical. Interpret "
"as a frame-count times --posterior-scale, typically 1/10 of "
"a number of frames. Suggest 100.");
po->Register("use-most-recent-ivector", &use_most_recent_ivector, "If true, "
"always use most recent available iVector, rather than the "
"one for the designated frame.");
@ -156,6 +168,7 @@ struct OnlineIvectorExtractionInfo {
int32 num_gselect;
BaseFloat min_post;
BaseFloat posterior_scale;
BaseFloat max_count;
bool use_most_recent_ivector;
bool greedy_ivector_extractor;
BaseFloat max_remembered_frames;
@ -191,7 +204,8 @@ struct OnlineIvectorExtractorAdaptationState {
OnlineIvectorExtractorAdaptationState(const OnlineIvectorExtractionInfo &info):
cmvn_state(info.global_cmvn_stats),
ivector_stats(info.extractor.IvectorDim(),
info.extractor.PriorOffset()) { }
info.extractor.PriorOffset(),
info.max_count) { }
/// Copy constructor
OnlineIvectorExtractorAdaptationState(

Просмотреть файл

@ -31,7 +31,7 @@ int main(int argc, char *argv[]) {
"Apply online cepstral mean (and possibly variance) computation online,\n"
"using the same code as used for online decoding in the 'new' setup in\n"
"online2/ and online2bin/. If the --spk2utt option is used, it uses\n"
"prior utterances from the same speaker to back off two at the utterance\n"
"prior utterances from the same speaker to back off to at the utterance\n"
"beginning. See also apply-cmvn-sliding.\n"
"\n"
"Usage: apply-cmvn-online [options] <global-cmvn-stats> <feature-rspecifier> "

Просмотреть файл

@ -84,7 +84,7 @@ int main(int argc, char *argv[]) {
RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
BaseFloatMatrixWriter ivector_writer(ivectors_wspecifier);
for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
std::string spk = spk2utt_reader.Key();
const std::vector<std::string> &uttlist = spk2utt_reader.Value();
@ -98,12 +98,12 @@ int main(int argc, char *argv[]) {
continue;
}
const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
OnlineMatrixFeature matrix_feature(feats);
OnlineIvectorFeature ivector_feature(ivector_info,
&matrix_feature);
ivector_feature.SetAdaptationState(adaptation_state);
int32 T = feats.NumRows(),
@ -130,9 +130,9 @@ int main(int argc, char *argv[]) {
<< ", UBM loglike/frame was "
<< ivector_feature.UbmLogLikePerFrame()
<< ", iVector length (at utterance end) was "
<< ivectors.Row(n-1).Norm(2.0)
<< ", objf improvement from iVector estimation was "
<< tot_objf_impr;
<< ivectors.Row(num_ivectors-1).Norm(2.0)
<< ", objf improvement/frame from iVector estimation was "
<< ivector_feature.ObjfImprPerFrame();
ivector_feature.GetAdaptationState(&adaptation_state);
ivector_writer.Write(utt, ivectors);