sandbox/lid: various script fixes and updates; improving speed of iVector-extractor model loading by parallelizing derived-variables computation.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@3759 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2014-03-12 04:31:52 +00:00
Родитель a1a368dc83
Коммит 5c5a4e2f5a
11 изменённых файлов: 90 добавлений и 57 удалений

Просмотреть файл

@ -75,6 +75,7 @@ if [ $stage -le 1 ]; then
if ($len <= $max_utt_len) {
print SEGMENTS "${utt}-1 ${utt} 0 -1\n";
print UTT2SPK "${utt}-1 $speaker\n";
print UTT2LANG "${utt}-1 $language\n";
} else {
# We will now allow split length to exceed max_utt_len.
$num_split = int(($len + 0.999*$max_utt_len) / $max_utt_len);

Просмотреть файл

@ -63,12 +63,6 @@ lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/train \
lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/lre07 \
exp/make_vad $vaddir
# Use 4k of the 14k utterances for testing, but make sure the speakers do not
# overlap with the rest of the data, which will be used for training.
#utils/subset_data_dir.sh --speakers data/all 4000 data/lre07
#utils/filter_scp.pl --exclude data/lre07/spk2utt < data/all/spk2utt | awk '{print $1}' > foo
#utils/subset_data_dir.sh --spk-list foo data/all data/train
utils/subset_data_dir.sh data/train 5000 data/train_5k
utils/subset_data_dir.sh data/train 10000 data/train_10k
@ -82,7 +76,6 @@ lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train_10k \
lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train \
exp/full_ubm_2048_10k exp/full_ubm_2048
lid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=2G,ram_free=2G" \
--num-iters 5 exp/full_ubm_2048/final.ubm data/train \
exp/extractor_2048
@ -91,6 +84,4 @@ lid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
exp/extractor_2048 data/train exp/ivectors_train
lid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
exp/extractor_2048 data/lre07 exp/ivectors_test
exp/extractor_2048 data/lre07 exp/ivectors_lre07

Просмотреть файл

@ -8,15 +8,13 @@
. cmd.sh
. path.sh
set -e
mfccdir=`pwd`/mfcc
vaddir=`pwd`/mfcc
config=conf/logistic-regression.conf
awk '{print $2}' <(utils/remove_dialect.pl data/train/utt2lang) | sort -u | \
awk '{print $1, NR-1}' > exp/ivectors_train/languages.txt
log=exp/ivectors_train/log/logistic_regression.log
model=exp/ivectors_train/logistic_regression
model_rebalanced=exp/ivectors_train/logistic_regression_rebalanced
@ -40,12 +38,12 @@ utils/balance_priors_to_test.pl \
exp/ivectors_train/priors.vec
logistic-regression-train --config=$config scp:$train_ivectors \
"$classes" $model 2>$log
"$classes" $model \
2>exp/ivectors_train/log/logistic_regression.log
( logistic-regression-train --config=$config scp:$train_ivectors \
"$classes" - | \
logistic-regression-copy --scale-priors=exp/ivectors_train/priors.vec - \
$model_rebalanced ) 2>$log
logistic-regression-copy --scale-priors=exp/ivectors_train/priors.vec \
$model $model_rebalanced
trials="utils/remove_dialect.pl data/train/utt2lang \
| utils/sym2int.pl -f 2 exp/ivectors_train/languages.txt -|"
@ -57,7 +55,7 @@ logistic-regression-eval $model scp:$train_ivectors \
logistic-regression-eval $model "ark:$trials" scp:$train_ivectors "$scores"
logistic-regression-eval $model scp:$train_ivectors ark,t:- | \
cat exp/ivectors_train/posteriors | \
awk '{max=$3; argmax=3; for(f=3;f<NF;f++) { if ($f>max)
{ max=$f; argmax=f; }}
print $1, (argmax - 3); }' | \
@ -66,29 +64,28 @@ logistic-regression-eval $model scp:$train_ivectors ark,t:- | \
# note: we treat the language as a sentence; it happens that the WER/SER
# corresponds to the recognition error rate.
compute-wer --text ark:<(utils/remove_dialect.pl data/train/utt2lang) \
compute-wer --mode=present --text ark:<(utils/remove_dialect.pl data/train/utt2lang) \
ark:exp/ivectors_train/output
# It perfectly classifies the training data:
#%WER 0.00 [ 0 / 10173, 0 ins, 0 del, 0 sub ]
#%SER 0.00 [ 0 / 10173 ]
#Scored 10173 sentences, 0 not present in hyp.
#%WER 4.68 [ 3355 / 71668, 0 ins, 0 del, 3355 sub ] [PARTIAL]
#%SER 4.68 [ 3355 / 71668 ]
#Scored 71668 sentences, 16 not present in hyp.
logistic-regression-eval $model_rebalanced \
scp:exp/ivectors_test/ivector.scp ark,t:- | \
scp:exp/ivectors_lre07/ivector.scp ark,t:- | \
awk '{max=$3; argmax=3; for(f=3;f<NF;f++) { if ($f>max)
{ max=$f; argmax=f; }}
print $1, (argmax - 3); }' | \
utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt >exp/ivectors_test/output
utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt >exp/ivectors_lre07/output
# someone needs to extend this to run on the dev data.
compute-wer --text ark:<(utils/remove_dialect.pl data/lre07/utt2lang)\
ark:exp/ivectors_test/output
# compute-wer --text ark:/dev/fd/63 ark:exp/lre07/output
# %WER 58.83 [ 3958 / 7527, 0 ins, 0 del, 3958 sub ]
# %SER 58.83 [ 3958 / 7527 ]
compute-wer --text ark:<(utils/remove_dialect.pl data/lre07/utt2lang) \
ark:exp/ivectors_lre07/output
> compute-wer --text ark:/dev/fd/63 ark:exp/ivectors_lre07/output
# %WER 34.34 [ 2585 / 7527, 0 ins, 0 del, 2585 sub ]
# %SER 34.34 [ 2585 / 7527 ]
# Scored 7527 sentences, 0 not present in hyp.

Просмотреть файл

@ -77,13 +77,13 @@ else
echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
split_scps=""
for ((n=1; n<=nj; n++)); do
split_scps="$split_scps $logdir/wav.$n.scp"
split_scps="$split_scps $logdir/wav_${name}.$n.scp"
done
utils/split_scp.pl $scp $split_scps || exit 1;
$cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
compute-mfcc-feats --verbose=2 --config=$mfcc_config scp:$logdir/wav.JOB.scp ark:- \| \
compute-mfcc-feats --verbose=2 --config=$mfcc_config scp:$logdir/wav_${name}.JOB.scp ark:- \| \
copy-feats --compress=$compress ark:- \
ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
|| exit 1;
@ -102,7 +102,7 @@ for ((n=1; n<=nj; n++)); do
cat $mfccdir/raw_mfcc_$name.$n.scp || exit 1;
done > $data/feats.scp
rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null
rm $logdir/wav_${name}.*.scp $logdir/segments.* 2>/dev/null
nf=`cat $data/feats.scp | wc -l`
nu=`cat $data/utt2spk | wc -l`

Просмотреть файл

@ -228,12 +228,16 @@ if [ -f $data/spk2gender ]; then
fi
fi
if [ -f $data/vad.scp ]; then
check_sorted_and_uniq $data/vad.scp
if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \
<( awk '{print $1}' $data/vad.scp ); then
echo "$0: error: in $data, vad.scp and utt2spk do not have identical utterance-id list"
# check some optionally-required things
for f in vad.scp utt2lang; do
if [ -f $data/$f ]; then
check_sorted_and_uniq $data/$f
if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \
<( awk '{print $1}' $data/$f ); then
echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list"
exit 1;
fi
fi
fi
done
echo "Successfully validated data-directory $data"

Просмотреть файл

@ -197,6 +197,17 @@ IvectorExtractor::IvectorExtractor(
ComputeDerivedVars();
}
class IvectorExtractorComputeDerivedVarsClass {
public:
IvectorExtractorComputeDerivedVarsClass(IvectorExtractor *extractor,
int32 i):
extractor_(extractor), i_(i) { }
void operator () () { extractor_->ComputeDerivedVars(i_); }
private:
IvectorExtractor *extractor_;
int32 i_;
};
void IvectorExtractor::ComputeDerivedVars() {
KALDI_LOG << "Computing derived variables for iVector extractor";
gconsts_.Resize(NumGauss());
@ -206,17 +217,32 @@ void IvectorExtractor::ComputeDerivedVars() {
// the gconsts don't contain any weight-related terms.
}
U_.Resize(NumGauss(), IvectorDim() * (IvectorDim() + 1) / 2);
SpMatrix<double> temp_U(IvectorDim());
for (int32 i = 0; i < NumGauss(); i++) {
// temp_U = M_i^T Sigma_i^{-1} M_i
temp_U.AddMat2Sp(1.0, M_[i], kTrans, Sigma_inv_[i], 0.0);
SubVector<double> temp_U_vec(temp_U.Data(),
IvectorDim() * (IvectorDim() + 1) / 2);
U_.Row(i).CopyFromVec(temp_U_vec);
// Note, we could have used RunMultiThreaded for this and similar tasks we
// have here, but we found that we don't get as complete CPU utilization as we
// could because some tasks finish before others.
{
TaskSequencerConfig sequencer_opts;
sequencer_opts.num_threads = g_num_threads;
TaskSequencer<IvectorExtractorComputeDerivedVarsClass> sequencer(
sequencer_opts);
for (int32 i = 0; i < NumGauss(); i++)
sequencer.Run(new IvectorExtractorComputeDerivedVarsClass(this, i));
}
KALDI_LOG << "Done.";
}
void IvectorExtractor::ComputeDerivedVars(int32 i) {
SpMatrix<double> temp_U(IvectorDim());
// temp_U = M_i^T Sigma_i^{-1} M_i
temp_U.AddMat2Sp(1.0, M_[i], kTrans, Sigma_inv_[i], 0.0);
SubVector<double> temp_U_vec(temp_U.Data(),
IvectorDim() * (IvectorDim() + 1) / 2);
U_.Row(i).CopyFromVec(temp_U_vec);
}
void IvectorExtractor::GetIvectorDistWeight(
const IvectorExtractorUtteranceStats &utt_stats,
@ -986,7 +1012,7 @@ double IvectorStats::UpdateProjections(
double tot_impr = 0.0;
{
TaskSequencerConfig sequencer_opts;
sequencer_opts.num_threads = opts.num_threads;
sequencer_opts.num_threads = g_num_threads;
TaskSequencer<IvectorExtractorUpdateProjectionClass> sequencer(
sequencer_opts);
for (int32 i = 0; i < I; i++)
@ -1149,7 +1175,7 @@ double IvectorStats::UpdateWeights(
double tot_impr = 0.0;
{
TaskSequencerConfig sequencer_opts;
sequencer_opts.num_threads = opts.num_threads;
sequencer_opts.num_threads = g_num_threads;
TaskSequencer<IvectorExtractorUpdateWeightClass> sequencer(
sequencer_opts);
for (int32 i = 0; i < I; i++)

Просмотреть файл

@ -57,6 +57,7 @@ struct IvectorEstimationOptions {
class IvectorExtractor;
class IvectorExtractorComputeDerivedVarsClass;
/// These are the stats for a particular utterance, i.e. the sufficient stats
/// for estimating an iVector (if need_2nd_order_stats == true, we can also
@ -229,7 +230,9 @@ class IvectorExtractor {
// because they do what we want.
protected:
void ComputeDerivedVars();
void ComputeDerivedVars(int32 i);
friend class IvectorExtractorComputeDerivedVarsClass;
// Imagine we'll project the iVectors with transformation T, so apply T^{-1}
// where necessary to keep the model equivalent. Used to keep unit variance
// (like prior re-estimation).
@ -311,8 +314,7 @@ struct IvectorExtractorEstimationOptions {
double gaussian_min_count;
int32 num_threads;
IvectorExtractorEstimationOptions(): variance_floor_factor(0.1),
gaussian_min_count(100.0),
num_threads(1) { }
gaussian_min_count(100.0) { }
void Register(OptionsItf *po) {
po->Register("variance-floor-factor", &variance_floor_factor,
"Factor that determines variance flooring (we floor each covar "
@ -320,8 +322,6 @@ struct IvectorExtractorEstimationOptions {
po->Register("gaussian-min-count", &gaussian_min_count,
"Minimum total count per Gaussian, below which we refuse to "
"update any associated parameters.");
po->Register("num-threads", &num_threads,
"Number of threads used in iVector estimation program");
}
};

Просмотреть файл

@ -131,6 +131,9 @@ int main(int argc, char *argv[]) {
posteriors_rspecifier = po.GetArg(3),
ivectors_wspecifier = po.GetArg(4);
// g_num_threads affects how ComputeDerivedVars is called when we read the
// extractor.
g_num_threads = sequencer_config.num_threads;
IvectorExtractor extractor;
ReadKaldiObject(ivector_extractor_rxfilename, &extractor);

Просмотреть файл

@ -102,6 +102,12 @@ int main(int argc, char *argv[]) {
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
// This is a bit of a mess... the code that reads in the extractor calls
// ComputeDerivedVars, and it can do this multi-threaded, controlled by
// g_num_threads. So if the user specified the --num-threads option, which
// goes to sequencer_opts in this case, copy it to g_num_threads.
g_num_threads = sequencer_opts.num_threads;
IvectorExtractor extractor;
ReadKaldiObject(ivector_extractor_rxfilename, &extractor);

Просмотреть файл

@ -19,7 +19,7 @@
#include "util/common-utils.h"
#include "ivector/ivector-extractor.h"
#include "thread/kaldi-thread.h"
int main(int argc, char *argv[]) {
try {
@ -36,6 +36,9 @@ int main(int argc, char *argv[]) {
kaldi::ParseOptions po(usage);
po.Register("binary", &binary, "Write output in binary mode");
po.Register("num-threads", &g_num_threads,
"Number of threads used in update");
update_opts.Register(&po);
po.Read(argc, argv);

Просмотреть файл

@ -27,13 +27,15 @@
// that you have some range of integers, e.g. A ... B-1 (with B > A), and some
// function call that takes a range of integers, and you partition these up into
// a number of blocks.
// Also see kaldi-task-sequence.h which is suitable for parallelizing the processing
// of tasks coming in sequentially from somewhere.
// TODO: if needed, provide a workaround for Windows and other
// non-POSIX-compliant systems, possibly one that does not actually do
// multi-threading.
// Description of MultiThreadPool and it's usage:
// Description of MultiThreadPool and its usage:
//
// Usage of the RunMultiThreadedPersistent is the same as the usage of
// RunMultiThreaded, except that the object provided ust inherit MultiThreadable