sandbox/lid: various script fixes and updates; improving speed of iVector-extractor model loading by parallelizing derived-variables computation.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@3759 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2014-03-12 04:31:52 +00:00 · 2014-03-12 04:31:52 +00:00 · 5c5a4e2f5a
--- a/egs/lre/v1/local/split_long_utts.sh
+++ b/egs/lre/v1/local/split_long_utts.sh
@ -75,6 +75,7 @@ if [ $stage -le 1 ]; then
    if ($len <= $max_utt_len) {
      print SEGMENTS "${utt}-1 ${utt} 0 -1\n";
      print UTT2SPK "${utt}-1 $speaker\n";
+      print UTT2LANG "${utt}-1 $language\n";
    } else {
      # We will now allow split length to exceed max_utt_len.
      $num_split = int(($len + 0.999*$max_utt_len) / $max_utt_len);
--- a/egs/lre/v1/run.sh
+++ b/egs/lre/v1/run.sh
@ -63,12 +63,6 @@ lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/train \
 lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/lre07 \
  exp/make_vad $vaddir

-# Use 4k of the 14k utterances for testing, but make sure the speakers do not
-# overlap with the rest of the data, which will be used for training.
-#utils/subset_data_dir.sh --speakers data/all 4000 data/lre07
-#utils/filter_scp.pl --exclude data/lre07/spk2utt < data/all/spk2utt  | awk '{print $1}' > foo
-#utils/subset_data_dir.sh --spk-list foo data/all data/train
-

 utils/subset_data_dir.sh data/train 5000 data/train_5k
 utils/subset_data_dir.sh data/train 10000 data/train_10k
@ -82,7 +76,6 @@ lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train_10k \
 lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train \
  exp/full_ubm_2048_10k exp/full_ubm_2048

-
 lid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=2G,ram_free=2G" \
  --num-iters 5 exp/full_ubm_2048/final.ubm data/train \
  exp/extractor_2048
@ -91,6 +84,4 @@ lid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
   exp/extractor_2048 data/train exp/ivectors_train

 lid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
-   exp/extractor_2048 data/lre07 exp/ivectors_test
-
-
+   exp/extractor_2048 data/lre07 exp/ivectors_lre07
--- a/egs/lre/v1/run_logistic_regression.sh
+++ b/egs/lre/v1/run_logistic_regression.sh
@ -8,15 +8,13 @@
 . cmd.sh
 . path.sh
 set -e
-mfccdir=`pwd`/mfcc
-vaddir=`pwd`/mfcc
+
 config=conf/logistic-regression.conf

 awk '{print $2}' <(utils/remove_dialect.pl data/train/utt2lang) | sort -u | \
  awk '{print $1, NR-1}' >  exp/ivectors_train/languages.txt


-log=exp/ivectors_train/log/logistic_regression.log

 model=exp/ivectors_train/logistic_regression
 model_rebalanced=exp/ivectors_train/logistic_regression_rebalanced
@ -40,12 +38,12 @@ utils/balance_priors_to_test.pl \
    exp/ivectors_train/priors.vec

 logistic-regression-train --config=$config scp:$train_ivectors \
-                          "$classes" $model 2>$log 
+                          "$classes" $model \
+   2>exp/ivectors_train/log/logistic_regression.log

-( logistic-regression-train --config=$config scp:$train_ivectors \
-                          "$classes" - | \
- logistic-regression-copy --scale-priors=exp/ivectors_train/priors.vec - \
- $model_rebalanced ) 2>$log
+
+ logistic-regression-copy --scale-priors=exp/ivectors_train/priors.vec \
+   $model $model_rebalanced

 trials="utils/remove_dialect.pl data/train/utt2lang \
        | utils/sym2int.pl -f 2 exp/ivectors_train/languages.txt -|"
@ -57,7 +55,7 @@ logistic-regression-eval $model scp:$train_ivectors \

 logistic-regression-eval $model "ark:$trials" scp:$train_ivectors "$scores"

-logistic-regression-eval $model scp:$train_ivectors ark,t:- | \
+cat exp/ivectors_train/posteriors | \
  awk '{max=$3; argmax=3; for(f=3;f<NF;f++) { if ($f>max) 
                          { max=$f; argmax=f; }}  
                          print $1, (argmax - 3); }' | \
@ -66,29 +64,28 @@ logistic-regression-eval $model scp:$train_ivectors ark,t:- | \

 # note: we treat the language as a sentence; it happens that the WER/SER
 # corresponds to the recognition error rate.
-compute-wer --text ark:<(utils/remove_dialect.pl data/train/utt2lang) \
+compute-wer --mode=present --text ark:<(utils/remove_dialect.pl data/train/utt2lang) \
  ark:exp/ivectors_train/output

-# It perfectly classifies the training data:
-#%WER 0.00 [ 0 / 10173, 0 ins, 0 del, 0 sub ]
-#%SER 0.00 [ 0 / 10173 ]
-#Scored 10173 sentences, 0 not present in hyp.
+
+#%WER 4.68 [ 3355 / 71668, 0 ins, 0 del, 3355 sub ] [PARTIAL]
+#%SER 4.68 [ 3355 / 71668 ]
+#Scored 71668 sentences, 16 not present in hyp.


 logistic-regression-eval $model_rebalanced \
-  scp:exp/ivectors_test/ivector.scp ark,t:- | \
+  scp:exp/ivectors_lre07/ivector.scp ark,t:- | \
  awk '{max=$3; argmax=3; for(f=3;f<NF;f++) { if ($f>max) 
                          { max=$f; argmax=f; }}  
                          print $1, (argmax - 3); }' | \
-  utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt >exp/ivectors_test/output
+  utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt >exp/ivectors_lre07/output


 # someone needs to extend this to run on the dev data.

-compute-wer --text ark:<(utils/remove_dialect.pl data/lre07/utt2lang)\
-  ark:exp/ivectors_test/output
-# compute-wer --text ark:/dev/fd/63 ark:exp/lre07/output 
-# %WER 58.83 [ 3958 / 7527, 0 ins, 0 del, 3958 sub ]
-# %SER 58.83 [ 3958 / 7527 ]
+compute-wer --text ark:<(utils/remove_dialect.pl data/lre07/utt2lang) \
+  ark:exp/ivectors_lre07/output
+> compute-wer --text ark:/dev/fd/63 ark:exp/ivectors_lre07/output 
+# %WER 34.34 [ 2585 / 7527, 0 ins, 0 del, 2585 sub ]
+# %SER 34.34 [ 2585 / 7527 ]
 # Scored 7527 sentences, 0 not present in hyp.
-
--- a/egs/wsj/s5/steps/make_mfcc.sh
+++ b/egs/wsj/s5/steps/make_mfcc.sh
@ -77,13 +77,13 @@ else
  echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
  split_scps=""
  for ((n=1; n<=nj; n++)); do
-    split_scps="$split_scps $logdir/wav.$n.scp"
+    split_scps="$split_scps $logdir/wav_${name}.$n.scp"
  done

  utils/split_scp.pl $scp $split_scps || exit 1;

  $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
-    compute-mfcc-feats  --verbose=2 --config=$mfcc_config scp:$logdir/wav.JOB.scp ark:- \| \
+    compute-mfcc-feats  --verbose=2 --config=$mfcc_config scp:$logdir/wav_${name}.JOB.scp ark:- \| \
      copy-feats --compress=$compress ark:- \
      ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
      || exit 1;
@ -102,7 +102,7 @@ for ((n=1; n<=nj; n++)); do
  cat $mfccdir/raw_mfcc_$name.$n.scp || exit 1;
 done > $data/feats.scp

-rm $logdir/wav.*.scp  $logdir/segments.* 2>/dev/null
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null

 nf=`cat $data/feats.scp | wc -l` 
 nu=`cat $data/utt2spk | wc -l` 
--- a/egs/wsj/s5/utils/validate_data_dir.sh
+++ b/egs/wsj/s5/utils/validate_data_dir.sh
@ -228,12 +228,16 @@ if [ -f $data/spk2gender ]; then
  fi
 fi

-if [ -f $data/vad.scp ]; then
-  check_sorted_and_uniq $data/vad.scp
-  if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \
-    <( awk '{print $1}' $data/vad.scp ); then
-    echo "$0: error: in $data, vad.scp and utt2spk do not have identical utterance-id list"
+# check some optionally-required things
+for f in vad.scp utt2lang; do
+  if [ -f $data/$f ]; then
+    check_sorted_and_uniq $data/$f
+    if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \
+      <( awk '{print $1}' $data/$f ); then
+      echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list"
+      exit 1;
+    fi
  fi
-fi
+done

 echo "Successfully validated data-directory $data"
--- a/src/ivector/ivector-extractor.cc
+++ b/src/ivector/ivector-extractor.cc
@ -197,6 +197,17 @@ IvectorExtractor::IvectorExtractor(
  ComputeDerivedVars();
 }

+class IvectorExtractorComputeDerivedVarsClass {
+ public:
+  IvectorExtractorComputeDerivedVarsClass(IvectorExtractor *extractor,
+                                          int32 i):
+      extractor_(extractor), i_(i) { }
+  void operator () () { extractor_->ComputeDerivedVars(i_); }
+ private:
+  IvectorExtractor *extractor_;
+  int32 i_;
+};
+
 void IvectorExtractor::ComputeDerivedVars() {
  KALDI_LOG << "Computing derived variables for iVector extractor";
  gconsts_.Resize(NumGauss());
@ -206,17 +217,32 @@ void IvectorExtractor::ComputeDerivedVars() {
    // the gconsts don't contain any weight-related terms.
  }
  U_.Resize(NumGauss(), IvectorDim() * (IvectorDim() + 1) / 2);
-  SpMatrix<double> temp_U(IvectorDim());
-  for (int32 i = 0; i < NumGauss(); i++) {
-    // temp_U = M_i^T Sigma_i^{-1} M_i
-    temp_U.AddMat2Sp(1.0, M_[i], kTrans, Sigma_inv_[i], 0.0);
-    SubVector<double> temp_U_vec(temp_U.Data(),
-                                 IvectorDim() * (IvectorDim() + 1) / 2);
-    U_.Row(i).CopyFromVec(temp_U_vec);
+
+  // Note, we could have used RunMultiThreaded for this and similar tasks we
+  // have here, but we found that we don't get as complete CPU utilization as we
+  // could because some tasks finish before others.
+  {
+    TaskSequencerConfig sequencer_opts;
+    sequencer_opts.num_threads = g_num_threads;
+    TaskSequencer<IvectorExtractorComputeDerivedVarsClass> sequencer(
+        sequencer_opts);
+    for (int32 i = 0; i < NumGauss(); i++)
+      sequencer.Run(new IvectorExtractorComputeDerivedVarsClass(this, i));
  }
  KALDI_LOG << "Done.";
 }

+  
+void IvectorExtractor::ComputeDerivedVars(int32 i) {
+  SpMatrix<double> temp_U(IvectorDim());
+  // temp_U = M_i^T Sigma_i^{-1} M_i
+  temp_U.AddMat2Sp(1.0, M_[i], kTrans, Sigma_inv_[i], 0.0);
+  SubVector<double> temp_U_vec(temp_U.Data(),
+                               IvectorDim() * (IvectorDim() + 1) / 2);
+  U_.Row(i).CopyFromVec(temp_U_vec);
+  
+}
+

 void IvectorExtractor::GetIvectorDistWeight(
    const IvectorExtractorUtteranceStats &utt_stats,
@ -986,7 +1012,7 @@ double IvectorStats::UpdateProjections(
  double tot_impr = 0.0;
  {
    TaskSequencerConfig sequencer_opts;
-    sequencer_opts.num_threads = opts.num_threads;
+    sequencer_opts.num_threads = g_num_threads;
    TaskSequencer<IvectorExtractorUpdateProjectionClass> sequencer(
        sequencer_opts);
    for (int32 i = 0; i < I; i++)
@ -1149,7 +1175,7 @@ double IvectorStats::UpdateWeights(
  double tot_impr = 0.0;
  {
    TaskSequencerConfig sequencer_opts;
-    sequencer_opts.num_threads = opts.num_threads;
+    sequencer_opts.num_threads = g_num_threads;
    TaskSequencer<IvectorExtractorUpdateWeightClass> sequencer(
        sequencer_opts);
    for (int32 i = 0; i < I; i++)
--- a/src/ivector/ivector-extractor.h
+++ b/src/ivector/ivector-extractor.h
@ -57,6 +57,7 @@ struct IvectorEstimationOptions {


 class IvectorExtractor;
+class IvectorExtractorComputeDerivedVarsClass;

 /// These are the stats for a particular utterance, i.e. the sufficient stats
 /// for estimating an iVector (if need_2nd_order_stats == true, we can also
@ -229,7 +230,9 @@ class IvectorExtractor {
  // because they do what we want.
 protected:
  void ComputeDerivedVars();
-
+  void ComputeDerivedVars(int32 i);
+  friend class IvectorExtractorComputeDerivedVarsClass;
+  
  // Imagine we'll project the iVectors with transformation T, so apply T^{-1}
  // where necessary to keep the model equivalent.  Used to keep unit variance
  // (like prior re-estimation).
@ -311,8 +314,7 @@ struct IvectorExtractorEstimationOptions {
  double gaussian_min_count;
  int32 num_threads;
  IvectorExtractorEstimationOptions(): variance_floor_factor(0.1),
-                                       gaussian_min_count(100.0),
-                                       num_threads(1) { }
+                                       gaussian_min_count(100.0) { }
  void Register(OptionsItf *po) {
    po->Register("variance-floor-factor", &variance_floor_factor,
                 "Factor that determines variance flooring (we floor each covar "
@ -320,8 +322,6 @@ struct IvectorExtractorEstimationOptions {
    po->Register("gaussian-min-count", &gaussian_min_count,
                 "Minimum total count per Gaussian, below which we refuse to "
                 "update any associated parameters.");
-    po->Register("num-threads", &num_threads,
-                 "Number of threads used in iVector estimation program");
  }
 };

--- a/src/ivectorbin/ivector-extract.cc
+++ b/src/ivectorbin/ivector-extract.cc
@ -131,6 +131,9 @@ int main(int argc, char *argv[]) {
        posteriors_rspecifier = po.GetArg(3),
        ivectors_wspecifier = po.GetArg(4);

+    // g_num_threads affects how ComputeDerivedVars is called when we read the
+    // extractor.
+    g_num_threads = sequencer_config.num_threads; 
    IvectorExtractor extractor;
    ReadKaldiObject(ivector_extractor_rxfilename, &extractor);

--- a/src/ivectorbin/ivector-extractor-acc-stats.cc
+++ b/src/ivectorbin/ivector-extractor-acc-stats.cc
@ -102,6 +102,12 @@ int main(int argc, char *argv[]) {
    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
    RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);

+
+    // This is a bit of a mess... the code that reads in the extractor calls
+    // ComputeDerivedVars, and it can do this multi-threaded, controlled by
+    // g_num_threads.  So if the user specified the --num-threads option, which
+    // goes to sequencer_opts in this case, copy it to g_num_threads.
+    g_num_threads = sequencer_opts.num_threads;
    
    IvectorExtractor extractor;
    ReadKaldiObject(ivector_extractor_rxfilename, &extractor);
--- a/src/ivectorbin/ivector-extractor-est.cc
+++ b/src/ivectorbin/ivector-extractor-est.cc
@ -19,7 +19,7 @@

 #include "util/common-utils.h"
 #include "ivector/ivector-extractor.h"
-
+#include "thread/kaldi-thread.h"

 int main(int argc, char *argv[]) {
  try {
@ -36,6 +36,9 @@ int main(int argc, char *argv[]) {
    
    kaldi::ParseOptions po(usage);
    po.Register("binary", &binary, "Write output in binary mode");
+    po.Register("num-threads", &g_num_threads,
+                "Number of threads used in update");
+    
    update_opts.Register(&po);
    
    po.Read(argc, argv);
--- a/src/thread/kaldi-thread.h
+++ b/src/thread/kaldi-thread.h
@ -27,13 +27,15 @@
 // that you have some range of integers, e.g. A ... B-1 (with B > A), and some
 // function call that takes a range of integers, and you partition these up into
 // a number of blocks.
+// Also see kaldi-task-sequence.h which is suitable for parallelizing the processing
+// of tasks coming in sequentially from somewhere.

 // TODO: if needed, provide a workaround for Windows and other
 // non-POSIX-compliant systems, possibly one that does not actually do
 // multi-threading.


-// Description of MultiThreadPool and it's usage:
+// Description of MultiThreadPool and its usage:
 //
 // Usage of the RunMultiThreadedPersistent is the same as the usage of
 // RunMultiThreaded, except that the object provided ust inherit MultiThreadable