sandbox/language_id: script changes for applying VTLN in language-id; not yet tested.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@4174 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2014-07-19 21:15:24 +00:00 · 2014-07-19 21:15:24 +00:00 · 6a91edb723
--- a/egs/lre/v1/lid/get_vtln_warps.sh
+++ b/egs/lre/v1/lid/get_vtln_warps.sh
@ -0,0 +1,141 @@
+#!/bin/bash
+
+# Copyright       2014  Daniel Povey
+# Apache 2.0
+
+#
+# This script takes a data directory and a directory computed by
+# ./train_lvtln_model.sh, and it computes speaker warp-factors spk2warp.  It
+# expects vad.scp to exist in the data directory.  Note: like
+# train_lvtln_model.sh, it uses features of the speaker-id type, i.e. double
+# delta features with sliding window cepstral mean normalization.
+
+# Begin configuration.
+stage=0
+config=
+cmd=run.pl
+logdet_scale=0.0
+subsample=5 # We use every 5th frame by default; this is more
+            # CPU-efficient.
+nj=4
+cleanup=true
+num_gselect=15
+refine_transforms=true  # if true, do a second pass of transform estimation.
+# End configuration.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh;
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Usage: $0 <data-dir> <vtln-dir> <exp-dir>"
+   echo "e.g.: $0 data/train_novtln exp/vtln exp/train_warps"
+   echo "where <vtln-dir> is produced by train_lvtln_model.sh"
+   echo "Output is <exp-dir>/spk2warp"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --nj <num-jobs>                                  # number of jobs to use (default 4)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --stage <stage>                                  # stage to do partial re-run from."
+   exit 1;
+fi
+
+data=$1
+vtlndir=$2
+dir=$3
+
+for f in $data/feats.scp $data/spk2utt $vtlndir/final.lvtln $vtlndir/final.dubm $vtlndir/final.ali_dubm; do
+  [ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
+done
+
+if [ -f $data/utt2warp ]; then
+  echo "$0: source data directory $data appears to already have VTLN.";
+  exit 1;
+fi
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+
+sdata=$data/split$nj;
+split_data.sh $data $nj || exit 1;
+
+cmvn_sliding_opts="--norm-vars=false --center=true --cmn-window=300"
+# don't change $cmvn_sliding_opts, it should probably match the
+# options used in ../sid/train_diag_ubm.sh and ./train_lvtln_model.sh
+
+sifeats="ark,s,cs:add-deltas scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding $cmvn_sliding_opts ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
+
+feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk $dir/trans.0.JOB ark:- ark:- |"
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: computing Gaussian selection info."
+
+  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
+    gmm-gselect --n=$num_gselect $vtlndir/final.ali_dubm "$sifeats" \
+      "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
+fi
+  
+
+if [ $stage -le 0 ]; then
+  echo "$0: computing initial LVTLN transforms"
+
+  $cmd JOB=1:$nj $dir/log/lvtln.0.JOB.log \
+    gmm-global-gselect-to-post $dir/final.ali_dubm "$sifeats" \
+      "ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark:- \| \
+    gmm-global-est-lvtln-trans --spk2utt=$sdata/JOB/spk2utt \
+      --logdet-scale=$logdet_scale --verbose=1 \
+      $vtlndir/final.dubm $vtlndir/final.lvtln "$sifeats" ark,s,cs:- \
+      ark:$dir/trans.0.JOB ark,t:$dir/warp.0.JOB || exit 1
+  
+  # consolidate the warps into one file.
+  for j in $(seq $nj); do cat $dir/warp.0.$j; done > $dir/warp.0
+  rm $dir/warp.0.*
+fi
+
+if $refine_transforms; then
+  ln -sf warp.0 $dir/spk2warp
+  $cleanup && rm $dir/gselect.*.gz $dir/trans.0.*
+  echo "$0: --refine-transforms=false so exiting with current warps."
+  echo "$0: Distribution of classes for one job is below."
+  grep 'Distribution of classes' $dir/log/lvtln.0.1.log
+  exit 0;
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: computing refined LVTLN transforms"
+  
+  $cmd JOB=1:$nj $dir/log/lvtln.1.JOB.log \
+    gmm-global-gselect-to-post $dir/final.dubm "$feats" \
+      "ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark:- \| \
+    gmm-global-est-lvtln-trans --spk2utt=$sdata/JOB/spk2utt \
+      --logdet-scale=$logdet_scale --verbose=1 \
+      $vtlndir/final.dubm $vtlndir/final.lvtln "$sifeats" ark,s,cs:- \
+      ark:/dev/null ark,t:$dir/warp.1.JOB || exit 1
+  
+  # consolidate the warps into one file.
+  for j in $(seq $nj); do cat $dir/warp.1.$j; done > $dir/warp.1
+  rm $dir/warp.1.*
+
+  ns1=$(cat $dir/0.warp | wc -l)
+  ns2=$(cat $dir/1.warp | wc -l)
+  ! [ "$ns1" == "$ns2" ] && echo "$0: Number of speakers differ pass1 vs pass2, $ns1 != $ns2" && exit 1;
+  paste $dir/0.warp $dir/1.warp | awk '{x=$2 - $4; if ((x>0?x:-x) > 0.010001) { print $1, $2, $4; }}' > $dir/warp_changed
+  nc=$(cat $dir/warp_changed | wc -l)
+  echo "$0: For $nc speakers out of $ns1, warp changed pass1 vs pass2 by >0.01, see $dir/warp_changed for details"
+fi
+
+$cleanup && rm $dir/gselect.*.gz $dir/trans.0.*
+
+ln -sf warp.1 $dir/spk2warp
+
+echo "$0: created warp factors in $dir/spk2warp"
+
+echo "$0: Distribution of classes for one job is below."
+grep 'Distribution of classes' $dir/log/lvtln.1.1.log
+
+# Summarize warning messages...
+utils/summarize_warnings.pl  $dir/log
+
+echo "$0: Done training LVTLN model in $dir"
--- a/egs/lre/v1/lid/train_lvtln_model.sh
+++ b/egs/lre/v1/lid/train_lvtln_model.sh
@ -172,8 +172,6 @@ if [ $stage -le -3 ]; then
      exit 1;
    fi
  done  
-  rm $dir/final.lvtln 2>/dev/null
-  ln -s 0.lvtln $dir/final.lvtln
 fi

 cp $ubmdir/final.dubm $dir/0.dubm
--- a/egs/lre/v1/run.sh
+++ b/egs/lre/v1/run.sh
@ -54,16 +54,25 @@ local/split_long_utts.sh --max-utt-len 120 data/train_unsplit data/train
 # max_voiced=3000 
 # local/vad_split_utts.sh --max-voiced $max_voiced data/train_unsplit $mfccdir data/train

- # Vtln-related things:
- # We'll use a subset of utterances to train the GMM we'll use for VTLN
- # warping.
- utils/subset_data_dir.sh data/train 5000 data/train_5k_novtln
+use_vtln=true
+if $use_vtln; then
+  for t in train lre07; do
+    cp -rt data/${t} data/${t}_novtln
+    rm -r data/${t}_novtln/{split,.backup} 2>/dev/null
+    steps/make_mfcc.sh --mfcc-config conf/mfcc_vtln.conf --nj 100 --cmd "$train_cmd" \
+       data/${t}_novtln exp/make_mfcc $mfccdir 
+    lid/compute_vad_decision.sh data/${t}_novtln exp/make_mfcc $mfccdir
+  done
+  # Vtln-related things:
+  # We'll use a subset of utterances to train the GMM we'll use for VTLN
+  # warping.
+  utils/subset_data_dir.sh data/train_novtln 5000 data/train_novtln_5k

  # for the features we use to estimate VTLN warp factors, we use more cepstra
  # (13 instead of just 7); this needs to be tuned.
  steps/make_mfcc.sh --mfcc-config conf/mfcc_vtln.conf --nj 50 --cmd "$train_cmd" \
    data/train_5k_novtln exp/make_mfcc $mfccdir
-  lid/compute_vad_decision.sh data/train_5k_novtln exp/make_mfcc $mfccdir
+
  # note, we're using the speaker-id version of the train_diag_ubm.sh script, which
  # uses double-delta instead of SDC features.  We train a 256-Gaussian UBM; this
  # has to be tuned.
@ -72,7 +81,14 @@ local/split_long_utts.sh --max-utt-len 120 data/train_unsplit data/train
  lid/train_lvtln_model.sh --mfcc-config conf/mfcc_vtln.conf --nj 30 --cmd "$train_cmd" \
     data/train_5k_novtln exp/diag_ubm_vtln exp/vtln

-)
+  for t in train lre07; do
+    lid/get_vtln_warps.sh --nj 30 --cmd "$train_cmd" \
+       data/${t}_novtln exp/vtln exp/${t}_warps
+    cp exp/${t}_warps/spk2warp ${t}/
+  done
+  lid/get_vtln_warps.sh --nj 30 --cmd "$train_cmd" \
+     data/lre07 exp/vtln exp/train_warps
+fi

 steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 100 --cmd "$train_cmd" \
  data/train exp/make_mfcc $mfccdir
--- a/egs/wsj/s5/steps/make_mfcc_pitch.sh
+++ b/egs/wsj/s5/steps/make_mfcc_pitch.sh
@ -71,14 +71,18 @@ else
 	postprocess_config_opt=
 fi

-# note: in general, the double-parenthesis construct in bash "((" is "C-style
-# syntax" where we can get rid of the $ for variable names, and omit spaces.
-# The "for" loop in this style is a special construct.
+if [ -f $data/spk2warp ]; then
+  echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
+  vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
+fi


 if [ -f $data/segments ]; then
  echo "$0 [info]: segments file exists: using that."
  split_segments=""
+  # note: in general, the double-parenthesis construct in bash "((" is "C-style
+  # syntax" where we can get rid of the $ for variable names, and omit spaces.
+  # The "for" loop in this style is a special construct.
  for ((n=1; n<=nj; n++)); do
    split_segments="$split_segments $logdir/segments.$n"
  done
@ -86,7 +90,7 @@ if [ -f $data/segments ]; then
  utils/split_scp.pl $data/segments $split_segments || exit 1;
  rm $logdir/.error 2>/dev/null
   
-  mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-mfcc-feats --verbose=2 --config=$mfcc_config ark:- ark:- |"
+  mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config ark:- ark:- |"
  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"

  $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
@ -104,7 +108,7 @@ else

  utils/split_scp.pl $scp $split_scps || exit 1;
  
-  mfcc_feats="ark:compute-mfcc-feats --verbose=2 --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
+  mfcc_feats="ark:compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
  $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \