Multilingual SGMM training scripts for GlobalPhone

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@1070 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2012-06-20 08:26:32 +00:00 · 2012-06-20 08:26:32 +00:00 · bf91124841
--- a/egs/gp/s1/local/gp_train_multi_sgmm_deltas.sh
+++ b/egs/gp/s1/local/gp_train_multi_sgmm_deltas.sh
@ -0,0 +1,359 @@
+#!/bin/bash -u
+
+# Copyright 2012  Arnab Ghoshal
+# Copyright 2010-2011 Microsoft Corporation  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This is Subspace Gaussian Mixture Model (SGMM) training--
+# see "The subspace Gaussian mixture model--A structured model for speech recognition"
+# by D. Povey et al, Computer Speech and Language, 2011.
+
+function error_exit () {
+  echo -e "$@" >&2; exit 1;
+}
+
+function readint () {
+  local retval=${1/#*=/};  # In case --switch=ARG format was used
+#  retval=${retval#0*}      # Strip any leading 0's
+  [[ "$retval" =~ ^-?[0-9][0-9]*$ ]] \
+    || error_exit "Argument \"$retval\" not an integer."
+  echo $retval
+}
+
+function est_alimodel () {
+# If we have speaker vectors, we need an alignment model. This function gets 
+# the Gaussian-level alignments with the speaker vectors but accumulates stats 
+# without any speaker vectors; we re-estimate M, w, c and S to get a model
+# that's compatible with not having speaker vectors. Note that the transitions
+# are not updated since the decoding graph will be shared with the normal model.
+  local lx=$1
+  for L in $LANGUAGES; do
+    wdir=$dir/$L
+    local lspkdim=`sgmm-info $wdir/$lx.mdl | grep speaker | awk '{print $NF}'`
+    if [ "$lspkdim" -le 0 ]; then
+      echo "est_alimodel: No speaker space in model '$wdir/$lx.mdl'. Returning."
+      return
+    fi
+  done
+
+  local y=0;
+  local lflags=MwcS  # First time don't update v
+  while [ $y -lt $numiters_alimdl ]; do
+    [ $y -gt 0 ] && lflags=vMwcS
+    echo "Pass $y of building alignment model, flags = '$lflags'"
+    local lmulti_est_opts=''  # model, acc, model-out, occs-out tuples
+    for L in $LANGUAGES; do 
+    (
+      data=data/$L/train
+      lang=data/$L/lang
+      wdir=$dir/$L
+      local cur_alimdl=$wdir/tmp$y.alimdl
+      [ $y -eq 0 ] && cur_alimdl=$wdir/$lx.mdl
+      feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk ark:$wdir/TASK_ID.cmvn scp:$data/split$nj/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
+      gselect_opt="--gselect=ark,s,cs:gunzip -c $wdir/TASK_ID.gselect.gz|"
+      spkvecs_opt="--spk-vecs=ark:$wdir/TASK_ID.vecs"
+
+      submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/acc_ali${lx}_$y.TASK_ID.log \
+	$sjopts ali-to-post "ark:gunzip -c $wdir/TASK_ID.ali.gz|" ark:- \| \
+          sgmm-post-to-gpost $spkvecs_opt "$gselect_opt" \
+          --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk $wdir/$lx.mdl \
+	  "$feats" ark,s,cs:- ark:- \| \
+          sgmm-acc-stats-gpost --update-flags=$lflags $cur_alimdl "$feats" \
+          ark,s,cs:- $wdir/$y.TASK_ID.aliacc \
+	|| { touch $dir/err; \
+	  error_exit "$L; Align model iter $y: Error accumulating stats"; }
+
+      # Summing accs is quite fast; run locally
+      sgmm-sum-accs $wdir/sum.aliacc $wdir/$y.*.aliacc || \
+	{ touch $dir/err; \
+	  error_exit "$L; Align model iter $y: Error summing stats"; }
+    )&  # Accumulate in parallel for different languages
+      wdir=$dir/$L
+      local cur_alimdl=$wdir/tmp$y.alimdl
+      [ $y -eq 0 ] && cur_alimdl=$wdir/$lx.mdl
+      lmulti_est_opts="$lmulti_est_opts $cur_alimdl $wdir/sum.aliacc $wdir/tmp$[$y+1].alimdl $wdir/tmp$[$y+1].occs"
+    done
+    wait
+
+    submit_jobs.sh "$qcmd" --log=$dir/log/update_ali.$y.log $sjopts \
+      sgmm-est-multi --update-flags=$lflags --remove-speaker-space=true \
+	$lmulti_est_opts \
+      || error_exit "Error estimating alignment models on iter $y";
+
+    rm -f $dir/??/$y.*.aliacc $dir/??/sum.aliacc || exit 1;
+    [ $y -gt 0 ]  && rm $dir/??/tmp$y.{alimdl,occs} 
+    y=$[$y+1]
+  done
+
+  for L in $LANGUAGES; do
+    mv $dir/$L/tmp$y.alimdl $dir/$L/$lx.alimdl
+  done
+}
+
+nj=4       # Default number of jobs
+stage=-5   # Default starting stage (start with tree building)
+qcmd=""    # Options for the submit_jobs.sh script
+sjopts=""  # Options for the submit_jobs.sh script
+LANGUAGES='GE PO SP SW'  # Languages processed
+
+PROG=`basename $0`;
+usage="Usage: $PROG [options] <phone-dim> <spk-dim> <ubm> <out-dir>\n
+e.g.: $PROG 40 39 exp/ubm3c/final.ubm exp/sgmm3c\n\n
+Options:\n
+  --help\t\tPrint this message and exit\n
+  --lang STR\tList of languages to process (default = '$LANGUAGES')\n
+  --num-jobs INT\tNumber of parallel jobs to run (default=$nj).\n
+  --qcmd STR\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
+  --sjopts STR\tOptions for the 'submit_jobs.sh' script\n
+  --stage INT\tStarting stage (e.g. -4 for SGMM init; 2 for iter 2; default=$stage)\n
+";
+
+echo "$PROG $@"
+while [ $# -gt 0 ]; do
+  case "${1# *}" in  # ${1# *} strips any leading spaces from the arguments
+    --help) echo -e $usage; exit 0 ;;
+    --lang) LANGUAGES="$2"; shift 2 ;;
+    --num-jobs) 
+      shift; nj=`readint $1`;
+      [ $nj -lt 1 ] && error_exit "--num-jobs arg '$nj' not positive.";
+      shift ;;
+    --qcmd)
+      shift; qcmd=" --qcmd=${1}"; shift ;;
+    --sjopts)
+      shift; sjopts="$1"; shift ;;
+    --stage)
+      shift; stage=`readint $1`; shift ;;
+    -*)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
+    *)   break ;;   # end of options: interpreted as num-leaves
+  esac
+done
+
+if [ $# != 4 ]; then
+  error_exit $usage;
+fi
+
+[ -f path.sh ] && . path.sh
+
+# This is SGMM with speaker vectors, on top of LDA+[something] features.
+# Any speaker-specific transforms are obtained from the alignment directory.
+# To be run from ..
+
+phndim=$1
+spkdim=$2
+ubm=$3
+dir=$4
+
+[ -f $ubm ] || error_exit "UBM file '$ubm' does not exist"
+mkdir -p $dir/log || error_exit "Cannot create '$dir/log'"
+
+# (1): Model initialization; training graph and initial alignment generation.
+for L in $LANGUAGES; do
+(
+  data=data/$L/train
+  lang=data/$L/lang
+  alidir=exp/$L/tri2a_ali
+  wdir=$dir/$L
+  oov_sym=`cat $lang/oov.txt`
+  mkdir -p $wdir/log || error_exit "Cannot create working directory '$wdir'"
+
+  # Initialize the model (removed the --spk-space-dim option)
+  if [ $stage -le -5 ]; then
+    echo "$L: Initializing model"
+    submit_jobs.sh "$qcmd" --log=$wdir/log/init_sgmm.log $sjopts \
+      sgmm-init --phn-space-dim=$phndim $lang/topo $wdir/tree $ubm \
+	$wdir/0.mdl || { touch $dir/err; error_exit "$L: SGMM init failed."; }
+  fi
+
+  # Make training graphs
+  if [ $stage -le -4 ]; then
+    echo "$L: Compiling training graphs"
+    submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/mkgraphs.TASK_ID.log \
+      $sjopts compile-train-graphs $wdir/tree $wdir/0.mdl $lang/L.fst \
+	"ark:sym2int.pl --map-oov '$oov_sym' --ignore-first-field $lang/words.txt < $data/split$nj/TASK_ID/text |" \
+	"ark:|gzip -c >$wdir/TASK_ID.fsts.gz" \
+      || { touch $dir/err; error_exit "$L: Error compiling training graphs"; }
+  fi
+
+  if [ $stage -le -3 ]; then
+    echo "$L: Converting alignments"
+    submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/convert.TASK_ID.log \
+      $sjopts convert-ali $alidir/final.mdl $wdir/0.mdl $wdir/tree \
+	"ark:gunzip -c $alidir/TASK_ID.ali.gz|" \
+	"ark:|gzip -c >$wdir/TASK_ID.ali.gz" \
+      || { touch $dir/err; error_exit "$L: Convert alignment failed."; }
+  fi
+
+  if [ $stage -le -2 ]; then
+    echo "$L: Computing cepstral mean and variance statistics"
+    submit_jobs.sh "$qcmd" --njobs=$nj $sjopts --log=$wdir/log/cmvn.TASK_ID.log \
+      compute-cmvn-stats --spk2utt=ark:$data/split$nj/TASK_ID/spk2utt \
+	scp:$data/split$nj/TASK_ID/feats.scp ark:$wdir/TASK_ID.cmvn \
+      || { touch $dir/err; error_exit "$L: Computing CMN/CVN stats failed."; }
+  fi
+
+  feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk ark:$wdir/TASK_ID.cmvn scp:$data/split$nj/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
+
+  if [ $stage -le -1 ]; then
+    echo "$L: Doing Gaussian selection"
+    submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/gselectTASK_ID.log \
+      $sjopts sgmm-gselect $wdir/0.mdl "$feats" "ark,t:|gzip -c > $wdir/TASK_ID.gselect.gz" \
+      || { touch $dir/err; error_exit "$L: Error doing Gaussian selection"; }
+  fi
+)&  # Run the language-specific initializations in parallel
+done
+wait
+[ -f $dir/err ] && { rm $dir/err; error_exit "Error initializing models."; }
+
+# Language independent constants
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+numiters_alimdl=3 # Number of iterations for estimating alignment model.
+incsub_interval=8   # increase substates every 8 iterations
+# total substates after each such increment
+total_substates=( 5000 7000 9000 12000 16000 20000 25000 30000 35000 40000 )
+# For a given number of substates, iterate for $incsub_interval iterations
+numiters=$[(${#total_substates[@]}+1)*$incsub_interval]
+realign_interval=4  # realign every 4 iterations
+spkvec_start=8      # use speaker subspace *after* 8 iterations
+spkvec_interval=2   # reestimate the speaker vectors every 2 iterations
+randprune=0.1
+
+# Initially don't have speaker vectors, but change this after we estimate them.
+spkvecs_gen=0
+
+x=0
+while [ $x -lt $numiters ]; do
+  if [ $x -eq 0 ]; then
+    flags=v  # On first iter, don't update M or N.
+  elif [ $spkdim -gt 0 -a $[$x%2] -eq 0 -a $x -gt $spkvec_start ]; then 
+  # Update N on odd iterations after 1st spkvec iter, if we have spk-space.
+    flags=NwSvct
+  else  # Else update M but not N.
+    flags=MwSvct
+  fi
+
+  if [ $stage -le $x ]; then
+    echo "Pass $x: update flags = '$flags' "
+    multi_est_opts=''  # Will contain model, acc, model-out, occs-out tuples
+    for L in $LANGUAGES; do 
+    (
+      data=data/$L/train
+      lang=data/$L/lang
+      wdir=$dir/$L
+      feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk ark:$wdir/TASK_ID.cmvn scp:$data/split$nj/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
+      gselect_opt="--gselect=ark,s,cs:gunzip -c $wdir/TASK_ID.gselect.gz|"
+      if [ $spkdim -gt 0 -a $spkvecs_gen -eq 1 ]; then
+	spkvecs_opt="--spk-vecs=ark:$wdir/TASK_ID.vecs"
+      else
+	spkvecs_opt=''
+      fi
+      silphonelist=`cat $lang/silphones.csl`
+#	numsubstates=`cat $wdir/numleaves`  # Initial #-substates.
+
+      if [ $[$x%$realign_interval] -eq 0 -a $x -gt 0 ]; then
+        echo "$L; iter $x: Aligning data"
+	submit_jobs.sh "$qcmd" $sjopts --log=$wdir/log/align.$x.TASK_ID.log \
+	  --njobs=$nj sgmm-align-compiled $spkvecs_opt $scale_opts \
+	    "$gselect_opt" --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk \
+	    --beam=8 --retry-beam=40 $wdir/$x.mdl \
+	    "ark:gunzip -c $wdir/TASK_ID.fsts.gz|" "$feats" \
+	    "ark:|gzip -c >$wdir/TASK_ID.ali.gz" || \
+	    { touch $dir/err; error_exit "$L, it $x: Error realigning data"; }
+      fi
+
+      if [ $spkdim -gt 0 -a $x -gt $spkvec_start \
+	  -a $[$x%$spkvec_interval] -eq 0 ]; then
+        echo "$L; iter $x: Computing speaker vectors"
+	submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/spkvecs.$x.TASK_ID.log \
+	  $sjopts ali-to-post "ark:gunzip -c $wdir/TASK_ID.ali.gz|" ark:- \| \
+          weight-silence-post 0.01 $silphonelist $wdir/$x.mdl ark:- ark:- \| \
+          sgmm-est-spkvecs --spk2utt=ark:$data/split$nj/TASK_ID/spk2utt \
+          $spkvecs_opt "$gselect_opt" --rand-prune=$randprune $wdir/$x.mdl \
+          "$feats" ark,s,cs:- ark:$wdir/tmpTASK_ID.vecs || \
+	  { touch $dir/err; error_exit "$L, it $x: Error computing spkvecs"; }
+	for n in `seq 1 $nj`; do
+          mv $wdir/tmp${n}.vecs $wdir/${n}.vecs;
+	done
+	spkvecs_gen=1
+      fi
+
+      submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/acc.$x.TASK_ID.log \
+	$sjopts sgmm-acc-stats --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk \
+	  --update-flags=$flags --rand-prune=$randprune $spkvecs_opt \
+	  "$gselect_opt" $wdir/$x.mdl "$feats" \
+	  "ark,s,cs:ali-to-post 'ark:gunzip -c $wdir/TASK_ID.ali.gz|' ark:-|" \
+          $wdir/$x.TASK_ID.acc || \
+	  { touch $dir/err; error_exit "$L, it $x: Error accumulating stats"; }
+
+      # Summing accs is quite fast; run locally
+      sgmm-sum-accs $wdir/sum.acc $wdir/$x.*.acc || \
+	  { touch $dir/err; error_exit "$L, it $x: Error summing stats"; }
+    ) &  # Accumulate in parallel for different languages
+      wdir=$dir/$L
+      multi_est_opts="$multi_est_opts $wdir/$x.mdl $wdir/sum.acc $wdir/$[$x+1].mdl $wdir/$[$x+1].occs"
+    done
+    wait
+    [ -f $dir/err ] && \
+      { rm $dir/err; error_exit "Iter $x: Error in accumulation"; }
+
+    add_dim_opts=''
+    if [ $x -eq $spkvec_start ]; then
+      add_dim_opts="--increase-spk-dim=$spkdim --increase-phn-dim=$phndim"
+    elif [ $x -eq $[$spkvec_start*2] ]; then
+      add_dim_opts="--increase-spk-dim=$spkdim --increase-phn-dim=$phndim"
+    fi
+    split_opts=''
+    if [ $[$x%$incsub_interval] -eq 1 -a $x -gt 1 ]; then
+      index=$[($x/$incsub_interval)-1]
+      numsubstates=${total_substates[$index]}
+      split_opts="--split-substates=$numsubstates"
+    fi
+
+    submit_jobs.sh "$qcmd" --log=$dir/log/update.$x.log $sjopts \
+      sgmm-est-multi --update-flags=$flags $split_opts $add_dim_opts \
+	$multi_est_opts || error_exit "Error in pass $x estimation."
+
+    # If using speaker vectors, estimate alignment model without spkvecs
+    if [ $[$x%$incsub_interval] -eq 0 -a $x -gt 0 ]; then
+      chmod -w $dir/??/$x.mdl $dir/??/$x.occs  # Preserve for scoring
+      [ $spkdim -gt 0 ] && est_alimodel $x;
+    else
+      rm -f $dir/??/$x.mdl $dir/??/$x.occs
+    fi
+    rm -f $dir/??/$x.*.acc $dir/??/sum.acc
+  fi  # End of current stage
+  x=$[$x+1];
+done
+
+for L in $LANGUAGES; do
+  ( 
+    wdir=$dir/$L
+    rm -f $wdir/final.mdl $wdir/final.occs;
+    chmod -w $wdir/$x.mdl $wdir/$x.occs  # Preserve for scoring
+    ln -s $wdir/$x.mdl $wdir/final.mdl; 
+    ln -s $wdir/$x.occs $wdir/final.occs;
+    # If using speaker vectors, estimate alignment model without spkvecs
+    [ $spkdim -gt 0 ] && est_alimodel $wdir/$x.mdl;
+    rm -f $wdir/final.alimdl;
+    ln -sf $wdir/$x.alimdl $wdir/final.alimdl;
+
+    # Print out summary of the warning messages.
+    for x in $wdir/log/*.log; do 
+      n=`grep WARNING $x | wc -l`; 
+      if [ $n -ne 0 ]; then echo "$n warnings in $x"; fi;
+    done
+  )
+done
+
+echo Done
--- a/egs/gp/s1/local/gp_train_multi_ubm.sh
+++ b/egs/gp/s1/local/gp_train_multi_ubm.sh
@ -0,0 +1,148 @@
+#!/bin/bash
+
+# Copyright 2012  Arnab Ghoshal
+# Copyright 2010-2011  Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# Train UBM from a trained HMM/GMM system using (e.g. MFCC) + delta + 
+# acceleration features and cepstral mean normalization. 
+# Alignment directory is used for the CMN and transforms.
+# A UBM is just a single mixture of Gaussians (full-covariance, in our case), 
+# that's trained on all the data.  This will later be used in SGMM training.
+
+function error_exit () {
+  echo -e "$@" >&2; exit 1;
+}
+
+function readint () {
+  local retval=${1/#*=/};  # In case --switch=ARG format was used
+  retval=${retval#0*}      # Strip any leading 0's
+  [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \
+    || error_exit "Argument \"$retval\" not an integer."
+  echo $retval
+}
+
+nj=4      # Default number of jobs
+qcmd=""   # Options for the submit_jobs.sh script
+sjopts="" # Options for the submit_jobs.sh script
+
+PROG=`basename $0`;
+usage="Usage: $PROG [options] <num-comp> <out-dir>\n
+e.g.: $PROG exp/ubm3a\n\n
+Options:\n
+  --help\t\tPrint this message and exit\n
+  --num-jobs INT\tNumber of parallel jobs to run (default=$nj).\n
+  --qcmd STR\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
+  --sjopts STR\tOptions for the 'submit_jobs.sh' script\n
+";
+
+while [ $# -gt 0 ]; do
+  case "${1# *}" in  # ${1# *} strips any leading spaces from the arguments
+    --help) echo -e $usage; exit 0 ;;
+    --num-jobs) 
+      shift; nj=`readint $1`;
+      [ $nj -lt 1 ] && error_exit "--num-jobs arg '$nj' not positive.";
+      shift ;;
+    --qcmd)
+      shift; qcmd=" --qcmd=${1}"; shift ;;
+    --sjopts)
+      shift; sjopts="$1"; shift ;;
+    -*)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
+    *)   break ;;   # end of options: interpreted as number of components
+  esac
+done
+
+if [ $# != 2 ]; then
+  error_exit $usage;
+fi
+
+numcomps=$1
+dir=$2
+
+LANGUAGES='GE PO SP SW'  # Languages processed
+[ -f path.sh ] && . path.sh
+mkdir -p $dir/{data,log}
+for f in feats.scp spk2utt utt2spk text wav.scp; do
+  for L in $LANGUAGES; do
+    cat data/$L/train/$f
+  done \
+    | sort -k1,1 > $dir/data/$f
+done
+data=$dir/data
+split_data.sh $data $nj
+
+# typically: --intermediate-numcomps=2000 --ubm-numcomps=400
+intermediate=$[$numcomps*5]
+
+merge_ubms=
+for L in $LANGUAGES; do
+  alidir=exp/$L/tri2a_ali
+  merge_ubms=$merge_ubms" $dir/${L}.ubm"
+  echo "Language '$L': Clustering model $alidir/final.mdl to get initial UBM"
+  (
+    submit_jobs.sh "$qcmd" --log=$dir/log/cluster_$L.log $sjopts \
+      init-ubm --intermediate-numcomps=$intermediate --ubm-numcomps=$numcomps \
+	--verbose=2 --fullcov-ubm=true $alidir/final.mdl $alidir/final.occs \
+	$dir/${L}.ubm || touch $dir/.error
+  ) &  # Run the language-specific clusterings in parallel
+done
+wait
+[ -f $dir/.error ] && \
+  { rm $dir/.error; error_exit "UBM initialization failed."; }
+
+echo "Merging language-specific UBMs to a global UBM."
+fgmm-global-merge $dir/0.ubm $dir/ubm_sizes $merge_ubms
+
+echo "Computing cepstral mean and variance statistics"
+submit_jobs.sh "$qcmd" --njobs=$nj $sjopts --log=$dir/log/cmvn.TASK_ID.log \
+  compute-cmvn-stats --spk2utt=ark:$data/split$nj/TASK_ID/spk2utt \
+    scp:$data/split$nj/TASK_ID/feats.scp ark:$dir/TASK_ID.cmvn \
+    || error_exit "Computing CMN/CVN stats failed.";
+
+feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk ark:$dir/TASK_ID.cmvn scp:$data/split$nj/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
+
+# First do Gaussian selection to 100 components, which will be used
+# as the initial screen for all further passes.
+ngselect=100
+submit_jobs.sh "$qcmd" --njobs=$nj --log=$dir/log/gselect_diag.TASK_ID.log \
+  $sjopts gmm-gselect --n=$ngselect "fgmm-global-to-gmm $dir/0.ubm - |" \
+    "$feats" "ark:|gzip -c >$dir/gselect_diag.TASK_ID.gz" \
+    || error_exit "Error doing GMM selection";
+gs_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect_diag.TASK_ID.gz|"
+
+ngselect=50  # During iterations select 50 components
+for x in 0 1 2 3; do
+  echo "Pass $x"
+  submit_jobs.sh "$qcmd" --njobs=$nj $sjopts --log=$dir/log/acc.$x.TASK_ID.log \
+    gmm-gselect --n=$ngselect "$gs_opt" "fgmm-global-to-gmm $dir/$x.ubm - |" \
+      "$feats" ark:- \| \
+    fgmm-global-acc-stats --gselect=ark,s,cs:- $dir/$x.ubm "$feats" \
+      $dir/$x.TASK_ID.acc \
+    || error_exit "Error accumulating stats for UBM estimation on pass $x."
+
+  # Only remove low-count Gaussians on last iter-- keeps gselect info valid.
+  lowcount_opt="--remove-low-count-gaussians=false"
+  [ $x -eq 3 ] && lowcount_opt=
+
+  submit_jobs.sh "$qcmd" --log=$dir/log/update.$x.log $sjopts \
+    fgmm-global-est $lowcount_opt --verbose=2 $dir/$x.ubm \
+      "fgmm-global-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].ubm \
+      || error_exit "Error estimating UBM on pass $x.";
+  rm $dir/$x.*.acc $dir/$x.ubm
+done
+
+rm $dir/gselect_diag.*.gz
+rm -f $dir/final.ubm
+mv $dir/4.ubm $dir/final.ubm || exit 1;