Modifications to s3 scripts.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@540 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2011-09-29 21:13:41 +00:00 · 2011-09-29 21:13:41 +00:00 · 669479471e
--- a/egs/rm/s1/RESULTS
+++ b/egs/rm/s1/RESULTS
@ -111,7 +111,6 @@ exp/decode_sgmme_fmllr/wer:Average WER is 2.266018 (284 / 12533)



-
 #### Note: stuff below this line may be out of date / not computed
 # with most recent version of toolkit.
 # note: when changing (phn,spk) dimensions from (40,39) -> (30,30),
--- a/egs/rm/s3/steps/train_lda_et.sh
+++ b/egs/rm/s3/steps/train_lda_et.sh
@ -205,8 +205,8 @@ defaultfeats="$basefeats transform-feats $dir/B.mat ark:- ark:- |"
 rm $dir/$x.acc2

 # The following files may be useful for display purposes.
-for n in 1 2 3 4 5 6 7 8 9 10 11 12 13 14; do
-  cat $dir/warps/$n.warp | scripts/process_warps.pl $data/spk2gender > $dir/warps/$n.warp_info
+for y in 1 2 3 4 5 6 7 8 9 10 11 12 13 14; do
+  cat $dir/warps/$y.warp | scripts/process_warps.pl $data/spk2gender > $dir/warps/$y.warp_info
 done

 ( cd $dir; rm final.mdl 2>/dev/null; 
--- a/egs/wsj/s3/run.sh
+++ b/egs/wsj/s3/run.sh
@ -87,6 +87,14 @@ scripts/decode.sh steps/decode_deltas.sh exp/tri2a/graph_tgpr data/dev_nov93 exp
 # Train tri2b, which is LDA+MLLT, on si84 data.
 steps/train_lda_mllt.sh 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2b
 scripts/mkgraph.sh data/lang_test_tgpr exp/tri2a exp/tri2a/graph_tgpr
+scripts/decode.sh steps/decode_lda_mllt.sh exp/tri2b/graph_tgpr data/eval_nov92 exp/tri2b/decode_tgpr_eval92
+scripts/decode.sh steps/decode_lda_mllt.sh exp/tri2b/graph_tgpr data/dev_nov93 exp/tri2b/decode_tgpr_dev93
+
+# Align tri2b system with si84 data.
+steps/align_lda_mllt.sh data/train_si84 data/lang exp/tri2b exp/tri2b_ali_si84
+
+
+steps/train_lda_et.sh 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2c

 # exp/decode_mono_tgpr_eval92 exp/graph_mono_tg_pruned/HCLG.fst steps/decode_mono.sh data/eval_nov92.scp 

--- a/egs/wsj/s3/steps/align_lda_mllt.sh
+++ b/egs/wsj/s3/steps/align_lda_mllt.sh
@ -0,0 +1,101 @@
+#!/bin/bash
+# Copyright 2010-2011 Microsoft Corporation  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# To be run from ..
+
+# This script does training-data alignment given a model built using 
+# CMN + delta + delta-delta features.  It splits the data into
+# four chunks and does everything in parallel on the same machine.
+# Its output, all in its own
+# experimental directory, is {0,1,2,3}.cmvn {0,1,2,3}.ali, tree, final.mdl ,
+# final.mat and final.occs (the last four are just copied from the source directory). 
+
+
+# Option to use precompiled graphs from last phase, if these
+# are available (i.e. if they were built with the same data).
+# These must be split into four pieces.
+
+oldgraphs=false
+if [ "$1" == --use-graphs ]; then
+   shift;
+   oldgraphs=true
+fi
+
+
+if [ $# != 4 ]; then
+   echo "Usage: steps/align_lda_mllt.sh <data-dir> <lang-dir> <src-dir> <exp-dir>"
+   echo " e.g.: steps/align_lda_mllt.sh data/train data/lang exp/tri1 exp/tri1_ali"
+   exit 1;
+fi
+
+if [ -f path.sh ]; then . path.sh; fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+oov_sym="<SPOKEN_NOISE>" # Map OOVs to this in training.
+grep SPOKEN_NOISE $lang/words.txt >/dev/null || echo "Warning: SPOKEN_NOISE not in dictionary"
+
+
+mkdir -p $dir
+cp $srcdir/{tree,final.mdl,final.mat,final.occs} $dir || exit 1;  # Create copy of the tree and model and occs...
+
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+
+if [ ! -f $data/split4 -o $data/split4 -ot $data/feats.scp ]; then
+  scripts/split_data.sh $data 4
+fi
+
+echo "Computing cepstral mean and variance statistics"
+for n in 0 1 2 3; do
+  compute-cmvn-stats --spk2utt=ark:$data/split4/$n/spk2utt scp:$data/split4/$n/feats.scp \
+      ark:$dir/$n.cmvn 2>$dir/cmvn$n.log || exit 1;
+done
+
+
+# Align all training data using the supplied model.
+
+
+rm $dir/.error 2>/dev/null
+echo "Aligning data from $data"
+if $oldgraphs; then 
+  for n in 0 1 2 3; do
+    feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/$n.cmvn scp:$data/split4/$n/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $dir/final.mat |"
+    if [ ! -f $srcdir/$n.fsts.gz ]; then
+       echo You specified --use-graphs but no such file $srcdir/$n.fsts.gz
+       exit 1;
+    fi
+    gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 $dir/final.mdl \
+     "ark:gunzip -c $srcdir/$n.fsts.gz|" "$feats" "ark:|gzip -c >$dir/$n.ali.gz" \
+        2> $dir/align$n.log || touch $dir/.error &
+  done
+  wait;
+  [ -f $dir/.error ] && echo error doing alignment && exit 1;
+else
+  for n in 0 1 2 3; do
+    feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/$n.cmvn scp:$data/split4/$n/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $dir/final.mat |"
+    # compute integer form of transcripts.
+    tra="ark:scripts/sym2int.pl --map-oov \"$oov_sym\" --ignore-first-field $lang/words.txt $data/split4/$n/text|";
+    gmm-align $scale_opts --beam=10 --retry-beam=40 $dir/tree $dir/final.mdl $lang/L.fst \
+        "$feats" "$tra" "ark:|gzip -c >$dir/$n.ali.gz" 2> $dir/align$n.log || touch $dir/.error &
+  done
+  wait;
+  [ -f $dir/.error ] && echo error doing alignment && exit 1;
+fi
+
+echo "Done aligning data."
--- a/egs/wsj/s3/steps/decode_deltas.sh
+++ b/egs/wsj/s3/steps/decode_deltas.sh
@ -21,10 +21,6 @@
 # This script just generates lattices for a single broken-up
 # piece of the data.

-#nd rescores them with different
-# acoustic weights, in order to explore a range of different
-# weights.
-
 if [ -f ./path.sh ]; then . ./path.sh; fi

 numjobs=1
--- a/egs/wsj/s3/steps/decode_lda_et.sh
+++ b/egs/wsj/s3/steps/decode_lda_et.sh
@ -0,0 +1,79 @@
+#!/bin/bash
+
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# Decoding script that works with a GMM model and the baseline
+# [e.g. MFCC] features plus cepstral mean subtraction plus
+# LDA + ET (exponential transform) features.  This script first
+# generates a pruned state-level lattice without adaptation,
+# then does acoustic rescoring on this lattice to generate
+# a new lattice; it determinizes and prunes this ready for
+# further rescoring (e.g. with new LMs, or varying the acoustic
+# scale).
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+
+numjobs=1
+jobid=0
+if [ "$1" == "-j" ]; then
+  shift;
+  numjobs=$1;
+  jobid=$2;
+  shift; shift;
+  if [ $jobid -ge $numjobs ]; then
+     echo "Invalid job number, $jobid >= $numjobs";
+     exit 1;
+  fi
+fi
+
+if [ $# != 3 ]; then
+   echo "Usage: steps/decode_lda_mllt.sh [-j num-jobs job-number] <graph-dir> <data-dir> <decode-dir>"
+   echo " e.g.: steps/decode_lda_mllt.sh -j 8 0 exp/mono/graph_tgpr data/dev_nov93 exp/mono/decode_dev93_tgpr"
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+
+mkdir -p $dir
+
+if [ $numjobs -gt 1 ]; then
+  mydata=$data/split$numjobs/$jobid
+else
+  mydata=$data
+fi
+
+requirements="$mydata/feats.scp $srcdir/final.mdl $srcdir/final.mat $graphdir/HCLG.fst"
+for f in $requirements; do
+  if [ ! -f $f ]; then
+     echo "decode_lda_mllt.sh: no such file $f";
+     exit 1;
+  fi
+done
+
+
+# We only do one decoding pass, so there is no point caching the
+# CMVN stats-- we make them part of a pipe.
+feats="ark:compute-cmvn-stats --spk2utt=ark:$mydata/spk2utt scp:$mydata/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$mydata/utt2spk ark:- scp:$mydata/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+
+gmm-latgen-faster --max-active=7000 --beam=13.0 --lattice-beam=6.0 --acoustic-scale=0.083333 \
+  --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+  $srcdir/final.mdl $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.$jobid.gz" \
+     2> $dir/decode$jobid.log || exit 1;
+
--- a/egs/wsj/s3/steps/decode_lda_mllt.sh
+++ b/egs/wsj/s3/steps/decode_lda_mllt.sh
@ -0,0 +1,76 @@
+#!/bin/bash
+
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# Decoding script that works with a GMM model and the baseline
+# [e.g. MFCC] features plus cepstral mean subtraction plus
+# LDA+MLLT or similar transform.
+# This script just generates lattices for a single broken-up
+# piece of the data.
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+
+numjobs=1
+jobid=0
+if [ "$1" == "-j" ]; then
+  shift;
+  numjobs=$1;
+  jobid=$2;
+  shift; shift;
+  if [ $jobid -ge $numjobs ]; then
+     echo "Invalid job number, $jobid >= $numjobs";
+     exit 1;
+  fi
+fi
+
+if [ $# != 3 ]; then
+   echo "Usage: steps/decode_lda_mllt.sh [-j num-jobs job-number] <graph-dir> <data-dir> <decode-dir>"
+   echo " e.g.: steps/decode_lda_mllt.sh -j 8 0 exp/mono/graph_tgpr data/dev_nov93 exp/mono/decode_dev93_tgpr"
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+
+mkdir -p $dir
+
+if [ $numjobs -gt 1 ]; then
+  mydata=$data/split$numjobs/$jobid
+else
+  mydata=$data
+fi
+
+requirements="$mydata/feats.scp $srcdir/final.mdl $srcdir/final.mat $graphdir/HCLG.fst"
+for f in $requirements; do
+  if [ ! -f $f ]; then
+     echo "decode_lda_mllt.sh: no such file $f";
+     exit 1;
+  fi
+done
+
+
+# We only do one decoding pass, so there is no point caching the
+# CMVN stats-- we make them part of a pipe.
+feats="ark:compute-cmvn-stats --spk2utt=ark:$mydata/spk2utt scp:$mydata/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$mydata/utt2spk ark:- scp:$mydata/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+
+gmm-latgen-faster --max-active=7000 --beam=13.0 --lattice-beam=6.0 --acoustic-scale=0.083333 \
+  --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+  $srcdir/final.mdl $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.$jobid.gz" \
+     2> $dir/decode$jobid.log || exit 1;
+
--- a/egs/wsj/s3/steps/train_lda_et.sh
+++ b/egs/wsj/s3/steps/train_lda_et.sh
@ -0,0 +1,269 @@
+#!/bin/bash
+# Copyright 2010-2011 Microsoft Corporation  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# To be run from ..
+# Triphone model training, using delta-delta features and cepstral
+# mean normalization.  It starts from an existing directory (e.g.
+# exp/mono), supplied as an argument, which is assumed to be built using
+# the same type of features.
+
+if [ $# != 6 ]; then
+   echo "Usage: steps/train_lda_et.sh <num-leaves> <tot-gauss> <data-dir> <lang-dir> <ali-dir> <exp-dir>"
+   echo " e.g.: steps/train_lda_et.sh 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2c"
+   exit 1;
+fi
+
+if [ -f path.sh ]; then . path.sh; fi
+
+numleaves=$1
+totgauss=$2
+data=$3
+lang=$4
+alidir=$5
+dir=$6
+
+if [ ! -f $alidir/final.mdl -o ! -f $alidir/0.ali.gz -o ! -f $alidir/3.ali.gz ]; then
+  echo "Error: alignment dir $alidir does not contain final.mdl and {0,1,2,3}.ali.gz"
+  exit 1;
+fi
+
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+realign_iters="10 20 30";
+numiters_et=15
+normtype=offset # et option; could be offset [recommended], or none
+oov_sym="<SPOKEN_NOISE>" # Map OOVs to this in training.
+grep SPOKEN_NOISE $lang/words.txt >/dev/null || echo "Warning: SPOKEN_NOISE not in dictionary"
+silphonelist=`cat $lang/silphones.csl`
+numiters=35    # Number of iterations of training
+maxiterinc=25 # Last iter to increase #Gauss on.
+numgauss=$numleaves
+incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
+randprune=4.0
+
+mkdir -p $dir/log $dir/warps
+
+if [ ! -f $data/split4 -o $data/split4 -ot $data/feats.scp ]; then
+  scripts/split_data.sh $data 4
+fi
+
+# basefeats is all the feats, transformed with lda.mat-- just needed for tree accumulation.
+basefeats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk \"ark:cat $alidir/*.cmvn|\" scp:$data/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |"
+
+for n in 0 1 2 3; do
+  splicedfeatspart[$n]="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split4/$n/utt2spk ark:$alidir/$n.cmvn scp:$data/split4/$n/feats.scp ark:- | splice-feats ark:- ark:- |"
+  basefeatspart[$n]="${splicedfeatspart[$n]} transform-feats $dir/lda.mat ark:- ark:- |"
+  featspart[$n]="${basefeatspart[$n]}" # This gets overwritten later in the script.
+done
+
+echo "Accumulating LDA statistics."
+
+rm $dir/.error 2>/dev/null
+
+for n in 0 1 2 3; do
+( ali-to-post "ark:gunzip -c $alidir/$n.ali.gz|" ark:- | \
+   weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- | \
+   acc-lda --rand-prune=$randprune $alidir/final.mdl "${splicedfeatspart[$n]}" ark,s,cs:- \
+     $dir/lda.$n.acc ) 2>$dir/log/lda_acc.$n.log || touch $dir/.error &
+done
+wait
+[ -f $dir/.error ] && echo "Error accumulating LDA stats" && exit 1;
+est-lda $dir/lda.mat $dir/lda.*.acc 2>$dir/log/lda_est.log || exit 1; # defaults to dim=40
+rm $dir/lda.*.acc
+cur_lda=$dir/0.mat
+
+# The next stage assumes we won't need the context of silence, which
+# assumes something about $lang/roots.txt, but it seems pretty safe.
+echo "Accumulating tree stats"
+acc-tree-stats  --ci-phones=$silphonelist $alidir/final.mdl "$basefeats" \
+  "ark:gunzip -c $alidir/?.ali.gz|" $dir/treeacc 2> $dir/log/acc_tree.log  || exit 1;
+
+echo "Computing questions for tree clustering"
+# preparing questions, roots file...
+scripts/sym2int.pl $lang/phones.txt $lang/phonesets_cluster.txt > $dir/phonesets.txt || exit 1;
+cluster-phones $dir/treeacc $dir/phonesets.txt $dir/questions.txt 2> $dir/log/questions.log || exit 1;
+scripts/sym2int.pl $lang/phones.txt $lang/extra_questions.txt >> $dir/questions.txt
+compile-questions $lang/topo $dir/questions.txt $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
+scripts/sym2int.pl --ignore-oov $lang/phones.txt $lang/roots.txt > $dir/roots.txt
+
+echo "Building tree"
+build-tree --verbose=1 --max-leaves=$numleaves \
+    $dir/treeacc $dir/roots.txt \
+    $dir/questions.qst $lang/topo $dir/tree  2> $dir/log/train_tree.log || exit 1;
+
+gmm-init-model  --write-occs=$dir/1.occs  \
+    $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
+
+gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl \
+   2>$dir/log/mixup.log || exit 1;
+
+gmm-init-et --normalize-type=$normtype --binary=false --dim=40 $dir/1.et 2>$dir/init_et.log || exit 1
+
+rm $dir/treeacc
+
+# Convert alignments in $alidir, to use as initial alignments.
+# This assumes that $alidir was split in 4 pieces, just like the
+# current dir.
+
+echo "Converting old alignments"
+for n in 0 1 2 3; do
+  convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
+   "ark:gunzip -c $alidir/$n.ali.gz|" "ark:|gzip -c >$dir/$n.ali.gz" \
+    2>$dir/log/convert$n.log  || exit 1;
+done
+
+# Make training graphs (this is split in 4 parts).
+echo "Compiling training graphs"
+rm $dir/.error 2>/dev/null
+for n in 0 1 2 3; do
+  compile-train-graphs $dir/tree $dir/1.mdl  $lang/L.fst  \
+    "ark:scripts/sym2int.pl --map-oov \"$oov_sym\" --ignore-first-field $lang/words.txt < $data/split4/$n/text |" \
+    "ark:|gzip -c >$dir/$n.fsts.gz"  2>$dir/log/compile_graphs$n.log  || touch $dir/.error &
+done
+wait;
+[ -f $dir/.error ] && echo "Error compiling training graphs" && exit 1;
+
+x=1
+while [ $x -lt $numiters ]; do
+   echo Pass $x
+   if echo $realign_iters | grep -w $x >/dev/null; then
+     echo "Aligning data"
+     for n in 0 1 2 3; do
+       gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 $dir/$x.mdl \
+              "ark:gunzip -c $dir/$n.fsts.gz|" "${featspart[$n]}" \
+              "ark:|gzip -c >$dir/$n.ali.gz" 2> $dir/log/align.$x.$n.log || touch $dir/.error &
+     done
+     wait;
+     [ -f $dir/.error ] && echo "Error aligning data on iteration $x" && exit 1;
+   fi
+
+   if [ $x -lt $numiters_et ]; then
+     echo "Re-estimating ET transforms"
+     for n in 0 1 2 3; do
+     ( ali-to-post "ark:gunzip -c $dir/$n.ali.gz|" ark:- | \
+       weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- | \
+       rand-prune-post $randprune ark:- ark:- | \
+       gmm-post-to-gpost $dir/$x.mdl "${featspart[$n]}" ark:- ark:- | \
+       gmm-est-et --spk2utt=ark:$data/split4/$n/spk2utt $dir/$x.mdl $dir/$x.et "${basefeatspart[$n]}" \
+         ark,s,cs:- ark:$dir/$n.trans.tmp ark,t:$dir/warps/$x.$n.warp ) \
+       2> $dir/log/trans.$x.$n.log || touch $dir/.error &
+     done
+     wait
+     [ -f $dir/.error ] && echo "Error computing ET transforms on iteration $x" && exit 1;
+     for n in 0 1 2 3; do 
+       mv $dir/$n.trans.tmp $dir/$n.trans || exit 1;
+       featspart[$n]="${basefeatspart[$n]} transform-feats --utt2spk=ark:$data/split4/$n/utt2spk ark:$dir/$n.trans ark:- ark:- |"
+     done
+   fi
+
+   for n in 0 1 2 3; do
+     gmm-acc-stats-ali --binary=false $dir/$x.mdl "${featspart[$n]}" \
+       "ark:gunzip -c $dir/$n.ali.gz|" $dir/$x.$n.acc \
+      2>$dir/log/acc.$x.$n.log  || touch $dir/.error &
+   done
+   wait;
+   [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
+   gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \
+      "gmm-sum-accs - $dir/$x.{0,1,2,3}.acc |" $dir/$[$x+1].mdl 2> $dir/log/update.$x.log || exit 1;
+   rm $dir/$x.mdl $dir/$x.{0,1,2,3}.acc
+   rm $dir/$x.occs 
+
+   x1=$[$x+1];
+   if [ $x -lt $numiters_et ]; then  
+     # Alternately estimate either A or B.
+     if [ $[$x%2] == 0 ]; then  # Estimate A:
+       for n in 0 1 2 3; do
+       ( ali-to-post "ark:gunzip -c $dir/$n.ali.gz|" ark:- | \
+         weight-silence-post 0.0 $silphonelist $dir/$x1.mdl ark:- ark:- | \
+         rand-prune-post $randprune ark:- ark:- | \
+         gmm-post-to-gpost $dir/$x1.mdl "${featspart[$n]}" ark:- ark:- | \
+         gmm-et-acc-a --spk2utt=ark:$data/split4/$n/spk2utt --verbose=1 $dir/$x1.mdl $dir/$x.et "${basefeatspart[$n]}" \
+              ark,s,cs:- $dir/$x.$n.et_acc_a ) 2> $dir/log/acc_a.$x.$n.log || touch $dir/.error &
+       done
+       wait
+       [ -f $dir/.error ] && echo "Error accumulating ET stats for A on iter $x" && exit 1;
+       gmm-et-est-a --verbose=1 $dir/$x.et $dir/$x1.et $dir/$x.*.et_acc_a 2> $dir/log/update_a.$x.log || exit 1;
+       rm $dir/$x.*.et_acc_a
+     else
+       for n in 0 1 2 3; do
+       ( ali-to-post "ark:gunzip -c $dir/$n.ali.gz|" ark:- | \
+         weight-silence-post 0.0 $silphonelist $dir/$x1.mdl ark:- ark:- | \
+         gmm-acc-mllt --rand-prune=$randprune $dir/$x1.mdl "${featspart[$n]}" ark:- \
+            $dir/$x.$n.mllt_acc ) 2> $dir/log/acc_b.$x.$n.log || touch $dir/.error &
+       done
+       wait
+       [ -f $dir/.error ] && echo "Error accumulating ET stats for A on iter $x" && exit 1;
+       est-mllt $dir/$x.mat $dir/$x.*.mllt_acc 2> $dir/log/update_b.$x.log || exit 1;
+       gmm-et-apply-c $dir/$x.et $dir/$x.mat $dir/$x1.et 2>>$dir/log/update_b.$x.log || exit 1;
+       gmm-transform-means $dir/$x.mat $dir/$x1.mdl $dir/$x1.mdl 2>> $dir/log/update_b.$x.log || exit 1;
+       # Modify current transforms by premultiplying by C.
+       for n in 0 1 2 3; do
+         compose-transforms $dir/$x.mat ark:$dir/$n.trans ark:$dir/tmp.trans 2>> $dir/update_b.$x.log || exit 1;
+         mv $dir/tmp.trans $dir/$n.trans
+       done
+       rm $dir/$x.mat
+       rm $dir/$x.*.mllt_acc
+     fi   
+   fi
+
+   if [[ $x -le $maxiterinc ]]; then 
+      numgauss=$[$numgauss+$incgauss];
+   fi
+   x=$[$x+1];
+done
+
+
+# Write out the B matrix which we will combine with LDA to get
+# final.mat; and write out final.et which is the current final et
+# but with B set to unity (since it's now part of final.mat).
+# This is just more convenient going forward, since the "default features"
+# (i.e. when speaker factor equals zero) are now the same as the
+# features that the ET acts on.
+
+gmm-et-get-b $dir/$numiters_et.et $dir/B.mat $dir/final.et 2>$dir/get_b.log || exit 1
+
+compose-transforms $dir/B.mat $dir/lda.mat $dir/final.mat 2>>$dir/get_b.log || exit 1
+
+for n in 0 1 2 3; do
+  defaultfeatspart[$n]="${basefeatspart[$n]} transform-feats $dir/B.mat ark:- ark:- |"
+done
+
+# Accumulate stats for "alignment model" which is as the model but with
+# the default features (shares Gaussian-level alignments).
+for n in 0 1 2 3; do
+ ( ali-to-post "ark:gunzip -c $dir/$n.ali.gz|" ark:-  | \
+   gmm-acc-stats-twofeats $dir/$x.mdl "${featspart[$n]}" "${defaultfeatspart[$n]}" \
+     ark:- $dir/$x.$n.acc2 ) 2>$dir/acc_alimdl.log || touch $dir/.error &
+done
+wait;
+[ -f $dir/.error ] && echo "Error accumulating alignment statistics." && exit 1;
+# Update model.
+gmm-est --write-occs=$dir/final.occs --remove-low-count-gaussians=false $dir/$x.mdl \
+  "gmm-sum-accs - $dir/$x.*.acc2|" $dir/$x.alimdl \
+    2>$dir/est_alimdl.log  || exit 1;
+rm $dir/$x.*.acc2
+
+# The following files may be useful for display purposes.
+for y in 2 3 4 5 6 7 8 9 10 11 12 13 14; do
+  cat $dir/warps/$y.*.warp | scripts/process_warps.pl $data/spk2gender > $dir/warps/$y.warp_info
+done
+
+( cd $dir; 
+  ln -s $x.mdl final.mdl; 
+  ln -s $x.occs final.occs;
+  ln -s $x.alimdl final.alimdl
+  ln -s $[$numiters_et-1].trans final.trans )
+
+echo Done
--- a/egs/wsj/s3/steps/train_lda_mllt.sh
+++ b/egs/wsj/s3/steps/train_lda_mllt.sh
@ -21,8 +21,8 @@
 # the same type of features.

 if [ $# != 6 ]; then
-   echo "Usage: steps/train_deltas.sh <num-leaves> <tot-gauss> <data-dir> <lang-dir> <ali-dir> <exp-dir>"
-   echo " e.g.: steps/train_deltas.sh 2000 10000 data/train_si84_half data/lang exp/mono_ali exp/tri1"
+   echo "Usage: steps/train_lda_mllt.sh <num-leaves> <tot-gauss> <data-dir> <lang-dir> <ali-dir> <exp-dir>"
+   echo " e.g.: steps/train_lda_mllt.sh 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2b"
   exit 1;
 fi