adding chime wsj eg

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@3291 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2013-12-10 19:29:00 +00:00 · 2013-12-10 19:29:00 +00:00 · 94c4646aba
--- a/egs/chime_wsj0/s5/cmd.sh
+++ b/egs/chime_wsj0/s5/cmd.sh
@ -0,0 +1,29 @@
+# "queue.pl" uses qsub.  The options to it are
+# options to qsub.  If you have GridEngine installed,
+# change this to a queue you have access to.
+# Otherwise, use "run.pl", which will run jobs locally
+# (make sure your --num-jobs options are no more than
+# the number of cpus on your machine.
+
+#a) JHU cluster options
+export train_cmd="queue.pl -l arch=*64"
+export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
+export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
+
+#export cuda_cmd="..."
+
+
+#b) BUT cluster options
+#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
+#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
+#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
+
+#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
+#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
+#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
+
+#c) run it locally...
+#export train_cmd=run.pl
+#export decode_cmd=run.pl
+export cuda_cmd=run.pl
+#export mkgraph_cmd=run.pl
--- a/egs/chime_wsj0/s5/conf/decode_dnn.config
+++ b/egs/chime_wsj0/s5/conf/decode_dnn.config
@ -0,0 +1,2 @@
+beam=18.0 # beam for decoding.  Was 13.0 in the scripts.
+latbeam=10.0 # this has most effect on size of the lattices.
--- a/egs/chime_wsj0/s5/conf/fbank.conf
+++ b/egs/chime_wsj0/s5/conf/fbank.conf
@ -0,0 +1,11 @@
+# No non-default options for now.
+--window-type=hamming # disable Dans window, use the standard
+--use-energy=false    # only fbank outputs
+--sample-frequency=16000 # Cantonese is sampled at 8kHz
+
+--low-freq=64         # typical setup from Frantisek Grezl
+--high-freq=8000
+--dither=1
+
+--num-mel-bins=40     # 8kHz so we use 15 bins
+--htk-compat=true     # try to make it compatible with HTK
--- a/egs/chime_wsj0/s5/conf/mfcc.conf
+++ b/egs/chime_wsj0/s5/conf/mfcc.conf
@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
--- a/egs/chime_wsj0/s5/local/binmask_wsj0_data_prep.sh
+++ b/egs/chime_wsj0/s5/local/binmask_wsj0_data_prep.sh
@ -0,0 +1,117 @@
+#!/bin/bash
+set -e
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems. 
+# - Arnab Ghoshal, 29/05/12
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <corpus-directory>\n\n" `basename $0`
+  echo "The argument should be a the top-level WSJ corpus directory."
+  echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory"
+  echo "within the top-level corpus directory."
+  exit 1;
+fi
+
+CORPUS=$1
+
+dir=`pwd`/data/local/data
+lmdir=`pwd`/data/local/nist_lm
+mkdir -p $dir $lmdir
+local=`pwd`/local
+utils=`pwd`/utils
+
+. ./path.sh # Needed for KALDI_ROOT
+export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+if [ ! -x $sph2pipe ]; then
+  echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
+  exit 1;
+fi
+
+cd $dir
+
+# reverb list for SI-84
+
+find $1/si_tr_s -name '*.wav' |  sort -u > train_si84_binmask.flist
+
+
+
+# Dev-set Hub 1,2 (503, 913 utterances)
+
+# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.  
+# Sometimes this gets copied from the CD's with upcasing, don't know 
+# why (could be older versions of the disks).
+find $1/si_dt_05  -name '*.wav' | sort -u > dev_dt_05_binmask.flist
+
+find $1/si_et_05  -name '*.wav' | sort -u > test_eval92_5k_binmask.flist
+
+
+# Finding the transcript files:
+#find -L $CORPUS -iname '*.dot' > dot_files.flist
+if [ ! -e $dir/dot_files.flist ]; then
+  echo "Could not find $dir/dot_files.flist files, first run clean_data_prep.sh";
+  exit 1;
+fi
+
+# Convert the transcripts into our format (no normalization yet)
+# adding suffix to utt_id 
+# 1 for reverb condition
+for x in train_si84_binmask dev_dt_05_binmask test_eval92_5k_binmask; do
+  cat $x.flist | perl -e ' 
+    while(<>) {
+      m:^\S+/(\w+)\.wav$: || die "Bad line $_";
+      $id = $1;
+      $id =~ tr/A-Z/a-z/;
+      print "$id $_"; 
+    }
+  ' | sort > ${x}_wav_tmp.scp
+  #cat ${x}_wav_tmp.scp | awk '{print $1}' \
+  #  | $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1
+  cat ${x}_wav_tmp.scp | perl -e '
+    while(<STDIN>) {
+      @A=split(" ", $_);
+      @B=split("/", $_);
+      $abs_path_len=@B;
+      $condition=$B[$abs_path_len-3];
+      if ($condition eq "9dB") {$key_suffix=8;}
+      elsif ($condition eq "6dB") {$key_suffix=9;}
+      elsif ($condition eq "3dB") {$key_suffix=a;}
+      elsif ($condition eq "0dB") {$key_suffix=b;}
+      elsif ($condition eq "m3dB") {$key_suffix=c;}
+      elsif ($condition eq "m6dB") {$key_suffix=d;}
+      else {print STDERR "error condition $condition";} 
+      print $A[0].$key_suffix." ".$A[1]."\n"; 
+    }
+  ' | sort -k1 > ${x}_wav.scp
+  cat ${x}_wav.scp | awk '{print $1}' \
+    | $local/find_noisy_transcripts.pl dot_files.flist > ${x}.trans1 
+done
+
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in train_si84_binmask dev_dt_05_binmask test_eval92_5k_binmask; do
+  cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
+    | sort > $x.txt || exit 1;
+done
+ 
+# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
+#for x in train_si84_clean test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean; do
+#  awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp \
+#    > ${x}_wav.scp
+#done
+
+# Make the utt2spk and spk2utt files.
+for x in train_si84_binmask dev_dt_05_binmask test_eval92_5k_binmask; do
+  cat ${x}_wav.scp | awk '{print $1}' \
+    | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
+  cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
+done
+
+echo "Data preparation succeeded"
--- a/egs/chime_wsj0/s5/local/chime_format_data.sh
+++ b/egs/chime_wsj0/s5/local/chime_format_data.sh
@ -0,0 +1,86 @@
+#!/bin/bash
+
+# Copyright 2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# This script takes data prepared in a corpus-dependent way
+# in data/local/, and converts it into the "canonical" form,
+# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
+# data/train_si284, data/train_si84, etc.
+
+# Don't bother doing train_si84 separately (although we have the file lists
+# in data/local/) because it's just the first 7138 utterances in train_si284.
+# We'll create train_si84 after doing the feature extraction.
+
+. ./path.sh || exit 1;
+
+echo "Preparing train and test data"
+srcdir=data/local/data
+lmdir=data/local/nist_lm
+tmpdir=data/local/lm_tmp
+lexicon=data/local/lang_tmp/lexiconp.txt
+mkdir -p $tmpdir
+
+for x in test_eval92_clean test_eval92_noisy test_eval92_5k_clean test_eval92_5k_noisy dev_dt_05_clean dev_dt_05_reverb dev_dt_05_noisy dev_dt_20_clean dev_dt_20_reverb dev_dt_20_noisy train_si84_clean train_si84_reverb train_si84_noisy; do 
+  mkdir -p data/$x
+  cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
+  cp $srcdir/$x.txt data/$x/text || exit 1;
+  cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
+  cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
+  utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1;
+done
+
+
+# Next, for each type of language model, create the corresponding FST
+# and the corresponding lang_test_* directory.
+
+echo Preparing language models for test
+
+for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
+  test=data/lang_test_${lm_suffix}
+  mkdir -p $test
+  for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
+     phones/; do
+    cp -r data/lang/$f $test
+  done
+  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
+   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt
+
+  # grep -v '<s> <s>' because the LM seems to have some strange and useless
+  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
+  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
+  # which are supposed to occur only at being/end of utt.  These can cause 
+  # determinization failures of CLG [ends up being epsilon cycles].
+  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
+    grep -v '<s> <s>' | \
+    grep -v '</s> <s>' | \
+    grep -v '</s> </s>' | \
+    arpa2fst - | fstprint | \
+    utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
+    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
+      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+     fstrmepsilon > $test/G.fst
+  fstisstochastic $test/G.fst
+ # The output is like:
+ # 9.14233e-05 -0.259833
+ # we do expect the first of these 2 numbers to be close to zero (the second is
+ # nonzero because the backoff weights make the states sum to >1).
+ # Because of the <s> fiasco for these particular LMs, the first number is not
+ # as close to zero as it could be.
+
+  # Everything below is only for diagnostic.
+  # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
+  # this might cause determinization failure of CLG.
+  # #0 is treated as an empty word.
+  mkdir -p $tmpdir/g
+  awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
+    < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
+  fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
+   fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
+  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+    echo "Language model has cycles with empty words" && exit 1
+  rm -r $tmpdir/g
+done
+
+echo "Succeeded in formatting data."
+rm -r $tmpdir
--- a/egs/chime_wsj0/s5/local/clean_wsj0_data_prep.sh
+++ b/egs/chime_wsj0/s5/local/clean_wsj0_data_prep.sh
@ -0,0 +1,190 @@
+#!/bin/bash
+set -e
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems. 
+# - Arnab Ghoshal, 29/05/12
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <corpus-directory>\n\n" `basename $0`
+  echo "The argument should be a the top-level WSJ corpus directory."
+  echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory"
+  echo "within the top-level corpus directory."
+  exit 1;
+fi
+
+CORPUS=$1
+
+dir=`pwd`/data/local/data
+lmdir=`pwd`/data/local/nist_lm
+mkdir -p $dir $lmdir
+local=`pwd`/local
+utils=`pwd`/utils
+
+. ./path.sh # Needed for KALDI_ROOT
+export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+if [ ! -x $sph2pipe ]; then
+  echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
+  exit 1;
+fi
+
+cd $dir
+
+# This version for SI-84
+cat $CORPUS/wsj0/doc/indices/train/tr_s_wv1.ndx \
+  | $local/cstr_ndx2flist.pl $CORPUS | sort -u > train_si84_clean.flist
+
+# This version for SI-284
+#cat $CORPUS/wsj1/doc/indices/si_tr_s.ndx \
+#  $CORPUS/wsj0/doc/indices/train/tr_s_wv1.ndx \
+#  | $local/cstr_ndx2flist.pl  $CORPUS | sort \
+#  | grep -v wsj0/si_tr_s/401 > train_si284.flist
+
+# Now for the test sets.
+# $CORPUS/wsj1/doc/indices/readme.doc 
+# describes all the different test sets.
+# Note: each test-set seems to come in multiple versions depending
+# on different vocabulary sizes, verbalized vs. non-verbalized
+# pronunciations, etc.  We use the largest vocab and non-verbalized
+# pronunciations.
+# The most normal one seems to be the "baseline 60k test set", which
+# is h1_p0. 
+
+# Nov'92 (333 utts)
+# These index files have a slightly different format; 
+# have to add .wv1, which is done in cstr_ndx2flist.pl 
+cat $CORPUS/wsj0/doc/indices/test/nvp/si_et_20.ndx | \
+  $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval92_clean.flist
+
+# Nov'92 (330 utts, 5k vocab)
+cat $CORPUS/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
+  $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval92_5k_clean.flist
+
+# Nov'93: (213 utts)
+# Have to replace a wrong disk-id.
+#cat $CORPUS/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \
+#  $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval93.flist
+
+# Nov'93: (215 utts, 5k)
+#cat $CORPUS/wsj1/doc/indices/wsj1/eval/h2_p0.ndx | \
+#  $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval93_5k.flist
+
+# Dev-set for Nov'93 (503 utts)
+#cat $CORPUS/wsj1/doc/indices/h1_p0.ndx | \
+#  $local/cstr_ndx2flist.pl $CORPUS | sort > test_dev93.flist
+
+# Dev-set for Nov'93 (513 utts, 5k vocab)
+#cat $CORPUS/wsj1/doc/indices/h2_p0.ndx | \
+#  $local/cstr_ndx2flist.pl $CORPUS | sort > test_dev93_5k.flist
+
+
+# Dev-set Hub 1,2 (503, 913 utterances)
+
+# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.  
+# Sometimes this gets copied from the CD's with upcasing, don't know 
+# why (could be older versions of the disks).
+find $CORPUS/wsj0/si_dt_20 -print | grep -i ".wv1" | sort > dev_dt_20_clean.flist
+find $CORPUS/wsj0/si_dt_05 -print | grep -i ".wv1" | sort > dev_dt_05_clean.flist
+
+
+# Finding the transcript files:
+find -L $CORPUS -iname '*.dot' > dot_files.flist
+
+# Convert the transcripts into our format (no normalization yet)
+# adding suffix to utt_id 
+# 0 for clean condition
+for x in train_si84_clean test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean; do
+  $local/flist2scp.pl $x.flist | sort > ${x}_sph_tmp.scp
+  cat ${x}_sph_tmp.scp | awk '{print $1}' \
+    | $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1
+  cat ${x}_sph_tmp.scp | awk '{printf("%s0 %s\n", $1, $2);}' > ${x}_sph.scp
+  cat ${x}_tmp.trans1 | awk '{printf("%s0 ", $1); for(i=2;i<=NF;i++) printf("%s ", $i); printf("\n");}' > ${x}.trans1
+done
+
+
+
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in train_si84_clean test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean; do
+  cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
+    | sort > $x.txt || exit 1;
+done
+ 
+# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
+for x in train_si84_clean test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean; do
+  awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp \
+    > ${x}_wav.scp
+done
+
+# Make the utt2spk and spk2utt files.
+for x in train_si84_clean test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean; do
+  cat ${x}_sph.scp | awk '{print $1}' \
+    | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
+  cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
+done
+
+#in case we want to limit lm's on most frequent words, copy lm training word frequency list
+cp $CORPUS/wsj0/doc/lng_modl/vocab/wfl_64.lst $lmdir
+chmod u+w $lmdir/*.lst # had weird permissions on source.
+
+# The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without
+# verbalized pronunciations.   This is the most common test setup, I understand.
+
+cp $CORPUS/wsj0/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1;
+chmod u+w $lmdir/lm_bg.arpa.gz
+
+# trigram would be:
+cat $CORPUS/wsj0/doc/lng_modl/base_lm/tcb20onp.z | \
+  perl -e 'while(<>){ if(m/^\\data\\/){ print; last;  } } while(<>){ print; }' \
+  | gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1;
+
+prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1;
+gzip -f $lmdir/lm_tgpr.arpa || exit 1;
+
+# repeat for 5k language models
+cp $CORPUS/wsj0/doc/lng_modl/base_lm/bcb05onp.z  $lmdir/lm_bg_5k.arpa.gz || exit 1;
+chmod u+w $lmdir/lm_bg_5k.arpa.gz
+
+# trigram would be: !only closed vocabulary here!
+cp $CORPUS/wsj0/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1;
+chmod u+w $lmdir/lm_tg_5k.arpa.gz
+gunzip $lmdir/lm_tg_5k.arpa.gz
+tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz
+rm $lmdir/lm_tg_5k.arpa
+
+prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1;
+gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1;
+
+
+if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
+  rm -f wsj0-train-spkrinfo.txt
+  wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt \
+    || ( echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
+         wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt );
+fi
+
+if [ ! -f wsj0-train-spkrinfo.txt ]; then
+  echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
+  echo "This is possibly omitted from the training disks; couldn't find it." 
+  echo "Everything else may have worked; we just may be missing gender info"
+  echo "which is only needed for VTLN-related diagnostics anyway."
+  exit 1
+fi
+# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
+# LDC put it on the web.  Perhaps it was accidentally omitted from the
+# disks.  
+
+cat $CORPUS/wsj0/doc/spkrinfo.txt \
+    ./wsj0-train-spkrinfo.txt  | \
+    perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
+    awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
+
+
+echo "Data preparation succeeded"
--- a/egs/chime_wsj0/s5/local/copy_clean_ali.sh
+++ b/egs/chime_wsj0/s5/local/copy_clean_ali.sh
@ -0,0 +1,13 @@
+#!/bin/bash                                                                    
+. path.sh                                                                      
+data=$1                                                                        
+old_ali_dir=$2                                                                 
+mix_ali_dir=$3                                                                 
+mkdir -p $mix_ali_dir                                                          
+                                                                               
+cp $old_ali_dir/{final.mdl,num_jobs,tree} $mix_ali_dir/   
+                                                                               
+gunzip -c $old_ali_dir/ali.*.gz | gzip -c > $old_ali_dir/ali.gz                
+                                                                               
+feats="ark,s,cs:copy-feats scp:$data/feats.scp ark:- |"                        
+copy-clean-ali "$feats" "ark:gunzip -c $old_ali_dir/ali.gz |" "ark:| gzip -c > $mix_ali_dir/ali.1.gz"
--- a/egs/chime_wsj0/s5/local/cstr_ndx2flist.pl
+++ b/egs/chime_wsj0/s5/local/cstr_ndx2flist.pl
@ -0,0 +1,54 @@
+#!/usr/bin/perl
+
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems. 
+# - Arnab Ghoshal, 12/1/12
+
+# This program takes as its standard input an .ndx file from the WSJ corpus that looks
+# like this:
+#;; File: tr_s_wv1.ndx, updated 04/26/94
+#;;
+#;; Index for WSJ0 SI-short Sennheiser training data
+#;; Data is read WSJ sentences, Sennheiser mic.
+#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts 
+#;; per speaker TI) = 7236 utts
+#;;
+#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
+#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
+#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1
+
+# and as command-line argument it takes the names of the WSJ disk locations, e.g.:
+# /group/corpora/public/wsjcam0/data on DICE machines.
+# It outputs a list of absolute pathnames.
+
+$wsj_dir = $ARGV[0];
+
+while(<STDIN>){
+  if(m/^;/){ next; } # Comment.  Ignore it.
+  else {
+    m/^([0-9_]+):\s*(\S+)$/  || die "Could not parse line $_";
+    $filename = $2; # as a subdirectory of the distributed disk.
+    if ($filename !~ m/\.wv1$/) { $filename .= ".wv1"; }
+    $filename = "$wsj_dir/$filename";
+    if (-e $filename) {
+      print "$filename\n";
+    } else {
+      print STDERR "File $filename found in the index but not on disk\n";
+    }
+  }
+}
--- a/egs/chime_wsj0/s5/local/cstr_wsj_data_prep.sh
+++ b/egs/chime_wsj0/s5/local/cstr_wsj_data_prep.sh
@ -0,0 +1,187 @@
+#!/bin/bash
+set -e
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems. 
+# - Arnab Ghoshal, 29/05/12
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <corpus-directory>\n\n" `basename $0`
+  echo "The argument should be a the top-level WSJ corpus directory."
+  echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory"
+  echo "within the top-level corpus directory."
+  exit 1;
+fi
+
+CORPUS=$1
+
+dir=`pwd`/data/local/data
+lmdir=`pwd`/data/local/nist_lm
+mkdir -p $dir $lmdir
+local=`pwd`/local
+utils=`pwd`/utils
+
+. ./path.sh # Needed for KALDI_ROOT
+export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+if [ ! -x $sph2pipe ]; then
+  echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
+  exit 1;
+fi
+
+cd $dir
+
+# This version for SI-84
+cat $CORPUS/wsj0/doc/indices/train/tr_s_wv1.ndx \
+  | $local/cstr_ndx2flist.pl $CORPUS | sort \
+  | grep -v wsj0/si_tr_s/401 > train_si84.flist
+
+# This version for SI-284
+cat $CORPUS/wsj1/doc/indices/si_tr_s.ndx \
+  $CORPUS/wsj0/doc/indices/train/tr_s_wv1.ndx \
+  | $local/cstr_ndx2flist.pl  $CORPUS | sort \
+  | grep -v wsj0/si_tr_s/401 > train_si284.flist
+
+# Now for the test sets.
+# $CORPUS/wsj1/doc/indices/readme.doc 
+# describes all the different test sets.
+# Note: each test-set seems to come in multiple versions depending
+# on different vocabulary sizes, verbalized vs. non-verbalized
+# pronunciations, etc.  We use the largest vocab and non-verbalized
+# pronunciations.
+# The most normal one seems to be the "baseline 60k test set", which
+# is h1_p0. 
+
+# Nov'92 (333 utts)
+# These index files have a slightly different format; 
+# have to add .wv1, which is done in cstr_ndx2flist.pl 
+cat $CORPUS/wsj0/doc/indices/test/nvp/si_et_20.ndx | \
+  $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval92.flist
+
+# Nov'92 (330 utts, 5k vocab)
+cat $CORPUS/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
+  $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval92_5k.flist
+
+# Nov'93: (213 utts)
+# Have to replace a wrong disk-id.
+cat $CORPUS/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \
+  $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval93.flist
+
+# Nov'93: (215 utts, 5k)
+cat $CORPUS/wsj1/doc/indices/wsj1/eval/h2_p0.ndx | \
+  $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval93_5k.flist
+
+# Dev-set for Nov'93 (503 utts)
+cat $CORPUS/wsj1/doc/indices/h1_p0.ndx | \
+  $local/cstr_ndx2flist.pl $CORPUS | sort > test_dev93.flist
+
+# Dev-set for Nov'93 (513 utts, 5k vocab)
+cat $CORPUS/wsj1/doc/indices/h2_p0.ndx | \
+  $local/cstr_ndx2flist.pl $CORPUS | sort > test_dev93_5k.flist
+
+
+# Dev-set Hub 1,2 (503, 913 utterances)
+
+# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.  
+# Sometimes this gets copied from the CD's with upcasing, don't know 
+# why (could be older versions of the disks).
+find $CORPUS/???1/??_??_20 -print | grep -i ".wv1" | sort > dev_dt_20.flist
+find $CORPUS/???1/??_??_05 -print | grep -i ".wv1" | sort > dev_dt_05.flist
+
+
+# Finding the transcript files:
+find -L $CORPUS -iname '*.dot' > dot_files.flist
+
+# Convert the transcripts into our format (no normalization yet)
+for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
+  $local/flist2scp.pl $x.flist | sort > ${x}_sph.scp
+  cat ${x}_sph.scp | awk '{print $1}' \
+    | $local/find_transcripts.pl dot_files.flist > $x.trans1
+done
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
+  cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
+    | sort > $x.txt || exit 1;
+done
+ 
+# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
+for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
+  awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp \
+    > ${x}_wav.scp
+done
+
+# Make the utt2spk and spk2utt files.
+for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
+  cat ${x}_sph.scp | awk '{print $1}' \
+    | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
+  cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
+done
+
+#in case we want to limit lm's on most frequent words, copy lm training word frequency list
+cp $CORPUS/wsj1/doc/lng_modl/vocab/wfl_64.lst $lmdir
+chmod u+w $lmdir/*.lst # had weird permissions on source.
+
+# The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without
+# verbalized pronunciations.   This is the most common test setup, I understand.
+
+cp $CORPUS/wsj1/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1;
+chmod u+w $lmdir/lm_bg.arpa.gz
+
+# trigram would be:
+cat $CORPUS/wsj1/doc/lng_modl/base_lm/tcb20onp.z | \
+  perl -e 'while(<>){ if(m/^\\data\\/){ print; last;  } } while(<>){ print; }' \
+  | gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1;
+
+prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1;
+gzip -f $lmdir/lm_tgpr.arpa || exit 1;
+
+# repeat for 5k language models
+cp $CORPUS/wsj1/doc/lng_modl/base_lm/bcb05onp.z  $lmdir/lm_bg_5k.arpa.gz || exit 1;
+chmod u+w $lmdir/lm_bg_5k.arpa.gz
+
+# trigram would be: !only closed vocabulary here!
+cp $CORPUS/wsj1/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1;
+chmod u+w $lmdir/lm_tg_5k.arpa.gz
+gunzip $lmdir/lm_tg_5k.arpa.gz
+tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz
+rm $lmdir/lm_tg_5k.arpa
+
+prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1;
+gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1;
+
+
+if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
+  rm -f wsj0-train-spkrinfo.txt
+  wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt \
+    || ( echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
+         wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt );
+fi
+
+if [ ! -f wsj0-train-spkrinfo.txt ]; then
+  echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
+  echo "This is possibly omitted from the training disks; couldn't find it." 
+  echo "Everything else may have worked; we just may be missing gender info"
+  echo "which is only needed for VTLN-related diagnostics anyway."
+  exit 1
+fi
+# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
+# LDC put it on the web.  Perhaps it was accidentally omitted from the
+# disks.  
+
+cat $CORPUS/wsj0/doc/spkrinfo.txt \
+    $CORPUS/wsj1/doc/evl_spok/spkrinfo.txt \
+    $CORPUS/wsj1/doc/dev_spok/spkrinfo.txt \
+    $CORPUS/wsj1/doc/train/spkrinfo.txt \
+    ./wsj0-train-spkrinfo.txt  | \
+    perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
+    awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
+
+
+echo "Data preparation succeeded"
--- a/egs/chime_wsj0/s5/local/cstr_wsj_extend_dict.sh
+++ b/egs/chime_wsj0/s5/local/cstr_wsj_extend_dict.sh
@ -0,0 +1,172 @@
+#!/bin/bash
+
+# This script builds a larger word-list and dictionary 
+# than used for the LMs supplied with the WSJ corpus.
+# It uses a couple of strategies to fill-in words in
+# the LM training data but not in CMUdict.  One is
+# to generate special prons for possible acronyms, that
+# just consist of the constituent letters.  The other
+# is designed to handle derivatives of known words
+# (e.g. deriving the pron of a plural from the pron of
+# the base-word), but in a more general, learned-from-data
+# way.
+# It makes use of scripts in local/dict/
+
+if [ $# -ne 1 ]; then
+  echo "Usage: local/cstr_wsj_train_lms.sh WSJ1_doc_dir"
+  exit 1
+fi
+
+export PATH=$PATH:`pwd`/local/dict/
+srcdir=$1
+
+if [ ! -d $srcdir/lng_modl ]; then
+  echo "Expecting 'lng_modl' under WSJ doc directory '$srcdir'"
+  exit 1
+fi
+
+mkdir -p data/local/dict_larger
+dir=data/local/dict_larger
+cp data/local/dict/* data/local/dict_larger # Various files describing phones etc.
+  # are there; we just want to copy them as the phoneset is the same.
+rm data/local/dict_larger/lexicon.txt # we don't want this.
+mincount=2 # Minimum count of an OOV we will try to generate a pron for.
+
+[ ! -f data/local/dict/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1;
+
+# Remove comments from cmudict; print first field; remove
+# words like FOO(1) which are alternate prons: our dict format won't
+# include these markers.
+grep -v ';;;' data/local/dict/cmudict/cmudict.0.7a | 
+ perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu
+
+cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu
+
+echo "Getting training data [this should take at least a few seconds; if not, there's a problem]"
+
+# Convert to uppercase, remove XML-like markings.
+# For words ending in "." that are not in CMUdict, we assume that these
+# are periods that somehow remained in the data during data preparation,
+# and we we replace the "." with "\n".  Note: we found this by looking at
+# oov.counts below (before adding this rule).
+
+touch $dir/cleaned.gz
+if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then
+  echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]";
+else
+ gunzip -c $srcdir/lng_modl/lm_train/np_data/{87,88,89}/*.z \
+  | awk '/^</{next}{print toupper($0)}' | perl -e '
+   open(F, "<$ARGV[0]")||die;
+   while(<F>){ chop; $isword{$_} = 1; }
+   while(<STDIN>) { 
+    @A = split(" ", $_); 
+    for ($n = 0; $n < @A; $n++) {
+      $a = $A[$n];
+      if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "."
+         # and have no other "." in them: treat as period.
+         print "$a";
+         if ($n+1 < @A) { print "\n"; }
+      } else { print "$a "; }
+    }
+    print "\n";
+  }
+ ' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz
+fi
+  
+# get unigram counts
+echo "Getting unigram counts"
+gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \
+  awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams
+
+cat $dir/unigrams | awk -v dict=$dir/dict.cmu \
+  'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \
+   > $dir/oov.counts
+
+echo "Most frequent unseen unigrams are: "
+head $dir/oov.counts
+
+# Prune away singleton counts, and remove things with numbers in
+# (which should have been normalized) and with no letters at all.
+
+
+cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \
+  | awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist
+
+# Automatic rule-finding...
+
+# First make some prons for possible acronyms.
+# Note: we don't do this for things like U.K or U.N,
+# or A.B. (which doesn't exist anyway), 
+# as we consider this normalization/spelling errors.
+
+cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu  > $dir/dict.acronyms
+
+mkdir $dir/f $dir/b # forward, backward directions of rules...
+  # forward is normal suffix
+  # rules, backward is reversed (prefix rules).  These
+  # dirs contain stuff we create while making the rule-based
+  # extensions to the dictionary.
+
+# Remove ; and , from words, if they are present; these
+# might crash our scripts, as they are used as separators there.
+filter_dict.pl $dir/dict.cmu > $dir/f/dict 
+cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
+reverse_dict.pl $dir/f/dict > $dir/b/dict
+reverse_dict.pl $dir/f/oovs > $dir/b/oovs
+
+# The next stage takes a few minutes.
+# Note: the forward stage takes longer, as English is
+# mostly a suffix-based language, and there are more rules
+# that it finds.
+for d in $dir/f $dir/b; do
+ (
+   cd $d
+   cat dict | get_rules.pl 2>get_rules.log >rules
+   get_rule_hierarchy.pl rules >hierarchy
+   awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
+     limit_candidate_prons.pl hierarchy | \
+     score_prons.pl dict | \
+     count_rules.pl >rule.counts
+   # the sort command below is just for convenience of reading.
+   score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
+   get_candidate_prons.pl rules.with_scores dict oovs | \
+     limit_candidate_prons.pl hierarchy > oovs.candidates
+ )  &   
+done 
+wait
+
+# Merge the candidates.
+reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
+select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s  %s\n", $1, $2);}' \
+  > $dir/dict.oovs
+
+cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
+
+awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
+sort $dir/oovlist | diff - $dir/oovlist.handled  | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
+
+
+# add_counts.pl attaches to original counts to the list of handled/not-handled OOVs
+add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts
+add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts
+
+echo "**Top OOVs we handled are:**"; 
+head $dir/oovlist.handled.counts
+echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**"; 
+head $dir/oovlist.not_handled.counts
+
+
+echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`"
+echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`"
+echo "Count of OOVs we didn't handle due to low count is" \
+    `awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts`
+# The two files created above are for humans to look at, as diagnostics.
+
+cat <<EOF | cat - $dir/dict.cmu $dir/dict.oovs_merged | sort | uniq > $dir/lexicon.txt
+!SIL SIL
+<SPOKEN_NOISE> SPN
+<UNK> SPN
+<NOISE> NSN
+EOF
+
+echo "Created $dir/lexicon.txt"
--- a/egs/chime_wsj0/s5/local/dict/add_counts.pl
+++ b/egs/chime_wsj0/s5/local/dict/add_counts.pl
@ -0,0 +1,31 @@
+#!/usr/bin/perl
+
+
+# Add counts to an oovlist.
+# Reads in counts as output by uniq -c, and
+# an oovlist, and prints out the counts of the oovlist.
+
+(@ARGV == 1 || @ARGV == 2) || die "Usage: add_counts.pl count_file [oovlist]\n";
+
+$counts = shift @ARGV;
+
+open(C, "<$counts") || die "Opening counts file $counts";
+
+while(<C>) {
+  @A = split(" ", $_);
+  @A == 2 || die "Bad line in counts file: $_";
+  ($count, $word) = @A;
+  $count =~ m:^\d+$: || die "Bad count $A[0]\n";
+  $counts{$word} = $count;
+}
+
+while(<>) {
+  chop;
+  $w = $_;
+  $w =~ m:\S+: || die "Bad word $w";
+  defined $counts{$w} || die "Word $w not present in counts file";
+  print "\t$counts{$w}\t$w\n";
+}
+    
+  
+
--- a/egs/chime_wsj0/s5/local/dict/count_rules.pl
+++ b/egs/chime_wsj0/s5/local/dict/count_rules.pl
@ -0,0 +1,44 @@
+#!/usr/bin/perl
+
+# This program takes the output of score_prons.pl and collates
+# it for each (rule, destress) pair so that we get the
+# counts of right/partial/wrong for each pair.
+
+# The input is a 7-tuple on each line, like:
+# word;pron;base-word;base-pron;rule-name;de-stress;right|partial|wrong
+#
+# The output format is a 5-tuple like:
+#
+# rule;destress;right-count;partial-count;wrong-count
+#
+
+if (@ARGV != 0 && @ARGV != 1) {
+  die "Usage: count_rules.pl < scored_candidate_prons > rule_counts";
+}
+
+
+while(<>) {
+  chop;
+  $line = $_;
+  my ($word, $pron, $baseword, $basepron, $rulename, $destress, $score) = split(";", $line);
+  
+  my $key = $rulename . ";" . $destress;
+
+  if (!defined $counts{$key}) {
+    $counts{$key} = [ 0, 0, 0 ]; # new anonymous array.
+  }
+  $ref = $counts{$key};
+  if ($score eq "right") {
+    $$ref[0]++;
+  } elsif ($score eq "partial") {
+    $$ref[1]++;
+  } elsif ($score eq "wrong") {
+    $$ref[2]++;
+  } else {
+    die "Bad score $score\n";
+  }
+}
+
+while ( my ($key, $value) = each(%counts)) {
+  print $key . ";" . join(";", @$value) . "\n";
+}
--- a/egs/chime_wsj0/s5/local/dict/filter_dict.pl
+++ b/egs/chime_wsj0/s5/local/dict/filter_dict.pl
@ -0,0 +1,19 @@
+#!/usr/bin/perl
+
+
+# This program reads and writes either a dictionary or just a list
+# of words, and it removes any words containing ";" or "," as these
+# are used in these programs.  It will warn about these.
+# It will die if the pronunciations have these symbols in.
+while(<>) {
+  chop;
+  @A = split(" ", $_);
+  $word = shift @A;
+  
+  if ($word =~ m:[;,]:) {
+    print STDERR "Omitting line $_ since it has one of the banned characters ; or ,\n" ;
+  } else {
+    $_ =~ m:[;,]: && die "Phones cannot have ; or , in them.";
+    print $_ . "\n";
+  }
+}
--- a/egs/chime_wsj0/s5/local/dict/find_acronyms.pl
+++ b/egs/chime_wsj0/s5/local/dict/find_acronyms.pl
@ -0,0 +1,95 @@
+#!/usr/bin/perl
+
+# Reads a dictionary, and prints out a list of words that seem to be pronounced
+# as acronyms (not including plurals of acronyms, just acronyms).  Uses
+# the prons of the individual letters (A., B. and so on) to judge this.
+# Note: this is somewhat dependent on the convention used in CMUduct, that
+# the individual letters are spelled this way (e.g. "A.").
+
+$max_length = 6; # Max length of words that might be
+ # acronyms.
+
+while(<>) { # Read the dict.
+  chop;
+  @A = split(" ", $_);
+  $word = shift @A;
+  $pron = join(" ", @A);
+  if ($word =~ m/^([A-Z])\.$/ ) {
+    chop $word; # Remove trailing "." to get just the letter
+    $letter = $1;
+    if (!defined $letter_prons{$letter} ) { 
+      $letter_prons{$letter} = [ ]; # new anonymous array
+    }
+    $arrayref = $letter_prons{$letter};
+    push @$arrayref, $pron;
+  } elsif( length($word) <= $max_length ) {
+    $pronof{$word . "," . $pron} = 1;
+    $isword{$word} = 1;
+    #if (!defined $prons{$word} ) {
+    #  $prons{$word} = [ ];
+    #}
+    #  push @{$prons{$word}}, $pron;
+  }
+}
+
+sub get_letter_prons;
+
+foreach $word (keys %isword) {
+  my @letter_prons = get_letter_prons($word);
+  foreach $pron (@letter_prons) {
+    if (defined $pronof{$word.",".$pron}) {
+      print "$word  $pron\n";
+    }
+  }
+}
+
+
+sub get_letter_prons {
+  @acronym = split("", shift); # The letters in the word.
+  my @prons = ( "" );
+  
+  while (@acronym > 0) {
+    $l = shift @acronym;
+    $n = 1; # num-repeats of letter $l.
+    while (@acronym > 0 && $acronym[0] eq $l) {
+      $n++;
+      shift @acronym;
+    }
+    my $arrayref = $letter_prons{$l};
+    my @prons_of_block = ();
+    if ($n == 1) { # Just one repeat.
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, $lpron; # typically (always?) just one pron of a letter.
+      }
+    } elsif ($n == 2) { # Two repeats.  Can be "double a" or "a a"
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, "D AH1 B AH0 L " . $lpron;
+        push @prons_of_block, $lpron . $lpron;
+      }
+    } elsif ($n == 3) { # can be "triple a" or "a a a"
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, "T R IH1 P AH0 L " . $lpron;
+        push @prons_of_block, $lpron . $lpron . $lpron;
+      }
+    } elsif ($n >= 4) { # let's say it can only be that letter repeated $n times..
+      # not sure really.
+      foreach $lpron ( @$arrayref ) {
+        $nlpron = "";
+        for ($m = 0; $m < $n; $m++) { $nlpron = $nlpron . $lpron; }
+        push @prons_of_block, $nlpron;
+      }
+    }
+    my @new_prons = ();
+    foreach $pron (@prons) {
+      foreach $pron_of_block(@prons_of_block) {
+        if ($pron eq "") {
+          push @new_prons, $pron_of_block;
+        } else {
+          push @new_prons, $pron . " " . $pron_of_block;
+        }
+      }
+    }
+    @prons = @new_prons;
+  }
+  return @prons;
+}
--- a/egs/chime_wsj0/s5/local/dict/get_acronym_prons.pl
+++ b/egs/chime_wsj0/s5/local/dict/get_acronym_prons.pl
@ -0,0 +1,123 @@
+#!/usr/bin/perl
+
+# Reads a dictionary (for prons of letters), and an OOV list,
+# and puts out candidate pronunciations of words in that list
+# that could plausibly be acronyms.
+# We judge that a word can plausibly be an acronym if it is
+# a sequence of just letters (no non-letter characters such
+# as "'"),  or something like U.K.,
+# and the number of letters is four or less.
+#
+# If the text were not already pre-normalized, there would
+# be other hints such as capitalization.
+
+# This program appends
+# the prons of the individual letters (A., B. and so on) to work out
+# the pron of the acronym.
+# Note: this is somewhat dependent on the convention used in CMUduct, that
+# the individual letters are spelled this way (e.g. "A."). [it seems
+# to also have the separated versions.
+
+if (!(@ARGV == 1 || @ARGV == 2)) { 
+  print "Usage: get_acronym_prons.pl dict [oovlist]";
+}
+
+$max_length = 4; # Max #letters in an acronym. (Longer 
+ # acronyms tend to have "pseudo-pronunciations", e.g. think about UNICEF.
+
+$dict = shift @ARGV;
+open(D, "<$dict") || die "Opening dictionary $dict";
+
+while(<D>) { # Read the dict, to get the prons of the letters.
+  chop;
+  @A = split(" ", $_);
+  $word = shift @A;
+  $pron = join(" ", @A);
+  if ($word =~ m/^([A-Z])\.$/ ) {
+    chop $word; # Remove trailing "." to get just the letter
+    $letter = $1;
+    if (!defined $letter_prons{$letter} ) { 
+      $letter_prons{$letter} = [ ]; # new anonymous array
+    }
+    $arrayref = $letter_prons{$letter};
+    push @$arrayref, $pron;
+  } elsif( length($word) <= $max_length ) {
+    $pronof{$word . "," . $pron} = 1;
+    $isword{$word} = 1;
+    #if (!defined $prons{$word} ) {
+    #  $prons{$word} = [ ];
+    #}
+    #  push @{$prons{$word}}, $pron;
+  }
+}
+
+sub get_letter_prons;
+
+while(<>) { # Read OOVs.
+  # For now, just do the simple cases without "." in 
+  # between... things with "." in the OOV list seem to
+  # be mostly errors.
+  chop;
+  $word = $_;
+  if ($word =~ m/^[A-Z]{1,5}$/) {
+    foreach $pron ( get_letter_prons($word) ) { # E.g. UNPO
+      print "$word  $pron\n";
+    }
+  } elsif ($word =~ m:^(\w\.){1,4}\w\.?$:) { # E.g. U.K.  Make the final "." optional.
+    $letters = $word;
+    $letters =~ s:\.::g;
+    foreach $pron ( get_letter_prons($letters) ) { 
+      print "$word  $pron\n";
+    }
+  }
+}
+
+sub get_letter_prons {
+  @acronym = split("", shift); # The letters in the word.
+  my @prons = ( "" );
+  
+  while (@acronym > 0) {
+    $l = shift @acronym;
+    $n = 1; # num-repeats of letter $l.
+    while (@acronym > 0 && $acronym[0] eq $l) {
+      $n++;
+      shift @acronym;
+    }
+    my $arrayref = $letter_prons{$l};
+    my @prons_of_block = ();
+    if ($n == 1) { # Just one repeat.
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, $lpron; # typically (always?) just one pron of a letter.
+      }
+    } elsif ($n == 2) { # Two repeats.  Can be "double a" or "a a"
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, "D AH1 B AH0 L " . $lpron;
+        push @prons_of_block, $lpron . " " . $lpron;
+      }
+    } elsif ($n == 3) { # can be "triple a" or "a a a"
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, "T R IH1 P AH0 L " . $lpron;
+        push @prons_of_block, "$lpron $lpron $lpron";
+      }
+    } elsif ($n >= 4) { # let's say it can only be that letter repeated $n times..
+      # not sure really.
+      foreach $lpron ( @$arrayref ) {
+        $nlpron = $lpron;
+        for ($m = 1; $m < $n; $m++) { $nlpron = $nlpron . " " . $lpron; }
+        push @prons_of_block, $nlpron;
+      }
+    }
+    my @new_prons = ();
+    foreach $pron (@prons) {
+      foreach $pron_of_block(@prons_of_block) {
+        if ($pron eq "") {
+          push @new_prons, $pron_of_block;
+        } else {
+          push @new_prons, $pron . " " . $pron_of_block;
+        }
+      }
+    }
+    @prons = @new_prons;
+  }
+  return @prons;
+}
--- a/egs/chime_wsj0/s5/local/dict/get_candidate_prons.pl
+++ b/egs/chime_wsj0/s5/local/dict/get_candidate_prons.pl
@ -0,0 +1,187 @@
+#!/usr/bin/perl
+
+# This script takes three command-line arguments (typically files, or "-"):
+# the suffix rules (as output by get_rules.pl), the rule-hierarchy 
+# (from get_rule_hierarchy.pl), and the words that we want prons to be 
+# generated for (one per line).
+
+# The output consists of candidate generated pronunciations for those words,
+# together with information about how we generated those pronunciations.
+# This does not do pruning of the candidates using the restriction
+# "you can't use a more general rule when a more specific one is applicable".
+# That is done by limit_candidate_prons.pl.
+
+# Each line of the output consists of a 4-tuple, separated by ";", of the
+# form:
+# word;pron;base-word;base-pron;rule-name;destress[;rule-score]
+# [the last field is only present if you supplied rules with score information].
+# where:
+# - "word" is the input word that we queried for, e.g. WASTED
+# - "pron" is the generated pronunciation, e.g. "W EY1 S T AH0 D"
+# - rule-name is a 4-tuple separated by commas that describes the rule, e.g.
+#   "STED,STING,D,NG",
+# - "base-word" is the base-word we're getting the pron from,
+#   e.g. WASTING
+# - "base-pron" is the pron of the base-word, e.g. "W EY1 S T IH0 NG"
+# - "destress" is either "yes" or "no" and corresponds to whether we destressed the
+#   base-word or not [de-stressing just corresponds to just taking any 2's down to 1's,
+#   although we may extend this in future]... 
+# - "rule-score" is a numeric score of the rule (this field is only present
+#   if there was score information in your rules.
+
+
+(@ARGV == 2  || @ARGV == 3) || die "Usage: get_candidate_prons.pl rules base-dict [ words ]";
+
+$min_prefix_len = 3;  # this should probably match with get_rules.pl
+
+$rules = shift @ARGV; # Note: rules may be with destress "yes/no" indicators or without...
+                      # if without, it's treated as if both "yes" and "no" are present.
+$dict = shift @ARGV;
+
+open(R, "<$rules") || die "Opening rules file: $rules";
+
+sub process_word;
+
+while(<R>) {
+  chop $_;
+  my ($rule, $destress, $rule_score) = split(";", $_); # We may have "destress" markings (yes|no),
+  # and scores, or we may have just rule, in which case
+  # $destress and $rule_score will be undefined.
+
+  my @R = split(",", $rule, 4); # "my" means new instance of @R each
+  # time we do this loop -> important because we'll be creating
+  # a reference to @R below.
+  # Note: the last arg to SPLIT tells it how many fields max to get.
+  # This stops it from omitting empty trailing fields.
+  @R == 4 || die "Bad rule $_";
+  $suffix = $R[0]; # Suffix of word we want pron for.
+  if (!defined $isrule{$rule}) {
+    $isrule{$rule} = 1; # make sure we do this only once for each rule 
+    # (don't repeate for different stresses).
+    if (!defined $suffix2rule{$suffix}) {
+      # The syntax [ $x, $y, ... ] means a reference to a newly created array
+      # containing $x, $y, etc.   \@R creates an array reference to R.
+      # so suffix2rule is a hash from suffix to ref to array of refs to 
+      # 4-dimensional arrays.
+      $suffix2rule{$suffix} = [ \@R ];
+    } else {
+      # Below, the syntax @{$suffix2rule{$suffix}} dereferences the array
+      # reference inside the hash; \@R pushes onto that array a new array
+      # reference pointing to @R.
+      push @{$suffix2rule{$suffix}}, \@R;
+    }
+  }
+  if (!defined $rule_score) { $rule_score = -1; } # -1 means we don't have the score info.
+  
+  # Now store information on which destress markings (yes|no) this rule
+  # is valid for, and the associated scores (if supplied)
+  # If just the rule is given (i.e. no destress marking specified),
+  # assume valid for both.
+  if (!defined $destress) { # treat as if both "yes" and "no" are valid.
+    $rule_and_destress_to_rule_score{$rule.";yes"} = $rule_score;
+    $rule_and_destress_to_rule_score{$rule.";no"} = $rule_score;
+  } else {
+    $rule_and_destress_to_rule_score{$rule.";".$destress} = $rule_score;
+  }
+
+}
+
+open(D, "<$dict") || die "Opening base dictionary: $dict";
+while(<D>) {
+  @A = split(" ", $_);
+  $word = shift @A;
+  $pron = join(" ", @A);
+  if (!defined $word2prons{$word}) {
+    $word2prons{$word} = [ $pron ]; # Ref to new anonymous array containing just "pron".
+  } else {
+    push @{$word2prons{$word}}, $pron; # Push $pron onto array referred to (@$ref derefs array).
+  }
+}
+foreach $word (%word2prons) {
+  # Set up the hash "prefixcount", which says how many times a char-sequence
+  # is a prefix (not necessarily a strict prefix) of a word in the dict.
+  $len = length($word);
+  for ($l = 0; $l <= $len; $l++) {
+    $prefixcount{substr($word, 0, $l)}++;
+  }
+}
+
+open(R, "<$rules") || die "Opening rules file: $rules";
+
+
+while(<>) {
+  chop;
+  m/^\S+$/ || die;
+  process_word($_);
+}
+
+sub process_word {
+  my $word = shift @_;
+  $len = length($word);
+  # $owncount is used in evaluating whether a particular prefix is a prefix
+  # of some other word in the dict... if a word itself may be in the dict
+  # (usually because we're running this on the dict itself), we need to
+  # correct for this.
+  if (defined $word2prons{$word}) { $owncount = 1; } else { $owncount = 0; }
+  
+  for ($prefix_len = $min_prefix_len; $prefix_len <= $len; $prefix_len++) {
+    my $prefix = substr($word, 0, $prefix_len);
+    my $suffix = substr($word, $prefix_len);
+    if ($prefixcount{$prefix} - $owncount == 0) {
+      # This prefix is not a prefix of any word in the dict, so no point
+      # checking the rules below-- none of them can match.
+      next;
+    }
+    $rules_array_ref = $suffix2rule{$suffix};
+    if (defined $rules_array_ref) {
+      foreach $R (@$rules_array_ref) { # @$rules_array_ref dereferences the array.
+        # $R is a refernce to a 4-dimensional array, whose elements we access with
+        # $$R[0], etc.
+        my $base_suffix = $$R[1];
+        my $base_word = $prefix . $base_suffix;
+        my $base_prons_ref = $word2prons{$base_word};
+        if (defined $base_prons_ref) {
+          my $psuffix = $$R[2];
+          my $base_psuffix = $$R[3];
+          if ($base_psuffix ne "") { 
+            $base_psuffix = " " . $base_psuffix; 
+            # Include " ", the space between phones, to prevent
+            # matching partial phones below.
+          }
+          my $base_psuffix_len = length($base_psuffix);
+          foreach $base_pron (@$base_prons_ref) { # @$base_prons_ref derefs 
+            # that reference to an array.
+            my $base_pron_prefix_len = length($base_pron) - $base_psuffix_len;
+            # Note: these lengths are in characters, not phones.
+            if ($base_pron_prefix_len >= 0 && 
+                substr($base_pron, $base_pron_prefix_len) eq $base_psuffix) {
+              # The suffix of the base_pron is what it should be.
+              my $pron_prefix = substr($base_pron, 0, $base_pron_prefix_len);
+              my $rule = join(",", @$R); # we'll output this..
+              my $len = @R;
+              for ($destress = 0; $destress <= 1; $destress++) { # Two versions 
+                # of each rule: with destressing and without.
+                # pron is the generated pron.
+                if ($destress) {  $pron_prefix =~ s/2/1/g; }
+                my $pron;
+                if ($psuffix ne "") { $pron = $pron_prefix . " " . $psuffix; }
+                else { $pron = $pron_prefix; }
+                # Now print out the info about the generated pron.
+                my $destress_mark = ($destress ? "yes" : "no");
+                my $rule_score = $rule_and_destress_to_rule_score{$rule.";".$destress_mark};
+                if (defined $rule_score) { # Means that the (rule,destress) combination was
+                  # seen [note: this if-statement may be pointless, as currently we don't
+                  # do any pruning of rules].
+                  my @output = ($word, $pron, $base_word, $base_pron, $rule, $destress_mark);
+                  if ($rule_score != -1) { push @output, $rule_score; } # If scores were supplied,
+                  # we also output the score info.
+                  print join(";", @output) . "\n";
+                }
+              }
+            }  
+          }
+        }
+      }
+    }
+  }
+}  
--- a/egs/chime_wsj0/s5/local/dict/get_rule_hierarchy.pl
+++ b/egs/chime_wsj0/s5/local/dict/get_rule_hierarchy.pl
@ -0,0 +1,73 @@
+#!/usr/bin/perl
+
+#This reads in rules, of the form put out by get_rules.pl, e.g.:
+# ERT,,ER0 T,
+# MENT,ING,M AH0 N T,IH0 NG
+# S,TON,Z,T AH0 N
+# ,ER,IH0 NG,IH0 NG ER0
+# ,'S,M AH0 N,M AH0 N Z
+#TIONS,TIVE,SH AH0 N Z,T IH0 V
+
+# and it works out a hierarchy that says which rules are sub-cases
+# of which rules: it outputs on each line a pair separated by ";", where
+# each member of the pair is a rule, first one is the specialization, the
+# second one being more general.
+# E.g.:
+# RED,RE,D,/ED,E,D,
+# RED,RE,D,/D,,D,
+# GING,GE,IH0 NG,/ING,I,IH0 NG,
+# TOR,TING,T ER0,T IH0 NG/OR,OR,T ER0,T ER0 
+# ERED,ER,D,/RED,R,D,
+# ERED,ER,D,/ED,,D,
+
+
+
+
+while(<>) {
+  chop;
+  $rule = $_;
+  $isrule{$rule} = 1;
+  push @rules, $rule;
+}
+
+foreach my $rule (@rules) {
+  # Truncate the letters and phones in the rule, while we
+  # can, to get more general rules; if the more general rule
+  # exists, put out the pair.
+  @A = split(",", $rule);
+  @suffixa = split("", $A[0]);
+  @suffixb = split("", $A[1]);
+  @psuffixa = split(" ", $A[2]);
+  @psuffixb = split(" ", $A[3]);
+  for ($common_suffix_len = 0; $common_suffix_len < @suffixa && $common_suffix_len < @suffixb;) {
+    if ($suffixa[$common_suffix_len] eq $suffixb[$common_suffix_len]) {
+      $common_suffix_len++;
+    } else {
+      last;
+    }
+  }
+  for ($common_psuffix_len = 0; $common_psuffix_len < @psuffixa && $common_psuffix_len < @psuffixb;) {
+    if ($psuffixa[$common_psuffix_len] eq $psuffixb[$common_psuffix_len]) {
+      $common_psuffix_len++;
+    } else {
+      last;
+    }
+  }
+  # Get all combinations of pairs of integers <= (common_suffix_len, common_psuffix_len),
+  # except (0,0), and print out this rule together with the corresponding rule (if it exists).
+  for ($m = 0; $m <= $common_suffix_len; $m++) {
+    $sa = join("", @suffixa[$m...$#suffixa]); # @x[a..b] is array slice notation.
+    $sb = join("", @suffixb[$m...$#suffixb]);
+    for ($n = 0; $n <= $common_psuffix_len; $n++) {
+      if (!($m == 0 && $n == 0)) {
+        $psa = join(" ", @psuffixa[$n...$#psuffixa]);
+        $psb = join(" ", @psuffixb[$n...$#psuffixb]);
+        $more_general_rule = join(",", ($sa, $sb, $psa, $psb));
+        if (defined $isrule{$more_general_rule}) {
+          print $rule . ";" . $more_general_rule . "\n";
+        }
+      }
+    }
+  }
+}
+
--- a/egs/chime_wsj0/s5/local/dict/get_rules.pl
+++ b/egs/chime_wsj0/s5/local/dict/get_rules.pl
@ -0,0 +1,204 @@
+#!/usr/bin/perl
+
+# This program creates suggested suffix rules from a dictionary.
+# It outputs quadruples of the form:
+# suffix,base-suffix,psuffix,base-psuffix
+# where "suffix" is the suffix of the letters of a word, "base-suffix" is
+# the suffix of the letters of the base-word, "psuffix" is the suffix of the
+# pronunciation of the word (a space-separated list of phonemes), and
+# "base-psuffix" is the suffix of the pronunciation of the baseword.
+# As far as this program is concerned, there is no distinction between
+# "word" and "base-word".  To simplify things slightly, what it does
+# is return all tuples (a,b,c,d) [with a != b] such that there are
+# at least $min_suffix_count instances in the dictionary of
+# a (word-prefix, pron-prefix) pair where there exists (word,pron)
+# pairs of the form
+# ( word-prefix . a,  pron-prefix . c)
+# and 
+# ( word-prefix . b, pron-prefix . d)
+# For example if (a,b,c,d) equals (USLY,US,S L IY0,S)
+# then this quadruple will be output as long as there at least
+# e.g. 30 instances of prefixes like (FAM, F EY1 M AH0)
+# where there exist (word, pron) pairs like:
+# FAMOUS, F EY1 M AH0 S
+# FAMOUSLY  F EY1 M AH0 S L IY0
+#
+# There are some modifications to the picture above, for efficiency.
+# If $disallow_empty_suffix != 0, this program will not output 4-tuples where
+# the first element (the own-word suffix) is empty, as this would cause
+# efficiency problems in get_candidate_prons.pl.  If 
+# $ignore_prefix_stress != 0, this program will ignore stress markings
+# while evaluating whether prefixes are the same.
+# The minimum count for a quadruple to be output is $min_suffix_count
+# (e.g. 30).
+#
+# The function of this program is not to evaluate the accuracy of these rules;
+# it is mostly a pruning step, where we suggest rules that have large enough
+# counts to be suitable for our later procedure where we evaluate their
+# accuracy in predicting prons.
+
+$disallow_empty_suffix = 1; # Disallow rules where the suffix of the "own-word" is
+   # empty.  This is for efficiency in later stages (e.g. get_candidate_prons.pl).
+$min_prefix_len = 3;  # this must match with get_candidate_prons.pl
+$ignore_prefix_stress = 1; # or 0 to take account of stress in prefix.
+$min_suffix_count = 20;
+
+# Takes in dictionary.
+
+print STDERR "Reading dict\n";
+while(<>) {
+  @A = split(" ", $_);
+  my $word = shift @A;
+  my $pron = join(" ", @A);
+  if (!defined $prons{$word}) {
+    $prons{$word} = $pron;
+    push @words, $word;
+  } else {
+    $prons{$word} = $prons{$word} . ";" . $pron;
+  }
+}
+
+# Get common suffixes (e.g., count >100).  Include empty suffix.
+
+print STDERR "Getting common suffix counts.\n";
+{
+  foreach $word (@words) {
+    $len = length($word);
+    for ($x = $min_prefix_len; $x <= $len; $x++) {
+      $suffix_count{substr($word, $x)}++;
+    }
+  }
+
+  foreach $suffix (keys %suffix_count) {
+    if ($suffix_count{$suffix} >= $min_suffix_count) {
+      $newsuffix_count{$suffix} = $suffix_count{$suffix};
+    }
+  }
+  %suffix_count = %newsuffix_count;
+  undef %newsuffix_count;
+
+  foreach $suffix ( sort { $suffix_count{$b} <=> $suffix_count{$a} } keys %suffix_count ) {
+    print STDERR "$suffix_count{$suffix} $suffix\n";
+  }
+}
+
+print STDERR "Getting common suffix pairs.\n";
+
+{
+  print STDERR " Getting map from prefix -> suffix-set.\n";
+
+  # Create map from prefix -> suffix-set.
+  foreach $word (@words) {
+    $len = length($word);
+    for ($x = $min_prefix_len; $x <= $len; $x++) {
+      $prefix = substr($word, 0, $x);
+      $suffix = substr($word, $x);
+      if (defined $suffix_count{$suffix}) { # Suffix is common...
+        if (!defined $suffixes_of{$prefix}) {
+          $suffixes_of{$prefix} = [ $suffix ]; # Create a reference to a new array with
+          # one element.
+        } else {
+          push @{$suffixes_of{$prefix}}, $suffix; # Push $suffix onto array that the
+          # hash member is a reference .
+        }
+      }
+    }
+  }
+  my %suffix_set_count;
+  print STDERR " Getting map from suffix-set -> count.\n";
+  while ( my ($key, $value) = each(%suffixes_of) ) { 
+    my @suffixes = sort ( @$value );
+    $suffix_set_count{join(";", @suffixes)}++;
+  }
+  print STDERR " Getting counts for suffix pairs.\n";
+  while ( my ($suffix_set, $count) = each (%suffix_set_count) ) {
+    my @suffixes = split(";", $suffix_set);
+    # Consider pairs to be ordered.  This is more convenient
+    # later on.
+    foreach $suffix_a (@suffixes) {
+      foreach $suffix_b (@suffixes) {
+        if ($suffix_a ne $suffix_b) {
+          $suffix_pair = $suffix_a . "," . $suffix_b;
+          $suffix_pair_count{$suffix_pair} += $count;
+        }
+      }
+    }
+  }
+
+  # To save memory, only keep pairs above threshold in the hash.
+  while ( my ($suffix_pair, $count) = each (%suffix_pair_count) ) {
+    if ($count >= $min_suffix_count) {
+      $new_hash{$suffix_pair} = $count;
+    }
+  }
+  %suffix_pair_count = %new_hash;
+  undef %new_hash;
+
+  # Print out the suffix pairs so the user can see.
+  foreach $suffix_pair ( 
+      sort { $suffix_pair_count{$b} <=> $suffix_pair_count{$a} } keys %suffix_pair_count ) {
+    print STDERR "$suffix_pair_count{$suffix_pair} $suffix_pair\n";
+  }
+}
+
+print STDERR "Getting common suffix/suffix/psuffix/psuffix quadruples\n";
+
+{
+  while ( my ($prefix, $suffixes_ref) = each(%suffixes_of) ) {
+    # Note: suffixes_ref is a reference to an array.  We dereference with
+    # @$suffixes_ref.
+    # Consider each pair of suffixes (in each order).
+    foreach my $suffix_a ( @$suffixes_ref ) {
+      foreach my $suffix_b ( @$suffixes_ref ) {
+        # could just used "defined" in next line, but this is for clarity.
+        $suffix_pair = $suffix_a.",".$suffix_b;
+        if ( $suffix_pair_count{$suffix_pair} >= $min_suffix_count ) {
+          foreach $pron_a_str (split(";", $prons{$prefix.$suffix_a})) {
+            @pron_a = split(" ", $pron_a_str);
+            foreach $pron_b_str (split(";", $prons{$prefix.$suffix_b})) {
+              @pron_b = split(" ", $pron_b_str);
+              $len_a = @pron_a; # evaluating array as scalar automatically gives length.
+              $len_b = @pron_b;
+              for (my $pos = 0; $pos <= $len_a && $pos <= $len_b; $pos++) {
+                # $pos is starting-pos of psuffix-pair. 
+                $psuffix_a = join(" ", @pron_a[$pos...$#pron_a]);
+                $psuffix_b = join(" ", @pron_b[$pos...$#pron_b]);
+                $quadruple = $suffix_pair . "," . $psuffix_a . "," . $psuffix_b;
+                $quadruple_count{$quadruple}++;
+                
+                my $pron_a_pos = $pron_a[$pos], $pron_b_pos = $pron_b[$pos];
+                if ($ignore_prefix_stress) {
+                  $pron_a_pos =~ s/\d//; # e.g convert IH0 to IH.  Only affects
+                  $pron_b_pos =~ s/\d//; # whether we exit the loop below.
+                }
+                if ($pron_a_pos ne $pron_b_pos) {
+                  # This is important: we don't consider a pron suffix-pair to be
+                  # valid unless the pron prefix is the same.
+                  last;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  # To save memory, only keep pairs above threshold in the hash.
+  while ( my ($quadruple, $count) = each (%quadruple_count) ) {
+    if ($count >= $min_suffix_count) {
+      $new_hash{$quadruple} = $count;
+    }
+  }
+  %quadruple_count = %new_hash;
+  undef %new_hash;
+  
+  # Print out the quadruples for diagnostics.
+  foreach $quadruple ( 
+    sort { $quadruple_count{$b} <=> $quadruple_count{$a} } keys %quadruple_count ) {
+    print STDERR "$quadruple_count{$quadruple} $quadruple\n";
+  }
+}
+# Now print out the quadruples; these are the output of this program.
+foreach $quadruple (keys %quadruple_count) {
+  print $quadruple."\n";
+}
--- a/egs/chime_wsj0/s5/local/dict/limit_candidate_prons.pl
+++ b/egs/chime_wsj0/s5/local/dict/limit_candidate_prons.pl
@ -0,0 +1,103 @@
+#!/usr/bin/perl
+
+# This program enforces the rule that
+# if a "more specific" rule applies, we cannot use the more general rule.
+# It takes in tuples generated by get_candidate_prons (one per line, separated
+# by ";"), of the form:
+# word;pron;base-word;base-pron;rule-name;de-stress[;rule-score]
+# [note: we mean that the last element, the numeric score of the rule, is optional]
+# and it outputs a (generally shorter) list
+# of the same form.
+
+
+# For each word:
+  # For each (base-word,base-pron):
+  #  Eliminate "more-general" rules as follows:
+  #    For each pair of rules applying to this (base-word, base-pron):
+  #      If pair is in more-general hash, disallow more general one.
+  # Let the output be: for each (base-word, base-pron, rule):
+  # for (destress-prefix) in [yes, no], do:
+  #  print out the word input, the rule-name, [destressed:yes|no], and the new pron.
+
+
+if (@ARGV != 1 && @ARGV != 2) {
+  die "Usage: limit_candidate_prons.pl rule_hierarchy [candidate_prons] > limited_candidate_prons";
+}
+
+$hierarchy = shift @ARGV;
+open(H, "<$hierarchy") || die "Opening rule hierarchy $hierarchy";
+
+while(<H>) {
+  chop;
+  m:.+;.+: || die "Bad rule-hierarchy line $_";
+  $hierarchy{$_} = 1; # Format is: if $rule1 is the string form of the more specific rule
+  # and $rule21 is that string form of the more general rule, then $hierarchy{$rule1.";".$rule2}
+  # is defined, else undefined.
+}
+
+
+sub process_word;
+
+undef $cur_word;
+@cur_lines = ();
+
+while(<>) {
+  # input, output is:
+  # word;pron;base-word;base-pron;rule-name;destress;score
+  chop;
+  m:^([^;]+);: || die "Unexpected input: $_";
+  $word = $1;
+  if (!defined $cur_word || $word eq $cur_word) {
+    if (!defined $cur_word) { $cur_word = $word; }
+    push @cur_lines, $_;
+  } else {
+    process_word(@cur_lines); # Process a series of suggested prons
+    # for a particular word.
+    $cur_word = $word;
+    @cur_lines = ( $_ ); 
+  }
+}
+process_word(@cur_lines);
+  
+sub process_word {
+  my %pair2rule_list; # hash from $baseword.";".$baseword to ref
+  # to array of [ line1, line2, ... ].
+  my @cur_lines = @_;
+  foreach my $line (@cur_lines) {
+    my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line);
+    my $key = $baseword.";".$basepron;
+    if (defined $pair2rule_list{$key}) {
+      push @{$pair2rule_list{$key}}, $line; # @{...} derefs the array pointed to 
+      # by the array ref inside {}. 
+    } else {
+      $pair2rule_list{$key} = [ $line ]; # [ $x ] is new anonymous array with 1 elem ($x)
+    }
+  }
+  while ( my ($key, $value) = each(%pair2rule_list) ) {
+    my @lines = @$value; # array of lines that are for this (baseword,basepron).
+    my @stress, @rules; # Arrays of stress markers and rule names, indexed by
+    # same index that indexes @lines.
+    for (my $n = 0; $n < @lines; $n++) {
+      my $line = $lines[$n];
+      my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line);
+      $stress[$n] = $destress;
+      $rules[$n] = $rulename;
+    }
+    for (my $m = 0; $m < @lines; $m++) {
+      my $ok = 1; # if stays 1, this line is OK.
+      for (my $n = 0; $n < @lines; $n++) {
+        if ($m != $n && $stress[$m] eq $stress[$n]) {
+          if (defined $hierarchy{$rules[$n].";".$rules[$m]}) {
+            # Note: this "hierarchy" variable is defined if $rules[$n] is a more
+            # specific instances of $rules[$m], thus invalidating $rules[$m].
+            $ok = 0;
+            last; # no point iterating further.
+          }
+        }
+      }
+      if ($ok != 0) {
+        print $lines[$m] . "\n";
+      }
+    }
+  }
+}
--- a/egs/chime_wsj0/s5/local/dict/reverse_candidates.pl
+++ b/egs/chime_wsj0/s5/local/dict/reverse_candidates.pl
@ -0,0 +1,50 @@
+#!/usr/bin/perl
+
+# This takes the output of e.g. get_candidate_prons.pl or limit_candidate_prons.pl,
+# which is 7-tuples, one per line, of the form:
+
+# word;pron;base-word;base-pron;rule-name;de-stress;rule-score
+# (where rule-score is somtimes listed as optional, but this
+# program does expect it, since we don't anticipate it being used
+# without it).
+# This program assumes that all the words and prons and rules have
+# come from a reversed dictionary (reverse_dict.pl) where the order
+# of the characters in the words, and the phones in the prons, have
+# been reversed, and it un-reverses them.  That it, the characters
+# in "word" and "base-word", and the phones in "pron" and "base-pron"
+# are reversed; and the rule ("rule-name") is parsed as a 4-tuple,
+# like:
+# suffix,base-suffix,psuffix,base-psuffix
+# so this program reverses the characters in "suffix" and "base-suffix"
+# and the phones (separated by spaces) in "psuffix" and "base-psuffix".
+
+sub reverse_str {
+  $str = shift;
+  return join("", reverse(split("", $str)));
+}
+sub reverse_pron {
+  $str = shift;
+  return join(" ", reverse(split(" ", $str)));
+}
+
+while(<>){ 
+  chop;
+  @A = split(";", $_);
+  @A == 7 || die "Bad input line $_: found $len fields, expected 7.";
+
+  ($word,$pron,$baseword,$basepron,$rule,$destress,$score) = @A;
+  $word = reverse_str($word);
+  $pron = reverse_pron($pron);
+  $baseword = reverse_str($baseword);
+  $basepron = reverse_pron($basepron);
+  @R = split(",", $rule, 4);
+  @R == 4 || die "Bad rule $rule";
+
+  $R[0] = reverse_str($R[0]); # suffix.
+  $R[1] = reverse_str($R[1]); # base-suffix.
+  $R[2] = reverse_pron($R[2]); # pron.
+  $R[3] = reverse_pron($R[3]); # base-pron.
+  $rule = join(",", @R);
+  @A = ($word,$pron,$baseword,$basepron,$rule,$destress,$score);
+  print join(";", @A) . "\n";
+}
--- a/egs/chime_wsj0/s5/local/dict/reverse_dict.pl
+++ b/egs/chime_wsj0/s5/local/dict/reverse_dict.pl
@ -0,0 +1,14 @@
+#!/usr/bin/perl
+
+# Used in conjunction with get_rules.pl
+# example input line: XANTHE  Z AE1 N DH
+# example output line: EHTNAX DH N AE1 Z
+
+while(<>){ 
+  @A = split(" ", $_);
+  $word = shift @A;
+  $word = join("", reverse(split("", $word))); # Reverse letters of word.
+  @A = reverse(@A); # Reverse phones in pron.
+  unshift @A, $word;
+  print join(" ", @A) . "\n";
+}
--- a/egs/chime_wsj0/s5/local/dict/score_prons.pl
+++ b/egs/chime_wsj0/s5/local/dict/score_prons.pl
@ -0,0 +1,50 @@
+#!/usr/bin/perl
+
+# This program takes candidate prons from "get_candidate_prons.pl" or
+# "limit_candidate_prons.pl", and a reference dictionary covering those words,
+# and outputs the same format but with scoring information added (so we go from
+# 6 to 7 fields).  The scoring information says, for each generated pron,
+# whether we have a match, a partial match, or no match, to some word in the
+# dictionary.  A partial match means it's correct except for stress.
+
+# The input is a 6-tuple on each line, like:
+# word;pron;base-word;base-pron;rule-name;de-stress
+#
+# The output is the same except with one more field, the score,
+# which may be "right", "wrong", "partial".
+
+if (@ARGV != 1 && @ARGV != 2) {
+  die "Usage: score_prons.pl reference_dict [candidate_prons] > scored_candidate_prons";
+}
+
+$dict = shift @ARGV;
+open(D, "<$dict") || die "Opening dictionary $dict";
+
+while(<D>) { # Set up some hashes that tell us when
+  # a (word,pron) pair is correct (and the same for
+  # prons with stress information removed).
+  chop;
+  @A = split(" ", $_);
+  $word = shift @A;
+  $pron = join(" ", @A);
+  $pron_nostress = $pron;
+  $pron_nostress =~ s:\d::g;
+  $word_and_pron{$word.";".$pron} = 1;
+  $word_and_pron_nostress{$word.";".$pron_nostress} = 1;
+}
+
+while(<>) {
+  chop;
+  $line = $_;
+  my ($word, $pron, $baseword, $basepron, $rulename, $destress) = split(";", $line);
+  $pron_nostress = $pron;
+  $pron_nostress =~ s:\d::g;
+  if (defined $word_and_pron{$word.";".$pron}) {
+    $score = "right";
+  } elsif (defined $word_and_pron_nostress{$word.";".$pron_nostress}) {
+    $score = "partial";
+  } else {
+    $score = "wrong";
+  }
+  print $line.";".$score."\n";
+}
--- a/egs/chime_wsj0/s5/local/dict/score_rules.pl
+++ b/egs/chime_wsj0/s5/local/dict/score_rules.pl
@ -0,0 +1,52 @@
+#!/usr/bin/perl
+
+# This program takes the output of count_rules.pl, which is tuples
+# of the form
+#
+# rule;destress;right-count;partial-count;wrong-count
+#
+# and outputs lines of the form
+#
+# rule;de-stress;score
+#
+# where the score, between 0 and 1 (1 better), is 
+# equal to:
+#
+# It forms a score between 0 and 1, of the form:
+# ((#correct) +  $partial_score * (#partial)) / (#correct + #partial + #wrong + $ballast)
+#
+# where $partial_score (e.g. 0.8) is the score we assign to a "partial" match,
+# and $ballast is a small number, e.g. 1, that is treated like "extra" wrong scores, to penalize
+# rules with few observations.
+#
+# It outputs all rules that at are at least the
+
+$ballast = 1;
+$partial_score = 0.8;
+$destress_penalty = 1.0e-05; # Give destressed rules a small
+# penalty vs. their no-destress counterparts, so if we
+# have to choose arbitrarily we won't destress (seems safer)>
+
+for ($n = 1; $n <= 4; $n++) {
+  if ($ARGV[0] eq "--ballast") {
+    shift @ARGV;
+    $ballast = shift @ARGV;
+  }
+  if ($ARGV[0] eq "--partial-score") {
+    shift @ARGV;
+    $partial_score = shift @ARGV;
+    ($partial_score >= 0.0 && $partial_score <= 1.0) || die "Invalid partial_score: $partial_score";
+  }
+}
+
+(@ARGV == 0 || @ARGV == 1) || die "Usage: score_rules.pl [--ballast ballast-count] [--partial-score partial-score] [input from count_rules.pl]";
+
+while(<>) {
+  @A = split(";", $_);
+  @A == 5 || die "Bad input line; $_";
+  ($rule,$destress,$right_count,$partial_count,$wrong_count) = @A;
+  $rule_score = ($right_count + $partial_score*$partial_count) / 
+    ($right_count+$partial_count+$wrong_count+$ballast);
+  if ($destress eq "yes") { $rule_score -= $destress_penalty; }
+  print join(";", $rule, $destress, sprintf("%.5f", $rule_score)) . "\n";
+}
--- a/egs/chime_wsj0/s5/local/dict/select_candidate_prons.pl
+++ b/egs/chime_wsj0/s5/local/dict/select_candidate_prons.pl
@ -0,0 +1,84 @@
+#!/usr/bin/perl
+
+# This takes the output of e.g. get_candidate_prons.pl or limit_candidate_prons.pl
+# or reverse_candidates.pl, which is 7-tuples, one per line, of the form:
+#
+# word;pron;base-word;base-pron;rule-name;de-stress;rule-score
+#
+# and selects the most likely prons for the words based on rule
+# score.  It outputs in the same format as the input (thus, it is
+# similar to limit_candidates.pl in its input and output format,
+# except it has a different way of selecting the prons to put out).
+#
+# This script will select the $max_prons best pronunciations for
+# each candidate word, subject to the constraint that no pron should
+# have a rule score worse than $min_rule_score.
+# It first merges the candidates by, if there are multiple candidates
+# generating the same pron, selecting the candidate that had the
+# best associated score.  It then sorts the prons on score and
+# selects the n best prons (but doesn't print out candidates with
+# score beneath the threshold).
+
+
+$max_prons = 4;
+$min_rule_score = 0.35;
+
+
+for ($n = 1; $n <= 3; $n++) {
+  if ($ARGV[0] eq "--max-prons") {
+    shift @ARGV;
+    $max_prons = shift @ARGV;
+  }
+  if ($ARGV[0] eq "--min-rule-score") {
+    shift @ARGV;
+    $min_rule_score = shift @ARGV;
+  }
+}
+
+if (@ARGV != 0 && @ARGV != 1) {
+  die "Usage: select_candidates_prons.pl [candidate_prons] > selected_candidate_prons";
+}
+
+sub process_word;
+
+undef $cur_word;
+@cur_lines = ();
+
+while(<>) {
+  # input, output is:
+  # word;pron;base-word;base-pron;rule-name;destress;score
+  chop;
+  m:^([^;]+);: || die "Unexpected input: $_";
+  $word = $1;
+  if (!defined $cur_word || $word eq $cur_word) {
+    if (!defined $cur_word) { $cur_word = $word; }
+    push @cur_lines, $_;
+  } else {
+    process_word(@cur_lines); # Process a series of suggested prons
+    # for a particular word.
+    $cur_word = $word;
+    @cur_lines = ( $_ ); 
+  }
+}
+process_word(@cur_lines);
+
+
+sub process_word {
+  my %pron2rule_score; # hash from generated pron to rule score for that pron.
+  my %pron2line; # hash from generated pron to best line for that pron.
+  my @cur_lines = @_;
+  foreach my $line (@cur_lines) {
+    my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line);
+    if (!defined $pron2rule_score{$pron} ||
+        $rule_score > $pron2rule_score{$pron}) {
+      $pron2rule_score{$pron} = $rule_score;
+      $pron2line{$pron} = $line;
+    }
+  }
+  my @prons = sort { $pron2rule_score{$b} <=> $pron2rule_score{$a} } keys %pron2rule_score;
+  for (my $n = 0; $n < @prons && $n < $max_prons &&
+       $pron2rule_score{$prons[$n]} >= $min_rule_score; $n++) {
+    print $pron2line{$prons[$n]} . "\n";
+  }
+}
+
--- a/egs/chime_wsj0/s5/local/find_noisy_transcripts.pl
+++ b/egs/chime_wsj0/s5/local/find_noisy_transcripts.pl
@ -0,0 +1,65 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+# This program takes on its standard input a list of utterance
+# id's, one for each line. (e.g. 4k0c030a is a an utterance id).
+# It takes as
+# Extracts from the dot files the transcripts for a given
+# dataset (represented by a file list).
+# 
+
+@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts";
+$dot_flist = shift @ARGV;
+
+open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n";
+while(<L>){
+    chop;
+    m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_";
+    $spk = $1;
+    $spk2dot{$spk} = $_;
+}
+
+
+
+while(<STDIN>){ 
+    chop;
+    $uttid_orig = $_;
+    $uttid = substr $uttid_orig, 0, 8; 
+    $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_";
+    $spk = $1;
+    if($spk ne $curspk) {
+        %utt2trans = { }; # Don't keep all the transcripts in memory...
+        $curspk = $spk;
+        $dotfile = $spk2dot{$spk};
+        defined $dotfile || die "No dot file for speaker $spk\n";
+        open(F, "<$dotfile") || die "Error opening dot file $dotfile\n";
+        while(<F>) {
+            $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n";
+            $trans = $1;
+            $utt = $2;
+            $utt2trans{$utt} = $trans;
+        }
+    }
+    if(!defined $utt2trans{$uttid}) {
+        print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n";
+    } else {
+        print "$uttid_orig $utt2trans{$uttid}\n";
+    }
+}
+
+
--- a/egs/chime_wsj0/s5/local/find_transcripts.pl
+++ b/egs/chime_wsj0/s5/local/find_transcripts.pl
@ -0,0 +1,64 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+# This program takes on its standard input a list of utterance
+# id's, one for each line. (e.g. 4k0c030a is a an utterance id).
+# It takes as
+# Extracts from the dot files the transcripts for a given
+# dataset (represented by a file list).
+# 
+
+@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts";
+$dot_flist = shift @ARGV;
+
+open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n";
+while(<L>){
+    chop;
+    m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_";
+    $spk = $1;
+    $spk2dot{$spk} = $_;
+}
+
+
+
+while(<STDIN>){ 
+    chop;
+    $uttid = $_;
+    $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_";
+    $spk = $1;
+    if($spk ne $curspk) {
+        %utt2trans = { }; # Don't keep all the transcripts in memory...
+        $curspk = $spk;
+        $dotfile = $spk2dot{$spk};
+        defined $dotfile || die "No dot file for speaker $spk\n";
+        open(F, "<$dotfile") || die "Error opening dot file $dotfile\n";
+        while(<F>) {
+            $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n";
+            $trans = $1;
+            $utt = $2;
+            $utt2trans{$utt} = $trans;
+        }
+    }
+    if(!defined $utt2trans{$uttid}) {
+        print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n";
+    } else {
+        print "$uttid $utt2trans{$uttid}\n";
+    }
+}
+
+
--- a/egs/chime_wsj0/s5/local/flist2scp.pl
+++ b/egs/chime_wsj0/s5/local/flist2scp.pl
@ -0,0 +1,31 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# takes in a file list with lines like
+# /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
+# and outputs an scp in kaldi format with lines like
+# 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
+# (the first thing is the utterance-id, which is the same as the basename of the file.
+
+
+while(<>){
+    m:^\S+/(\w+)\.[wW][vV]1$: || die "Bad line $_";
+    $id = $1;
+    $id =~ tr/A-Z/a-z/;  # Necessary because of weirdness on disk 13-16.1 (uppercase filenames)
+    print "$id $_";
+}
+
--- a/egs/chime_wsj0/s5/local/generate_example_kws.sh
+++ b/egs/chime_wsj0/s5/local/generate_example_kws.sh
@ -0,0 +1,110 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
+# Apache 2.0.
+
+
+if [ $# -ne 2 ]; then
+   echo "Usage: local/generate_example_kws.sh <data-dir> <kws-data-dir>"
+   echo " e.g.: local/generate_example_kws.sh data/test_eval92/ <data/kws>"
+   exit 1;
+fi
+
+datadir=$1;
+kwsdatadir=$2;
+text=$datadir/text;
+
+mkdir -p $kwsdatadir;
+
+# Generate keywords; we generate 20 unigram keywords with at least 20 counts,
+# 20 bigram keywords with at least 10 counts and 10 trigram keywords with at
+# least 5 counts.
+cat $text | perl -e '
+  %unigram = ();
+  %bigram = ();
+  %trigram = ();
+  while(<>) {
+    chomp;
+    @col=split(" ", $_);
+    shift @col;
+    for($i = 0; $i < @col; $i++) {
+      # unigram case
+      if (!defined($unigram{$col[$i]})) {
+        $unigram{$col[$i]} = 0;
+      }
+      $unigram{$col[$i]}++;
+
+      # bigram case
+      if ($i < @col-1) {
+        $word = $col[$i] . " " . $col[$i+1];
+        if (!defined($bigram{$word})) {
+          $bigram{$word} = 0;
+        }
+        $bigram{$word}++;
+      }
+
+      # trigram case
+      if ($i < @col-2) {
+        $word = $col[$i] . " " . $col[$i+1] . " " . $col[$i+2];
+        if (!defined($trigram{$word})) {
+          $trigram{$word} = 0;
+        }
+        $trigram{$word}++;
+      }
+    }
+  }
+
+  $max_count = 100;
+  $total = 20;
+  $current = 0;
+  $min_count = 20;
+  while ($current < $total && $min_count <= $max_count) {
+    foreach $x (keys %unigram) {
+      if ($unigram{$x} == $min_count) {
+        print "$x\n";
+        $unigram{$x} = 0;
+        $current++;
+      }
+      if ($current == $total) {
+        last;
+      }
+    }
+    $min_count++;
+  }
+  
+  $total = 20;
+  $current = 0;
+  $min_count = 4;
+  while ($current < $total && $min_count <= $max_count) {
+    foreach $x (keys %bigram) {
+      if ($bigram{$x} == $min_count) {
+        print "$x\n";
+        $bigram{$x} = 0;
+        $current++;
+      }
+      if ($current == $total) {
+        last;
+      }
+    }
+    $min_count++;
+  }
+  
+  $total = 10;
+  $current = 0;
+  $min_count = 3;
+  while ($current < $total && $min_count <= $max_count) {
+    foreach $x (keys %trigram) {
+      if ($trigram{$x} == $min_count) {
+        print "$x\n";
+        $trigram{$x} = 0;
+        $current++;
+      }
+      if ($current == $total) {
+        last;
+      }
+    }
+    $min_count++;
+  }
+  ' > $kwsdatadir/raw_keywords.txt
+
+echo "Keywords generation succeeded"
--- a/egs/chime_wsj0/s5/local/kws_data_prep.sh
+++ b/egs/chime_wsj0/s5/local/kws_data_prep.sh
@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
+# Apache 2.0.
+
+
+if [ $# -ne 3 ]; then
+   echo "Usage: local/kws_data_prep.sh <lang-dir> <data-dir> <kws-data-dir>"
+   echo " e.g.: local/kws_data_prep.sh data/lang_test_bd_tgpr/ data/test_eval92/ data/kws/"
+   exit 1;
+fi
+
+langdir=$1;
+datadir=$2;
+kwsdatadir=$3;
+
+mkdir -p $kwsdatadir;
+
+# Create keyword id for each keyword
+cat $kwsdatadir/raw_keywords.txt | perl -e '
+  $idx=1;
+  while(<>) {
+    chomp;
+    printf "WSJ-%04d $_\n", $idx;
+    $idx++;
+  }' > $kwsdatadir/keywords.txt
+
+# Map the keywords to integers; note that we remove the keywords that
+# are not in our $langdir/words.txt, as we won't find them anyway...
+cat $kwsdatadir/keywords.txt | \
+  sym2int.pl --map-oov 0 -f 2- $langdir/words.txt | \
+  grep -v " 0 " | grep -v " 0$" > $kwsdatadir/keywords.int
+
+# Compile keywords into FSTs
+transcripts-to-fsts ark:$kwsdatadir/keywords.int ark:$kwsdatadir/keywords.fsts
+
+# Create utterance id for each utterance; Note that by "utterance" here I mean
+# the keys that will appear in the lattice archive. You may have to modify here
+cat $datadir/wav.scp | \
+  awk '{print $1}' | \
+  sort | uniq | perl -e '
+  $idx=1;
+  while(<>) {
+    chomp;
+    print "$_ $idx\n";
+    $idx++;
+  }' > $kwsdatadir/utter_id
+
+# Map utterance to the names that will appear in the rttm file. You have 
+# to modify the commands below accoring to your rttm file. In the WSJ case
+# since each file is an utterance, we assume that the actual file names will 
+# be the "names" in the rttm, so the utterance names map to themselves.
+cat $datadir/wav.scp | \
+  awk '{print $1}' | \
+  sort | uniq | perl -e '
+  while(<>) {
+    chomp;
+    print "$_ $_\n";
+  }' > $kwsdatadir/utter_map;
+echo "Kws data preparation succeeded"
--- a/egs/chime_wsj0/s5/local/ndx2flist.pl
+++ b/egs/chime_wsj0/s5/local/ndx2flist.pl
@ -0,0 +1,62 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This program takes as its standard input an .ndx file from the WSJ corpus that looks
+# like this:
+#;; File: tr_s_wv1.ndx, updated 04/26/94
+#;;
+#;; Index for WSJ0 SI-short Sennheiser training data
+#;; Data is read WSJ sentences, Sennheiser mic.
+#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts 
+#;; per speaker TI) = 7236 utts
+#;;
+#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
+#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
+#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1
+
+#and as command-line arguments it takes the names of the WSJ disk locations, e.g.:
+#/mnt/matylda2/data/WSJ0/11-1.1 /mnt/matylda2/data/WSJ0/11-10.1  ... etc.
+# It outputs a list of absolute pathnames (it does this by replacing e.g. 11_1_1 with
+# /mnt/matylda2/data/WSJ0/11-1.1.
+# It also does a slight fix because one of the WSJ disks (WSJ1/13-16.1) was distributed with
+# uppercase rather than lower case filenames.
+
+foreach $fn (@ARGV) {
+    $fn =~ m:.+/([0-9\.\-]+)/?$: || die "Bad command-line argument $fn\n";
+    $disk_id=$1; 
+    $disk_id =~ tr/-\./__/; # replace - and . with - so 11-10.1 becomes 11_10_1
+    $fn =~ s:/$::; # Remove final slash, just in case it is present.
+    $disk2fn{$disk_id} = $fn;
+}
+
+while(<STDIN>){
+    if(m/^;/){ next; } # Comment.  Ignore it.
+    else {
+      m/^([0-9_]+):\s*(\S+)$/  || die "Could not parse line $_";
+      $disk=$1;
+      if(!defined $disk2fn{$disk}) {
+          die "Disk id $disk not found";
+      }
+      $filename = $2; # as a subdirectory of the distributed disk.
+      if($disk eq "13_16_1" && `hostname` =~ m/fit.vutbr.cz/) {
+          # The disk 13-16.1 has been uppercased for some reason, on the
+          # BUT system.  This is a fix specifically for that case.
+          $filename =~ tr/a-z/A-Z/; # This disk contains all uppercase filenames.  Why?
+      }
+      print "$disk2fn{$disk}/$filename\n";
+  }
+}
--- a/egs/chime_wsj0/s5/local/nnet2/run_5b.sh
+++ b/egs/chime_wsj0/s5/local/nnet2/run_5b.sh
@ -0,0 +1,69 @@
+#!/bin/bash
+
+
+stage=0
+train_stage=-100
+# This trains only unadapted (just cepstral mean normalized) features,
+# and uses various combinations of VTLN warping factor and time-warping
+# factor to artificially expand the amount of data.
+
+. cmd.sh
+
+. utils/parse_options.sh  # to parse the --stage option, if given
+
+[ $# != 0 ] && echo "Usage: local/run_4b.sh [--stage <stage> --train-stage <train-stage>]" && exit 1;
+
+set -e
+
+if [ $stage -le 0 ]; then 
+  # Create the training data.
+  featdir=`pwd`/mfcc/nnet5b; mkdir -p $featdir
+  fbank_conf=conf/fbank_40.conf
+  echo "--num-mel-bins=40" > $fbank_conf
+  steps/nnet2/get_perturbed_feats.sh --cmd "$train_cmd" \
+    $fbank_conf $featdir exp/perturbed_fbanks_si284 data/train_si284 data/train_si284_perturbed_fbank &
+  steps/nnet2/get_perturbed_feats.sh --cmd "$train_cmd" --feature-type mfcc \
+    conf/mfcc.conf $featdir exp/perturbed_mfcc_si284 data/train_si284 data/train_si284_perturbed_mfcc &
+  wait
+fi
+
+if [ $stage -le 1 ]; then
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/train_si284_perturbed_mfcc data/lang exp/tri4b exp/tri4b_ali_si284_perturbed_mfcc
+fi 
+
+if [ $stage -le 2 ]; then
+  steps/nnet2/train_block.sh --stage "$train_stage" \
+     --cleanup false \
+     --initial-learning-rate 0.01 --final-learning-rate 0.001 \
+     --num-epochs 10 --num-epochs-extra 5 \
+     --cmd "$decode_cmd" \
+     --hidden-layer-dim 1536 \
+     --num-block-layers 3 --num-normal-layers 3 \
+      data/train_si284_perturbed_fbank data/lang exp/tri4b_ali_si284_perturbed_mfcc exp/nnet5b  || exit 1
+fi
+
+if [ $stage -le 3 ]; then # create testing fbank data.
+  featdir=`pwd`/mfcc
+  fbank_conf=conf/fbank_40.conf
+  for x in test_eval92 test_eval93 test_dev93; do 
+    cp -rT data/$x data/${x}_fbank
+    rm -r ${x}_fbank/split* || true
+    steps/make_fbank.sh --fbank-config "$fbank_conf" --nj 8 \
+      --cmd "$train_cmd" data/${x}_fbank exp/make_fbank/$x $featdir  || exit 1;
+    steps/compute_cmvn_stats.sh data/${x}_fbank exp/make_fbank/$x $featdir  || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
+     exp/tri4b/graph_bd_tgpr data/test_dev93_fbank exp/nnet5b/decode_bd_tgpr_dev93
+
+  steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 8 \
+     exp/tri4b/graph_bd_tgpr data/test_eval92_fbank exp/nnet5b/decode_bd_tgpr_eval92
+fi
+
+
+
+exit 0;
+
--- a/egs/chime_wsj0/s5/local/nnet2/run_5c.sh
+++ b/egs/chime_wsj0/s5/local/nnet2/run_5c.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+# This is neural net training on top of adapted 40-dimensional features.
+# 
+
+. ./cmd.sh
+
+( 
+ steps/nnet2/train_tanh.sh \
+   --mix-up 8000 \
+   --initial-learning-rate 0.01 --final-learning-rate 0.001 \
+   --num-hidden-layers 4 --hidden-layer-dim 1024 \
+   --cmd "$decode_cmd" \
+    data/train_si284 data/lang exp/tri4b_ali_si284 exp/nnet5c || exit 1
+  
+  steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 10 \
+    --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
+     exp/tri4b/graph_bd_tgpr data/test_dev93 exp/nnet5c/decode_bd_tgpr_dev93
+
+  steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 8 \
+    --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \
+     exp/tri4b/graph_bd_tgpr data/test_eval92 exp/nnet5c/decode_bd_tgpr_eval92
+)
+
--- a/egs/chime_wsj0/s5/local/noisy_wsj0_data_prep.sh
+++ b/egs/chime_wsj0/s5/local/noisy_wsj0_data_prep.sh
@ -0,0 +1,119 @@
+#!/bin/bash
+set -e
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems. 
+# - Arnab Ghoshal, 29/05/12
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <corpus-directory>\n\n" `basename $0`
+  echo "The argument should be a the top-level WSJ corpus directory."
+  echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory"
+  echo "within the top-level corpus directory."
+  exit 1;
+fi
+
+CORPUS=$1
+
+dir=`pwd`/data/local/data
+lmdir=`pwd`/data/local/nist_lm
+mkdir -p $dir $lmdir
+local=`pwd`/local
+utils=`pwd`/utils
+
+. ./path.sh # Needed for KALDI_ROOT
+export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+if [ ! -x $sph2pipe ]; then
+  echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
+  exit 1;
+fi
+
+cd $dir
+
+# reverb list for SI-84
+
+find $1/si_tr_s -name '*.wav' |  sort -u > train_si84_noisy.flist
+
+
+
+# Dev-set Hub 1,2 (503, 913 utterances)
+
+# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.  
+# Sometimes this gets copied from the CD's with upcasing, don't know 
+# why (could be older versions of the disks).
+find $1/si_dt_20  -name '*.wav' | sort -u > dev_dt_20_noisy.flist
+find $1/si_dt_05  -name '*.wav' | sort -u > dev_dt_05_noisy.flist
+
+find $1/si_et_20  -name '*.wav' | sort -u > test_eval92_noisy.flist
+find $1/si_et_05  -name '*.wav' | sort -u > test_eval92_5k_noisy.flist
+
+
+# Finding the transcript files:
+#find -L $CORPUS -iname '*.dot' > dot_files.flist
+if [ ! -e $dir/dot_files.flist ]; then
+  echo "Could not find $dir/dot_files.flist files, first run clean_data_prep.sh";
+  exit 1;
+fi
+
+# Convert the transcripts into our format (no normalization yet)
+# adding suffix to utt_id 
+# 1 for reverb condition
+for x in train_si84_noisy dev_dt_05_noisy dev_dt_20_noisy test_eval92_noisy test_eval92_5k_noisy; do
+  cat $x.flist | perl -e ' 
+    while(<>) {
+      m:^\S+/(\w+)\.wav$: || die "Bad line $_";
+      $id = $1;
+      $id =~ tr/A-Z/a-z/;
+      print "$id $_"; 
+    }
+  ' | sort > ${x}_wav_tmp.scp
+  #cat ${x}_wav_tmp.scp | awk '{print $1}' \
+  #  | $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1
+  cat ${x}_wav_tmp.scp | perl -e '
+    while(<STDIN>) {
+      @A=split(" ", $_);
+      @B=split("/", $_);
+      $abs_path_len=@B;
+      $condition=$B[$abs_path_len-5];
+      if ($condition eq "9dB") {$key_suffix=2;}
+      elsif ($condition eq "6dB") {$key_suffix=3;}
+      elsif ($condition eq "3dB") {$key_suffix=4;}
+      elsif ($condition eq "0dB") {$key_suffix=5;}
+      elsif ($condition eq "m3dB") {$key_suffix=6;}
+      elsif ($condition eq "m6dB") {$key_suffix=7;}
+      else {print STDERR "error condition $condition";} 
+      print $A[0].$key_suffix." ".$A[1]."\n"; 
+    }
+  ' | sort -k1 > ${x}_wav.scp
+  cat ${x}_wav.scp | awk '{print $1}' \
+    | $local/find_noisy_transcripts.pl dot_files.flist > ${x}.trans1 
+done
+
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in train_si84_noisy dev_dt_05_noisy dev_dt_20_noisy test_eval92_noisy test_eval92_5k_noisy; do
+  cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
+    | sort > $x.txt || exit 1;
+done
+ 
+# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
+#for x in train_si84_clean test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean; do
+#  awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp \
+#    > ${x}_wav.scp
+#done
+
+# Make the utt2spk and spk2utt files.
+for x in train_si84_noisy dev_dt_05_noisy dev_dt_20_noisy test_eval92_noisy test_eval92_5k_noisy; do
+  cat ${x}_wav.scp | awk '{print $1}' \
+    | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
+  cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
+done
+
+echo "Data preparation succeeded"
--- a/egs/chime_wsj0/s5/local/normalize_transcript.pl
+++ b/egs/chime_wsj0/s5/local/normalize_transcript.pl
@ -0,0 +1,59 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This takes data from the standard input that's unnormalized transcripts in the format
+# 4k2c0308 Of course there isn\'t any guarantee the company will keep its hot hand [misc_noise] 
+# 4k2c030a [loud_breath] And new hardware such as the set of personal computers I\. B\. M\. introduced last week can lead to unexpected changes in the software business [door_slam] 
+# and outputs normalized transcripts.
+# c.f. /mnt/matylda2/data/WSJ0/11-10.1/wsj0/transcrp/doc/dot_spec.doc
+
+@ARGV == 1 ||  die "usage: normalize_transcript.pl noise_word < transcript > transcript2";
+$noise_word = shift @ARGV;
+
+while(<STDIN>) {
+    $_ =~ m:^(\S+) (.+): || die "bad line $_";
+    $utt = $1;
+    $trans = $2;
+    print "$utt";
+    foreach $w (split (" ",$trans)) {
+        $w =~ tr:a-z:A-Z:; # Upcase everything to match the CMU dictionary. .
+        $w =~ s:\\::g;      # Remove backslashes.  We don't need the quoting.
+        $w =~ s:^\%PERCENT$:PERCENT:; # Normalization for Nov'93 test transcripts.
+        $w =~ s:^\.POINT$:POINT:; # Normalization for Nov'93 test transcripts.
+        if($w =~ m:^\[\<\w+\]$:  || # E.g. [<door_slam], this means a door slammed in the preceding word. Delete.
+           $w =~ m:^\[\w+\>\]$:  ||  # E.g. [door_slam>], this means a door slammed in the next word.  Delete.
+           $w =~ m:\[\w+/\]$: ||  # E.g. [phone_ring/], which indicates the start of this phenomenon.
+           $w =~ m:\[\/\w+]$: ||  # E.g. [/phone_ring], which indicates the end of this phenomenon.
+           $w eq "~" ||        # This is used to indicate truncation of an utterance.  Not a word.
+           $w eq ".") {      # "." is used to indicate a pause.  Silence is optional anyway so not much 
+                             # point including this in the transcript.
+            next; # we won't print this word.
+        } elsif($w =~ m:\[\w+\]:) { # Other noises, e.g. [loud_breath].
+            print " $noise_word";
+        } elsif($w =~ m:^\<([\w\']+)\>$:) {
+            # e.g. replace <and> with and.  (the <> means verbal deletion of a word).. but it's pronounced.
+            print " $1";
+        } elsif($w eq "--DASH") {
+            print " -DASH";  # This is a common issue; the CMU dictionary has it as -DASH.
+#        } elsif($w =~ m:(.+)\-DASH$:) { # E.g. INCORPORATED-DASH... seems the DASH gets combined with previous word
+#            print " $1 -DASH";
+        } else {
+            print " $w";
+        }
+    }
+    print "\n";
+}
--- a/egs/chime_wsj0/s5/local/reverb_wsj0_data_prep.sh
+++ b/egs/chime_wsj0/s5/local/reverb_wsj0_data_prep.sh
@ -0,0 +1,100 @@
+#!/bin/bash
+set -e
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems. 
+# - Arnab Ghoshal, 29/05/12
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <corpus-directory>\n\n" `basename $0`
+  echo "The argument should be a the top-level WSJ corpus directory."
+  echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory"
+  echo "within the top-level corpus directory."
+  exit 1;
+fi
+
+CORPUS=$1
+
+dir=`pwd`/data/local/data
+lmdir=`pwd`/data/local/nist_lm
+mkdir -p $dir $lmdir
+local=`pwd`/local
+utils=`pwd`/utils
+
+. ./path.sh # Needed for KALDI_ROOT
+export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+if [ ! -x $sph2pipe ]; then
+  echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
+  exit 1;
+fi
+
+cd $dir
+
+# reverb list for SI-84
+
+find $1/si_tr_s -name '*.wav' |  sort -u > train_si84_reverb.flist
+
+
+
+# Dev-set Hub 1,2 (503, 913 utterances)
+
+# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.  
+# Sometimes this gets copied from the CD's with upcasing, don't know 
+# why (could be older versions of the disks).
+find $1/si_dt_20  -name '*.wav' | sort -u > dev_dt_20_reverb.flist
+find $1/si_dt_05  -name '*.wav' | sort -u > dev_dt_05_reverb.flist
+
+
+# Finding the transcript files:
+#find -L $CORPUS -iname '*.dot' > dot_files.flist
+if [ ! -e $dir/dot_files.flist ]; then
+  echo "Could not find $dir/dot_files.flist files, first run clean_data_prep.sh";
+  exit 1;
+fi
+
+# Convert the transcripts into our format (no normalization yet)
+# adding suffix to utt_id 
+# 1 for reverb condition
+for x in train_si84_reverb dev_dt_05_reverb dev_dt_20_reverb; do
+  cat $x.flist | perl -e ' 
+    while(<>) {
+      m:^\S+/(\w+)\.wav$: || die "Bad line $_";
+      $id = $1;
+      $id =~ tr/A-Z/a-z/;
+      print "$id $_"; 
+    }
+  ' | sort > ${x}_wav_tmp.scp
+  cat ${x}_wav_tmp.scp | awk '{print $1}' \
+    | $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1
+  cat ${x}_wav_tmp.scp | awk '{printf("%s1 %s\n", $1, $2);}' > ${x}_wav.scp
+  cat ${x}_tmp.trans1 | awk '{printf("%s1 ", $1); for(i=2;i<=NF;i++) printf("%s ", $i); printf("\n");}' > ${x}.trans1
+done
+
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in train_si84_reverb dev_dt_05_reverb dev_dt_20_reverb; do
+  cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
+    | sort > $x.txt || exit 1;
+done
+ 
+# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
+#for x in train_si84_clean test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean; do
+#  awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp \
+#    > ${x}_wav.scp
+#done
+
+# Make the utt2spk and spk2utt files.
+for x in train_si84_reverb dev_dt_05_reverb dev_dt_20_reverb; do
+  cat ${x}_wav.scp | awk '{print $1}' \
+    | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
+  cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
+done
+
+echo "Data preparation succeeded"
--- a/egs/chime_wsj0/s5/local/run_basis_fmllr.sh
+++ b/egs/chime_wsj0/s5/local/run_basis_fmllr.sh
@ -0,0 +1,42 @@
+#!/bin/bash
+
+. cmd.sh
+
+mfccdir=mfcc
+
+# Make "per-utterance" versions of the test sets where the speaker
+# information corresponds to utterances-- to demonstrate adaptation on
+# short utterances, particularly for basis fMLLR
+for x in test_eval92 test_eval93 test_dev93 ; do
+  y=${x}_utt
+  rm -r data/$y
+  cp -r data/$x data/$y
+  cat data/$x/utt2spk | awk '{print $1, $1;}' > data/$y/utt2spk;
+  cp data/$y/utt2spk data/$y/spk2utt;
+  steps/compute_cmvn_stats.sh data/$y exp/make_mfcc/$y $mfccdir || exit 1; 
+done
+
+
+ # basis fMLLR experiments.
+ # First a baseline: decode per-utterance with normal fMLLR.
+steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+  exp/tri3b/graph_tgpr data/test_dev93_utt exp/tri3b/decode_tgpr_dev93_utt || exit 1;
+steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+  exp/tri3b/graph_tgpr data/test_eval92_utt exp/tri3b/decode_tgpr_eval92_utt || exit 1;
+
+ # get the fMLLR basis.
+steps/get_fmllr_basis.sh --cmd "$train_cmd" data/train_si84 data/lang exp/tri3b
+
+ # decoding tri3b with basis fMLLR
+steps/decode_basis_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+  exp/tri3b/graph_tgpr data/test_dev93 exp/tri3b/decode_tgpr_dev93_basis || exit 1;
+steps/decode_basis_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+  exp/tri3b/graph_tgpr data/test_eval92 exp/tri3b/decode_tgpr_eval92_basis || exit 1;
+
+  # The same, per-utterance.
+steps/decode_basis_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+  exp/tri3b/graph_tgpr data/test_dev93_utt exp/tri3b/decode_tgpr_dev93_basis_utt || exit 1;
+steps/decode_basis_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+  exp/tri3b/graph_tgpr data/test_eval92_utt exp/tri3b/decode_tgpr_eval92_basis_utt || exit 1;
+
+
--- a/egs/chime_wsj0/s5/local/run_dnn.sh
+++ b/egs/chime_wsj0/s5/local/run_dnn.sh
@ -0,0 +1,181 @@
+#!/bin/bash
+
+# Copyright 2012-2013  Brno University of Technology (Author: Karel Vesely)
+# Apache 2.0
+
+# In this recipe we build DNN in four stages:
+# 1) Data preparations : the fMLLR features are stored to disk
+# 2) RBM pre-training : in this unsupervised stage we train stack of RBMs, a good starting point for Cross-entropy trainig
+# 3) Frame-level cross-entropy training : in this stage the objective is to classify frames correctly.
+# 4) Sequence-criterion training : in this stage the objective is to classify the whole sequence correctly,
+#     the idea is similar to the 'Discriminative training' in context of GMM-HMMs.
+
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+. ./path.sh ## Source the tools/utils (import the queue.pl)
+
+
+
+#false && \
+{
+gmmdir=exp/tri4b
+
+###
+### Generate the alignments of dev93 
+### (held-out set for Cross-entropy training)
+###
+steps/align_fmllr.sh --nj 10 --cmd "$train_cmd" \
+  data/test_dev93 data/lang $gmmdir exp/tri4b_ali_dev93 || exit 1
+
+###
+### Store the fMLLR features, so we can train on them easily
+###
+
+# train si284
+# generate the features
+dir=data-fmllr-tri4b/train_si284
+steps/make_fmllr_feats.sh --nj 20 --cmd "$train_cmd" \
+   --transform-dir exp/tri4b_ali_si284 \
+   $dir data/train_si284 $gmmdir $dir/_log $dir/_data || exit 1
+
+# eval92
+dir=data-fmllr-tri4b/test_eval92
+steps/make_fmllr_feats.sh --nj 8 --cmd "$train_cmd" \
+   --transform-dir exp/tri4b/decode_tgpr_eval92 \
+   $dir data/test_eval92 $gmmdir $dir/_log $dir/_data || exit 1
+
+# dev93 (unsupervised fMLLR)
+# held-out set of Cross-entropy training
+dir=data-fmllr-tri4b/test_dev93
+steps/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
+   --transform-dir exp/tri4b/decode_tgpr_dev93 \
+   $dir data/test_dev93 $gmmdir $dir/_log $dir/_data || exit 1
+}
+
+
+
+###
+### Now we can pre-train stack of RBMs
+###
+#false && \
+{ # Pre-train the DBN
+dir=exp/tri4b_pretrain-dbn
+(tail --pid=$$ -F $dir/_pretrain_dbn.log 2>/dev/null)&
+$cuda_cmd $dir/_pretrain_dbn.log \
+  steps/pretrain_dbn.sh --rbm-iter 3 data-fmllr-tri4b/train_si284 $dir
+}
+
+
+
+###
+### Now we train the DNN optimizing cross-entropy.
+### This will take quite some time.
+###
+
+#false && \
+{ # Train the MLP
+dir=exp/tri4b_pretrain-dbn_dnn
+ali=exp/tri4b_ali
+feature_transform=exp/tri4b_pretrain-dbn/final.feature_transform
+dbn=exp/tri4b_pretrain-dbn/6.dbn
+(tail --pid=$$ -F $dir/_train_nnet.log 2>/dev/null)& 
+$cuda_cmd $dir/_train_nnet.log \
+  steps/train_nnet.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
+  data-fmllr-tri4b/train_si284 data-fmllr-tri4b/test_dev93 data/lang ${ali}_si284 ${ali}_dev93 $dir || exit 1;
+# decode with 'big-dictionary' (reuse HCLG graph)
+steps/decode_nnet.sh --nj 10 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \
+  exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_dev93 $dir/decode_bd_tgpr_dev93 || exit 1;
+steps/decode_nnet.sh --nj 8 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \
+  exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_eval92 $dir/decode_bd_tgpr_eval92 || exit 1;
+}
+
+
+
+###
+### Finally we train using sMBR criterion.
+### We do Stochastic-GD with per-utterance updates. 
+###
+### To get faster convergence, we will re-generate 
+### the lattices after 1st epoch of sMBR.
+###
+
+dir=exp/tri4b_pretrain-dbn_dnn_smbr
+srcdir=exp/tri4b_pretrain-dbn_dnn
+acwt=0.10
+
+# First we need to generate lattices and alignments:
+#false && \
+{
+steps/align_nnet.sh --nj 100 --cmd "$train_cmd" \
+  data-fmllr-tri4b/train_si284 data/lang $srcdir ${srcdir}_ali_si284 || exit 1;
+steps/make_denlats_nnet.sh --nj 100 --cmd "$decode_cmd" \
+  --config conf/decode_dnn.config --acwt $acwt \
+  data-fmllr-tri4b/train_si284 data/lang $srcdir ${srcdir}_denlats_si284  || exit 1;
+}
+# Now we re-train the hybrid by single iteration of sMBR 
+#false && \
+{
+steps/train_nnet_mpe.sh --cmd "$cuda_cmd" --num-iters 1 --acwt $acwt --do-smbr true \
+  data-fmllr-tri4b/train_si284 data/lang $srcdir \
+  ${srcdir}_ali_si284 ${srcdir}_denlats_si284 $dir || exit 1
+}
+# Decode
+#false && \
+{
+for ITER in 1; do
+  # decode dev93 with big dict graph_bd_tgpr
+  steps/decode_nnet.sh --nj 10 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+    --nnet $dir/${ITER}.nnet --acwt $acwt \
+    exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_dev93 $dir/decode_dev93_bd_tgpr_it${ITER} || exit 1
+  # decode eval92 with big dict graph_bd_tgpr
+  steps/decode_nnet.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+    --nnet $dir/${ITER}.nnet --acwt $acwt \
+    exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_eval92 $dir/decode_eval92_bd_tgpr_it${ITER} || exit 1
+done 
+}
+
+
+###
+### Re-generate lattices and run several more iterations of sMBR
+###
+
+dir=exp/tri4b_pretrain-dbn_dnn_smbr_iter1-lats
+srcdir=exp/tri4b_pretrain-dbn_dnn_smbr
+acwt=0.10
+
+# First we need to generate lattices and alignments:
+#false && \
+{
+steps/align_nnet.sh --nj 100 --cmd "$train_cmd" \
+  data-fmllr-tri4b/train_si284 data/lang $srcdir ${srcdir}_ali_si284 || exit 1;
+steps/make_denlats_nnet.sh --nj 100 --cmd "$decode_cmd" \
+  --config conf/decode_dnn.config --acwt $acwt \
+  data-fmllr-tri4b/train_si284 data/lang $srcdir ${srcdir}_denlats_si284  || exit 1;
+}
+# Now we re-train the hybrid by several iterations of sMBR 
+#false && \
+{
+steps/train_nnet_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \
+  data-fmllr-tri4b/train_si284 data/lang $srcdir \
+  ${srcdir}_ali_si284 ${srcdir}_denlats_si284 $dir 
+}
+# Decode
+#false && \
+{
+for ITER in 1 2 3 4; do
+  # decode dev93 with big dict graph_bd_tgpr
+  steps/decode_nnet.sh --nj 10 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+    --nnet $dir/${ITER}.nnet --acwt $acwt \
+    exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_dev93 $dir/decode_dev93_bd_tgpr_it${ITER} || exit 1
+  # decode eval92 with big dict graph_bd_tgpr
+  steps/decode_nnet.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+    --nnet $dir/${ITER}.nnet --acwt $acwt \
+    exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_eval92 $dir/decode_eval92_bd_tgpr_it${ITER} || exit 1
+done 
+}
+
+
+# Getting results [see RESULTS file]
+# for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
--- a/egs/chime_wsj0/s5/local/run_fwdbwd.sh
+++ b/egs/chime_wsj0/s5/local/run_fwdbwd.sh
@ -0,0 +1,41 @@
+#prepare reverse lexicon and language model for backwards decoding
+utils/prepare_lang.sh --reverse true data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp.reverse data/lang.reverse || exit 1;
+utils/reverse_lm.sh data/local/nist_lm/lm_bg_5k.arpa.gz data/lang.reverse data/lang_test_bg_5k.reverse || exit 1;
+utils/reverse_lm_test.sh data/lang_test_bg_5k data/lang_test_bg_5k.reverse || exit 1;
+
+# normal forward decoding
+utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg5k
+steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --nj 8 --cmd "$decode_cmd" \
+  exp/tri2a/graph_bg5k data/test_eval92 exp/tri2a/decode_eval92_bg5k_10 || exit 1;
+
+# backward decoding
+utils/mkgraph.sh --reverse data/lang_test_bg_5k.reverse exp/tri2a exp/tri2a/graph_bg5k_r
+steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --reverse true --nj 8 --cmd "$decode_cmd" \
+  exp/tri2a/graph_bg5k_r data/test_eval92 exp/tri2a/decode_eval92_bg5k_reverse10 || exit 1;
+
+# pingpong decoding
+steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --reverse true --nj 8 --cmd "$decode_cmd" \
+  --first_pass exp/tri2a/decode_eval92_bg5k_10 exp/tri2a/graph_bg5k_r data/test_eval92 exp/tri2a/decode_eval92_bg5k_pingpong10 || exit 1;
+steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --nj 8 --cmd "$decode_cmd" \
+  --first_pass exp/tri2a/decode_eval92_bg5k_reverse10 exp/tri2a/graph_bg5k data/test_eval92 exp/tri2a/decode_eval92_bg5k_pongping10 || exit 1;
+
+# same for bigger language models (on machine with 8GB RAM, you can run the whole decoding in 3-4 min without SGE)
+utils/prepare_lang.sh --reverse true data/local/dict_larger "<SPOKEN_NOISE>" data/local/lang_larger.reverse data/lang_bd.reverse || exit;
+utils/reverse_lm.sh --lexicon data/local/dict_larger/lexicon.txt data/local/local_lm/3gram-mincount/lm_pr6.0.gz data/lang_bd.reverse data/lang_test_bd_tgpr.reverse || exit 1;
+utils/reverse_lm_test.sh data/lang_test_bd_tgpr data/lang_test_bd_tgpr.reverse || exit 1;
+
+utils/mkgraph.sh data/lang_test_bd_tgpr exp/tri2a exp/tri2a/graph_bd_tgpr
+steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --nj 4 --cmd run.pl \
+  exp/tri2a/graph_bd_tgpr data/test_eval92 exp/tri2a/decode_eval92_bdtgpr4_10 || exit 1;
+
+utils/mkgraph.sh --reverse data/lang_test_bd_tgpr.reverse exp/tri2a exp/tri2a/graph_bd_tgpr_r
+steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --reverse true --nj 4 --cmd run.pl \
+  exp/tri2a/graph_bd_tgpr_r data/test_eval92 exp/tri2a/decode_eval92_bdtgpr4_reverse10 || exit 1;
+
+steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --reverse true --nj 4 --cmd run.pl \
+  --first_pass exp/tri2a/decode_eval92_bdtgpr4_10 exp/tri2a/graph_bd_tgpr_r data/test_eval92 \
+  exp/tri2a/decode_eval92_bdtgpr4_pingpong10 || exit 1;
+
+steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --nj 4 --cmd run.pl \
+  --first_pass exp/tri2a/decode_eval92_bdtgpr4_reverse10 exp/tri2a/graph_bd_tgpr data/test_eval92 \
+  exp/tri2a/decode_eval92_bdtgpr4_pongping10 || exit 1;
--- a/egs/chime_wsj0/s5/local/run_mmi_tri2b.sh
+++ b/egs/chime_wsj0/s5/local/run_mmi_tri2b.sh
@ -0,0 +1,60 @@
+#!/bin/bash
+
+. ./cmd.sh
+
+# Train and test MMI (and boosted MMI) on tri2b system.
+steps/make_denlats.sh --sub-split 20 --nj 10 --cmd "$train_cmd" \
+  data/train_si84 data/lang exp/tri2b exp/tri2b_denlats_si84 || exit 1;
+
+# train the basic MMI system.
+steps/train_mmi.sh --cmd "$train_cmd" \
+  data/train_si84 data/lang exp/tri2b_ali_si84 \
+  exp/tri2b_denlats_si84 exp/tri2b_mmi  || exit 1;
+for iter in 3 4; do
+  steps/decode_si.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
+    exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_mmi/decode_tgpr_dev93_it$iter &
+  steps/decode_si.sh --nj 8 --cmd "$decode_cmd" --iter $iter \
+     exp/tri2b/graph_tgpr data/test_eval92 exp/tri2b_mmi/decode_tgpr_eval92_it$iter &
+done
+
+# MMI with 0.1 boosting factor.
+steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \
+  data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 \
+  exp/tri2b_mmi_b0.1  || exit 1;
+
+for iter in 3 4; do
+  steps/decode_si.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
+    exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_mmi_b0.1/decode_tgpr_dev93_it$iter &
+  steps/decode_si.sh --nj 8 --cmd "$decode_cmd" --iter $iter \
+     exp/tri2b/graph_tgpr data/test_eval92 exp/tri2b_mmi_b0.1/decode_tgpr_eval92_it$iter &
+done
+
+
+# Train a UBM with 400 components, for fMMI.
+steps/train_diag_ubm.sh --silence-weight 0.5 --nj 10 --cmd "$train_cmd" \
+  400 data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b
+
+ steps/train_mmi_fmmi.sh --boost 0.1 --cmd "$train_cmd" \
+   data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \
+   exp/tri2b_fmmi_b0.1
+
+ for iter in `seq 3 8`; do 
+   steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
+     exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it$iter &
+ done
+
+ steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \
+   data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \
+   exp/tri2b_fmmi_b0.1_lr0.005 || exit 1;
+ for iter in `seq 3 8`; do 
+   steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
+     exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it$iter &
+ done
+
+ steps/train_mmi_fmmi_indirect.sh --boost 0.1 --cmd "$train_cmd" \
+   data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \
+   exp/tri2b_fmmi_indirect_b0.1
+ for iter in `seq 3 8`; do 
+   steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
+      exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it$iter &
+ done
--- a/egs/chime_wsj0/s5/local/run_mmi_tri4b.sh
+++ b/egs/chime_wsj0/s5/local/run_mmi_tri4b.sh
@ -0,0 +1,50 @@
+#!/bin/bash
+. ./cmd.sh
+
+steps/make_denlats.sh --nj 30 --sub-split 30 --cmd "$train_cmd" \
+  --transform-dir exp/tri4b_ali_si284 \
+  data/train_si284 data/lang exp/tri4b exp/tri4b_denlats_si284 || exit 1;
+
+steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \
+  data/train_si284 data/lang exp/tri4b_ali_si284 exp/tri4b_denlats_si284 \
+  exp/tri4b_mmi_b0.1  || exit 1;
+
+steps/decode.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri3b/decode_tgpr_dev93 \
+  exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b_mmi_b0.1/decode_tgpr_dev93
+
+#first, train UBM for fMMI experiments.
+steps/train_diag_ubm.sh --silence-weight 0.5 --nj 30 --cmd "$train_cmd" \
+  600 data/train_si284 data/lang exp/tri4b_ali_si284 exp/dubm4b
+
+# Next, fMMI+MMI.
+steps/train_mmi_fmmi.sh \
+  --boost 0.1 --cmd "$train_cmd" data/train_si284 data/lang exp/tri4b_ali_si284 exp/dubm4b exp/tri4b_denlats_si284 \
+  exp/tri4b_fmmi_a || exit 1;
+
+for iter in 3 4 5 6 7 8; do
+ steps/decode_fmmi.sh --nj 10  --cmd "$decode_cmd" --iter $iter \
+   --transform-dir exp/tri3b/decode_tgpr_dev93  exp/tri4b/graph_tgpr data/test_dev93 \
+  exp/tri4b_fmmi_a/decode_tgpr_dev93_it$iter &
+done
+# decode the last iter with the bd model.
+for iter in 8; do
+ steps/decode_fmmi.sh --nj 10  --cmd "$decode_cmd" --iter $iter \
+   --transform-dir exp/tri3b/decode_bd_tgpr_dev93  exp/tri4b/graph_bd_tgpr data/test_dev93 \
+  exp/tri4b_fmmi_a/decode_bd_tgpr_dev93_it$iter &
+ steps/decode_fmmi.sh --nj 8  --cmd "$decode_cmd" --iter $iter \
+   --transform-dir exp/tri3b/decode_bd_tgpr_eval92  exp/tri4b/graph_bd_tgpr data/test_eval92 \
+  exp/tri4b_fmmi_a/decode_tgpr_eval92_it$iter &
+done
+
+
+# fMMI + mmi with indirect differential.
+steps/train_mmi_fmmi_indirect.sh \
+  --boost 0.1 --cmd "$train_cmd" data/train_si284 data/lang exp/tri4b_ali_si284 exp/dubm4b exp/tri4b_denlats_si284 \
+  exp/tri4b_fmmi_indirect || exit 1;
+
+for iter in 3 4 5 6 7 8; do
+ steps/decode_fmmi.sh --nj 10  --cmd "$decode_cmd" --iter $iter \
+   --transform-dir exp/tri3b/decode_tgpr_dev93  exp/tri4b/graph_tgpr data/test_dev93 \
+  exp/tri4b_fmmi_indirect/decode_tgpr_dev93_it$iter &
+done
+
--- a/egs/chime_wsj0/s5/local/run_nnet_cpu.sh
+++ b/egs/chime_wsj0/s5/local/run_nnet_cpu.sh
@ -0,0 +1,9 @@
+#!/bin/bash
+
+. ./cmd.sh
+
+
+# ...
+
+local/nnet2/run_5c.sh
+
--- a/egs/chime_wsj0/s5/local/run_raw_fmllr.sh
+++ b/egs/chime_wsj0/s5/local/run_raw_fmllr.sh
@ -0,0 +1,66 @@
+#!/bin/bash
+
+
+steps/align_raw_fmllr.sh --nj 10 --cmd "$train_cmd" --use-graphs true \
+    data/train_si84 data/lang exp/tri2b exp/tri2b_ali_si84_raw
+
+steps/train_raw_sat.sh --cmd "$train_cmd" \
+   2500 15000 data/train_si84 data/lang exp/tri2b_ali_si84_raw exp/tri3c || exit 1;
+
+
+mfccdir=mfcc
+for x in test_eval92 test_eval93 test_dev93 ; do
+  y=${x}_utt
+  cp -rT data/$x data/$y
+  cat data/$x/utt2spk | awk '{print $1, $1;}' > data/$y/utt2spk;
+  cp data/$y/utt2spk data/$y/spk2utt;
+  steps/compute_cmvn_stats.sh data/$y exp/make_mfcc/$y $mfccdir || exit 1; 
+done
+
+(
+utils/mkgraph.sh data/lang_test_tgpr exp/tri3c exp/tri3c/graph_tgpr || exit 1;
+steps/decode_raw_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+  exp/tri3c/graph_tgpr data/test_dev93 exp/tri3c/decode_tgpr_dev93 || exit 1;
+steps/decode_raw_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+  exp/tri3c/graph_tgpr data/test_eval92 exp/tri3c/decode_tgpr_eval92 || exit 1;
+
+steps/decode_raw_fmllr.sh --nj 30 --cmd "$decode_cmd" \
+  exp/tri3c/graph_tgpr data/test_dev93_utt exp/tri3c/decode_tgpr_dev93_utt || exit 1;
+steps/decode_raw_fmllr.sh --nj 30 --cmd "$decode_cmd" \
+  exp/tri3c/graph_tgpr data/test_eval92_utt exp/tri3c/decode_tgpr_eval92_utt || exit 1;
+
+steps/decode_raw_fmllr.sh --use-normal-fmllr true --nj 10 --cmd "$decode_cmd" \
+  exp/tri3c/graph_tgpr data/test_dev93 exp/tri3c/decode_tgpr_dev93_2fmllr || exit 1;
+steps/decode_raw_fmllr.sh --use-normal-fmllr true --nj 8 --cmd "$decode_cmd" \
+  exp/tri3c/graph_tgpr data/test_eval92 exp/tri3c/decode_tgpr_eval92_2fmllr || exit 1;
+)&
+
+(
+utils/mkgraph.sh data/lang_test_bd_tgpr exp/tri3c exp/tri3c/graph_bd_tgpr || exit 1; 
+
+steps/decode_raw_fmllr.sh --cmd "$decode_cmd" --nj 8 exp/tri3c/graph_bd_tgpr \
+    data/test_eval92 exp/tri3c/decode_bd_tgpr_eval92 
+ steps/decode_raw_fmllr.sh --cmd "$decode_cmd" --nj 10 exp/tri3c/graph_bd_tgpr \
+   data/test_dev93 exp/tri3c/decode_bd_tgpr_dev93 
+)&
+
+steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
+  data/train_si284 data/lang exp/tri3c exp/tri3c_ali_si284 || exit 1;
+
+
+steps/train_raw_sat.sh  --cmd "$train_cmd" \
+  4200 40000 data/train_si284 data/lang exp/tri3c_ali_si284 exp/tri4d || exit 1;
+(
+ utils/mkgraph.sh data/lang_test_tgpr exp/tri4d exp/tri4d/graph_tgpr || exit 1;
+ steps/decode_raw_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+   exp/tri4d/graph_tgpr data/test_dev93 exp/tri4d/decode_tgpr_dev93 || exit 1;
+ steps/decode_raw_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+   exp/tri4d/graph_tgpr data/test_eval92 exp/tri4d/decode_tgpr_eval92 || exit 1;
+) & 
+
+
+wait
+
+
+#for x in exp/tri3{b,c}/decode_tgpr*; do grep WER $x/wer_* | utils/best_wer.sh ; done
+
--- a/egs/chime_wsj0/s5/local/run_rnnlms_sgmm5b.sh
+++ b/egs/chime_wsj0/s5/local/run_rnnlms_sgmm5b.sh
@ -0,0 +1,42 @@
+#!/bin/bash
+
+for test in dev93 eval92; do
+
+  steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_bd_tgpr data/lang_test_bd_fg \
+    data/test_${test} exp/sgmm5b_mmi_b0.1/decode_bd_tgpr_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 || exit 1;
+
+
+# Note: for N-best-list generation, choosing the acoustic scale (12) that gave
+# the best WER on this test set.  Ideally we should do this on a dev set.
+
+ # This step interpolates a small RNNLM (with weight 0.25) with the 4-gram LM.
+  steps/rnnlmrescore.sh \
+    --N 100 --cmd "$decode_cmd" --inv-acwt 12 \
+    0.25 data/lang_test_bd_fg data/local/rnnlm.h30.voc10k data/test_${test} \
+    exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm30_0.25  \
+    || exit 1;
+
+  steps/rnnlmrescore.sh \
+    --N 100 --cmd "$decode_cmd" --inv-acwt 12 \
+    0.5 data/lang_test_bd_fg data/local/rnnlm.h100.voc20k data/test_${test} \
+    exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm100_0.5 \
+    || exit 1;
+
+  steps/rnnlmrescore.sh \
+    --N 100 --cmd "$decode_cmd" --inv-acwt 12 \
+    0.5 data/lang_test_bd_fg data/local/rnnlm.h200.voc30k data/test_${test} \
+    exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm200_0.5 \
+    || exit 1;
+
+  steps/rnnlmrescore.sh \
+    --N 100 --cmd "$decode_cmd" --inv-acwt 12 \
+    0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_${test} \
+    exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm300_0.5 \
+    || exit 1;
+
+  steps/rnnlmrescore.sh \
+    --N 100 --cmd "$decode_cmd" --inv-acwt 12 \
+    0.75 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_${test} \
+    exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm300_0.75 \
+    || exit 1;
+done
--- a/egs/chime_wsj0/s5/local/run_rnnlms_tri3b.sh
+++ b/egs/chime_wsj0/s5/local/run_rnnlms_tri3b.sh
@ -0,0 +1,64 @@
+#!/bin/bash
+
+. cmd.sh
+
+ # This step interpolates a small RNNLM (with weight 0.25) with the 4-gram LM.
+steps/rnnlmrescore.sh \
+  --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
+  0.25 data/lang_test_bd_fg data/local/rnnlm.h30.voc10k data/test_eval92 \
+  exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm30_0.25  \
+  || exit 1;
+
+steps/rnnlmrescore.sh \
+  --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
+  0.5 data/lang_test_bd_fg data/local/rnnlm.h100.voc20k data/test_eval92 \
+  exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm100_0.5 \
+  || exit 1;
+
+steps/rnnlmrescore.sh \
+  --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
+  0.5 data/lang_test_bd_fg data/local/rnnlm.h200.voc30k data/test_eval92 \
+  exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm200_0.5 \
+  || exit 1;
+
+steps/rnnlmrescore.sh \
+  --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
+  0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
+  exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5 \
+  || exit 1;
+
+steps/rnnlmrescore.sh \
+  --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \
+  0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
+  exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5_N1000 
+
+dir=exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.75_N1000
+rm -rf $dir
+cp -r exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5_N1000 $dir
+steps/rnnlmrescore.sh \
+  --stage 7 --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \
+  0.75 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
+  exp/tri3b/decode_bd_tgpr_eval92_fg $dir
+
+dir=exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.75
+rm -rf $dir
+cp -r exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5 $dir
+steps/rnnlmrescore.sh \
+  --stage 7 --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
+  0.75 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
+  exp/tri3b/decode_bd_tgpr_eval92_fg $dir
+
+dir=exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.25
+rm -rf $dir
+cp -r exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5 $dir
+steps/rnnlmrescore.sh \
+  --stage 7 --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
+  0.25 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
+  exp/tri3b/decode_bd_tgpr_eval92_fg $dir
+
+steps/rnnlmrescore.sh \
+  --N 10 --cmd "$decode_cmd" --inv-acwt 17 \
+  0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
+  exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5_N10 \
+  || exit 1;
+
--- a/egs/chime_wsj0/s5/local/run_sgmm.sh
+++ b/egs/chime_wsj0/s5/local/run_sgmm.sh
@ -0,0 +1,113 @@
+#!/bin/bash
+
+# This script is invoked from ../run.sh
+# It contains some SGMM-related scripts that I am breaking out of the main run.sh for clarity.
+
+. cmd.sh
+
+# SGMM system on si84 data [sgmm5a].  Note: the system we aligned from used the si284 data for
+# training, but this shouldn't have much effect.
+
+(
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/train_si84 data/lang exp/tri4b exp/tri4b_ali_si84 || exit 1;
+
+  steps/train_ubm.sh --cmd "$train_cmd" \
+    400 data/train_si84 data/lang exp/tri4b_ali_si84 exp/ubm5a || exit 1;
+
+  steps/train_sgmm.sh --cmd "$train_cmd" \
+    3500 10000 data/train_si84 data/lang exp/tri4b_ali_si84 \
+    exp/ubm5b/final.ubm exp/sgmm5a || exit 1;
+
+  (
+    utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5a exp/sgmm5a/graph_tgpr
+    steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
+      exp/sgmm5a/graph_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93
+  ) &
+
+  steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si84 \
+    --use-graphs true --use-gselect true data/train_si84 data/lang exp/sgmm5a exp/sgmm5a_ali_si84 || exit 1;
+  steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 \
+    data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84
+
+  steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
+    data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1
+
+  for iter in 1 2 3 4; do
+    steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
+      --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \
+      exp/sgmm5a_mmi_b0.1/decode_tgpr_dev93_it$iter &
+  done
+
+  steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
+   --update-opts "--cov-min-value=0.9" data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1_m0.9
+
+  for iter in 1 2 3 4; do
+    steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
+      --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \
+      exp/sgmm5a_mmi_b0.1_m0.9/decode_tgpr_dev93_it$iter &
+  done
+
+) &
+
+
+(
+# The next commands are the same thing on all the si284 data.
+
+# SGMM system on the si284 data [sgmm5b]
+  steps/train_ubm.sh --cmd "$train_cmd" \
+    600 data/train_si284 data/lang exp/tri4b_ali_si284 exp/ubm5b || exit 1;
+
+  steps/train_sgmm.sh --cmd "$train_cmd" \
+    5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \
+    exp/ubm5b/final.ubm exp/sgmm5b || exit 1;
+
+  (
+    utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5b exp/sgmm5b/graph_tgpr
+    steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
+      exp/sgmm5b/graph_tgpr data/test_dev93 exp/sgmm5b/decode_tgpr_dev93
+    steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_eval92 \
+      exp/sgmm5b/graph_tgpr data/test_eval92 exp/sgmm5b/decode_tgpr_eval92
+
+    utils/mkgraph.sh data/lang_test_bd_tgpr exp/sgmm5b exp/sgmm5b/graph_bd_tgpr || exit 1;
+    steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
+      exp/sgmm5b/graph_bd_tgpr data/test_dev93 exp/sgmm5b/decode_bd_tgpr_dev93
+    steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \
+      exp/sgmm5b/graph_bd_tgpr data/test_eval92 exp/sgmm5b/decode_bd_tgpr_eval92
+  ) &
+
+  steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si284 \
+    --use-graphs true --use-gselect true data/train_si284 data/lang exp/sgmm5b exp/sgmm5b_ali_si284 
+
+  steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 \
+    data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284
+
+  steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \
+    data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284 exp/sgmm5b_mmi_b0.1
+
+  for iter in 1 2 3 4; do
+    for test in dev93 eval92; do
+      steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
+        --transform-dir exp/tri4b/decode_tgpr_${test} data/lang_test_tgpr data/test_${test} exp/sgmm5b/decode_tgpr_${test} \
+        exp/sgmm5b_mmi_b0.1/decode_tgpr_${test}_it$iter &
+
+      steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
+        --transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_tgpr data/test_${test} exp/sgmm5b/decode_bd_tgpr_${test} \
+        exp/sgmm5b_mmi_b0.1/decode_bd_tgpr_${test}_it$iter &
+     done
+  done
+) &
+
+
+
+# Train quinphone SGMM system. 
+
+steps/train_sgmm.sh  --cmd "$train_cmd" \
+   --context-opts "--context-width=5 --central-position=2" \
+   5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \
+   exp/ubm5b/final.ubm exp/sgmm5c || exit 1;
+
+# Decode from lattices in exp/sgmm5a/decode_tgpr_dev93.
+steps/decode_sgmm_fromlats.sh --cmd "$decode_cmd"  --transform-dir exp/tri4b/decode_tgpr_dev93 \
+   data/test_dev93 data/lang_test_tgpr exp/sgmm5a/decode_tgpr_dev93 exp/sgmm5c/decode_tgpr_dev93 
+
--- a/egs/chime_wsj0/s5/local/run_sgmm2.sh
+++ b/egs/chime_wsj0/s5/local/run_sgmm2.sh
@ -0,0 +1,148 @@
+#!/bin/bash
+
+# This script is invoked from ../run.sh
+# It contains some SGMM-related scripts that I am breaking out of the main run.sh for clarity.
+
+. cmd.sh
+
+# Note: you might want to try to give the option --spk-dep-weights=false to train_sgmm2.sh;
+# this takes out the "symmetric SGMM" part which is not always helpful.
+
+# SGMM system on si84 data [sgmm5a].  Note: the system we aligned from used the si284 data for
+# training, but this shouldn't have much effect.
+
+(
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/train_si84 data/lang exp/tri4b exp/tri4b_ali_si84 || exit 1;
+
+  steps/train_ubm.sh --cmd "$train_cmd" \
+    400 data/train_si84 data/lang exp/tri4b_ali_si84 exp/ubm5a || exit 1;
+
+  steps/train_sgmm2.sh --cmd "$train_cmd" \
+    7000 9000 data/train_si84 data/lang exp/tri4b_ali_si84 \
+    exp/ubm5a/final.ubm exp/sgmm2_5a || exit 1;
+
+  (
+    utils/mkgraph.sh data/lang_test_tgpr exp/sgmm2_5a exp/sgmm2_5a/graph_tgpr
+    steps/decode_sgmm2.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
+      exp/sgmm2_5a/graph_tgpr data/test_dev93 exp/sgmm2_5a/decode_tgpr_dev93
+  ) &
+
+  steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si84 \
+    --use-graphs true --use-gselect true data/train_si84 data/lang exp/sgmm2_5a exp/sgmm2_5a_ali_si84 || exit 1;
+  steps/make_denlats_sgmm2.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 \
+    data/train_si84 data/lang exp/sgmm2_5a_ali_si84 exp/sgmm2_5a_denlats_si84
+
+  steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
+    data/train_si84 data/lang exp/sgmm2_5a_ali_si84 exp/sgmm2_5a_denlats_si84 exp/sgmm2_5a_mmi_b0.1
+
+  for iter in 1 2 3 4; do
+    steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
+      --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm2_5a/decode_tgpr_dev93 \
+      exp/sgmm2_5a_mmi_b0.1/decode_tgpr_dev93_it$iter &
+  done
+
+  steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
+   --update-opts "--cov-min-value=0.9" data/train_si84 data/lang exp/sgmm2_5a_ali_si84 exp/sgmm2_5a_denlats_si84 exp/sgmm2_5a_mmi_b0.1_m0.9
+
+  for iter in 1 2 3 4; do
+    steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
+      --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm2_5a/decode_tgpr_dev93 \
+      exp/sgmm2_5a_mmi_b0.1_m0.9/decode_tgpr_dev93_it$iter &
+  done
+
+) &
+
+
+(
+# The next commands are the same thing on all the si284 data.
+
+# SGMM system on the si284 data [sgmm5b]
+  steps/train_ubm.sh --cmd "$train_cmd" \
+    600 data/train_si284 data/lang exp/tri4b_ali_si284 exp/ubm5b || exit 1;
+
+  steps/train_sgmm2.sh --cmd "$train_cmd" \
+   11000 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \
+    exp/ubm5b/final.ubm exp/sgmm2_5b || exit 1;
+
+  (
+    utils/mkgraph.sh data/lang_test_tgpr exp/sgmm2_5b exp/sgmm2_5b/graph_tgpr
+    steps/decode_sgmm2.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
+      exp/sgmm2_5b/graph_tgpr data/test_dev93 exp/sgmm2_5b/decode_tgpr_dev93
+    steps/decode_sgmm2.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_eval92 \
+      exp/sgmm2_5b/graph_tgpr data/test_eval92 exp/sgmm2_5b/decode_tgpr_eval92
+
+    utils/mkgraph.sh data/lang_test_bd_tgpr exp/sgmm2_5b exp/sgmm2_5b/graph_bd_tgpr || exit 1;
+    steps/decode_sgmm2.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
+      exp/sgmm2_5b/graph_bd_tgpr data/test_dev93 exp/sgmm2_5b/decode_bd_tgpr_dev93
+    steps/decode_sgmm2.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \
+      exp/sgmm2_5b/graph_bd_tgpr data/test_eval92 exp/sgmm2_5b/decode_bd_tgpr_eval92
+  ) &
+
+
+ # This shows how you would build and test a quinphone SGMM2 system, but
+  (
+   steps/train_sgmm2.sh --cmd "$train_cmd" \
+      --context-opts "--context-width=5 --central-position=2" \
+    11000 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \
+     exp/ubm5b/final.ubm exp/sgmm2_5c || exit 1;
+   # Decode from lattices in exp/sgmm2_5b
+    steps/decode_sgmm2_fromlats.sh --cmd "$decode_cmd"  --transform-dir exp/tri4b/decode_tgpr_dev93 \
+       data/test_dev93 data/lang_test_tgpr exp/sgmm2_5b/decode_tgpr_dev93 exp/sgmm2_5c/decode_tgpr_dev93 
+    steps/decode_sgmm2_fromlats.sh --cmd "$decode_cmd"  --transform-dir exp/tri4b/decode_tgpr_eval92 \
+       data/test_eval92 data/lang_test_tgpr exp/sgmm2_5b/decode_tgpr_eval92 exp/sgmm2_5c/decode_tgpr_eval92 
+  ) &
+
+
+  steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si284 \
+    --use-graphs true --use-gselect true data/train_si284 data/lang exp/sgmm2_5b exp/sgmm2_5b_ali_si284 
+
+  steps/make_denlats_sgmm2.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 \
+    data/train_si284 data/lang exp/sgmm2_5b_ali_si284 exp/sgmm2_5b_denlats_si284
+
+  steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \
+    data/train_si284 data/lang exp/sgmm2_5b_ali_si284 exp/sgmm2_5b_denlats_si284 exp/sgmm2_5b_mmi_b0.1
+
+  for iter in 1 2 3 4; do
+    for test in eval92; do # dev93
+      steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
+        --transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_fg data/test_${test} exp/sgmm2_5b/decode_bd_tgpr_${test} \
+        exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_${test}_it$iter &
+     done
+  done
+
+  steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \
+    --zero-if-disjoint true data/train_si284 data/lang exp/sgmm2_5b_ali_si284 exp/sgmm2_5b_denlats_si284 exp/sgmm2_5b_mmi_b0.1_z
+
+  for iter in 1 2 3 4; do
+    for test in eval92 dev93; do
+      steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
+        --transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_fg data/test_${test} exp/sgmm2_5b/decode_bd_tgpr_${test} \
+        exp/sgmm2_5b_mmi_b0.1_z/decode_bd_tgpr_${test}_it$iter &
+     done
+  done
+
+) &
+
+wait
+
+# Examples of combining some of the best decodings: SGMM+MMI with
+# MMI+fMMI on a conventional system.
+ 
+local/score_combine.sh data/test_eval92 \
+   data/lang_test_bd_tgpr \
+   exp/tri4b_fmmi_a/decode_tgpr_eval92_it8 \
+   exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3 \
+   exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it8_3
+
+
+# %WER 4.43 [ 250 / 5643, 41 ins, 12 del, 197 sub ] exp/tri4b_fmmi_a/decode_tgpr_eval92_it8/wer_11
+# %WER 3.85 [ 217 / 5643, 35 ins, 11 del, 171 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3/wer_10
+# combined to:
+# %WER 3.76 [ 212 / 5643, 32 ins, 12 del, 168 sub ] exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it8_3/wer_12
+
+# Checking MBR decode of baseline:
+cp -r -T exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3{,.mbr}
+local/score_mbr.sh data/test_eval92 data/lang_test_bd_tgpr exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr
+# MBR decoding did not seem to help (baseline was 3.85).  I think this is normal at such low WERs.
+%WER 3.86 [ 218 / 5643, 35 ins, 11 del, 172 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr/wer_10
--- a/egs/chime_wsj0/s5/local/score.sh
+++ b/egs/chime_wsj0/s5/local/score.sh
@ -0,0 +1,67 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+decode_mbr=true
+reverse=false
+word_ins_penalty=0.0
+min_lmwt=5
+max_lmwt=20
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  echo "    --reverse (true/false)          # score with time reversed features "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/scoring/log
+
+cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
+
+$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
+  lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+  lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \
+  lattice-best-path --word-symbol-table=$symtab \
+    ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
+
+if $reverse; then
+  for lmwt in `seq $min_lmwt $max_lmwt`; do
+    mv $dir/scoring/$lmwt.tra $dir/scoring/$lmwt.tra.orig
+    awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
+       <$dir/scoring/$lmwt.tra.orig >$dir/scoring/$lmwt.tra
+  done
+fi
+
+# Note: the double level of quoting for the sed command
+$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
+   cat $dir/scoring/LMWT.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+    compute-wer --text --mode=present \
+     ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT || exit 1;
+
+exit 0;
--- a/egs/chime_wsj0/s5/local/score_combine.sh
+++ b/egs/chime_wsj0/s5/local/score_combine.sh
@ -0,0 +1,95 @@
+#!/bin/bash
+
+# Copyright 2013  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Script for system combination using minimum Bayes risk decoding.
+# This calls lattice-combine to create a union of lattices that have been 
+# normalized by removing the total forward cost from them. The resulting lattice
+# is used as input to lattice-mbr-decode. This should not be put in steps/ or 
+# utils/ since the scores on the combined lattice must not be scaled.
+
+# begin configuration section.
+cmd=run.pl
+min_lmwt=9
+max_lmwt=20
+lat_weights=
+#end configuration section.
+
+help_message="Usage: "$(basename $0)" [options] <data-dir> <graph-dir|lang-dir> <decode-dir1> <decode-dir2> [decode-dir3 ... ] <out-dir>
+Options:
+  --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes.
+  --min-lmwt INT                  # minumum LM-weight for lattice rescoring 
+  --max-lmwt INT                  # maximum LM-weight for lattice rescoring
+  --lat-weights STR               # colon-separated string of lattice weights
+";
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -lt 5 ]; then
+  printf "$help_message\n";
+  exit 1;
+fi
+
+data=$1
+graphdir=$2
+odir=${@: -1}  # last argument to the script
+shift 2;
+decode_dirs=( $@ )  # read the remaining arguments into an array
+unset decode_dirs[${#decode_dirs[@]}-1]  # 'pop' the last argument which is odir
+num_sys=${#decode_dirs[@]}  # number of systems to combine
+
+symtab=$graphdir/words.txt
+[ ! -f $symtab ] && echo "$0: missing word symbol table '$symtab'" && exit 1;
+[ ! -f $data/text ] && echo "$0: missing reference '$data/text'" && exit 1;
+
+
+mkdir -p $odir/log
+
+for i in `seq 0 $[num_sys-1]`; do
+  model=${decode_dirs[$i]}/../final.mdl  # model one level up from decode dir
+  for f in $model ${decode_dirs[$i]}/lat.1.gz ; do
+    [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+  done
+  lats[$i]="\"ark:gunzip -c ${decode_dirs[$i]}/lat.*.gz |\""
+done
+
+mkdir -p $odir/scoring/log
+
+cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' \
+  > $odir/scoring/test_filt.txt
+
+if [ -z "$lat_weights" ]; then
+  $cmd LMWT=$min_lmwt:$max_lmwt $odir/log/combine_lats.LMWT.log \
+    lattice-combine --inv-acoustic-scale=LMWT ${lats[@]} ark:- \| \
+    lattice-mbr-decode --word-symbol-table=$symtab ark:- \
+    ark,t:$odir/scoring/LMWT.tra || exit 1;
+else
+  $cmd LMWT=$min_lmwt:$max_lmwt $odir/log/combine_lats.LMWT.log \
+    lattice-combine --inv-acoustic-scale=LMWT --lat-weights=$lat_weights \
+    ${lats[@]} ark:- \| \
+    lattice-mbr-decode --word-symbol-table=$symtab ark:- \
+    ark,t:$odir/scoring/LMWT.tra || exit 1;
+fi
+
+$cmd LMWT=$min_lmwt:$max_lmwt $odir/scoring/log/score.LMWT.log \
+  cat $odir/scoring/LMWT.tra \| \
+  utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+  compute-wer --text --mode=present \
+  ark:$odir/scoring/test_filt.txt  ark,p:- ">&" $odir/wer_LMWT || exit 1;
+
+exit 0
--- a/egs/chime_wsj0/s5/local/score_mbr.sh
+++ b/egs/chime_wsj0/s5/local/score_mbr.sh
@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Script for minimum bayes risk decoding.
+
+[ -f ./path.sh ] && . ./path.sh;
+
+# begin configuration section.
+cmd=run.pl
+min_lmwt=9
+max_lmwt=20
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score_mbr.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score_mbr.sh: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/scoring/log
+
+cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
+
+# We submit the jobs separately, not as an array, because it's hard
+# to get the inverse of the LM scales.
+rm $dir/.error 2>/dev/null
+for inv_acwt in `seq $min_lmwt $max_lmwt`; do
+  acwt=`perl -e "print (1.0/$inv_acwt);"`
+  $cmd $dir/scoring/rescore_mbr.${inv_acwt}.log \
+    lattice-mbr-decode  --acoustic-scale=$acwt --word-symbol-table=$symtab \
+      "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/${inv_acwt}.tra \
+    || touch $dir/.error &
+done
+wait;
+[ -f $dir/.error ] && echo "score_mbr.sh: errror getting MBR outout.";
+     
+
+$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
+   cat $dir/scoring/LMWT.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+    compute-wer --text --mode=present \
+     ark:$dir/scoring/test_filt.txt  ark,p:- ">" $dir/wer_LMWT || exit 1;
+
--- a/egs/chime_wsj0/s5/local/wsj_data_prep.sh
+++ b/egs/chime_wsj0/s5/local/wsj_data_prep.sh
@ -0,0 +1,201 @@
+#!/bin/bash
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+
+if [ $# -le 3 ]; then
+   echo "Arguments should be a list of WSJ directories, see ../run.sh for example."
+   exit 1;
+fi
+
+
+dir=`pwd`/data/local/data
+lmdir=`pwd`/data/local/nist_lm
+mkdir -p $dir $lmdir
+local=`pwd`/local
+utils=`pwd`/utils
+
+. ./path.sh # Needed for KALDI_ROOT
+export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+if [ ! -x $sph2pipe ]; then
+   echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
+   exit 1;
+fi
+
+cd $dir
+
+# Make directory of links to the WSJ disks such as 11-13.1.  This relies on the command
+# line arguments being absolute pathnames.
+rm -r links/ 2>/dev/null
+mkdir links/
+ln -s $* links
+
+# Do some basic checks that we have what we expected.
+if [ ! -d links/11-13.1 -o ! -d links/13-34.1 -o ! -d links/11-2.1 ]; then
+  echo "wsj_data_prep.sh: Spot check of command line arguments failed"
+  echo "Command line arguments must be absolute pathnames to WSJ directories"
+  echo "with names like 11-13.1."
+  exit 1;
+fi
+
+# This version for SI-84
+
+cat links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
+ $local/ndx2flist.pl $* | sort | \
+ grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si84.flist
+
+nl=`cat train_si84.flist | wc -l`
+[ "$nl" -eq 7138 ] || echo "Warning: expected 7138 lines in train_si84.flist, got $nl"
+
+# This version for SI-284
+cat links/13-34.1/wsj1/doc/indices/si_tr_s.ndx \
+ links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
+ $local/ndx2flist.pl  $* | sort | \
+ grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si284.flist
+
+nl=`cat train_si284.flist | wc -l`
+[ "$nl" -eq 37416 ] || echo "Warning: expected 37416 lines in train_si284.flist, got $nl"
+
+# Now for the test sets.
+# links/13-34.1/wsj1/doc/indices/readme.doc 
+# describes all the different test sets.
+# Note: each test-set seems to come in multiple versions depending
+# on different vocabulary sizes, verbalized vs. non-verbalized
+# pronunciations, etc.  We use the largest vocab and non-verbalized
+# pronunciations.
+# The most normal one seems to be the "baseline 60k test set", which
+# is h1_p0. 
+
+# Nov'92 (333 utts)
+# These index files have a slightly different format;
+# have to add .wv1
+cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_20.ndx | \
+  $local/ndx2flist.pl $* |  awk '{printf("%s.wv1\n", $1)}' | \
+  sort > test_eval92.flist
+
+# Nov'92 (330 utts, 5k vocab)
+cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
+  $local/ndx2flist.pl $* |  awk '{printf("%s.wv1\n", $1)}' | \
+  sort > test_eval92_5k.flist
+
+# Nov'93: (213 utts)
+# Have to replace a wrong disk-id.
+cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \
+  sed s/13_32_1/13_33_1/ | \
+  $local/ndx2flist.pl $* | sort > test_eval93.flist
+
+# Nov'93: (213 utts, 5k)
+cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h2_p0.ndx | \
+  sed s/13_32_1/13_33_1/ | \
+  $local/ndx2flist.pl $* | sort > test_eval93_5k.flist
+
+# Dev-set for Nov'93 (503 utts)
+cat links/13-34.1/wsj1/doc/indices/h1_p0.ndx | \
+  $local/ndx2flist.pl $* | sort > test_dev93.flist
+
+# Dev-set for Nov'93 (513 utts, 5k vocab)
+cat links/13-34.1/wsj1/doc/indices/h2_p0.ndx | \
+  $local/ndx2flist.pl $* | sort > test_dev93_5k.flist
+
+
+# Dev-set Hub 1,2 (503, 913 utterances)
+
+# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.  
+# Sometimes this gets copied from the CD's with upcasing, don't know 
+# why (could be older versions of the disks).
+find `readlink links/13-16.1`/???1/??_??_20 -print | grep -i ".wv1" | sort > dev_dt_20.flist
+find `readlink links/13-16.1`/???1/??_??_05 -print | grep -i ".wv1" | sort > dev_dt_05.flist
+
+
+# Finding the transcript files:
+for x in $*; do find -L $x -iname '*.dot'; done > dot_files.flist
+
+# Convert the transcripts into our format (no normalization yet)
+for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
+   $local/flist2scp.pl $x.flist | sort > ${x}_sph.scp
+   cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl  dot_files.flist > $x.trans1
+done
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
+   cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1;
+done
+ 
+# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
+for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
+  awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
+done
+
+# Make the utt2spk and spk2utt files.
+for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
+   cat ${x}_sph.scp | awk '{print $1}' | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
+   cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
+done
+
+
+#in case we want to limit lm's on most frequent words, copy lm training word frequency list
+cp links/13-32.1/wsj1/doc/lng_modl/vocab/wfl_64.lst $lmdir
+chmod u+w $lmdir/*.lst # had weird permissions on source.
+
+# The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without
+# verbalized pronunciations.   This is the most common test setup, I understand.
+
+cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1;
+chmod u+w $lmdir/lm_bg.arpa.gz
+
+# trigram would be:
+cat links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb20onp.z | \
+ perl -e 'while(<>){ if(m/^\\data\\/){ print; last;  } } while(<>){ print; }' | \
+ gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1;
+
+prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1;
+gzip -f $lmdir/lm_tgpr.arpa || exit 1;
+
+# repeat for 5k language models
+cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb05onp.z  $lmdir/lm_bg_5k.arpa.gz || exit 1;
+chmod u+w $lmdir/lm_bg_5k.arpa.gz
+
+# trigram would be: !only closed vocabulary here!
+cp links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1;
+chmod u+w $lmdir/lm_tg_5k.arpa.gz
+gunzip $lmdir/lm_tg_5k.arpa.gz
+tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz
+rm $lmdir/lm_tg_5k.arpa
+
+prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1;
+gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1;
+
+
+if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
+  rm wsj0-train-spkrinfo.txt
+  ! wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt && \
+    echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
+    wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt 
+fi
+
+if [ ! -f wsj0-train-spkrinfo.txt ]; then
+  echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
+  echo "This is possibly omitted from the training disks; couldn't find it." 
+  echo "Everything else may have worked; we just may be missing gender info"
+  echo "which is only needed for VTLN-related diagnostics anyway."
+  exit 1
+fi
+# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
+# LDC put it on the web.  Perhaps it was accidentally omitted from the
+# disks.  
+
+cat links/11-13.1/wsj0/doc/spkrinfo.txt \
+    links/13-32.1/wsj1/doc/evl_spok/spkrinfo.txt \
+    links/13-34.1/wsj1/doc/dev_spok/spkrinfo.txt \
+    links/13-34.1/wsj1/doc/train/spkrinfo.txt \
+   ./wsj0-train-spkrinfo.txt  | \
+    perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
+   awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
+
+
+echo "Data preparation succeeded"
--- a/egs/chime_wsj0/s5/local/wsj_extend_dict.sh
+++ b/egs/chime_wsj0/s5/local/wsj_extend_dict.sh
@ -0,0 +1,173 @@
+#!/bin/bash
+
+# This script builds a larger word-list and dictionary 
+# than used for the LMs supplied with the WSJ corpus.
+# It uses a couple of strategies to fill-in words in
+# the LM training data but not in CMUdict.  One is
+# to generate special prons for possible acronyms, that
+# just consist of the constituent letters.  The other
+# is designed to handle derivatives of known words
+# (e.g. deriving the pron of a plural from the pron of
+# the base-word), but in a more general, learned-from-data
+# way.
+# It makes use of scripts in local/dict/
+
+if [ $# -ne 1 ]; then
+  echo "Usage: local/wsj_train_lms.sh /foo/bar/WSJ/13-32.1/"
+  exit 1
+fi
+if [ "`basename $1`" != 13-32.1 ]; then
+  echo "Expecting the argument to this script to end in 13-32.1"
+  exit 1
+fi
+
+# e.g.
+#srcdir=/mnt/matylda2/data/WSJ1/13-32.1
+export PATH=$PATH:`pwd`/local/dict/
+srcdir=$1
+mkdir -p data/local/dict_larger
+dir=data/local/dict_larger
+cp data/local/dict/* data/local/dict_larger # Various files describing phones etc.
+  # are there; we just want to copy them as the phoneset is the same.
+rm data/local/dict_larger/lexicon.txt # we don't want this.
+rm data/local/dict_larger/lexiconp.txt # we don't want this either.
+mincount=2 # Minimum count of an OOV we will try to generate a pron for.
+
+[ ! -f data/local/dict/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1;
+
+# Remove comments from cmudict; print first field; remove
+# words like FOO(1) which are alternate prons: our dict format won't
+# include these markers.
+grep -v ';;;' data/local/dict/cmudict/cmudict.0.7a | 
+ perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu
+
+cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu
+
+echo "Getting training data [this should take at least a few seconds; if not, there's a problem]"
+
+# Convert to uppercase, remove XML-like markings.
+# For words ending in "." that are not in CMUdict, we assume that these
+# are periods that somehow remained in the data during data preparation,
+# and we we replace the "." with "\n".  Note: we found this by looking at
+# oov.counts below (before adding this rule).
+
+touch $dir/cleaned.gz
+if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then
+  echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]";
+else
+ gunzip -c $srcdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \
+  | awk '/^</{next}{print toupper($0)}' | perl -e '
+   open(F, "<$ARGV[0]")||die;
+   while(<F>){ chop; $isword{$_} = 1; }
+   while(<STDIN>) { 
+    @A = split(" ", $_); 
+    for ($n = 0; $n < @A; $n++) {
+      $a = $A[$n];
+      if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "."
+         # and have no other "." in them: treat as period.
+         print "$a";
+         if ($n+1 < @A) { print "\n"; }
+      } else { print "$a "; }
+    }
+    print "\n";
+  }
+ ' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz
+fi
+  
+# get unigram counts
+echo "Getting unigram counts"
+gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \
+  awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams
+
+cat $dir/unigrams | awk -v dict=$dir/dict.cmu \
+  'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \
+   > $dir/oov.counts
+
+echo "Most frequent unseen unigrams are: "
+head $dir/oov.counts
+
+# Prune away singleton counts, and remove things with numbers in
+# (which should have been normalized) and with no letters at all.
+
+
+cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \
+  | awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist
+
+# Automatic rule-finding...
+
+# First make some prons for possible acronyms.
+# Note: we don't do this for things like U.K or U.N,
+# or A.B. (which doesn't exist anyway), 
+# as we consider this normalization/spelling errors.
+
+cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu  > $dir/dict.acronyms
+
+mkdir $dir/f $dir/b # forward, backward directions of rules...
+  # forward is normal suffix
+  # rules, backward is reversed (prefix rules).  These
+  # dirs contain stuff we create while making the rule-based
+  # extensions to the dictionary.
+
+# Remove ; and , from words, if they are present; these
+# might crash our scripts, as they are used as separators there.
+filter_dict.pl $dir/dict.cmu > $dir/f/dict 
+cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
+reverse_dict.pl $dir/f/dict > $dir/b/dict
+reverse_dict.pl $dir/f/oovs > $dir/b/oovs
+
+# The next stage takes a few minutes.
+# Note: the forward stage takes longer, as English is
+# mostly a suffix-based language, and there are more rules
+# that it finds.
+for d in $dir/f $dir/b; do
+ (
+   cd $d
+   cat dict | get_rules.pl 2>get_rules.log >rules
+   get_rule_hierarchy.pl rules >hierarchy
+   awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
+     limit_candidate_prons.pl hierarchy | \
+     score_prons.pl dict | \
+     count_rules.pl >rule.counts
+   # the sort command below is just for convenience of reading.
+   score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
+   get_candidate_prons.pl rules.with_scores dict oovs | \
+     limit_candidate_prons.pl hierarchy > oovs.candidates
+ )  &   
+done 
+wait
+
+# Merge the candidates.
+reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
+select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s  %s\n", $1, $2);}' \
+  > $dir/dict.oovs
+
+cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
+
+awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
+sort $dir/oovlist | diff - $dir/oovlist.handled  | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
+
+
+# add_counts.pl attaches to original counts to the list of handled/not-handled OOVs
+add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts
+add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts
+
+echo "**Top OOVs we handled are:**"; 
+head $dir/oovlist.handled.counts
+echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**"; 
+head $dir/oovlist.not_handled.counts
+
+
+echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`"
+echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`"
+echo "Count of OOVs we didn't handle due to low count is" \
+    `awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts`
+# The two files created above are for humans to look at, as diagnostics.
+
+cat <<EOF | cat - $dir/dict.cmu $dir/dict.oovs_merged | sort | uniq > $dir/lexicon.txt
+!SIL SIL
+<SPOKEN_NOISE> SPN
+<UNK> SPN
+<NOISE> NSN
+EOF
+
+echo "Created $dir/lexicon.txt"
--- a/egs/chime_wsj0/s5/local/wsj_format_data.sh
+++ b/egs/chime_wsj0/s5/local/wsj_format_data.sh
@ -0,0 +1,86 @@
+#!/bin/bash
+
+# Copyright 2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# This script takes data prepared in a corpus-dependent way
+# in data/local/, and converts it into the "canonical" form,
+# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
+# data/train_si284, data/train_si84, etc.
+
+# Don't bother doing train_si84 separately (although we have the file lists
+# in data/local/) because it's just the first 7138 utterances in train_si284.
+# We'll create train_si84 after doing the feature extraction.
+
+. ./path.sh || exit 1;
+
+echo "Preparing train and test data"
+srcdir=data/local/data
+lmdir=data/local/nist_lm
+tmpdir=data/local/lm_tmp
+lexicon=data/local/lang_tmp/lexiconp.txt
+mkdir -p $tmpdir
+
+for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do 
+  mkdir -p data/$x
+  cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
+  cp $srcdir/$x.txt data/$x/text || exit 1;
+  cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
+  cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
+  utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1;
+done
+
+
+# Next, for each type of language model, create the corresponding FST
+# and the corresponding lang_test_* directory.
+
+echo Preparing language models for test
+
+for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
+  test=data/lang_test_${lm_suffix}
+  mkdir -p $test
+  for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
+     phones/; do
+    cp -r data/lang/$f $test
+  done
+  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
+   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt
+
+  # grep -v '<s> <s>' because the LM seems to have some strange and useless
+  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
+  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
+  # which are supposed to occur only at being/end of utt.  These can cause 
+  # determinization failures of CLG [ends up being epsilon cycles].
+  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
+    grep -v '<s> <s>' | \
+    grep -v '</s> <s>' | \
+    grep -v '</s> </s>' | \
+    arpa2fst - | fstprint | \
+    utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
+    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
+      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+     fstrmepsilon > $test/G.fst
+  fstisstochastic $test/G.fst
+ # The output is like:
+ # 9.14233e-05 -0.259833
+ # we do expect the first of these 2 numbers to be close to zero (the second is
+ # nonzero because the backoff weights make the states sum to >1).
+ # Because of the <s> fiasco for these particular LMs, the first number is not
+ # as close to zero as it could be.
+
+  # Everything below is only for diagnostic.
+  # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
+  # this might cause determinization failure of CLG.
+  # #0 is treated as an empty word.
+  mkdir -p $tmpdir/g
+  awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
+    < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
+  fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
+   fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
+  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+    echo "Language model has cycles with empty words" && exit 1
+  rm -r $tmpdir/g
+done
+
+echo "Succeeded in formatting data."
+rm -r $tmpdir
--- a/egs/chime_wsj0/s5/local/wsj_format_local_lms.sh
+++ b/egs/chime_wsj0/s5/local/wsj_format_local_lms.sh
@ -0,0 +1,52 @@
+#!/bin/bash
+
+# Copyright Johns Hopkins University (Author: Daniel Povey) 2012
+
+. ./path.sh
+
+[ ! -d data/lang_bd ] && echo "Expect data/local/lang_bd to exist" && exit 1;
+
+lm_srcdir_3g=data/local/local_lm/3gram-mincount
+lm_srcdir_4g=data/local/local_lm/4gram-mincount
+
+[ ! -d "$lm_srcdir_3g" ] && echo "No such dir $lm_srcdir_3g" && exit 1;
+[ ! -d "$lm_srcdir_4g" ] && echo "No such dir $lm_srcdir_4g" && exit 1;
+
+for d in data/lang_test_bd_{tg,tgpr,fg,fgpr}; do
+  rm -r $d 2>/dev/null
+  cp -r data/lang_bd $d
+done
+
+lang=data/lang_bd
+
+# Be careful: this time we dispense with the grep -v '<s> <s>' so this might
+# not work for LMs generated from all toolkits.
+gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \
+  arpa2fst - | fstprint | \
+    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
+      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+     fstrmepsilon > data/lang_test_bd_tgpr/G.fst || exit 1;
+  fstisstochastic data/lang_test_bd_tgpr/G.fst
+
+gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
+  arpa2fst - | fstprint | \
+    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
+      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+     fstrmepsilon > data/lang_test_bd_tg/G.fst || exit 1;
+  fstisstochastic data/lang_test_bd_tg/G.fst
+
+gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
+  arpa2fst - | fstprint | \
+    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
+      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+     fstrmepsilon > data/lang_test_bd_fg/G.fst || exit 1;
+  fstisstochastic data/lang_test_bd_fg/G.fst
+
+gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \
+  arpa2fst - | fstprint | \
+    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
+      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+     fstrmepsilon > data/lang_test_bd_fgpr/G.fst || exit 1;
+  fstisstochastic data/lang_test_bd_fgpr/G.fst
+
+exit 0;
--- a/egs/chime_wsj0/s5/local/wsj_prepare_dict.sh
+++ b/egs/chime_wsj0/s5/local/wsj_prepare_dict.sh
@ -0,0 +1,83 @@
+#!/bin/bash
+
+# Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# Call this script from one level above, e.g. from the s3/ directory.  It puts
+# its output in data/local/.
+
+# The parts of the output of this that will be needed are
+# [in data/local/dict/ ]
+# lexicon.txt
+# extra_questions.txt
+# nonsilence_phones.txt
+# optional_silence.txt
+# silence_phones.txt
+
+# run this from ../
+dir=data/local/dict
+mkdir -p $dir
+
+
+# (1) Get the CMU dictionary
+svn co  https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
+  $dir/cmudict || exit 1;
+
+# can add -r 10966 for strict compatibility.
+
+
+#(2) Dictionary preparation:
+
+
+# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
+# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.
+
+# silence phones, one per line.
+(echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt
+echo SIL > $dir/optional_silence.txt
+
+# nonsilence phones; on each line is a list of phones that correspond
+# really to the same base phone.
+cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \
+ perl -e 'while(<>){
+  chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; 
+  $phones_of{$1} .= "$_ "; }
+  foreach $list (values %phones_of) {print $list . "\n"; } ' \
+  > $dir/nonsilence_phones.txt || exit 1;
+
+# A few extra questions that will be added to those obtained by automatically clustering
+# the "real" phones.  These ask about stress; there's also one for silence.
+cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
+cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
+  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
+ >> $dir/extra_questions.txt || exit 1;
+
+grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
+ perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
+  > $dir/lexicon1_raw_nosil.txt || exit 1;
+
+# Add to cmudict the silences, noises etc.
+
+(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; echo '<NOISE> NSN'; ) | \
+ cat - $dir/lexicon1_raw_nosil.txt  > $dir/lexicon2_raw.txt || exit 1;
+
+
+# lexicon.txt is without the _B, _E, _S, _I markers.
+# This is the input to wsj_format_data.sh
+cp $dir/lexicon2_raw.txt $dir/lexicon.txt
+
+
+echo "Dictionary preparation succeeded"
+
--- a/egs/chime_wsj0/s5/local/wsj_train_lms.sh
+++ b/egs/chime_wsj0/s5/local/wsj_train_lms.sh
@ -0,0 +1,202 @@
+#!/bin/bash
+
+# This script trains LMs on the WSJ LM-training data.
+# It requires that you have already run wsj_extend_dict.sh,
+# to get the larger-size dictionary including all of CMUdict
+# plus any OOVs and possible acronyms that we could easily 
+# derive pronunciations for.
+
+# This script takes no command-line arguments
+
+dir=data/local/local_lm
+srcdir=data/local/dict_larger
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/kaldi_lm:$PATH
+( # First make sure the kaldi_lm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d kaldi_lm ]; then
+   echo Not installing the kaldi_lm toolkit since it is already there.
+ else
+   echo Downloading and installing the kaldi_lm tools
+   if [ ! -f kaldi_lm.tar.gz ]; then
+     wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1;
+   fi
+   tar -xvzf kaldi_lm.tar.gz || exit 1;
+   cd kaldi_lm
+   make || exit 1;
+   echo Done making the kaldi_lm tools
+ fi
+) || exit 1;
+
+
+
+if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
+  echo "Expecting files $srcdir/cleaned.gz and $srcdir/lexicon.txt to exist";
+  echo "You need to run local/wsj_extend_dict.sh before running this script."
+  exit 1;
+fi
+
+# Get a wordlist-- keep everything but silence, which should not appear in
+# the LM.
+awk '{print $1}' $srcdir/lexicon.txt | grep -v -w '!SIL' > $dir/wordlist.txt
+
+# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
+echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)" 
+gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.txt \
+  'BEGIN{while((getline<w)>0) v[$1]=1;}
+  {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
+  | gzip -c > $dir/train_nounk.gz
+
+# Get unigram counts (without bos/eos, but this doens't matter here, it's
+# only to get the word-map, which treats them specially & doesn't need their
+# counts).
+# Add a 1-count for each word in word-list by including that in the data,
+# so all words appear.
+gunzip -c $dir/train_nounk.gz | cat - $dir/wordlist.txt | \
+  awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
+ sort -nr > $dir/unigram.counts
+
+# Get "mapped" words-- a character encoding of the words that makes the common words very short.
+cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map
+
+gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
+  { for(n=1;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz
+
+# To save disk space, remove the un-mapped training data.  We could
+# easily generate it again if needed.
+rm $dir/train_nounk.gz 
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir
+#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
+# 7.8 million N-grams.
+
+prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
+# 1.45 million N-grams.
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
+
+train_lm.sh --arpa --lmtype 4gram-mincount $dir
+#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
+# 10.3 million N-grams.
+
+prune_lm.sh --arpa 7.0 $dir/4gram-mincount
+# 1.50 million N-grams
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
+
+
+exit 0
+
+### Below here, this script is showing various commands that 
+## were run during LM tuning.
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir
+#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
+# 7.8 million N-grams.
+
+prune_lm.sh --arpa 3.0 $dir/3gram-mincount/
+#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740
+# 2.5 million N-grams.
+
+prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
+# 1.45 million N-grams.
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
+
+train_lm.sh --arpa --lmtype 4gram-mincount $dir
+#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
+# 10.3 million N-grams.
+
+prune_lm.sh --arpa 3.0 $dir/4gram-mincount
+#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294
+# 2.6 million N-grams.
+
+prune_lm.sh --arpa 4.0 $dir/4gram-mincount
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717
+# 2.15 million N-grams.
+
+prune_lm.sh --arpa 5.0 $dir/4gram-mincount
+# 1.86 million N-grams
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023
+
+prune_lm.sh --arpa 7.0 $dir/4gram-mincount
+# 1.50 million N-grams
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
+
+train_lm.sh --arpa --lmtype 3gram $dir
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866
+# 20.0 million N-grams
+
+! which ngram-count  \
+  && echo "SRILM tools not installed so not doing the comparison" && exit 1;
+
+#################
+# You could finish the script here if you wanted.
+# Below is to show how to do baselines with SRILM.
+#  You'd have to install the SRILM toolkit first.
+
+heldout_sent=10000 # Don't change this if you want result to be comparable with
+    # kaldi_lm results
+sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
+mkdir -p $sdir
+gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout
+gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train
+(echo "<s>"; echo "</s>" ) | cat - $dir/wordlist.txt > $sdir/wordlist.final.s
+
+# 3-gram:
+ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
+  -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
+ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2
+#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
+#0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437
+
+# Trying 4-gram:
+ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
+  -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz
+ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout 
+#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
+#0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822
+
+#3-gram with pruning:
+ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
+  -prune 0.0000001 -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz
+ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout 
+#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
+#0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616
+# Around 2.25M N-grams.
+# Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/"
+# above, which gave 2.5 million N-grams and a perplexity of 156.
+
+# Note: all SRILM experiments above fully discount all singleton 3 and 4-grams.
+# You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to
+# the kaldi_lm experiments above without "-mincount".
+
+##  From here is how to train with
+# IRSTLM.  This is not really working at the moment.
+export IRSTLM=$KALDI_ROOT/tools/irstlm/
+
+idir=$dir/irstlm
+mkdir $idir
+gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | $IRSTLM/scripts/add-start-end.sh | \
+  gzip -c > $idir/train.gz
+
+$IRSTLM/bin/dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no
+ cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\
+{print $0;}}' > vocab.irstlm.20k
+
+
+$IRSTLM/bin/build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz  -p yes \
+  -n 3 -s improved-kneser-ney -b yes
+# Testing perplexity with SRILM tools:
+ngram -lm $idir/lm_3gram.gz  -ppl $sdir/cleaned.heldout 
+#data/local/local_lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for <unk> in closed-vocabulary LM
+#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs
+#0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599
+
+# Perplexity is very bad (should be ~141, since we used -p option,
+# not 175),
+# but adding -debug 3 to the command line shows that
+# the IRSTLM LM does not seem to sum to one properly, so it seems that
+# it produces an LM that isn't interpretable in the normal way as an ARPA
+# LM.
+
+
+
--- a/egs/chime_wsj0/s5/local/wsj_train_rnnlms.sh
+++ b/egs/chime_wsj0/s5/local/wsj_train_rnnlms.sh
@ -0,0 +1,153 @@
+#!/bin/bash 
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
+
+# This script trains LMs on the WSJ LM-training data.
+# It requires that you have already run wsj_extend_dict.sh,
+# to get the larger-size dictionary including all of CMUdict
+# plus any OOVs and possible acronyms that we could easily 
+# derive pronunciations for.
+
+# This script takes no command-line arguments but takes the --cmd option.
+
+# Begin configuration section.
+rand_seed=0
+cmd=run.pl
+nwords=10000 # This is how many words we're putting in the vocab of the RNNLM. 
+hidden=30
+class=200 # Num-classes... should be somewhat larger than sqrt of nwords.
+direct=1000 # Probably number of megabytes to allocate for hash-table for "direct" connections.
+rnnlm_ver=rnnlm-0.3e # version of RNNLM to use
+# End configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. utils/parse_options.sh
+
+if [ $# != 1 ]; then
+   echo "Usage: local/wsj_train_rnnlms.sh [options] <dest-dir>"
+   echo "For options, see top of script file"
+   exit 1;
+fi
+
+dir=$1
+srcdir=data/local/dict_larger
+mkdir -p $dir
+
+export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH
+
+
+( # First make sure the kaldi_lm toolkit is installed.
+ # Note: this didn't work out of the box for me, I had to
+ # change the g++ version to just "g++" (no cross-compilation
+ # needed for me as I ran on a machine that had been setup
+ # as 64 bit by default.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d $rnnlm_ver ]; then
+   echo Not installing the rnnlm toolkit since it is already there.
+ else
+   echo Downloading and installing the rnnlm tools
+   # http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz
+   if [ ! -f $rnnlm_ver.tgz ]; then
+     wget http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz || exit 1;
+   fi
+   mkdir $rnnlm_ver
+   cd $rnnlm_ver
+   tar -xvzf ../$rnnlm_ver.tgz || exit 1;
+   make CC=g++ || exit 1;
+   echo Done making the rnnlm tools
+ fi
+) || exit 1;
+
+
+if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
+  echo "Expecting files $srcdir/cleaned.gz and $srcdir/wordlist.final to exist";
+  echo "You need to run local/wsj_extend_dict.sh before running this script."
+  exit 1;
+fi
+
+cat $srcdir/lexicon.txt | awk '{print $1}' | grep -v -w '!SIL' > $dir/wordlist.all
+
+# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
+echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)" 
+gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.all \
+  'BEGIN{while((getline<w)>0) v[$1]=1;}
+  {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
+  | gzip -c > $dir/all.gz
+
+echo "Splitting data into train and validation sets."
+heldout_sent=10000
+gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data
+gunzip -c $dir/all.gz | tail -n +$heldout_sent | \
+ perl -e ' use List::Util qw(shuffle); @A=<>; print join("", shuffle(@A)); ' \
+  > $dir/train.in # training data
+
+
+  # The rest will consist of a word-class represented by <RNN_UNK>, that
+  # maps (with probabilities) to a whole class of words.
+
+# Get unigram counts from our training data, and use this to select word-list
+# for RNNLM training; e.g. 10k most frequent words.  Rest will go in a class
+# that we (manually, at the shell level) assign probabilities for words that
+# are in that class.  Note: this word-list doesn't need to include </s>; this
+# automatically gets added inside the rnnlm program.
+# Note: by concatenating with $dir/wordlist.all, we are doing add-one
+# smoothing of the counts.
+
+cat $dir/train.in $dir/wordlist.all | grep -v '</s>' | grep -v '<s>' | \
+  awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
+  sort -nr > $dir/unigram.counts
+
+head -$nwords $dir/unigram.counts | awk '{print $2}' > $dir/wordlist.rnn
+
+tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts
+
+tot=`awk '{x=x+$1} END{print x}' $dir/unk_class.counts`
+awk -v tot=$tot '{print $2, ($1*1.0/tot);}' <$dir/unk_class.counts  >$dir/unk.probs
+
+
+for type in train valid; do
+  cat $dir/$type.in | awk -v w=$dir/wordlist.rnn \
+    'BEGIN{while((getline<w)>0) v[$1]=1;}
+    {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<RNN_UNK> ";print ""}'|sed 's/ $//g' \
+    > $dir/$type
+done
+rm $dir/train.in # no longer needed-- and big.
+
+# Now randomize the order of the training data.
+cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \
+ sort | cut -f 2 > $dir/foo
+mv $dir/foo $dir/train
+
+# OK we'll train the RNNLM on this data.
+
+# todo: change 100 to 320.
+# using 100 classes as square root of 10k.
+echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
+#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/100.rnnlm \
+#  -hidden 100 -rand-seed 1 -debug 2 -class 100 -bptt 2 -bptt-block 20 \
+#  -direct-order 4 -direct 1000 -binary >& $dir/rnnlm1.log &
+
+$cmd $dir/rnnlm.log \
+   $KALDI_ROOT/tools/$rnnlm_ver/rnnlm -independent -train $dir/train -valid $dir/valid \
+   -rnnlm $dir/rnnlm -hidden $hidden -rand-seed 1 -debug 2 -class $class -bptt 2 -bptt-block 20 \
+   -direct-order 4 -direct $direct -binary || exit 1;
+
+
+# make it like a Kaldi table format, with fake utterance-ids.
+cat $dir/valid.in | awk '{ printf("uttid-%d ", NR); print; }' > $dir/valid.with_ids
+
+utils/rnnlm_compute_scores.sh $dir $dir/tmp.valid $dir/valid.with_ids \
+  $dir/valid.scores
+nw=`wc -w < $dir/valid.with_ids` # Note: valid.with_ids includes utterance-ids which
+  # is one per word, to account for the </s> at the end of each sentence; this is the
+  # correct number to normalize buy.
+p=`awk -v nw=$nw '{x=x+$2} END{print exp(x/nw);}' <$dir/valid.scores` 
+echo Perplexity is $p | tee $dir/perplexity.log
+
+rm $dir/train $dir/all.gz
+
+# This is a better setup, but takes a long time to train:
+#echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
+#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/320.rnnlm \
+#  -hidden 320 -rand-seed 1 -debug 2 -class 300 -bptt 2 -bptt-block 20 \
+#  -direct-order 4 -direct 2000 -binary
--- a/egs/chime_wsj0/s5/path.sh
+++ b/egs/chime_wsj0/s5/path.sh
@ -0,0 +1,3 @@
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet-cpubin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH
+export LC_ALL=C
--- a/egs/chime_wsj0/s5/run.sh
+++ b/egs/chime_wsj0/s5/run.sh
@ -0,0 +1,261 @@
+#!/bin/bash
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+# This is a shell script, but it's recommended that you run the commands one by
+# one by copying and pasting into the shell.
+
+case 0 in    #goto here
+    1)
+;;           #here:
+esac
+
+#exit 1;
+#need wsj0 for the clean version and LMs
+wsj0=/mnt/spdb/wall_street_journal
+local/clean_wsj0_data_prep.sh $wsj0
+
+reverb=/mnt/spdb/CHiME/chime2-wsj0/reverberated 
+local/reverb_wsj0_data_prep.sh $reverb 
+
+noisy=/mnt/spdb/CHiME/chime2-wsj0/isolated
+local/noisy_wsj0_data_prep.sh $noisy 
+
+local/wsj_prepare_dict.sh || exit 1;
+
+utils/prepare_lang.sh data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp data/lang || exit 1;
+
+local/chime_format_data.sh || exit 1;
+
+# Now make MFCC features.
+# mfccdir should be some place with a largish disk where you
+# want to store MFCC features.
+
+mfccdir=mfcc
+for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do 
+ steps/make_mfcc.sh --nj 10 \
+   data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+ steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+done
+
+# Note: the --boost-silence option should probably be omitted by default
+# for normal setups.  It doesn't always help. [it's to discourage non-silence
+# models from modeling silence.]
+mfccdir=mfcc
+for x in test_eval92_5k_noisy dev_dt_05_noisy train_si84_noisy; do 
+ steps/make_mfcc.sh --nj 10 \
+   data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+ steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+done
+
+mfccdir=mfcc
+for x in dev_dt_05_reverb train_si84_reverb; do 
+ steps/make_mfcc.sh --nj 10 \
+   data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+ steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+done
+
+#begin train gmm systems using multi condition data
+#train_si84 = clean+reverb+noisy, 
+for s in train_si84 ; do 
+  mkdir -p data/$s
+  cp data/${s}_clean/spk2gender data/$s/ 
+  for x in text wav.scp; do
+    cat data/${s}_clean/$x data/${s}_reverb/$x data/${s}_noisy/$x | sort -k1 > data/$s/$x 
+  done
+  cat data/$s/wav.scp | awk '{print $1}' | perl -ane 'chop; m:^...:; print "$_ $&\n";' > data/$s/utt2spk 
+  cat data/$s/utt2spk | utils/utt2spk_to_spk2utt.pl > data/$s/spk2utt 
+done
+
+mfccdir=mfcc
+for x in train_si84; do 
+ steps/make_mfcc.sh --nj 10 \
+   data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+ steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+done
+
+steps/train_mono.sh --boost-silence 1.25 --nj 10 \
+  data/train_si84 data/lang exp/mono0a || exit 1;
+
+
+
+utils/mkgraph.sh --mono data/lang_test_tgpr_5k exp/mono0a exp/mono0a/graph_tgpr_5k
+#steps/decode.sh --nj 8  \
+#  exp/mono0a/graph_tgpr_5k data/test_eval92_5k_clean exp/mono0a/decode_tgpr_eval92_5k_clean
+steps/decode.sh --nj 8  \
+  exp/mono0a/graph_tgpr_5k data/test_eval92_5k_noisy exp/mono0a/decode_tgpr_eval92_5k_noisy
+ 
+
+steps/align_si.sh --boost-silence 1.25 --nj 10 \
+   data/train_si84 data/lang exp/mono0a exp/mono0a_ali || exit 1;
+
+steps/train_deltas.sh --boost-silence 1.25 \
+    2000 10000 data/train_si84 data/lang exp/mono0a_ali exp/tri1 || exit 1;
+
+while [ ! -f data/lang_test_tgpr/tmp/LG.fst ] || \
+   [ -z data/lang_test_tgpr/tmp/LG.fst ]; do
+  sleep 20;
+done
+sleep 30;
+# or the mono mkgraph.sh might be writing 
+# data/lang_test_tgpr/tmp/LG.fst which will cause this to fail.
+
+utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri1 exp/tri1/graph_tgpr_5k || exit 1;
+
+#steps/decode.sh --nj 8 \
+#  exp/tri1/graph_tgpr data/test_eval92_5k_clean exp/tri1/decode_tgpr_eval92_5k_clean || exit 1;
+steps/decode.sh --nj 8 \
+  exp/tri1/graph_tgpr_5k data/test_eval92_5k_noisy exp/tri1/decode_tgpr_eval92_5k_noisy || exit 1;
+
+
+# test various modes of LM rescoring (4 is the default one).
+# This is just confirming they're equivalent.
+#for mode in 1 2 3 4; do
+#steps/lmrescore.sh --mode $mode --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+#  data/test_dev93 exp/tri1/decode_tgpr_dev93 exp/tri1/decode_tgpr_dev93_tg$mode  || exit 1;
+#done
+
+# demonstrate how to get lattices that are "word-aligned" (arcs coincide with
+# words, with boundaries in the right place).
+#sil_label=`grep '!SIL' data/lang_test_tgpr/words.txt | awk '{print $2}'`
+#steps/word_align_lattices.sh --cmd "$train_cmd" --silence-label $sil_label \
+#  data/lang_test_tgpr exp/tri1/decode_tgpr_dev93 exp/tri1/decode_tgpr_dev93_aligned || exit 1;
+
+steps/align_si.sh --nj 10 \
+  data/train_si84 data/lang exp/tri1 exp/tri1_ali_si84 || exit 1;
+
+# Train tri2a, which is deltas + delta-deltas, on si84 data.
+steps/train_deltas.sh \
+  2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2a || exit 1;
+
+utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri2a exp/tri2a/graph_tgpr_5k || exit 1;
+
+#steps/decode.sh --nj 8  \
+#  exp/tri2a/graph_tgpr_5k data/test_eval92_5k_clean exp/tri2a/decode_tgpr_eval92_5k_clean || exit 1;
+steps/decode.sh --nj 8  \
+  exp/tri2a/graph_tgpr_5k data/test_eval92_5k_noisy exp/tri2a/decode_tgpr_eval92_5k_noisy|| exit 1;
+
+#utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg5k
+#steps/decode.sh --nj 8 \
+#  exp/tri2a/graph_bg5k data/test_eval92_5k_clean exp/tri2a/decode_bg_eval92_5k_clean || exit 1;
+
+steps/train_lda_mllt.sh \
+   --splice-opts "--left-context=3 --right-context=3" \
+   2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2b || exit 1;
+
+utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri2b exp/tri2b/graph_tgpr_5k || exit 1;
+steps/decode.sh --nj 8 \
+  exp/tri2b/graph_tgpr_5k data/test_eval92_5k_noisy exp/tri2b/decode_tgpr_eval92_5k_noisy || exit 1;
+#steps/decode.sh --nj 8 \
+#  exp/tri2b/graph_tgpr data/test_eval92_clean exp/tri2b/decode_tgpr_eval92_clean || exit 1;
+
+
+# Align tri2b system with si84 data.
+steps/align_si.sh  --nj 10 \
+  --use-graphs true data/train_si84 data/lang exp/tri2b exp/tri2b_ali_si84  || exit 1;
+
+
+# From 2b system, train 3b which is LDA + MLLT + SAT.
+steps/train_sat.sh \
+  2500 15000 data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri3b || exit 1;
+utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri3b exp/tri3b/graph_tgpr_5k || exit 1;
+steps/decode_fmllr.sh --nj 8 \
+  exp/tri3b/graph_tgpr_5k data/test_eval92_5k_noisy exp/tri3b/decode_tgpr_eval92_5k_noisy || exit 1;
+
+
+# From 3b multi-condition system, align noisy si84 data.
+steps/align_fmllr.sh --nj 10 \
+  data/train_si84_noisy data/lang exp/tri3b exp/tri3b_ali_si84_noisy || exit 1;
+
+steps/align_fmllr.sh --nj 10 \
+  data/dev_dt_05_noisy data/lang exp/tri3b exp/tri3b_ali_dev_dt_05 || exit 1;
+
+#begin training DNN-HMM system
+#only on noisy si84 
+
+. ./path.sh
+#RBM pretraining
+dir=exp/tri4a_dnn_pretrain
+$cuda_cmd $dir/_pretrain_dbn.log \
+  steps/pretrain_dbn.sh --use-gpu-id 0 --nn-depth 7 --rbm-iter 3 data-fbank/train_si84_noisy $dir
+#BP 
+dir=exp/tri4a_dnn
+ali=exp/tri3b_ali_si84_noisy
+ali_dev=exp/tri3b_ali_dev_dt_05
+feature_transform=exp/tri4a_dnn_pretrain/final.feature_transform
+dbn=exp/tri4a_dnn_pretrain/7.dbn
+$cuda_cmd $dir/_train_nnet.log \
+  steps/train_nnet.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 --use-gpu-id 0 \
+  data-fbank/train_si84_noisy data-fbank/dev_dt_05_noisy data/lang $ali $ali_dev $dir || exit 1;
+
+utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri4a_dnn exp/tri4a_dnn/graph_tgpr_5k || exit 1;
+steps/decode_nnet.sh --nj 8 --acwt 0.10 --config conf/decode_dnn.config \
+  exp/tri4a_dnn/graph_tgpr_5k data-fbank/test_eval92_5k_noisy $dir/decode_tgpr_5k_eval92_5k_noisy || exit 1;
+
+#Retrain system using new ali,
+#this is essential 
+#repeat this process for 3 times 
+srcdir=exp/tri4a_dnn
+steps/align_nnet.sh --nj 10 \
+  data-fbank/train_si84_noisy data/lang $srcdir ${srcdir}_ali_si84_noisy || exit 1;
+steps/align_nnet.sh --nj 10 \
+  data-fbank/dev_dt_05_noisy data/lang $srcdir ${srcdir}_ali_dt_05_noisy || exit 1;
+
+#no need to do pretraining again
+dir=exp/tri5a_dnn
+ali=exp/tri4a_dnn_ali_si84_noisy
+ali_dev=exp/tri4a_dnn_ali_dt_05_noisy
+feature_transform=exp/tri4a_dnn_pretrain/final.feature_transform
+dbn=exp/tri4a_dnn_pretrain/7.dbn
+$cuda_cmd $dir/_train_nnet.log \
+  steps/train_nnet.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 --use-gpu-id 0 \
+  data-fbank/train_si84_noisy data-fbank/dev_dt_05_noisy data/lang $ali $ali_dev $dir || exit 1;
+
+utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri5a_dnn exp/tri5a_dnn/graph_tgpr_5k || exit 1;
+steps/decode_nnet.sh --nj 8 --acwt 0.10 --config conf/decode_dnn.config \
+  exp/tri5a_dnn/graph_tgpr_5k data-fbank/test_eval92_5k_noisy $dir/decode_tgpr_5k_eval92_5k_noisy || exit 1;
+
+
+srcdir=exp/tri5a_dnn
+steps/align_nnet.sh --nj 10 \
+  data-fbank/train_si84_noisy data/lang $srcdir ${srcdir}_ali_si84_noisy || exit 1;
+steps/align_nnet.sh --nj 10 \
+  data-fbank/dev_dt_05_noisy data/lang $srcdir ${srcdir}_ali_dt_05_noisy || exit 1;
+
+. ./path.sh
+dir=exp/tri6a_dnn
+ali=exp/tri5a_dnn_ali_si84_noisy
+ali_dev=exp/tri5a_dnn_ali_dt_05_noisy
+feature_transform=exp/tri4a_dnn_pretrain/final.feature_transform
+dbn=exp/tri4a_dnn_pretrain/7.dbn
+$cuda_cmd $dir/_train_nnet.log \
+  steps/train_nnet.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 --use-gpu-id 0 \
+  data-fbank/train_si84_noisy data-fbank/dev_dt_05_noisy data/lang $ali $ali_dev $dir || exit 1;
+
+utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri6a_dnn exp/tri6a_dnn/graph_tgpr_5k || exit 1;
+steps/decode_nnet.sh --nj 8 --acwt 0.10 --config conf/decode_dnn.config \
+  exp/tri6a_dnn/graph_tgpr_5k data-fbank/test_eval92_5k_noisy $dir/decode_tgpr_5k_eval92_5k_noisy || exit 1;
+
+srcdir=exp/tri6a_dnn
+steps/align_nnet.sh --nj 10 \
+  data-fbank/train_si84_noisy data/lang $srcdir ${srcdir}_ali_si84_noisy || exit 1;
+steps/align_nnet.sh --nj 10 \
+  data-fbank/dev_dt_05_noisy data/lang $srcdir ${srcdir}_ali_dt_05_noisy || exit 1;
+
+. ./path.sh
+dir=exp/tri7a_dnn
+ali=exp/tri6a_dnn_ali_si84_noisy
+ali_dev=exp/tri6a_dnn_ali_dt_05_noisy
+feature_transform=exp/tri4a_dnn_pretrain/final.feature_transform
+dbn=exp/tri4a_dnn_pretrain/7.dbn
+$cuda_cmd $dir/_train_nnet.log \
+  steps/train_nnet.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 --use-gpu-id 0 \
+  data-fbank/train_si84_noisy data-fbank/dev_dt_05_noisy data/lang $ali $ali_dev $dir || exit 1;
+
+utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri7a_dnn exp/tri7a_dnn/graph_tgpr_5k || exit 1;
+steps/decode_nnet.sh --nj 8 --acwt 0.10 --config conf/decode_dnn.config \
+  exp/tri7a_dnn/graph_tgpr_5k data-fbank/test_eval92_5k_noisy $dir/decode_tgpr_5k_eval92_5k_noisy || exit 1;
+
+
+
--- a/egs/chime_wsj0/s5/steps/align_basis_fmllr.sh
+++ b/egs/chime_wsj0/s5/steps/align_basis_fmllr.sh
@ -0,0 +1,150 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Copyright 2013  GoVivace Inc (Author: Nagendra Goel)
+# Apache 2.0
+
+# Computes training alignments; assumes features are (LDA+MLLT or delta+delta-delta)
+# + fMLLR (probably with SAT models).
+# It first computes an alignment with the final.alimdl (or the final.mdl if final.alimdl
+# is not present), then does 2 iterations of fMLLR estimation.
+
+# If you supply the --use-graphs option, it will use the training
+# graphs from the source directory (where the model is).  In this
+# case the number of jobs must match the source directory.
+
+
+# Begin configuration section.  
+stage=0
+nj=4
+cmd=run.pl
+use_graphs=false
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+beam=10
+retry_beam=40
+boost_silence=1.5 # factor by which to boost silence during alignment.
+fmllr_update_type=full
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "usage: steps/align_fmllr.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
+   echo "e.g.:  steps/align_fmllr.sh data/train data/lang exp/tri1 exp/tri1_ali"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --use-graphs true                                # use graphs in src-dir"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --fmllr-update-type (full|diag|offset|none)      # default full."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+graphdir=$dir
+
+oov=`cat $lang/oov.int` || exit 1;
+silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+sdata=$data/split$nj
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+cp $srcdir/{tree,final.mdl} $dir || exit 1;
+cp $srcdir/final.occs $dir;
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $dir    
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+## Set up model and alignment model.
+mdl=$srcdir/final.mdl
+if [ -f $srcdir/final.alimdl ]; then
+  alimdl=$srcdir/final.alimdl
+else
+  alimdl=$srcdir/final.mdl
+fi
+[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
+alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/boost_phones.csl` $alimdl - |"
+mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |"
+
+## Work out where we're getting the graphs from.
+if $use_graphs; then
+  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
+    echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1;
+  [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1;
+  graphdir=$srcdir
+else
+  graphdir=$dir
+  if [ $stage -le 0 ]; then
+    echo "$0: compiling training graphs"
+    tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";   
+    $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
+      compile-train-graphs $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
+        "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
+  fi
+fi
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: aligning data in $data using $alimdl and speaker-independent features."
+  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
+    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
+    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: computing fMLLR transforms"
+  if [ "$alimdl" != "$mdl" ]; then
+    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+      gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
+      gmm-est-basis-fmllr-gpost --fmllr-min-count=22  --num-iters=10 \
+        --size-scale=0.2 --step-size-iters=3 \
+        --write-weights=ark:$dir/pre_wgt.JOB \
+        $mdl $srcdir/fmllr.basis "$sifeats"  ark,s,cs:- \
+        ark:$dir/trans.JOB || exit 1;
+#  else
+#    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+#      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+#      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+#      gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
+#      --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \
+#      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
+  fi
+fi
+
+feats="$sifeats transform-feats ark:$dir/pre_trans.JOB ark:- ark:- |"
+
+if [ $stage -le 3 ]; then
+  echo "$0: doing final alignment."
+  $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \
+    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl_cmd" \
+    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+
+#rm $dir/pre_ali.*.gz
+
+echo "$0: done aligning data."
+
+utils/summarize_warnings.pl $dir/log
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/align_fmllr.sh
+++ b/egs/chime_wsj0/s5/steps/align_fmllr.sh
@ -0,0 +1,148 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# Computes training alignments; assumes features are (LDA+MLLT or delta+delta-delta)
+# + fMLLR (probably with SAT models).
+# It first computes an alignment with the final.alimdl (or the final.mdl if final.alimdl
+# is not present), then does 2 iterations of fMLLR estimation.
+
+# If you supply the --use-graphs option, it will use the training
+# graphs from the source directory (where the model is).  In this
+# case the number of jobs must match the source directory.
+
+
+# Begin configuration section.  
+stage=0
+nj=4
+cmd=run.pl
+use_graphs=false
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+beam=10
+retry_beam=40
+boost_silence=1.0 # factor by which to boost silence during alignment.
+fmllr_update_type=full
+norm_vars=false
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "usage: steps/align_fmllr.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
+   echo "e.g.:  steps/align_fmllr.sh data/train data/lang exp/tri1 exp/tri1_ali"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --use-graphs true                                # use graphs in src-dir"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --fmllr-update-type (full|diag|offset|none)      # default full."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+oov=`cat $lang/oov.int` || exit 1;
+silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+sdata=$data/split$nj
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+cp $srcdir/{tree,final.mdl} $dir || exit 1;
+cp $srcdir/final.occs $dir;
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $srcdir/full.mat $dir    
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+## Set up model and alignment model.
+mdl=$srcdir/final.mdl
+if [ -f $srcdir/final.alimdl ]; then
+  alimdl=$srcdir/final.alimdl
+else
+  alimdl=$srcdir/final.mdl
+fi
+[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
+alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |"
+mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |"
+
+
+## Work out where we're getting the graphs from.
+if $use_graphs; then
+  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
+    echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1;
+  [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1;
+  graphdir=$srcdir
+else
+  graphdir=$dir
+  if [ $stage -le 0 ]; then
+    echo "$0: compiling training graphs"
+    tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";   
+    $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
+      compile-train-graphs $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
+        "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
+  fi
+fi
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: aligning data in $data using $alimdl and speaker-independent features."
+  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
+    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
+    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: computing fMLLR transforms"
+  if [ "$alimdl" != "$mdl" ]; then
+    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+      gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
+      gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \
+      --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \
+      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
+  else
+    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+      gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
+      --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \
+      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
+  fi
+fi
+
+feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
+
+if [ $stage -le 3 ]; then
+  echo "$0: doing final alignment."
+  $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \
+    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl_cmd" \
+    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+
+rm $dir/pre_ali.*.gz
+
+echo "$0: done aligning data."
+
+utils/summarize_warnings.pl $dir/log
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/align_nnet.sh
+++ b/egs/chime_wsj0/s5/steps/align_nnet.sh
@ -0,0 +1,99 @@
+#!/bin/bash
+# Copyright 2012  Brno University of Technology (Author: Karel Vesely)
+# Apache 2.0
+
+# Computes training alignments using MLP model
+
+# If you supply the "--use-graphs true" option, it will use the training
+# graphs from the source directory (where the model is).  In this
+# case the number of jobs must match with the source directory.
+
+
+# Begin configuration section.  
+nj=4
+cmd=run.pl
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+beam=10
+retry_beam=40
+use_gpu_id=-1 # disable gpu
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "usage: $0 <data-dir> <lang-dir> <src-dir> <align-dir>"
+   echo "e.g.:  $0 data/train data/lang exp/tri1 exp/tri1_ali"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+oov=`cat $lang/oov.int` || exit 1;
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+sdata=$data/split$nj
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+cp $srcdir/{tree,final.mdl} $dir || exit 1;
+
+#Get the files we will need
+nnet=$srcdir/final.nnet;
+[ -z "$nnet" ] && echo "Error nnet '$nnet' does not exist!" && exit 1;
+
+class_frame_counts=$srcdir/ali_train_pdf.counts
+[ -z "$class_frame_counts" ] && echo "Error class_frame_counts '$class_frame_counts' does not exist!" && exit 1;
+
+feature_transform=$srcdir/final.feature_transform
+if [ ! -f $feature_transform ]; then
+  echo "Missing feature_transform '$feature_transform'"
+  exit 1
+fi
+
+model=$dir/final.mdl
+[ -z "$model" ] && echo "Error transition model '$model' does not exist!" && exit 1;
+
+###
+### Prepare feature pipeline (same as for decoding)
+###
+# Create the feature stream:
+feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
+# Optionally add cmvn
+if [ -f $srcdir/norm_vars ]; then
+  norm_vars=$(cat $srcdir/norm_vars 2>/dev/null)
+  [ ! -f $sdata/1/cmvn.scp ] && echo "$0: cannot find cmvn stats $sdata/1/cmvn.scp" && exit 1
+  feats="$feats apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
+fi
+# Optionally add deltas
+if [ -f $srcdir/delta_order ]; then
+  delta_order=$(cat $srcdir/delta_order)
+  feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
+fi
+
+# Finally add feature_transform and the MLP
+feats="$feats nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet ark:- ark:- |"
+###
+###
+###
+ 
+echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"
+
+tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
+# We could just use gmm-align-mapped in the next line, but it's less efficient as it compiles the
+# training graphs one by one.
+$cmd JOB=1:$nj $dir/log/align.JOB.log \
+  compile-train-graphs $dir/tree $dir/final.mdl  $lang/L.fst "$tra" ark:- \| \
+  align-compiled-mapped $scale_opts --beam=$beam --retry-beam=$retry_beam $dir/final.mdl ark:- \
+    "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+
+echo "$0: done aligning data."
--- a/egs/chime_wsj0/s5/steps/align_raw_fmllr.sh
+++ b/egs/chime_wsj0/s5/steps/align_raw_fmllr.sh
@ -0,0 +1,142 @@
+#!/bin/bash
+# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# Computes training alignments; assumes features are (LDA+MLLT or delta+delta-delta)
+# + fMLLR (probably with SAT models).
+# It first computes an alignment with the final.alimdl (or the final.mdl if final.alimdl
+# is not present), then does 2 iterations of fMLLR estimation.
+
+# If you supply the --use-graphs option, it will use the training
+# graphs from the source directory (where the model is).  In this
+# case the number of jobs must match the source directory.
+
+
+# Begin configuration section.  
+stage=0
+nj=4
+cmd=run.pl
+use_graphs=false
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+beam=10
+retry_beam=40
+boost_silence=1.0 # factor by which to boost silence during alignment.
+norm_vars=false
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "usage: steps/align_fmllr.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
+   echo "e.g.:  steps/align_fmllr.sh data/train data/lang exp/tri1 exp/tri1_ali"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --use-graphs true                                # use graphs in src-dir"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+oov=`cat $lang/oov.int` || exit 1;
+silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+sdata=$data/split$nj
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+cp $srcdir/{tree,final.mdl} $dir || exit 1;
+cp $srcdir/final.occs $dir;
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+
+if [[ ! -f $srcdir/final.mat || ! -f $srcdir/full.mat ]]; then
+  echo "$0: we require final.mat and full.mat in the source directory $srcdir"
+fi
+
+full_lda_mat="get-full-lda-mat --print-args=false $srcdir/final.mat $srcdir/full.mat -|"
+cp $srcdir/full.mat $srcdir/final.mat $dir 
+
+splicedfeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |"
+sifeats="$splicedfeats transform-feats $srcdir/final.mat ark:- ark:- |"
+
+## Set up model and alignment model.
+mdl=$srcdir/final.mdl
+if [ -f $srcdir/final.alimdl ]; then
+  alimdl=$srcdir/final.alimdl
+else
+  alimdl=$srcdir/final.mdl
+fi
+[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
+alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |"
+mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |"
+
+
+## Work out where we're getting the graphs from.
+if $use_graphs; then
+  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
+    echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1;
+  [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1;
+  graphdir=$srcdir
+else
+  graphdir=$dir
+  if [ $stage -le 0 ]; then
+    echo "$0: compiling training graphs"
+    tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";   
+    $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
+      compile-train-graphs $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
+        "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
+  fi
+fi
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: aligning data in $data using $alimdl and speaker-independent features."
+  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
+    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
+    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: computing fMLLR transforms"
+  if [ "$alimdl" != "$mdl" ]; then
+    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+      gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
+      gmm-est-fmllr-raw-gpost --spk2utt=ark:$sdata/JOB/spk2utt \
+       $mdl "$full_lda_mat" "$splicedfeats" ark,s,cs:- ark:$dir/raw_trans.JOB || exit 1;
+  else
+    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+      gmm-est-fmllr-raw --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$full_lda_mat" \
+       "$splicedfeats" ark,s,cs:- ark:$dir/raw_trans.JOB || exit 1;
+  fi
+fi
+
+feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+
+if [ $stage -le 3 ]; then
+  echo "$0: doing final alignment."
+  $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \
+    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl_cmd" \
+    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+
+rm $dir/pre_ali.*.gz
+
+echo "$0: done aligning data."
+
+utils/summarize_warnings.pl $dir/log
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/align_sgmm.sh
+++ b/egs/chime_wsj0/s5/steps/align_sgmm.sh
@ -0,0 +1,193 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# Computes training alignments and (if needed) speaker-vectors, given an 
+# SGMM system.  If the system is built on top of SAT, you should supply
+# transforms with the --transform-dir option.
+
+# If you supply the --use-graphs option, it will use the training
+# graphs from the source directory.
+
+# Begin configuration section.  
+stage=0
+nj=4
+cmd=run.pl
+use_graphs=false # use graphs from srcdir
+use_gselect=false # use gselect info from srcdir [regardless, we use
+   # Gaussian-selection info, we might have to compute it though.]
+gselect=15  # Number of Gaussian-selection indices for SGMMs.
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+beam=10
+retry_beam=40
+transform_dir=  # directory to find fMLLR transforms in.
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "usage: steps/align_sgmm.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
+   echo "e.g.:  steps/align_sgmm.sh --transform-dir exp/tri3b data/train data/lang \\"
+   echo "           exp/sgmm4a exp/sgmm5a_ali"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --use-graphs true                                # use graphs in src-dir"
+   echo "  --transform-dir <transform-dir>                  # directory to find fMLLR transforms"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+oov=`cat $lang/oov.int` || exit 1;
+silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+sdata=$data/split$nj
+
+mkdir -p $dir/log
+cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+echo $nj > $dir/num_jobs
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+cp $srcdir/{tree,final.mdl} $dir || exit 1;
+[ -f $srcdir/final.alimdl ] && cp $srcdir/final.alimdl $dir
+cp $srcdir/final.occs $dir;
+
+## Set up features.
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $dir    
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
+  [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
+    && echo "$0: #jobs mismatch with transform-dir." && exit 1;
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
+elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
+  echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
+  echo "  but you are not providing the --transform-dir option during alignment."
+fi
+##
+
+## Set up model and alignment model.
+mdl=$srcdir/final.mdl
+if [ -f $srcdir/final.alimdl ]; then
+  alimdl=$srcdir/final.alimdl
+else
+  alimdl=$srcdir/final.mdl
+fi
+[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
+
+## Work out where we're getting the graphs from.
+if $use_graphs; then
+  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
+    echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1;
+  [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1;
+  graphdir=$srcdir
+  ln.pl $srcdir/fsts.*.gz $dir
+else
+  graphdir=$dir
+  if [ $stage -le 0 ]; then
+    echo "$0: compiling training graphs"
+    tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";   
+    $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
+      compile-train-graphs $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
+        "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
+  fi
+fi
+
+## Work out where we're getting the Gaussian-selection info from
+if $use_gselect; then
+  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
+    echo "$0: you specified --use-gselect true, but #jobs mismatch." && exit 1;
+  [ ! -f $srcdir/gselect.1.gz ] && echo "No gselect info in $srcdir" && exit 1;
+  graphdir=$srcdir
+  gselect_opt="--gselect=ark,s,cs:gunzip -c $srcdir/gselect.JOB.gz|"
+  ln.pl $srcdir/gselect.*.gz $dir
+else
+  graphdir=$dir
+  if [ $stage -le 1 ]; then
+    echo "$0: computing Gaussian-selection info"
+    # Note: doesn't matter whether we use $alimdl or $mdl, they will
+    # have the same gselect info.
+    $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
+      sgmm-gselect --full-gmm-nbest=$gselect $alimdl \
+      "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
+  fi
+  gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"
+fi
+
+
+if [ $alimdl == $mdl ]; then 
+  # Speaker-independent decoding-- just one pass.  Not normal.
+  T=`sgmm-info $mdl | grep 'speaker vector space' | awk '{print $NF}'` || exit 1;
+  [ "$T" -ne 0 ] && echo "No alignment model, yet speaker vector space nonempty" && exit 1;
+
+  if [ $stage -le 2 ]; then
+    echo "$0: aligning data in $data using model $mdl (no speaker-vectors)"
+    $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
+      sgmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $alimdl \
+      "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+  fi
+  echo "$0: done aligning data."
+  exit 0;
+fi
+
+# Continue with system with speaker vectors.
+if [ $stage -le 2 ]; then
+  echo "$0: aligning data in $data using model $alimdl"
+  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
+    sgmm-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam $alimdl \
+    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: computing speaker vectors (1st pass)"
+  $cmd JOB=1:$nj $dir/log/spk_vecs1.JOB.log \
+    ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+    weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+    sgmm-post-to-gpost "$gselect_opt" $alimdl "$feats" ark:- ark:- \| \
+    sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \
+     $mdl "$feats" ark,s,cs:- ark:$dir/pre_vecs.JOB || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing speaker vectors (2nd pass)"
+  $cmd JOB=1:$nj $dir/log/spk_vecs2.JOB.log \
+    ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+    weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+    sgmm-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \
+     --spk-vecs=ark:$dir/pre_vecs.JOB $mdl "$feats" ark,s,cs:- ark:$dir/vecs.JOB || exit 1;
+  rm $dir/pre_vecs.*
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: doing final alignment."
+  $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \
+    sgmm-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam \
+     --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \
+     $mdl "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+
+rm $dir/pre_ali.*.gz
+
+echo "$0: done aligning data."
+
+utils/summarize_warnings.pl $dir/log
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/align_sgmm2.sh
+++ b/egs/chime_wsj0/s5/steps/align_sgmm2.sh
@ -0,0 +1,193 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# Computes training alignments and (if needed) speaker-vectors, given an 
+# SGMM system.  If the system is built on top of SAT, you should supply
+# transforms with the --transform-dir option.
+
+# If you supply the --use-graphs option, it will use the training
+# graphs from the source directory.
+
+# Begin configuration section.  
+stage=0
+nj=4
+cmd=run.pl
+use_graphs=false # use graphs from srcdir
+use_gselect=false # use gselect info from srcdir [regardless, we use
+   # Gaussian-selection info, we might have to compute it though.]
+gselect=15  # Number of Gaussian-selection indices for SGMMs.
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+beam=10
+retry_beam=40
+transform_dir=  # directory to find fMLLR transforms in.
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "usage: steps/align_sgmm.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
+   echo "e.g.:  steps/align_sgmm.sh --transform-dir exp/tri3b data/train data/lang \\"
+   echo "           exp/sgmm4a exp/sgmm5a_ali"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --use-graphs true                                # use graphs in src-dir"
+   echo "  --transform-dir <transform-dir>                  # directory to find fMLLR transforms"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+oov=`cat $lang/oov.int` || exit 1;
+silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+sdata=$data/split$nj
+
+mkdir -p $dir/log
+cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+echo $nj > $dir/num_jobs
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+cp $srcdir/{tree,final.mdl} $dir || exit 1;
+[ -f $srcdir/final.alimdl ] && cp $srcdir/final.alimdl $dir
+cp $srcdir/final.occs $dir;
+
+## Set up features.
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $dir    
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
+  [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
+    && echo "$0: #jobs mismatch with transform-dir." && exit 1;
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
+elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
+  echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
+  echo "  but you are not providing the --transform-dir option during alignment."
+fi
+##
+
+## Set up model and alignment model.
+mdl=$srcdir/final.mdl
+if [ -f $srcdir/final.alimdl ]; then
+  alimdl=$srcdir/final.alimdl
+else
+  alimdl=$srcdir/final.mdl
+fi
+[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
+
+## Work out where we're getting the graphs from.
+if $use_graphs; then
+  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
+    echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1;
+  [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1;
+  graphdir=$srcdir
+  ln.pl $srcdir/fsts.*.gz $dir
+else
+  graphdir=$dir
+  if [ $stage -le 0 ]; then
+    echo "$0: compiling training graphs"
+    tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";   
+    $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
+      compile-train-graphs $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
+        "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
+  fi
+fi
+
+## Work out where we're getting the Gaussian-selection info from
+if $use_gselect; then
+  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
+    echo "$0: you specified --use-gselect true, but #jobs mismatch." && exit 1;
+  [ ! -f $srcdir/gselect.1.gz ] && echo "No gselect info in $srcdir" && exit 1;
+  graphdir=$srcdir
+  gselect_opt="--gselect=ark,s,cs:gunzip -c $srcdir/gselect.JOB.gz|"
+  ln.pl $srcdir/gselect.*.gz $dir
+else
+  graphdir=$dir
+  if [ $stage -le 1 ]; then
+    echo "$0: computing Gaussian-selection info"
+    # Note: doesn't matter whether we use $alimdl or $mdl, they will
+    # have the same gselect info.
+    $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
+      sgmm2-gselect --full-gmm-nbest=$gselect $alimdl \
+      "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
+  fi
+  gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"
+fi
+
+
+if [ $alimdl == $mdl ]; then 
+  # Speaker-independent decoding-- just one pass.  Not normal.
+  T=`sgmm2-info $mdl | grep 'speaker vector space' | awk '{print $NF}'` || exit 1;
+  [ "$T" -ne 0 ] && echo "No alignment model, yet speaker vector space nonempty" && exit 1;
+
+  if [ $stage -le 2 ]; then
+    echo "$0: aligning data in $data using model $mdl (no speaker-vectors)"
+    $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
+      sgmm2-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $alimdl \
+      "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+  fi
+  echo "$0: done aligning data."
+  exit 0;
+fi
+
+# Continue with system with speaker vectors.
+if [ $stage -le 2 ]; then
+  echo "$0: aligning data in $data using model $alimdl"
+  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
+    sgmm2-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam $alimdl \
+    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: computing speaker vectors (1st pass)"
+  $cmd JOB=1:$nj $dir/log/spk_vecs1.JOB.log \
+    ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+    weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+    sgmm2-post-to-gpost "$gselect_opt" $alimdl "$feats" ark:- ark:- \| \
+    sgmm2-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \
+     $mdl "$feats" ark,s,cs:- ark:$dir/pre_vecs.JOB || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing speaker vectors (2nd pass)"
+  $cmd JOB=1:$nj $dir/log/spk_vecs2.JOB.log \
+    ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+    weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+    sgmm2-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \
+     --spk-vecs=ark:$dir/pre_vecs.JOB $mdl "$feats" ark,s,cs:- ark:$dir/vecs.JOB || exit 1;
+  rm $dir/pre_vecs.*
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: doing final alignment."
+  $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \
+    sgmm2-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam \
+     --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \
+     $mdl "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+
+rm $dir/pre_ali.*.gz
+
+echo "$0: done aligning data."
+
+utils/summarize_warnings.pl $dir/log
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/align_si.sh
+++ b/egs/chime_wsj0/s5/steps/align_si.sh
@ -0,0 +1,89 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# Computes training alignments using a model with delta or
+# LDA+MLLT features.
+
+# If you supply the "--use-graphs true" option, it will use the training
+# graphs from the source directory (where the model is).  In this
+# case the number of jobs must match with the source directory.
+
+
+# Begin configuration section.  
+nj=4
+cmd=run.pl
+use_graphs=false
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+beam=10
+retry_beam=40
+boost_silence=1.0 # Factor by which to boost silence during alignment.
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "usage: steps/align_si.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
+   echo "e.g.:  steps/align_si.sh data/train data/lang exp/tri1 exp/tri1_ali"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --use-graphs true                                # use graphs in src-dir"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+oov=`cat $lang/oov.int` || exit 1;
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+sdata=$data/split$nj
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+cp $srcdir/{tree,final.mdl} $dir || exit 1;
+cp $srcdir/final.occs $dir;
+
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $dir    
+   ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+
+echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"
+
+mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/final.mdl - |"
+
+if $use_graphs; then 
+  [ $nj != "`cat $srcdir/num_jobs`" ] && echo "$0: mismatch in num-jobs" && exit 1;
+  [ ! -f $srcdir/fsts.1.gz ] && echo "$0: no such file $srcdir/fsts.1.gz" && exit 1;
+
+  $cmd JOB=1:$nj $dir/log/align.JOB.log \
+    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" \
+      "ark:gunzip -c $srcdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+else
+  tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
+  # We could just use gmm-align in the next line, but it's less efficient as it compiles the
+  # training graphs one by one.
+  $cmd JOB=1:$nj $dir/log/align.JOB.log \
+    compile-train-graphs $dir/tree $dir/final.mdl  $lang/L.fst "$tra" ark:- \| \
+    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" ark:- \
+      "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+
+echo "$0: done aligning data."
--- a/egs/chime_wsj0/s5/steps/append_feats.sh
+++ b/egs/chime_wsj0/s5/steps/append_feats.sh
@ -0,0 +1,67 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+# This script appends the features in two data directories.
+
+# To be run from .. (one directory up from here)
+# see ../run.sh for example
+
+# Begin configuration section.
+cmd=run.pl
+nj=4
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 5 ]; then
+   echo "usage: append_feats.sh [options] <src-data-dir1> <src-data-dir2> <dest-data-dir> <log-dir> <path-to-storage-dir>";
+   echo "options: "
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data_src1=$1
+data_src2=$2
+data=$3
+logdir=$4
+mfccdir=$5
+
+# make $mfccdir an absolute pathname.
+mfccdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $mfccdir ${PWD}`
+
+utils/split_data.sh $data_src1 $nj || exit 1;
+utils/split_data.sh $data_src2 $nj || exit 1;
+
+mkdir -p $mfccdir $logdir
+
+mkdir -p $data 
+cp $data_src1/* $data/ 2>/dev/null # so we get the other files, such as utt2spk.
+rm $data/cmvn.scp 2>/dev/null 
+rm -r $data/split* 2>/dev/null
+
+# use "name" as part of name of the archive.
+name=`basename $data`
+
+$cmd JOB=1:$nj $logdir/append.JOB.log \
+   append-feats --truncate-frames=true \
+   scp:$data_src1/split$nj/JOB/feats.scp scp:$data_src2/split$nj/JOB/feats.scp \
+   ark,scp:$mfccdir/appended_$name.JOB.ark,$mfccdir/appended_$name.JOB.scp || exit 1;
+              
+# concatenate the .scp files together.
+for ((n=1; n<=nj; n++)); do
+  cat $mfccdir/appended_$name.$n.scp >> $data/feats.scp || exit 1;
+done > $data/feats.scp || exit 1;
+
+
+nf=`cat $data/feats.scp | wc -l` 
+nu=`cat $data/utt2spk | wc -l` 
+if [ $nf -ne $nu ]; then
+  echo "It seems not all of the feature files were successfully ($nf != $nu);"
+  echo "consider using utils/fix_data_dir.sh $data"
+fi
+
+echo "Succeeded creating MFCC features for $name"
--- a/egs/chime_wsj0/s5/steps/compute_cmvn_stats.sh
+++ b/egs/chime_wsj0/s5/steps/compute_cmvn_stats.sh
@ -0,0 +1,80 @@
+#!/bin/bash 
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+# To be run from .. (one directory up from here)
+# see ../run.sh for example
+
+# Compute cepstral mean and variance statistics per speaker.  
+# We do this in just one job; it's fast.
+# This script takes no options.
+#
+# Note: there is no option to do CMVN per utterance.  The idea is
+# that if you did it per utterance it would not make sense to do
+# per-speaker fMLLR on top of that (since you'd be doing fMLLR on
+# top of different offsets).  Therefore what would be the use
+# of the speaker information?  In this case you should probably
+# make the speaker-ids identical to the utterance-ids.  The
+# speaker information does not have to correspond to actual
+# speakers, it's just the level you want to adapt at.
+
+echo "$0 $@"  # Print the command line for logging
+
+fake=false
+if [ $1 == "--fake" ]; then
+  fake=true
+  shift
+fi
+
+if [ $# != 3 ]; then
+   echo "usage: compute_cmvn_stats.sh [--fake] <data-dir> <log-dir> <path-to-cmvn-dir>";
+   echo "(note: --fake gives you fake cmvn stats that do no normalization.)"
+   exit 1;
+fi
+
+if [ -f path.sh ]; then . ./path.sh; fi
+
+data=$1
+logdir=$2
+cmvndir=$3
+
+# make $cmvndir an absolute pathname.
+cmvndir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $cmvndir ${PWD}`
+
+# use "name" as part of name of the archive.
+name=`basename $data`
+
+mkdir -p $cmvndir || exit 1;
+mkdir -p $logdir || exit 1;
+
+
+required="$data/feats.scp $data/spk2utt"
+
+for f in $required; do
+  if [ ! -f $f ]; then
+    echo "make_cmvn.sh: no such file $f"
+    exit 1;
+  fi
+done
+
+if $fake; then
+  dim=`feat-to-dim scp:$data/feats.scp -`
+  ! cat $data/spk2utt | awk -v dim=$dim '{print $1, "["; for (n=0; n < dim; n++) { printf("0 "); } print "1";
+                                                        for (n=0; n < dim; n++) { printf("1 "); } print "0 ]";}' | \
+    copy-matrix ark:- ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp && \
+     echo "Error creating fake CMVN stats" && exit 1;
+else  
+  ! compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp \
+    2> $logdir/cmvn_$name.log && echo "Error computing CMVN stats" && exit 1;
+fi
+
+cp $cmvndir/cmvn_$name.scp $data/cmvn.scp || exit 1;
+
+nc=`cat $data/cmvn.scp | wc -l` 
+nu=`cat $data/spk2utt | wc -l` 
+if [ $nc -ne $nu ]; then
+  echo "Error: it seems not all of the speakers got cmvn stats ($nc != $nu);"
+  exit 1;
+fi
+
+echo "Succeeded creating CMVN stats for $name"
--- a/egs/chime_wsj0/s5/steps/decode.sh
+++ b/egs/chime_wsj0/s5/steps/decode.sh
@ -0,0 +1,108 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# Begin configuration section.  
+transform_dir=
+iter=
+model= # You can specify the model to use (e.g. if you want to use the .alimdl)
+stage=0
+nj=4
+cmd=run.pl
+max_active=7000
+max_arcs=-1
+beam=13.0
+latbeam=6.0
+acwt=0.083333 # note: only really affects pruning (scoring is on lattices).
+num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+parallel_opts=  # If you supply num-threads, you should supply this too.
+scoring_opts=
+# note: there are no more min-lmwt and max-lmwt options, instead use
+# e.g. --scoring-opts "--min-lmwt 1 --max-lmwt 20"
+skip_scoring=false
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Usage: steps/decode.sh [options] <graph-dir> <data-dir> <decode-dir>"
+   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
+   echo " where the model is."
+   echo "e.g.: steps/decode.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr"
+   echo ""
+   echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
+   echo "what type of features you used (assuming it's one of these two)"
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --iter <iter>                                    # Iteration of model to test."
+   echo "  --model <model>                                  # which model to use (e.g. to"
+   echo "                                                   # specify the final.alimdl)"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --transform-dir <trans-dir>                      # dir to find fMLLR transforms "
+   echo "  --acwt <float>                                   # acoustic scale used for lattice generation "
+   echo "  --scoring-opts <string>                          # options to local/score.sh"
+   echo "  --num-threads <n>                                # number of threads to use, default 1."
+   echo "  --parallel-opts <opts>                           # e.g. '-pe smp 4' if you supply --num-threads 4"
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
+sdata=$data/split$nj;
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
+  if [ -z $iter ]; then model=$srcdir/final.mdl; 
+  else model=$srcdir/$iter.mdl; fi
+fi
+
+for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst; do
+  [ ! -f $f ] && echo "decode.sh: no such file $f" && exit 1;
+done
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "decode.sh: feature type is $feat_type";
+
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+thread_string=
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+if [ ! -z "$transform_dir" ]; then # add transforms to features...
+  echo "Using fMLLR transforms from $transform_dir"
+  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
+  [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \
+     echo "Mismatch in number of jobs with $transform_dir";
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
+fi
+
+if [ $stage -le 0 ]; then
+  $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
+    gmm-latgen-faster$thread_string --max-arcs=$max_arcs --max-active=$max_active --beam=$beam --lattice-beam=$latbeam \
+    --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+    $model $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+fi
+
+if ! $skip_scoring ; then
+  [ ! -x local/score.sh ] && \
+    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+  local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir
+fi
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_basis_fmllr.sh
+++ b/egs/chime_wsj0/s5/steps/decode_basis_fmllr.sh
@ -0,0 +1,206 @@
+#!/bin/bash
+
+# Copyright 2012   Carnegie Mellon University (Author: Yajie Miao)
+#                  Johns Hopkins University (Author: Daniel Povey)
+
+# Decoding script that does basis fMLLR.  This can be on top of delta+delta-delta,
+# or LDA+MLLT features.
+
+# There are 3 models involved potentially in this script,
+# and for a standard, speaker-independent system they will all be the same.
+# The "alignment model" is for the 1st-pass decoding and to get the 
+# Gaussian-level alignments for the "adaptation model" the first time we
+# do fMLLR.  The "adaptation model" is used to estimate fMLLR transforms
+# and to generate state-level lattices.  The lattices are then rescored
+# with the "final model".
+#
+# The following table explains where we get these 3 models from.
+# Note: $srcdir is one level up from the decoding directory.
+#
+#   Model              Default source:                 
+#
+#  "alignment model"   $srcdir/final.alimdl              --alignment-model <model>
+#                     (or $srcdir/final.mdl if alimdl absent)
+#  "adaptation model"  $srcdir/final.mdl                 --adapt-model <model>
+#  "final model"       $srcdir/final.mdl                 --final-model <model>
+
+
+# Begin configuration section
+first_beam=10.0 # Beam used in initial, speaker-indep. pass
+first_max_active=2000 # max-active used in initial pass.
+alignment_model=
+adapt_model=
+final_model=
+stage=0
+acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in 
+              # lattice generation.
+
+# Parameters in alignment of training data
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+align_beam=10
+retry_beam=40
+
+max_active=7000
+beam=13.0
+lattice_beam=6.0
+nj=4
+silence_weight=0.01
+cmd=run.pl
+si_dir=
+# End configuration section
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Usage: steps/decode_basis_fmllr.sh [options] <graph-dir> <data-dir> <decode-dir>"
+   echo " e.g.: steps/decode_basis_fmllr.sh exp/tri2b/graph_tgpr data/train_si84 data/test_dev93 exp/tri2b/decode_dev93_tgpr"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                   # config containing options"
+   echo "  --nj <nj>                                # number of parallel jobs"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --adapt-model <adapt-mdl>                # Model to compute transforms with"
+   echo "  --alignment-model <ali-mdl>              # Model to get Gaussian-level alignments for"
+   echo "                                           # 1st pass of transform computation."
+   echo "  --final-model <finald-mdl>               # Model to finally decode with"
+   echo "  --si-dir <speaker-indep-decoding-dir>    # use this to skip 1st pass of decoding"
+   echo "                                           # Caution-- must be with same tree"
+   echo "  --acwt <acoustic-weight>                 # default 0.08333 ... used to get posteriors"
+
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash.
+
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+sdata=$data/split$nj;
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+
+silphonelist=`cat $graphdir/phones/silence.csl` || exit 1;
+
+# Some checks.  Note: we don't need $srcdir/tree but we expect
+# it should exist, given the current structure of the scripts.
+for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/tree $srcdir/fmllr.basis; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+## Work out name of alignment model. ##
+if [ -z "$alignment_model" ]; then
+  if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
+  else alignment_model=$srcdir/final.mdl; fi
+fi
+[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;
+##
+
+## Do the speaker-independent decoding, if --si-dir option not present. ##
+if [ -z "$si_dir" ]; then # we need to do the speaker-independent decoding pass.
+  si_dir=${dir}.si # Name it as our decoding dir, but with suffix ".si".
+  if [ $stage -le 0 ]; then
+    steps/decode.sh --acwt $acwt --nj $nj --cmd "$cmd" --beam $first_beam --model $alignment_model --max-active $first_max_active $graphdir $data $si_dir || exit 1;
+  fi
+fi
+##
+
+## Some checks, and setting of defaults for variables.
+[ "$nj" -ne "`cat $si_dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1;
+[ ! -f "$si_dir/lat.1.gz" ] && echo "No such file $si_dir/lat.1.gz" && exit 1;
+[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl
+[ -z "$final_model" ] && final_model=$srcdir/final.mdl
+for f in $adapt_model $final_model; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+##
+
+## Set up the unadapted features "$sifeats" for testing set
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type";
+case $feat_type in
+  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+##
+
+## Now get the first-pass fMLLR transforms.
+## We give all the default parameters in gmm-est-basis-fmllr
+if [ $stage -le 1 ]; then
+  echo "$0: getting first-pass fMLLR transforms."
+  $cmd JOB=1:$nj $dir/log/fmllr_pass1.JOB.log \
+    gunzip -c $si_dir/lat.JOB.gz \| \
+    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+    weight-silence-post $silence_weight $silphonelist $alignment_model ark:- ark:- \| \
+    gmm-post-to-gpost $alignment_model "$sifeats" ark:- ark:- \| \
+    gmm-est-basis-fmllr-gpost --spk2utt=ark:$sdata/JOB/spk2utt \
+    --fmllr-min-count=200  --num-iters=10 --size-scale=0.2 \
+    --step-size-iters=3 --write-weights=ark:$dir/pre_wgt.JOB \
+     $adapt_model $srcdir/fmllr.basis "$sifeats" ark,s,cs:- \
+    ark:$dir/pre_trans.JOB || exit 1;
+fi
+##
+
+pass1feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/pre_trans.JOB ark:- ark:- |"
+
+## Do the main lattice generation pass.  Note: we don't determinize the lattices at
+## this stage, as we're going to use them in acoustic rescoring with the larger 
+## model, and it's more correct to store the full state-level lattice for this purpose.
+if [ $stage -le 2 ]; then
+  echo "$0: doing main lattice generation phase"
+  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
+    gmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
+    --acoustic-scale=$acwt  \
+    --determinize-lattice=false --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+    $adapt_model $graphdir/HCLG.fst "$pass1feats" "ark:|gzip -c > $dir/lat.tmp.JOB.gz" \
+    || exit 1;
+fi
+##
+
+## Do a second pass of estimating the transform-- this time with the lattices
+## generated from the alignment model.  Compose the transforms to get
+## $dir/trans.1, etc.
+if [ $stage -le 3 ]; then
+  echo "$0: estimating fMLLR transforms a second time."
+  $cmd JOB=1:$nj $dir/log/fmllr_pass2.JOB.log \
+    lattice-determinize-pruned --acoustic-scale=$acwt --beam=4.0 \
+    "ark:gunzip -c $dir/lat.tmp.JOB.gz|" ark:- \| \
+    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+    weight-silence-post $silence_weight $silphonelist $adapt_model ark:- ark:- \| \
+    gmm-est-basis-fmllr --fmllr-min-count=200 \
+    --spk2utt=ark:$sdata/JOB/spk2utt --write-weights=ark:$dir/trans_tmp_wgt.JOB \
+    $adapt_model $srcdir/fmllr.basis "$pass1feats" ark,s,cs:- ark:$dir/trans_tmp.JOB '&&' \
+    compose-transforms --b-is-affine=true ark:$dir/trans_tmp.JOB ark:$dir/pre_trans.JOB \
+    ark:$dir/trans.JOB  || exit 1;
+fi
+##
+
+feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
+
+# Rescore the state-level lattices with the final adapted features, and the final model
+# (which by default is $srcdir/final.mdl, but which may be specified on the command line,
+# useful in case of discriminatively trained systems).
+# At this point we prune and determinize the lattices and write them out, ready for 
+# language model rescoring.
+
+if [ $stage -le 4 ]; then
+  echo "$0: doing a final pass of acoustic rescoring."
+  $cmd JOB=1:$nj $dir/log/acoustic_rescore.JOB.log \
+    gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat.tmp.JOB.gz|" "$feats" ark:- \| \
+    lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
+    "ark:|gzip -c > $dir/lat.JOB.gz" '&&' rm $dir/lat.tmp.JOB.gz || exit 1;
+fi
+
+[ ! -x local/score.sh ] && \
+  echo "$0: not scoring because local/score.sh does not exist or not executable." && exit 1;
+local/score.sh --cmd "$cmd" $data $graphdir $dir
+
+rm $dir/{trans_tmp,pre_trans}.*
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_biglm.sh
+++ b/egs/chime_wsj0/s5/steps/decode_biglm.sh
@ -0,0 +1,86 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# Begin configuration.
+nj=4
+cmd=run.pl
+maxactive=7000
+beam=13.0
+latbeam=6.0
+acwt=0.083333
+# End configuration.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 5 ]; then
+   echo "Usage: steps/decode_si_biglm.sh [options] <graph-dir> <old-LM-fst> <new-LM-fst> <data-dir> <decode-dir>"
+   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
+   echo " where the model is."
+   echo "e.g.: steps/decode_si.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr"
+   echo ""
+   echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
+   echo "what type of features you used (assuming it's one of these two)"
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+
+graphdir=$1
+oldlm_fst=$2
+newlm_fst=$3
+data=$4
+dir=$5
+
+srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
+sdata=$data/split$nj;
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+
+for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $srcdir/final.mdl $graphdir/HCLG.fst $oldlm_fst $newlm_fst; do
+  [ ! -f $f ] && echo "decode_si.sh: no such file $f" && exit 1;
+done
+
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "decode_si.sh: feature type is $feat_type"
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+[ -f `dirname $oldlm_fst`/words.txt ] && ! cmp `dirname $oldlm_fst`/words.txt $graphdir/words.txt && \
+  echo "Warning: old LM words.txt does not match with that in $graphdir .. probably will not work.";
+[ -f `dirname $newlm_fst`/words.txt ] && ! cmp `dirname $oldlm_fst`/words.txt $graphdir/words.txt && \
+  echo "Warning: new LM words.txt does not match with that in $graphdir .. probably will not work.";
+
+# fstproject replaces the disambiguation symbol #0, which only appears on the
+# input side, with the <eps> that appears in the corresponding arcs on the output side.
+oldlm_cmd="fstproject --project_output=true $oldlm_fst | fstarcsort --sort_type=ilabel |"
+newlm_cmd="fstproject --project_output=true $newlm_fst | fstarcsort --sort_type=ilabel |"
+
+$cmd JOB=1:$nj $dir/log/decode.JOB.log \
+ gmm-latgen-biglm-faster --max-active=$maxactive --beam=$beam --lattice-beam=$latbeam \
+   --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+  $srcdir/final.mdl $graphdir/HCLG.fst "$oldlm_cmd" "$newlm_cmd" "$feats" \
+  "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+
+[ ! -x local/score.sh ] && \
+  echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+local/score.sh --cmd "$cmd" $data $graphdir $dir
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_combine.sh
+++ b/egs/chime_wsj0/s5/steps/decode_combine.sh
@ -0,0 +1,59 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# Combine two decoding directories by composing the lattices (we
+# apply a weight to each of the original weights, by default 0.5 each).
+
+# Begin configuration section.
+weight1=0.5 # Weight on 1st set of lattices.
+cmd=run.pl
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# -ne 5 ]; then
+  echo "Usage: steps/decode_combine.sh [options] <data> <lang-dir|graph-dir> <decode-dir1> <decode-dir2> <decode-dir-out>"
+  echo " e.g.: steps/decode_combine.sh data/lang data/test exp/dir1/decode exp/dir2/decode exp/combine_1_2/decode"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --weight1 <weight>                       # Weight on 1st set of lattices (default 0.5)"
+  exit 1;
+fi
+
+data=$1
+lang_or_graphdir=$2
+srcdir1=$3
+srcdir2=$4
+dir=$5
+
+for f in $data/utt2spk $lang_or_graphdir/phones.txt $srcdir1/lat.1.gz $srcdir2/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+nj1=`cat $srcdir1/num_jobs` || exit 1;
+nj2=`cat $srcdir2/num_jobs` || exit 1;
+[ $nj1 -ne $nj2 ] && echo "$0: mismatch in number of jobs $nj1 versus $nj2" && exit 1;
+nj=$nj1
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+
+# The lattice-interp command does the score interpolation (with composition),
+# and the lattice-copy-backoff replaces the result with the 1st lattice, in 
+# cases where the composed result was empty.
+$cmd JOB=1:$nj $dir/log/interp.JOB.log \
+  lattice-interp --alpha=$weight1 "ark:gunzip -c $srcdir1/lat.JOB.gz|" \
+   "ark,s,cs:gunzip -c $srcdir2/lat.JOB.gz|" ark:- \| \
+  lattice-copy-backoff "ark,s,cs:gunzip -c $srcdir1/lat.JOB.gz|" ark,s,cs:- \
+   "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
+
+[ ! -x local/score.sh ] && \
+  echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+local/score.sh --cmd "$cmd" $data $lang_or_graphdir $dir
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_fmllr.sh
+++ b/egs/chime_wsj0/s5/steps/decode_fmllr.sh
@ -0,0 +1,217 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+
+# Decoding script that does fMLLR.  This can be on top of delta+delta-delta, or
+# LDA+MLLT features.
+
+# There are 3 models involved potentially in this script,
+# and for a standard, speaker-independent system they will all be the same.
+# The "alignment model" is for the 1st-pass decoding and to get the 
+# Gaussian-level alignments for the "adaptation model" the first time we
+# do fMLLR.  The "adaptation model" is used to estimate fMLLR transforms
+# and to generate state-level lattices.  The lattices are then rescored
+# with the "final model".
+#
+# The following table explains where we get these 3 models from.
+# Note: $srcdir is one level up from the decoding directory.
+#
+#   Model              Default source:                 
+#
+#  "alignment model"   $srcdir/final.alimdl              --alignment-model <model>
+#                     (or $srcdir/final.mdl if alimdl absent)
+#  "adaptation model"  $srcdir/final.mdl                 --adapt-model <model>
+#  "final model"       $srcdir/final.mdl                 --final-model <model>
+
+
+# Begin configuration section
+first_beam=10.0 # Beam used in initial, speaker-indep. pass
+first_max_active=2000 # max-active used in initial pass.
+first_max_arcs=-1
+alignment_model=
+adapt_model=
+final_model=
+stage=0
+acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in 
+              # lattice generation.
+max_active=7000
+max_arcs=-1
+beam=13.0
+lattice_beam=6.0
+nj=4
+silence_weight=0.01
+cmd=run.pl
+si_dir=
+fmllr_update_type=full
+num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+parallel_opts=  # If you supply num-threads, you should supply this too.
+skip_scoring=false
+scoring_opts=
+norm_vars=false
+# End configuration section
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: steps/decode_fmllr.sh [options] <graph-dir> <data-dir> <decode-dir>"
+   echo " e.g.: steps/decode_fmllr.sh exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b/decode_dev93_tgpr"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                   # config containing options"
+   echo "  --nj <nj>                                # number of parallel jobs"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --adapt-model <adapt-mdl>                # Model to compute transforms with"
+   echo "  --alignment-model <ali-mdl>              # Model to get Gaussian-level alignments for"
+   echo "                                           # 1st pass of transform computation."
+   echo "  --final-model <finald-mdl>               # Model to finally decode with"
+   echo "  --si-dir <speaker-indep-decoding-dir>    # use this to skip 1st pass of decoding"
+   echo "                                           # Caution-- must be with same tree"
+   echo "  --acwt <acoustic-weight>                 # default 0.08333 ... used to get posteriors"
+   echo "  --num-threads <n>                        # number of threads to use, default 1."
+   echo "  --parallel-opts <opts>                   # e.g. '-pe smp 4' if you supply --num-threads 4"
+   echo "  --scoring-opts <opts>                    # options to local/score.sh"
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash.
+
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+sdata=$data/split$nj;
+
+thread_string=
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
+
+
+mkdir -p $dir/log
+split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+
+silphonelist=`cat $graphdir/phones/silence.csl` || exit 1;
+
+# Some checks.  Note: we don't need $srcdir/tree but we expect
+# it should exist, given the current structure of the scripts.
+for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/tree; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+## Work out name of alignment model. ##
+if [ -z "$alignment_model" ]; then
+  if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
+  else alignment_model=$srcdir/final.mdl; fi
+fi
+[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;
+##
+
+## Do the speaker-independent decoding, if --si-dir option not present. ##
+if [ -z "$si_dir" ]; then # we need to do the speaker-independent decoding pass.
+  si_dir=${dir}.si # Name it as our decoding dir, but with suffix ".si".
+  if [ $stage -le 0 ]; then
+    steps/decode.sh --parallel-opts "$parallel_opts" --scoring-opts "$scoring_opts" \
+              --num-threads $num_threads --skip-scoring $skip_scoring \
+              --acwt $acwt --nj $nj --cmd "$cmd" --beam $first_beam \
+              --model $alignment_model --max-arcs $max_arcs --max-active \
+              $first_max_active $graphdir $data $si_dir || exit 1;
+  fi
+fi
+##
+
+## Some checks, and setting of defaults for variables.
+[ "$nj" -ne "`cat $si_dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1;
+[ ! -f "$si_dir/lat.1.gz" ] && echo "No such file $si_dir/lat.1.gz" && exit 1;
+[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl
+[ -z "$final_model" ] && final_model=$srcdir/final.mdl
+for f in $adapt_model $final_model; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+##
+
+## Set up the unadapted features "$sifeats"
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type";
+case $feat_type in
+  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+##
+
+## Now get the first-pass fMLLR transforms.
+if [ $stage -le 1 ]; then
+  echo "$0: getting first-pass fMLLR transforms."
+  $cmd JOB=1:$nj $dir/log/fmllr_pass1.JOB.log \
+    gunzip -c $si_dir/lat.JOB.gz \| \
+    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+    weight-silence-post $silence_weight $silphonelist $alignment_model ark:- ark:- \| \
+    gmm-post-to-gpost $alignment_model "$sifeats" ark:- ark:- \| \
+    gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \
+    --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$sifeats" ark,s,cs:- \
+    ark:$dir/pre_trans.JOB || exit 1;
+fi
+##
+
+pass1feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/pre_trans.JOB ark:- ark:- |"
+
+## Do the main lattice generation pass.  Note: we don't determinize the lattices at
+## this stage, as we're going to use them in acoustic rescoring with the larger 
+## model, and it's more correct to store the full state-level lattice for this purpose.
+if [ $stage -le 2 ]; then
+  echo "$0: doing main lattice generation phase"
+  $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
+    gmm-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
+    --acoustic-scale=$acwt --max-arcs=$max_arcs \
+    --determinize-lattice=false --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+    $adapt_model $graphdir/HCLG.fst "$pass1feats" "ark:|gzip -c > $dir/lat.tmp.JOB.gz" \
+    || exit 1;
+fi
+##
+
+## Do a second pass of estimating the transform-- this time with the lattices
+## generated from the alignment model.  Compose the transforms to get
+## $dir/trans.1, etc.
+if [ $stage -le 3 ]; then
+  echo "$0: estimating fMLLR transforms a second time."
+  $cmd JOB=1:$nj $dir/log/fmllr_pass2.JOB.log \
+    lattice-determinize-pruned --acoustic-scale=$acwt --beam=4.0 \
+    "ark:gunzip -c $dir/lat.tmp.JOB.gz|" ark:- \| \
+    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+    weight-silence-post $silence_weight $silphonelist $adapt_model ark:- ark:- \| \
+    gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
+    --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$pass1feats" \
+    ark,s,cs:- ark:$dir/trans_tmp.JOB '&&' \
+    compose-transforms --b-is-affine=true ark:$dir/trans_tmp.JOB ark:$dir/pre_trans.JOB \
+    ark:$dir/trans.JOB  || exit 1;
+fi
+##
+
+feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
+
+# Rescore the state-level lattices with the final adapted features, and the final model
+# (which by default is $srcdir/final.mdl, but which may be specified on the command line,
+# useful in case of discriminatively trained systems).
+# At this point we prune and determinize the lattices and write them out, ready for 
+# language model rescoring.
+
+if [ $stage -le 4 ]; then
+  echo "$0: doing a final pass of acoustic rescoring."
+  $cmd $parallel_opts JOB=1:$nj $dir/log/acoustic_rescore.JOB.log \
+    gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat.tmp.JOB.gz|" "$feats" ark:- \| \
+    lattice-determinize-pruned$thread_string --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
+    "ark:|gzip -c > $dir/lat.JOB.gz" '&&' rm $dir/lat.tmp.JOB.gz || exit 1;
+fi
+
+if ! $skip_scoring ; then
+  [ ! -x local/score.sh ] && \
+    echo "$0: not scoring because local/score.sh does not exist or not executable." && exit 1;
+  local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+fi
+
+rm $dir/{trans_tmp,pre_trans}.*
+
+exit 0;
+
--- a/egs/chime_wsj0/s5/steps/decode_fmllr_extra.sh
+++ b/egs/chime_wsj0/s5/steps/decode_fmllr_extra.sh
@ -0,0 +1,250 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+
+# Decoding script that does fMLLR.  This can be on top of delta+delta-delta, or
+# LDA+MLLT features.
+# This script does an extra pass of lattice generation over and above what the original
+# script did-- it's for robustness in the case where your original cepstral mean
+# normalization was way off.
+# We also added a new option --distribute=true (by default) to 
+# weight-silence-post.  This weights the silence frames in a different way,
+# weighting all posteriors on the frame rather than just the silence ones, which
+# removes a particular kind of bias that the old approach suffered from.
+
+# There are 3 models involved potentially in this script,
+# and for a standard, speaker-independent system they will all be the same.
+# The "alignment model" is for the 1st-pass decoding and to get the 
+# Gaussian-level alignments for the "adaptation model" the first time we
+# do fMLLR.  The "adaptation model" is used to estimate fMLLR transforms
+# and to generate state-level lattices.  The lattices are then rescored
+# with the "final model".
+#
+# The following table explains where we get these 3 models from.
+# Note: $srcdir is one level up from the decoding directory.
+#
+#   Model              Default source:                 
+#
+#  "alignment model"   $srcdir/final.alimdl              --alignment-model <model>
+#                     (or $srcdir/final.mdl if alimdl absent)
+#  "adaptation model"  $srcdir/final.mdl                 --adapt-model <model>
+#  "final model"       $srcdir/final.mdl                 --final-model <model>
+
+
+# Begin configuration section
+first_beam=10.0 # Beam used in initial, speaker-indep. pass
+first_max_active=2000 # max-active used in first two passes.
+first_latbeam=4.0 # lattice pruning beam for si decode and first-pass fMLLR decode.
+                # the different spelling from lattice_beam is unfortunate; these scripts
+                # have a history.
+alignment_model=
+adapt_model=
+final_model=
+cleanup=true
+stage=0
+acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in 
+              # lattice generation.
+max_active=7000
+beam=13.0
+lattice_beam=6.0
+nj=4
+silence_weight=0.01
+distribute=true # option to weight-silence-post.
+cmd=run.pl
+si_dir=
+fmllr_update_type=full
+skip_scoring=false
+num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+parallel_opts=  # If you supply num-threads, you should supply this too.
+scoring_opts=
+
+# End configuration section
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Usage: steps/decode_fmllr.sh [options] <graph-dir> <data-dir> <decode-dir>"
+   echo " e.g.: steps/decode_fmllr.sh exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b/decode_dev93_tgpr"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                   # config containing options"
+   echo "  --nj <nj>                                # number of parallel jobs"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --adapt-model <adapt-mdl>                # Model to compute transforms with"
+   echo "  --alignment-model <ali-mdl>              # Model to get Gaussian-level alignments for"
+   echo "                                           # 1st pass of transform computation."
+   echo "  --final-model <finald-mdl>               # Model to finally decode with"
+   echo "  --si-dir <speaker-indep-decoding-dir>    # use this to skip 1st pass of decoding"
+   echo "                                           # Caution-- must be with same tree"
+   echo "  --acwt <acoustic-weight>                 # default 0.08333 ... used to get posteriors"
+   echo "  --num-threads <n>                        # number of threads to use, default 1."
+   echo "  --parallel-opts <opts>                   # e.g. '-pe smp 4' if you supply --num-threads 4"
+   echo "  --scoring-opts <opts>                    # options to local/score.sh"
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash.
+
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+sdata=$data/split$nj;
+
+thread_string=
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+
+silphonelist=`cat $graphdir/phones/silence.csl` || exit 1;
+
+# Some checks.  Note: we don't need $srcdir/tree but we expect
+# it should exist, given the current structure of the scripts.
+for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/tree; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+## Work out name of alignment model. ##
+if [ -z "$alignment_model" ]; then
+  if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
+  else alignment_model=$srcdir/final.mdl; fi
+fi
+[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;
+##
+
+## Do the speaker-independent decoding, if --si-dir option not present. ##
+if [ -z "$si_dir" ]; then # we need to do the speaker-independent decoding pass.
+  si_dir=${dir}.si # Name it as our decoding dir, but with suffix ".si".
+  if [ $stage -le 0 ]; then
+    steps/decode.sh --acwt $acwt --nj $nj --cmd "$cmd" --beam $first_beam --model $alignment_model\
+      --max-active $first_max_active --parallel-opts "${parallel_opts}" --num-threads $num_threads\
+      --skip-scoring true $graphdir $data $si_dir || exit 1;
+  fi
+fi
+##
+
+## Some checks, and setting of defaults for variables.
+[ "$nj" -ne "`cat $si_dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1;
+[ ! -f "$si_dir/lat.1.gz" ] && echo "No such file $si_dir/lat.1.gz" && exit 1;
+[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl
+[ -z "$final_model" ] && final_model=$srcdir/final.mdl
+for f in $adapt_model $final_model; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+##
+
+## Set up the unadapted features "$sifeats"
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type";
+case $feat_type in
+  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+##
+
+## Now get the first-pass fMLLR transforms.
+if [ $stage -le 1 ]; then
+  echo "$0: getting first-pass fMLLR transforms."
+  $cmd JOB=1:$nj $dir/log/fmllr_pass1.JOB.log \
+    gunzip -c $si_dir/lat.JOB.gz \| \
+    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+    weight-silence-post --distribute=$distribute $silence_weight $silphonelist $alignment_model ark:- ark:- \| \
+    gmm-post-to-gpost $alignment_model "$sifeats" ark:- ark:- \| \
+    gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \
+    --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$sifeats" ark,s,cs:- \
+    ark:$dir/trans1.JOB || exit 1;
+fi
+##
+
+pass1feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans1.JOB ark:- ark:- |"
+
+## Do the first adapted lattice generation pass. 
+if [ $stage -le 2 ]; then
+  echo "$0: doing first adapted lattice generation phase"
+  $cmd $parallel_opts JOB=1:$nj $dir/log/decode1.JOB.log\
+    gmm-latgen-faster$thread_string --max-active=$first_max_active --beam=$first_beam --lattice-beam=$first_latbeam \
+    --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+    $adapt_model $graphdir/HCLG.fst "$pass1feats" "ark:|gzip -c > $dir/lat1.JOB.gz" \
+    || exit 1;
+fi
+
+
+## Do a second pass of estimating the transform.  Compose the transforms to get
+## $dir/trans2.*.
+if [ $stage -le 3 ]; then
+  echo "$0: estimating fMLLR transforms a second time."
+  $cmd JOB=1:$nj $dir/log/fmllr_pass2.JOB.log \
+    lattice-to-post --acoustic-scale=$acwt "ark:gunzip -c $dir/lat1.JOB.gz|" ark:- \| \
+    weight-silence-post --distribute=$distribute $silence_weight $silphonelist $adapt_model ark:- ark:- \| \
+    gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
+    --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$pass1feats" \
+    ark,s,cs:- ark:$dir/trans1b.JOB '&&' \
+    compose-transforms --b-is-affine=true ark:$dir/trans1b.JOB ark:$dir/trans1.JOB \
+    ark:$dir/trans2.JOB  || exit 1;
+  if $cleanup; then
+    rm $dir/trans1b.* $dir/trans1.* $dir/lat1.*.gz
+  fi
+fi
+##
+
+pass2feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans2.JOB ark:- ark:- |"
+
+# Generate a 3rd set of lattices, with the "adaptation model"; we'll use these
+# to adapt a 3rd time, and we'll rescore them.  Since we should be close to the final
+# fMLLR, we don't bother dumping un-determinized lattices to disk.
+
+## Do the final lattice generation pass (but we'll rescore these lattices
+## after another stage of adaptation.)
+if [ $stage -le 4 ]; then
+  echo "$0: doing final lattice generation phase"
+  $cmd $parallel_opts JOB=1:$nj $dir/log/decode2.JOB.log\
+    gmm-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
+    --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+    $adapt_model $graphdir/HCLG.fst "$pass2feats" "ark:|gzip -c > $dir/lat2.JOB.gz" \
+    || exit 1;
+fi
+
+
+## Do a third pass of estimating the transform.  Compose the transforms to get
+## $dir/trans.*.
+if [ $stage -le 5 ]; then
+  echo "$0: estimating fMLLR transforms a third time."
+  $cmd JOB=1:$nj $dir/log/fmllr_pass3.JOB.log \
+    lattice-to-post --acoustic-scale=$acwt "ark:gunzip -c $dir/lat2.JOB.gz|" ark:- \| \
+    weight-silence-post --distribute=$distribute $silence_weight $silphonelist $adapt_model ark:- ark:- \| \
+    gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
+    --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$pass2feats" \
+    ark,s,cs:- ark:$dir/trans2b.JOB '&&' \
+    compose-transforms --b-is-affine=true ark:$dir/trans2b.JOB ark:$dir/trans2.JOB \
+    ark:$dir/trans.JOB  || exit 1;
+  if $cleanup; then
+    rm $dir/trans2b.* $dir/trans2.*
+  fi
+fi
+##
+
+feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
+
+if [ $stage -le 6 ]; then
+  echo "$0: doing a final pass of acoustic rescoring."
+  $cmd JOB=1:$nj $dir/log/acoustic_rescore.JOB.log \
+    gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat2.JOB.gz|" "$feats" \
+      "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+  if $cleanup; then
+    rm $dir/lat2.*.gz
+  fi
+fi
+
+if ! $skip_scoring ; then
+  [ ! -x local/score.sh ] && \
+    echo "$0: not scoring because local/score.sh does not exist or not executable." && exit 1;
+  local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+fi
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_fmmi.sh
+++ b/egs/chime_wsj0/s5/steps/decode_fmmi.sh
@ -0,0 +1,111 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+# Decoding of fMMI or fMPE models (feature-space discriminative training).
+# If transform-dir supplied, expects e.g. fMLLR transforms in that dir.
+
+# Begin configuration section.  
+stage=1
+iter=final
+nj=4
+cmd=run.pl
+maxactive=7000
+beam=13.0
+latbeam=6.0
+acwt=0.083333 # note: only really affects pruning (scoring is on lattices).
+ngselect=2; # Just use the 2 top Gaussians for fMMI/fMPE.  Should match train.
+transform_dir=
+num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+parallel_opts=  # If you supply num-threads, you should supply this too.
+scoring_opts=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Usage: steps/decode_fmmi.sh [options] <graph-dir> <data-dir> <decode-dir>"
+   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
+   echo " where the model is."
+   echo "e.g.: steps/decode_fmmi.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr"
+   echo ""
+   echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
+   echo "what type of features you used (assuming it's one of these two)"
+   echo "You can also use fMLLR features-- you have to supply --transform-dir option."
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --iter <iter>                                    # Iteration of model to test."
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --acwt <float>                                   # acoustic scale used for lattice generation "
+   echo "  --transform-dir <transform-dir>                  # where to find fMLLR transforms."
+   echo "  --scoring-opts <string>                          # options to local/score.sh"
+   echo "                                                   # speaker-adapted decoding"
+   echo "  --num-threads <n>                                # number of threads to use, default 1."
+   echo "  --parallel-opts <opts>                           # e.g. '-pe smp 4' if you supply --num-threads 4"
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
+sdata=$data/split$nj;
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+thread_string=
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+model=$srcdir/$iter.mdl
+
+for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst; do
+  [ ! -f $f ] && echo "decode_fmmi.sh: no such file $f" && exit 1;
+done
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "decode_fmmi.sh: feature type is $feat_type";
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+if [ ! -z "$transform_dir" ]; then # add transforms to features...
+  echo "Using fMLLR transforms from $transform_dir"
+  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
+  [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \
+     echo "Mismatch in number of jobs with $transform_dir";
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
+fi
+
+fmpefeats="$feats fmpe-apply-transform $srcdir/$iter.fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.JOB.gz|' ark:- |" 
+
+if [ $stage -le 1 ]; then
+  # Get Gaussian selection info.
+  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
+    gmm-gselect --n=$ngselect $srcdir/$iter.fmpe "$feats" \
+    "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
+fi
+  
+if [ $stage -le 2 ]; then
+  $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
+    gmm-latgen-faster$thread_string --max-active=$maxactive --beam=$beam --lattice-beam=$latbeam \
+    --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+    $model $graphdir/HCLG.fst "$fmpefeats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  [ ! -x local/score.sh ] && \
+    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+  local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+fi
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_fromlats.sh
+++ b/egs/chime_wsj0/s5/steps/decode_fromlats.sh
@ -0,0 +1,90 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# Decode, limited to the word-sequences that were present in a set
+# of lattices on disk.  The other lattices do not have to be built
+# with the same tree or the same context size-- however, you do
+# have to be using the same vocabulary (words.txt)-- if not you'd
+# have to map the vocabulary somehow.
+
+# Note: if the trees are identical, you can use gmm-rescore-lattice.
+
+# Mechanism: create an unweighted acceptor (on words) for each utterance,
+# compose that with G, determinize, and then use compile-train-graphs-fsts
+# to compile a graph for each utterance, to decode with.  
+
+# Begin configuration.
+cmd=run.pl
+maxactive=7000
+beam=20.0
+latbeam=7.0
+acwt=0.083333
+batch_size=75 # Limits memory blowup in compile-train-graphs-fsts
+scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+# End configuration.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+
+
+if [ $# != 4 ]; then
+   echo "Usage: steps/decode_si_fromlats.sh [options] <data-dir> <lang> <old-decode-dir> <decode-dir>"
+   echo "e.g.: steps/decode_si_fromlats.sh data/test_dev93 data/lang_test_tg exp/tri2b/decode_tgpr_dev93 exp/tri2a/decode_tgpr_dev93_fromlats"
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+
+data=$1
+lang=$2
+olddir=$3
+dir=$4
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+
+mkdir -p $dir/log
+
+nj=`cat $olddir/num_jobs` || exit 1;
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+sdata=$data/split$nj
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj >$dir/num_jobs
+
+for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $srcdir/final.mdl $olddir/lat.1.gz \
+    $srcdir/tree $lang/L_disambig.fst $lang/phones.txt; do
+  [ ! -f $f ] && echo "decode_si_fromlats.sh: no such file $f" && exit 1;
+done
+
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "decode_si.sh: feature type is $feat_type"
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+
+$cmd JOB=1:$nj $dir/log/decode_lats.JOB.log \
+ lattice-to-fst "ark:gunzip -c $olddir/lat.JOB.gz|" ark:- \| \
+  fsttablecompose "fstproject --project_output=true $lang/G.fst | fstarcsort |" ark:- ark:- \| \
+  fstdeterminizestar ark:- ark:- \| \
+  compile-train-graphs-fsts --read-disambig-syms=$lang/phones/disambig.int \
+    --batch-size=$batch_size $scale_opts $srcdir/tree $srcdir/final.mdl $lang/L_disambig.fst ark:- ark:- \|  \
+  gmm-latgen-faster --max-active=$maxactive --beam=$beam --lattice-beam=$latbeam --acoustic-scale=$acwt \
+    --allow-partial=true --word-symbol-table=$lang/words.txt \
+    $srcdir/final.mdl ark:- "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+
+[ ! -x local/score.sh ] && \
+  echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+local/score.sh --cmd "$cmd" $data $lang $dir
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_fwdbwd.sh
+++ b/egs/chime_wsj0/s5/steps/decode_fwdbwd.sh
@ -0,0 +1,122 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey), BUT (Author: Mirko Hannemann)
+# Apache 2.0
+
+# Begin configuration section.  
+transform_dir=
+first_pass=
+iter=
+model= # You can specify the model to use (e.g. if you want to use the .alimdl)
+nj=4
+reverse=false
+cmd=run.pl
+max_active=7000
+beam=13.0
+latbeam=6.0
+acwt=0.083333 # note: only really affects pruning (scoring is on lattices).
+extra_beam=0.0 # small additional beam over varying beam
+max_beam=100.0 # maximum of varying beam
+scoring_opts=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Usage: steps/decode_fwdbwd.sh [options] <graph-dir> <data-dir> <decode-dir>"
+   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
+   echo " where the model is."
+   echo "e.g.: steps/decode.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr"
+   echo ""
+   echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
+   echo "what type of features you used (assuming it's one of these two)"
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --first_pass <decode-dir>                        # decoding dir of first pass"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --iter <iter>                                    # Iteration of model to test."
+   echo "  --model <model>                                  # which model to use (e.g. to"
+   echo "                                                   # specify the final.alimdl)"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --transform_dir <trans-dir>                      # dir to find fMLLR transforms "
+   echo "                                                   # speaker-adapted decoding"
+   echo "  --scoring-opts <string>                          # options to local/score.sh"
+   echo "  --reverse [true/false]                           # time reversal of features"
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
+sdata=$data/split$nj;
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
+  if [ -z $iter ]; then model=$srcdir/final.mdl; 
+  else model=$srcdir/$iter.mdl; fi
+fi
+
+for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst $graphdir/words.txt; do
+  [ ! -f $f ] && echo "decode_fwdbwd.sh: no such file $f" && exit 1;
+done
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "decode_fwdbwd.sh: feature type is $feat_type";
+
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+if [ ! -z "$transform_dir" ]; then # add transforms to features...
+  echo "Using fMLLR transforms from $transform_dir"
+  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
+  [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \
+     echo "Mismatch in number of jobs with $transform_dir";
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
+fi
+if $reverse; then
+  feats="$feats reverse-feats ark:- ark:- |"
+fi
+
+if [ -f $first_pass/lat.1.gz ]; then
+  echo "converting first pass lattice to graph arc acceptor"
+  $cmd JOB=1:$nj $dir/log/arc_graph.JOB.log \
+    time lattice-arcgraph $model $graphdir/HCLG.fst \
+    "ark:gunzip -c $first_pass/lat.JOB.gz|" ark,t:$dir/lat.JOB.arcs || exit 1;
+    #  --write-lattices=ark,t:$dir/lat.det
+    #  --acoustic-scale=$acwt --lattice-beam=$latbeam --prune=false \
+
+  echo "decode with tracking first pass lattice"
+  $cmd JOB=1:$nj $dir/log/decode_fwdbwd.JOB.log \
+    gmm-latgen-tracking --max-active=$max_active --beam=$beam --lattice-beam=$latbeam \
+      --acoustic-scale=$acwt --allow-partial=true \
+      --extra-beam=$extra_beam --max-beam=$max_beam \
+      --word-symbol-table=$graphdir/words.txt  --verbose=2 \
+      $model $graphdir/HCLG.fst "$feats" ark:$dir/lat.JOB.arcs "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+
+else
+  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
+   gmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$latbeam \
+     --acoustic-scale=$acwt --allow-partial=true \
+     --word-symbol-table=$graphdir/words.txt \
+     $model $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+fi
+
+[ ! -x local/score.sh ] && \
+  echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+local/score.sh $scoring_opts --cmd "$cmd" --reverse $reverse $scoring_opts $data $graphdir $dir
+
+echo "Decoding done."
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_nnet.sh
+++ b/egs/chime_wsj0/s5/steps/decode_nnet.sh
@ -0,0 +1,128 @@
+#!/bin/bash
+
+# Copyright 2012-2013 Karel Vesely, Daniel Povey
+# Apache 2.0
+
+# Begin configuration section.  
+nnet= # Optionally pre-select network to use for getting state-likelihoods
+feature_transform= # Optionally pre-select feature transform (in front of nnet)
+model= # Optionally pre-select transition model
+class_frame_counts= # Optionally pre-select class-counts used to compute PDF priors 
+
+stage=0 # stage=1 skips lattice generation
+nj=4
+cmd=run.pl
+max_active=7000 # maximum of active tokens
+max_mem=50000000 # limit the fst-size to 50MB (larger fsts are minimized)
+beam=13.0 # GMM:13.0
+latbeam=8.0 # GMM:6.0
+acwt=0.10 # GMM:0.0833, note: only really affects pruning (scoring is on lattices).
+scoring_opts="--min-lmwt 4 --max-lmwt 15"
+skip_scoring=false
+use_gpu_id=-1 # disable gpu
+parallel_opts="-pe smp 2" # use 2 CPUs (1 DNN-forward, 1 decoder)
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
+   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
+   echo " where the DNN + transition model is."
+   echo "e.g.: $0 exp/dnn1/graph_tgpr data/test exp/dnn1/decode_tgpr"
+   echo ""
+   echo "This script works on plain or modified features (CMN,delta+delta-delta),"
+   echo "which are then sent through feature-transform. It works out what type"
+   echo "of features you used from content of srcdir."
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo ""
+   echo "  --nnet <nnet>                                    # which nnet to use (opt.)"
+   echo "  --feature-transform <nnet>                       # select transform in front of nnet (opt.)"
+   echo "  --class-frame-counts <file>                      # file with frame counts (used to compute priors) (opt.)"
+   echo "  --model <model>                                  # which transition model to use (opt.)"
+   echo ""
+   echo "  --acwt <float>                                   # select acoustic scale for decoding"
+   echo "  --scoring-opts <opts>                            # options forwarded to local/score.sh"
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
+sdata=$data/split$nj;
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+if [ -z "$nnet" ]; then # if --nnet <nnet> was not specified on the command line...
+  nnet=$srcdir/final.nnet; 
+fi
+[ -z "$nnet" ] && echo "Error nnet '$nnet' does not exist!" && exit 1;
+
+if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
+  model=$srcdir/final.mdl;
+fi
+
+# find the feature_transform to use
+if [ -z "$feature_transform" ]; then
+  feature_transform=$srcdir/final.feature_transform
+fi
+if [ ! -f $feature_transform ]; then
+  echo "Missing feature_transform '$feature_transform'"
+  exit 1
+fi
+
+# check that files exist
+for f in $sdata/1/feats.scp $nnet_i $nnet $model $graphdir/HCLG.fst; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+# PREPARE THE LOG-POSTERIOR COMPUTATION PIPELINE
+if [ -z "$class_frame_counts" ]; then
+  class_frame_counts=$srcdir/ali_train_pdf.counts
+else
+  echo "Overriding class_frame_counts by $class_frame_counts"
+fi
+
+# Create the feature stream:
+feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
+# Optionally add cmvn
+if [ -f $srcdir/norm_vars ]; then
+  norm_vars=$(cat $srcdir/norm_vars 2>/dev/null)
+  [ ! -f $sdata/1/cmvn.scp ] && echo "$0: cannot find cmvn stats $sdata/1/cmvn.scp" && exit 1
+  feats="$feats apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
+fi
+# Optionally add deltas
+if [ -f $srcdir/delta_order ]; then
+  delta_order=$(cat $srcdir/delta_order)
+  feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
+fi
+
+
+# Run the decoding in the queue
+if [ $stage -le 0 ]; then
+  $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
+    nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet "$feats" ark:- \| \
+    latgen-faster-mapped --max-active=$max_active --max-mem=$max_mem --beam=$beam --lattice-beam=$latbeam \
+    --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+    $model $graphdir/HCLG.fst ark:- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+fi
+
+# Run the scoring
+if ! $skip_scoring ; then
+  [ ! -x local/score.sh ] && \
+    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+  local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir || exit 1;
+fi
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_nnet_cpu.sh
+++ b/egs/chime_wsj0/s5/steps/decode_nnet_cpu.sh
@ -0,0 +1,127 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).
+# Apache 2.0.
+
+# This script does decoding with a neural-net.  If the neural net was built on
+# top of fMLLR transforms from a conventional system, you should provide the
+# --transform-dir option.
+
+# Begin configuration section.
+stage=1
+transform_dir=    # dir to find fMLLR transforms.
+nj=4 # number of decoding jobs.  If --transform-dir set, must match that number!
+acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
+cmd=run.pl
+beam=15.0
+max_active=7000
+
+#WARNING: This option is renamed lat_beam (it was renamed to follow the naming 
+#         in the other scripts
+lattice_beam=8.0 # Beam we use in lattice generation.
+iter=final
+num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+parallel_opts=  # If you supply num-threads, you should supply this too.
+scoring_opts=
+skip_scoring=false
+feat_type=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: steps/decode_nnet_cpu.sh [options] <graph-dir> <data-dir> <decode-dir>"
+  echo " e.g.: steps/decode_nnet_cpu.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\"
+  echo "      exp/tri3b/graph_tgpr data/test_dev93 exp/tri4a_nnet/decode_dev93_tgpr"
+  echo "main options (for others, see top of script file)"
+  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
+  echo "                                           # where we can find transforms for SAT systems."
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --nj <nj>                                # number of parallel jobs"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --beam <beam>                            # Decoding beam; default 15.0"
+  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
+  echo "  --scoring-opts <string>                  # options to local/score.sh"
+  echo "  --num-threads <n>                        # number of threads to use, default 1."
+  echo "  --parallel-opts <opts>                   # e.g. '-pe smp 4' if you supply --num-threads 4"
+  exit 1;
+fi
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+model=$srcdir/$iter.mdl
+
+for f in $graphdir/HCLG.fst $data/feats.scp $model; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+sdata=$data/split$nj;
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+thread_string=
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+
+## Set up features.
+if [ -z "$feat_type" ]; then
+  if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+  echo "$0: feature type is $feat_type"
+fi
+
+case $feat_type in
+  raw) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  if [ "$feat_type" == "raw" ]; then
+    [ ! -f $transform_dir/raw_trans.1 ] && echo "$0: no such file $transform_dir/raw_trans.1" && exit 1;
+    [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
+      && echo "$0: #jobs mismatch with transform-dir." && exit 1;
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/raw_trans.JOB ark:- ark:- |"
+  else
+    [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
+    [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
+      && echo "$0: #jobs mismatch with transform-dir." && exit 1;
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
+  fi
+elif grep 'transform-feats --utt2spk' $srcdir/log/train.1.log >&/dev/null; then
+  echo "$0: **WARNING**: you seem to be using a neural net system trained with transforms,"
+  echo "  but you are not providing the --transform-dir option in test time."
+fi
+##
+
+
+if [ $stage -le 1 ]; then
+  $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
+    nnet-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
+    --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt "$model" \
+    $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+fi
+
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at 
+# different acoustic scales to get the final output.
+
+
+if [ $stage -le 2 ]; then
+  if ! $skip_scoring ; then
+    [ ! -x local/score.sh ] && \
+      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+    echo "score best paths"
+    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+    echo "score confidence and timing with sclite"
+  fi
+fi
+echo "Decoding done."
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_raw_fmllr.sh
+++ b/egs/chime_wsj0/s5/steps/decode_raw_fmllr.sh
@ -0,0 +1,235 @@
+#!/bin/bash
+
+# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey)
+
+# This decoding script is like decode_fmllr.sh, but it does the fMLLR on
+# the raw cepstra, using the model in the LDA+MLLT space
+# 
+# Decoding script that does fMLLR.  This can be on top of delta+delta-delta, or
+# LDA+MLLT features.
+
+# There are 3 models involved potentially in this script,
+# and for a standard, speaker-independent system they will all be the same.
+# The "alignment model" is for the 1st-pass decoding and to get the 
+# Gaussian-level alignments for the "adaptation model" the first time we
+# do fMLLR.  The "adaptation model" is used to estimate fMLLR transforms
+# and to generate state-level lattices.  The lattices are then rescored
+# with the "final model".
+#
+# The following table explains where we get these 3 models from.
+# Note: $srcdir is one level up from the decoding directory.
+#
+#   Model              Default source:                 
+#
+#  "alignment model"   $srcdir/final.alimdl              --alignment-model <model>
+#                     (or $srcdir/final.mdl if alimdl absent)
+#  "adaptation model"  $srcdir/final.mdl                 --adapt-model <model>
+#  "final model"       $srcdir/final.mdl                 --final-model <model>
+
+
+# Begin configuration section
+first_beam=10.0 # Beam used in initial, speaker-indep. pass
+first_max_active=2000 # max-active used in initial pass.
+first_max_arcs=-1
+alignment_model=
+adapt_model=
+final_model=
+stage=0
+acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in 
+              # lattice generation.
+max_active=7000
+use_normal_fmllr=false
+max_arcs=-1
+beam=13.0
+lattice_beam=6.0
+nj=4
+silence_weight=0.01
+cmd=run.pl
+si_dir=
+num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+parallel_opts=  # If you supply num-threads, you should supply this too.
+skip_scoring=false
+scoring_opts=
+norm_vars=false
+# End configuration section
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: steps/decode_fmllr.sh [options] <graph-dir> <data-dir> <decode-dir>"
+   echo " e.g.: steps/decode_fmllr.sh exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b/decode_dev93_tgpr"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                   # config containing options"
+   echo "  --nj <nj>                                # number of parallel jobs"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --adapt-model <adapt-mdl>                # Model to compute transforms with"
+   echo "  --alignment-model <ali-mdl>              # Model to get Gaussian-level alignments for"
+   echo "                                           # 1st pass of transform computation."
+   echo "  --final-model <finald-mdl>               # Model to finally decode with"
+   echo "  --si-dir <speaker-indep-decoding-dir>    # use this to skip 1st pass of decoding"
+   echo "                                           # Caution-- must be with same tree"
+   echo "  --acwt <acoustic-weight>                 # default 0.08333 ... used to get posteriors"
+   echo "  --num-threads <n>                        # number of threads to use, default 1."
+   echo "  --parallel-opts <opts>                   # e.g. '-pe smp 4' if you supply --num-threads 4"
+   echo "  --scoring-opts <opts>                    # options to local/score.sh"
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash.
+
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+sdata=$data/split$nj;
+
+thread_string=
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
+
+
+mkdir -p $dir/log
+split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+
+silphonelist=`cat $graphdir/phones/silence.csl` || exit 1;
+
+# Some checks.  Note: we don't need $srcdir/tree but we expect
+# it should exist, given the current structure of the scripts.
+for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/tree; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+## Work out name of alignment model. ##
+if [ -z "$alignment_model" ]; then
+  if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
+  else alignment_model=$srcdir/final.mdl; fi
+fi
+[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;
+##
+
+## Do the speaker-independent decoding, if --si-dir option not present. ##
+if [ -z "$si_dir" ]; then # we need to do the speaker-independent decoding pass.
+  si_dir=${dir}.si # Name it as our decoding dir, but with suffix ".si".
+  if [ $stage -le 0 ]; then
+    steps/decode.sh --parallel-opts "$parallel_opts" --scoring-opts "$scoring_opts" \
+              --num-threads $num_threads --skip-scoring $skip_scoring \
+              --acwt $acwt --nj $nj --cmd "$cmd" --beam $first_beam \
+              --model $alignment_model --max-arcs $max_arcs --max-active \
+              $first_max_active $graphdir $data $si_dir || exit 1;
+  fi
+fi
+##
+
+## Some checks, and setting of defaults for variables.
+[ "$nj" -ne "`cat $si_dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1;
+[ ! -f "$si_dir/lat.1.gz" ] && echo "No such file $si_dir/lat.1.gz" && exit 1;
+[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl
+[ -z "$final_model" ] && final_model=$srcdir/final.mdl
+for f in $adapt_model $final_model; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+##
+
+if [[ ! -f $srcdir/final.mat || ! -f $srcdir/full.mat ]]; then
+  echo "$0: we require final.mat and full.mat in the source directory $srcdir"
+fi
+
+splicedfeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |"
+sifeats="$splicedfeats transform-feats $srcdir/final.mat ark:- ark:- |"
+
+full_lda_mat="get-full-lda-mat --print-args=false $srcdir/final.mat $srcdir/full.mat -|"
+
+##
+
+## Now get the first-pass fMLLR transforms.
+if [ $stage -le 1 ]; then
+  echo "$0: getting first-pass raw-fMLLR transforms."
+  $cmd JOB=1:$nj $dir/log/fmllr_pass1.JOB.log \
+    gunzip -c $si_dir/lat.JOB.gz \| \
+    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+    weight-silence-post $silence_weight $silphonelist $alignment_model ark:- ark:- \| \
+    gmm-post-to-gpost $alignment_model "$sifeats" ark:- ark:- \| \
+    gmm-est-fmllr-raw-gpost --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$full_lda_mat" \
+      "$splicedfeats" ark,s,cs:- ark:$dir/pre_trans.JOB || exit 1;
+fi
+##
+
+pass1splicedfeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/pre_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- |"
+pass1feats="$pass1splicedfeats transform-feats $srcdir/final.mat ark:- ark:- |"
+
+## Do the main lattice generation pass.  Note: we don't determinize the lattices at
+## this stage, as we're going to use them in acoustic rescoring with the larger 
+## model, and it's more correct to store the full state-level lattice for this purpose.
+if [ $stage -le 2 ]; then
+  echo "$0: doing main lattice generation phase"
+  $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
+    gmm-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
+    --acoustic-scale=$acwt --max-arcs=$max_arcs \
+    --determinize-lattice=false --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+    $adapt_model $graphdir/HCLG.fst "$pass1feats" "ark:|gzip -c > $dir/lat.tmp.JOB.gz" \
+    || exit 1;
+fi
+##
+
+## Do a second pass of estimating the transform-- this time with the lattices
+## generated from the alignment model.  Compose the transforms to get
+## $dir/trans.1, etc.
+if [ $stage -le 3 ]; then
+  echo "$0: estimating raw-fMLLR transforms a second time."
+  $cmd JOB=1:$nj $dir/log/fmllr_pass2.JOB.log \
+    lattice-determinize-pruned --acoustic-scale=$acwt --beam=4.0 \
+    "ark:gunzip -c $dir/lat.tmp.JOB.gz|" ark:- \| \
+    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+    weight-silence-post $silence_weight $silphonelist $adapt_model ark:- ark:- \| \
+    gmm-est-fmllr-raw --spk2utt=ark:$sdata/JOB/spk2utt \
+     $adapt_model "$full_lda_mat" "$pass1splicedfeats" ark,s,cs:- ark:$dir/trans_tmp.JOB '&&' \
+    compose-transforms --b-is-affine=true ark:$dir/trans_tmp.JOB ark:$dir/pre_trans.JOB \
+    ark:$dir/raw_trans.JOB  || exit 1;
+fi
+##
+
+feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+
+if [ $stage -le 4 ] && $use_normal_fmllr; then
+  echo "$0: estimating normal fMLLR transforms"
+  $cmd JOB=1:$nj $dir/log/fmllr_pass3.JOB.log \
+    gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat.tmp.JOB.gz|" "$feats" ark:- \| \
+    lattice-determinize-pruned --acoustic-scale=$acwt --beam=4.0 ark:- ark:- \| \
+    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+    weight-silence-post $silence_weight $silphonelist $adapt_model ark:- ark:- \| \
+    gmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt \
+     $adapt_model "$feats" ark,s,cs:- ark:$dir/trans.JOB || exit 1;
+fi
+
+if $use_normal_fmllr; then
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
+fi
+
+# Rescore the state-level lattices with the final adapted features, and the final model
+# (which by default is $srcdir/final.mdl, but which may be specified on the command line,
+# useful in case of discriminatively trained systems).
+# At this point we prune and determinize the lattices and write them out, ready for 
+# language model rescoring.
+
+if [ $stage -le 5 ]; then
+  echo "$0: doing a final pass of acoustic rescoring."
+  $cmd $parallel_opts JOB=1:$nj $dir/log/acoustic_rescore.JOB.log \
+    gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat.tmp.JOB.gz|" "$feats" ark:- \| \
+    lattice-determinize-pruned$thread_string --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
+    "ark:|gzip -c > $dir/lat.JOB.gz" '&&' rm $dir/lat.tmp.JOB.gz || exit 1;
+fi
+
+if ! $skip_scoring ; then
+  [ ! -x local/score.sh ] && \
+    echo "$0: not scoring because local/score.sh does not exist or not executable." && exit 1;
+  local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+fi
+
+#rm $dir/{trans_tmp,pre_trans}.*
+
+exit 0;
+
--- a/egs/chime_wsj0/s5/steps/decode_sgmm.sh
+++ b/egs/chime_wsj0/s5/steps/decode_sgmm.sh
@ -0,0 +1,257 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# This script does decoding with an SGMM system, with speaker vectors. 
+# If the SGMM system was
+# built on top of fMLLR transforms from a conventional system, you should
+# provide the --transform-dir option.
+
+# Begin configuration section.
+stage=1
+alignment_model=
+transform_dir=    # dir to find fMLLR transforms.
+nj=4 # number of decoding jobs.
+acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
+cmd=run.pl
+beam=15.0
+gselect=15  # Number of Gaussian-selection indices for SGMMs.  [Note:
+            # the first_pass_gselect variable is used for the 1st pass of
+            # decoding and can be tighter.
+first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in 
+            # the 1st pass of decoding (lattice generation).
+max_active=7000
+
+#WARNING: This option is renamed lat_beam (it was renamed to follow the naming 
+#         in the other scripts
+lattice_beam=8.0 # Beam we use in lattice generation.
+vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for 
+    # speaker-vector computation.  Can be quite tight (actually we could
+    # probably just do best-path.
+use_fmllr=false
+fmllr_iters=10
+fmllr_min_count=1000
+
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: steps/decode_sgmm.sh [options] <graph-dir> <data-dir> <decode-dir>"
+  echo " e.g.: steps/decode_sgmm.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\"
+  echo "      exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr"
+  echo "main options (for others, see top of script file)"
+  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
+  echo "                                           # where we can find transforms for SAT systems."
+  echo "  --alignment-model <ali-mdl>              # Model for the first-pass decoding."
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --nj <nj>                                # number of parallel jobs"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --beam <beam>                            # Decoding beam; default 13.0"
+  exit 1;
+fi
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+
+for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/final.mdl; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+sdata=$data/split$nj;
+silphonelist=`cat $graphdir/phones/silence.csl` || exit 1
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"
+gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |"
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+
+## Set up features.
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
+  [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
+    && echo "$0: #jobs mismatch with transform-dir." && exit 1;
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
+elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
+  echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
+  echo "  but you are not providing the --transform-dir option in test time."
+fi
+##
+
+## Calculate FMLLR pre-transforms if needed. We are doing this here since this
+## step is requried by models both with and without speaker vectors
+if $use_fmllr; then
+  if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then
+    echo "$0: computing pre-transform for fMLLR computation."
+    sgmm-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1;
+  fi
+fi
+
+## Save Gaussian-selection info to disk.
+# Note: we can use final.mdl regardless of whether there is an alignment model--
+# they use the same UBM.
+if [ $stage -le 1 ]; then
+  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
+    sgmm-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \
+    "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
+fi
+
+## Work out name of alignment model. ##
+if [ -z "$alignment_model" ]; then
+  if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
+  else alignment_model=$srcdir/final.mdl; fi
+fi
+[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;
+
+# Generate state-level lattice which we can rescore.  This is done with the 
+# alignment model and no speaker-vectors.
+if [ $stage -le 2 ]; then
+  $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \
+    sgmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
+    --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \
+    --word-symbol-table=$graphdir/words.txt "$gselect_opt_1stpass" $alignment_model \
+    $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1;
+fi
+
+## Check if the model has speaker vectors
+spkdim=`sgmm-info $srcdir/final.mdl | grep 'speaker vector' | awk '{print $NF}'`
+
+if [ $spkdim -gt 0 ]; then  ### For models with speaker vectors:
+
+# Estimate speaker vectors (1st pass).  Prune before determinizing
+# because determinization can take a while on un-pruned lattices.
+# Note: the sgmm-post-to-gpost stage is necessary because we have
+# a separate alignment-model and final model, otherwise we'd skip it 
+# and use sgmm-est-spkvecs.
+  if [ $stage -le 3 ]; then
+    $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \
+      gunzip -c $dir/pre_lat.JOB.gz \| \
+      lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+      lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alignment_model ark:- ark:- \| \
+      sgmm-post-to-gpost "$gselect_opt" $alignment_model "$feats" ark:- ark:- \| \
+      sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \
+      $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1;
+  fi
+
+# Estimate speaker vectors (2nd pass).  Since we already have spk vectors,
+# at this point we need to rescore the lattice to get the correct posteriors.
+  if [ $stage -le 4 ]; then
+    $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \
+      gunzip -c $dir/pre_lat.JOB.gz \| \
+      sgmm-rescore-lattice --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \
+      "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
+      lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+      lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+      weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
+      sgmm-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \
+      $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1;
+  fi
+  rm $dir/pre_vecs.*
+
+  if $use_fmllr; then
+  # Estimate fMLLR transforms (note: these may be on top of any
+  # fMLLR transforms estimated with the baseline GMM system.
+    if [ $stage -le 5 ]; then # compute fMLLR transforms.
+      echo "$0: computing fMLLR transforms."
+      $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+	gunzip -c $dir/pre_lat.JOB.gz \| \
+	sgmm-rescore-lattice --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \
+	"$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
+	lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+	lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+	lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+	weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
+	sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \
+	--fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \
+	$srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1;
+    fi
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"  
+  fi
+
+# Now rescore the state-level lattices with the adapted features and the
+# corresponding model.  Prune and determinize the lattices to limit
+# their size.
+  if [ $stage -le 6 ]; then
+    $cmd JOB=1:$nj $dir/log/rescore.JOB.log \
+      sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \
+      $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \
+      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
+      "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+  fi
+  rm $dir/pre_lat.*.gz
+
+else  ### For models without speaker vectors:
+
+  if $use_fmllr; then
+  # Estimate fMLLR transforms (note: these may be on top of any
+  # fMLLR transforms estimated with the baseline GMM system.
+    if [ $stage -le 5 ]; then # compute fMLLR transforms.
+      echo "$0: computing fMLLR transforms."
+      $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+	gunzip -c $dir/pre_lat.JOB.gz \| \
+	sgmm-rescore-lattice --utt2spk=ark:$sdata/JOB/utt2spk \
+	"$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
+	lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+	lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+	lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+	weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
+	sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \
+	--fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \
+	$srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1;
+    fi
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"  
+  fi
+
+# Now rescore the state-level lattices with the adapted features and the
+# corresponding model.  Prune and determinize the lattices to limit
+# their size.
+  if [ $stage -le 6 ] && $use_fmllr; then
+    $cmd JOB=1:$nj $dir/log/rescore.JOB.log \
+      sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk \
+      $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \
+      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
+      "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+    rm $dir/pre_lat.*.gz
+  else  # Already done with decoding if no adaptation needed.
+    for n in `seq 1 $nj`; do
+      mv $dir/pre_lat.${n}.gz $dir/lat.${n}.gz
+    done
+  fi
+
+fi
+
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at 
+# different acoustic scales to get the final output.
+
+
+if [ $stage -le 7 ]; then
+  [ ! -x local/score.sh ] && \
+    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+  echo "score best paths"
+  local/score.sh --cmd "$cmd" $data $graphdir $dir
+  echo "score confidence and timing with sclite"
+  #local/score_sclite_conf.sh --cmd "$cmd" --language turkish $data $graphdir $dir
+fi
+echo "Decoding done."
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_sgmm2.sh
+++ b/egs/chime_wsj0/s5/steps/decode_sgmm2.sh
@ -0,0 +1,211 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# This script does decoding with an SGMM system, with speaker vectors. 
+# If the SGMM system was
+# built on top of fMLLR transforms from a conventional system, you should
+# provide the --transform-dir option.
+
+# Begin configuration section.
+stage=1
+transform_dir=    # dir to find fMLLR transforms.
+nj=4 # number of decoding jobs.
+acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
+cmd=run.pl
+beam=13.0
+gselect=15  # Number of Gaussian-selection indices for SGMMs.  [Note:
+            # the first_pass_gselect variable is used for the 1st pass of
+            # decoding and can be tighter.
+first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in 
+            # the 1st pass of decoding (lattice generation).
+max_active=7000
+max_arcs=-1
+
+#WARNING: This option is renamed lat_beam (it was renamed to follow the naming 
+#         in the other scripts
+lattice_beam=6.0 # Beam we use in lattice generation.
+vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for 
+    # speaker-vector computation.  Can be quite tight (actually we could
+    # probably just do best-path.
+use_fmllr=false
+fmllr_iters=10
+fmllr_min_count=1000
+num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+parallel_opts=  # If you supply num-threads, you should supply this too.
+skip_scoring=false
+scoring_opts=
+# note: there are no more min-lmwt and max-lmwt options, instead use
+# e.g. --scoring-opts "--min-lmwt 1 --max-lmwt 20"
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: steps/decode_sgmm2.sh [options] <graph-dir> <data-dir> <decode-dir>"
+  echo " e.g.: steps/decode_sgmm2.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\"
+  echo "      exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr"
+  echo "main options (for others, see top of script file)"
+  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
+  echo "                                           # where we can find transforms for SAT systems."
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --nj <nj>                                # number of parallel jobs"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --beam <beam>                            # Decoding beam; default 13.0"
+  exit 1;
+fi
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+
+for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/final.mdl; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+sdata=$data/split$nj;
+silphonelist=`cat $graphdir/phones/silence.csl` || exit 1
+gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"
+gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |"
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+thread_string=
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
+
+## Set up features.
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+if [ ! -z "$transform_dir" ]; then
+  [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
+    && echo "$0: #jobs mismatch with transform-dir." && exit 1;
+  if [ -f $transform_dir/trans.1 ]; then
+    echo "$0: using transforms from $transform_dir"
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
+  elif [ -f $transform_dir/raw_trans.1 ]; then
+    feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"    
+  else
+    echo "$0: no such file $transform_dir/trans.1 or $transform_dir/raw_trans.1, invalid --transform-dir option?"
+    exit 1;
+  fi
+elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
+  echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
+  echo "  but you are not providing the --transform-dir option in test time."
+fi
+##
+
+## Save Gaussian-selection info to disk.
+# Note: we can use final.mdl regardless of whether there is an alignment model--
+# they use the same UBM.
+
+if [ $stage -le 1 ]; then
+  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
+    sgmm2-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \
+    "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
+fi
+
+# Generate state-level lattice which we can rescore.  This is done with the alignment
+# model and no speaker-vectors.
+if [ $stage -le 2 ]; then
+  $cmd $parallel_opts JOB=1:$nj $dir/log/decode_pass1.JOB.log \
+    sgmm2-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
+    --max-arcs=$max_arcs --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \
+    --word-symbol-table=$graphdir/words.txt "$gselect_opt_1stpass" $srcdir/final.alimdl \
+    $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1;
+fi
+
+# Estimate speaker vectors (1st pass).  Prune before determinizing
+# because determinization can take a while on un-pruned lattices.
+# Note: the sgmm2-post-to-gpost stage is necessary because we have
+# a separate alignment-model and final model, otherwise we'd skip it 
+# and use sgmm2-est-spkvecs.
+if [ $stage -le 3 ]; then
+  $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \
+    gunzip -c $dir/pre_lat.JOB.gz \| \
+    lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+    lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+    weight-silence-post 0.0 $silphonelist $srcdir/final.alimdl ark:- ark:- \| \
+    sgmm2-post-to-gpost "$gselect_opt" $srcdir/final.alimdl "$feats" ark:- ark:- \| \
+    sgmm2-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \
+     $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1;
+fi
+
+# Estimate speaker vectors (2nd pass).  Since we already have spk vectors,
+# at this point we need to rescore the lattice to get the correct posteriors.
+if [ $stage -le 4 ]; then
+  $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \
+    gunzip -c $dir/pre_lat.JOB.gz \| \
+    sgmm2-rescore-lattice --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \
+      "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
+    lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+    lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+    weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
+    sgmm2-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \
+     $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1;
+fi
+rm $dir/pre_vecs.*
+
+if $use_fmllr; then
+  # Estimate fMLLR transforms (note: these may be on top of any
+  # fMLLR transforms estimated with the baseline GMM system.
+  if [ $stage -le 5 ]; then # compute fMLLR transforms.
+    echo "$0: computing fMLLR transforms."
+    if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then
+      echo "$0: computing pre-transform for fMLLR computation."
+      sgmm2-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1;
+    fi
+    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+      gunzip -c $dir/pre_lat.JOB.gz \| \
+      sgmm2-rescore-lattice --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \
+      "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
+      lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+      lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+      weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
+      sgmm2-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \
+       --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \
+      $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1;
+  fi
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"  
+fi
+
+# Now rescore the state-level lattices with the adapted features and the
+# corresponding model.  Prune and determinize the lattices to limit
+# their size.
+if [ $stage -le 6 ]; then
+  $cmd $parallel_opts JOB=1:$nj $dir/log/rescore.JOB.log \
+    sgmm2-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \
+    $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \
+    lattice-determinize-pruned$thread_string --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
+    "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+fi
+rm $dir/pre_lat.*.gz
+
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at different
+# acoustic scales to get the final output.
+
+
+if [ $stage -le 7 ]; then
+  if ! $skip_scoring ; then
+    [ ! -x local/score.sh ] && \
+      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+  fi
+fi
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_sgmm2_fromlats.sh
+++ b/egs/chime_wsj0/s5/steps/decode_sgmm2_fromlats.sh
@ -0,0 +1,270 @@
+#!/bin/bash
+
+# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# This script does decoding with an SGMM2 system, with speaker vectors.  If the
+# SGMM2 system was built on top of fMLLR transforms from a conventional system,
+# you should provide the --transform-dir option.
+
+# This script does not use a decoding graph, but instead you provide
+# a previous decoding directory with lattices in it.  This script will only
+# make use of the word sequences in the lattices; it limits the decoding
+# to those sequences.  You should also provide a "lang" directory from 
+# which this script will use the G.fst and L.fst.
+
+# Begin configuration section.
+stage=1
+alignment_model=
+transform_dir=    # dir to find fMLLR transforms.
+acwt=0.08333  # Just a default value, used for adaptation and beam-pruning..
+batch_size=75 # Limits memory blowup in compile-train-graphs-fsts
+cmd=run.pl
+beam=20.0
+gselect=15  # Number of Gaussian-selection indices for SGMMs.  [Note:
+            # the first_pass_gselect variable is used for the 1st pass of
+            # decoding and can be tighter.
+first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in 
+            # the 1st pass of decoding (lattice generation).
+max_active=7000
+lattice_beam=8.0 # Beam we use in lattice generation.
+vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for 
+    # speaker-vector computation.  Can be quite tight (actually we could
+    # probably just do best-path.
+use_fmllr=false
+fmllr_iters=10
+fmllr_min_count=1000
+scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# -ne 4 ]; then
+  echo "Usage: steps/decode_sgmm_fromlats.sh [options] <data-dir> <lang-dir> <old-decode-dir> <decode-dir>"
+  echo ""
+  echo "main options (for others, see top of script file)"
+  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
+  echo "                                           # where we can find transforms for SAT systems."
+  echo "  --alignment-model <ali-mdl>              # Model for the first-pass decoding."
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --beam <beam>                            # Decoding beam; default 13.0"
+  exit 1;
+fi
+
+data=$1
+lang=$2
+olddir=$3
+dir=$4
+srcdir=`dirname $dir`
+
+for f in $data/feats.scp $lang/G.fst $lang/L_disambig.fst $lang/phones/disambig.int \
+    $srcdir/final.mdl $srcdir/tree $olddir/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+nj=`cat $olddir/num_jobs` || exit 1;
+sdata=$data/split$nj;
+silphonelist=`cat $lang/phones/silence.csl` || exit 1
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"
+gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |"
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+
+## Set up features
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+if [ -z "$transform_dir" ] && [ -f $olddir/trans.1 ]; then
+  transform_dir=$olddir
+fi
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
+  [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
+    && echo "$0: #jobs mismatch with transform-dir." && exit 1;
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
+elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
+  echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
+  echo "  but you are not providing the --transform-dir option in test time."
+fi
+
+## Calculate FMLLR pre-transforms if needed. We are doing this here since this
+## step is requried by models both with and without speaker vectors
+if $use_fmllr; then
+  if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then
+    echo "$0: computing pre-transform for fMLLR computation."
+    sgmm2-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1;
+  fi
+fi
+
+## Save Gaussian-selection info to disk.
+# Note: we can use final.mdl regardless of whether there is an alignment model--
+# they use the same UBM.
+if [ $stage -le 1 ]; then
+  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
+    sgmm2-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \
+    "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
+fi
+
+## Work out name of alignment model. ##
+if [ -z "$alignment_model" ]; then
+  if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
+  else alignment_model=$srcdir/final.mdl; fi
+fi
+[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;
+
+# Generate state-level lattice which we can rescore.  This is done with the 
+# alignment model and no speaker-vectors.
+if [ $stage -le 2 ]; then
+  $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \
+ lattice-to-fst "ark:gunzip -c $olddir/lat.JOB.gz|" ark:- \| \
+  fsttablecompose "fstproject --project_output=true $lang/G.fst | fstarcsort |" ark:- ark:- \| \
+  fstdeterminizestar ark:- ark:- \| \
+  compile-train-graphs-fsts --read-disambig-syms=$lang/phones/disambig.int \
+    --batch-size=$batch_size $scale_opts \
+    $srcdir/tree $srcdir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \
+  sgmm2-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
+    --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \
+    --word-symbol-table=$lang/words.txt "$gselect_opt_1stpass" $alignment_model \
+    "ark:-" "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1;
+fi
+
+## Check if the model has speaker vectors
+spkdim=`sgmm2-info $srcdir/final.mdl | grep 'speaker vector' | awk '{print $NF}'`
+
+if [ $spkdim -gt 0 ]; then  ### For models with speaker vectors:
+
+# Estimate speaker vectors (1st pass).  Prune before determinizing
+# because determinization can take a while on un-pruned lattices.
+# Note: the sgmm2-post-to-gpost stage is necessary because we have
+# a separate alignment-model and final model, otherwise we'd skip it 
+# and use sgmm2-est-spkvecs.
+  if [ $stage -le 3 ]; then
+    $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \
+      gunzip -c $dir/pre_lat.JOB.gz \| \
+      lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+      lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alignment_model ark:- ark:- \| \
+      sgmm2-post-to-gpost "$gselect_opt" $alignment_model "$feats" ark:- ark:- \| \
+      sgmm2-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \
+      $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1;
+  fi
+
+# Estimate speaker vectors (2nd pass).  Since we already have spk vectors,
+# at this point we need to rescore the lattice to get the correct posteriors.
+  if [ $stage -le 4 ]; then
+    $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \
+      gunzip -c $dir/pre_lat.JOB.gz \| \
+      sgmm2-rescore-lattice --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \
+      "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
+      lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+      lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+      weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
+      sgmm2-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \
+      $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1;
+  fi
+  rm $dir/pre_vecs.*
+
+  if $use_fmllr; then
+  # Estimate fMLLR transforms (note: these may be on top of any
+  # fMLLR transforms estimated with the baseline GMM system.
+    if [ $stage -le 5 ]; then # compute fMLLR transforms.
+      echo "$0: computing fMLLR transforms."
+      $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+	gunzip -c $dir/pre_lat.JOB.gz \| \
+	sgmm2-rescore-lattice --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \
+	"$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
+	lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+	lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+	lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+	weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
+	sgmm2-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \
+	--fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \
+	$srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1;
+    fi
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"  
+  fi
+
+# Now rescore the state-level lattices with the adapted features and the
+# corresponding model.  Prune and determinize the lattices to limit
+# their size.
+  if [ $stage -le 6 ]; then
+    $cmd JOB=1:$nj $dir/log/rescore.JOB.log \
+      sgmm2-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \
+      $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \
+      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
+      "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+  fi
+  rm $dir/pre_lat.*.gz
+
+else  ### For models without speaker vectors:
+
+  if $use_fmllr; then
+  # Estimate fMLLR transforms (note: these may be on top of any
+  # fMLLR transforms estimated with the baseline GMM system.
+    if [ $stage -le 5 ]; then # compute fMLLR transforms.
+      echo "$0: computing fMLLR transforms."
+      $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+	gunzip -c $dir/pre_lat.JOB.gz \| \
+	sgmm2-rescore-lattice --utt2spk=ark:$sdata/JOB/utt2spk \
+	"$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
+	lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+	lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+	lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+	weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
+	sgmm2-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \
+	--fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \
+	$srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1;
+    fi
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"  
+  fi
+
+# Now rescore the state-level lattices with the adapted features and the
+# corresponding model.  Prune and determinize the lattices to limit
+# their size.
+  if [ $stage -le 6 ] && $use_fmllr; then
+    $cmd JOB=1:$nj $dir/log/rescore.JOB.log \
+      sgmm2-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk \
+      $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \
+      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
+      "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+    rm $dir/pre_lat.*.gz
+  else  # Already done with decoding if no adaptation needed.
+    for n in `seq 1 $nj`; do
+      mv $dir/pre_lat.${n}.gz $dir/lat.${n}.gz
+    done
+  fi
+
+fi
+
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at 
+# different acoustic scales to get the final output.
+
+
+if [ $stage -le 7 ]; then
+  [ ! -x local/score.sh ] && \
+    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+  echo "score best paths"
+  local/score.sh --cmd "$cmd" $data $lang $dir
+  echo "score confidence and timing with sclite"
+  #local/score_sclite_conf.sh --cmd "$cmd" --language turkish $data $lang $dir
+fi
+echo "Decoding done."
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_sgmm2_rescore.sh
+++ b/egs/chime_wsj0/s5/steps/decode_sgmm2_rescore.sh
@ -0,0 +1,111 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# This script does decoding with an SGMM system, by rescoring lattices
+# generated from a previous SGMM system.  The directory with the lattices
+# is assumed to contain speaker vectors, if used.  Basically it rescores
+# the lattices one final time, using the same setup as the final decoding
+# pass of the source dir.  The assumption is that the model may have
+# been discriminatively trained.
+
+# If the system was built on top of fMLLR transforms from a conventional system,
+# you should provide the --transform-dir option.
+
+# Begin configuration section.
+transform_dir=    # dir to find fMLLR transforms.
+cmd=run.pl
+iter=final
+skip_scoring=false
+scoring_opts=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# -ne 4 ]; then
+  echo "Usage: steps/decode_sgmm_rescore.sh [options] <graph-dir|lang-dir> <data-dir> <old-decode-dir> <decode-dir>"
+  echo " e.g.: steps/decode_sgmm_rescore.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\"
+  echo "      exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr exp/sgmm3a_mmi/decode_dev93_tgpr"
+  echo "main options (for others, see top of script file)"
+  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
+  echo "                                           # where we can find transforms for SAT systems."
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --iter <iter>                            # iteration of model to use (default: final)"
+  exit 1;
+fi
+
+graphdir=$1
+data=$2
+olddir=$3
+dir=$4
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+
+for f in $graphdir/words.txt $data/feats.scp $olddir/lat.1.gz $olddir/gselect.1.gz \
+   $srcdir/$iter.mdl; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+nj=`cat $olddir/num_jobs` || exit 1;
+sdata=$data/split$nj;
+gselect_opt="--gselect=ark,s,cs:gunzip -c $olddir/gselect.JOB.gz|"
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+if [ -f $olddir/vecs.1 ]; then
+  echo "$0: using speaker vectors from $olddir"
+  spkvecs_opt="--spk-vecs=ark:$olddir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk"
+else
+  echo "$0: no speaker vectors found."
+  spkvecs_opt=
+fi
+
+
+## Set up features.
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
+  [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
+    && echo "$0: #jobs mismatch with transform-dir." && exit 1;
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
+elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
+  echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
+  echo "  but you are not providing the --transform-dir option in test time."
+fi
+
+if [ -f $olddir/trans.1 ]; then
+  echo "$0: using (in addition to any previous transforms) transforms from $olddir"
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$olddir/trans.JOB ark:- ark:- |"
+fi
+##
+
+# Rescore the state-level lattices with the model provided.  Just
+# one command in this script.
+echo "$0: rescoring lattices with SGMM model in $srcdir/$iter.mdl"
+$cmd JOB=1:$nj $dir/log/rescore.JOB.log \
+  sgmm2-rescore-lattice "$gselect_opt" $spkvecs_opt \
+  $srcdir/$iter.mdl "ark:gunzip -c $olddir/lat.JOB.gz|" "$feats" \
+  "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+
+if  ! $skip_scoring  ; then
+  [ ! -x local/score.sh ] && \
+    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+  local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+fi
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_sgmm2_rescore_project.sh
+++ b/egs/chime_wsj0/s5/steps/decode_sgmm2_rescore_project.sh
@ -0,0 +1,172 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# This script does decoding with an SGMM system, by rescoring lattices
+# generated from a previous SGMM system.  This version does the "predictive"
+# SGMM, where we subtract some constant times the log-prob of the left
+# few spliced frames, and the same for the right few.
+# The directory with the lattices
+# is assumed to contain any speaker vectors, if used.  This script just
+# adds into the acoustic scores, (some constant, default -0.25) times
+# the acoustic score of the left model, and the same for the right model.
+
+# the lattices one final time, using the same setup as the final decoding
+# pass of the source dir.  The assumption is that the model may have
+# been discriminatively trained.
+
+# If the system was built on top of fMLLR transforms from a conventional system,
+# you should provide the --transform-dir option.
+
+# Begin configuration section.
+stage=0
+transform_dir=    # dir to find fMLLR transforms.
+cmd=run.pl
+iter=final
+prob_scale=-0.25
+dimensions=0:13:104:117
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# -ne 5 ]; then
+  echo "Usage: steps/decode_sgmm_rescore_project.sh [options] <full-lda-mat> <graph-dir|lang-dir> <data-dir> <old-decode-dir> <decode-dir>"
+  echo " e.g.: steps/decode_sgmm_rescore_project.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\"
+  echo "     exp/tri2b/full.mat exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr exp/sgmm3a/decode_dev93_tgpr_predict"
+  echo "main options (for others, see top of script file)"
+  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
+  echo "                                           # where we can find transforms for SAT systems."
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --prob-scale <scale>                     # Default -0.25, scale on left and right models."
+  exit 1;
+fi
+
+full_lda_mat=$1
+graphdir=$2
+data=$3
+olddir=$4
+dir=$5
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+
+for f in $full_lda_mat $graphdir/words.txt $data/feats.scp $olddir/lat.1.gz \
+   $olddir/gselect.1.gz $srcdir/$iter.mdl; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+nj=`cat $olddir/num_jobs` || exit 1;
+sdata=$data/split$nj;
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+if [ -f $olddir/vecs.1 ]; then
+  echo "$0: using speaker vectors from $olddir"
+  spkvecs_opt="--spk-vecs=ark:$olddir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk"
+else
+  echo "$0: no speaker vectors found."
+  spkvecs_opt=
+fi
+
+if [ $stage -le 0 ]; then
+  # Get full LDA+MLLT mat and its inverse.  Note: the full LDA+MLLT mat is
+  # the LDA+MLLT mat, plus the "rejected" rows of the LDA matrix.
+  $cmd $dir/log/get_full_lda.log \
+    get-full-lda-mat $srcdir/final.mat $full_lda_mat $dir/full.mat $dir/full_inv.mat || exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  left_start=`echo $dimensions | cut '-d:' -f 1`;
+  left_end=`echo $dimensions | cut '-d:' -f 2`;
+  right_start=`echo $dimensions | cut '-d:' -f 3`;
+  right_end=`echo $dimensions | cut '-d:' -f 4`;
+
+  # Prepare left and right models.  For now, the dimensions are hardwired (e.g., 13 MFCCs and splice 9 frames).
+  # Note: the choice of dividing by the prob of the left 4 and the right 4 frames is a bit arbitrary and
+  # we could investigate different configurations.
+  $cmd $dir/log/left.log \
+    sgmm2-project --start-dim=$left_start --end-dim=$left_end $srcdir/final.mdl $dir/full.mat $dir/left.mdl $dir/left.mat || exit 1;
+  $cmd $dir/log/right.log \
+    sgmm2-project --start-dim=$right_start --end-dim=$right_end $srcdir/final.mdl $dir/full.mat $dir/right.mdl $dir/right.mat || exit 1;
+fi
+
+
+# we apply the scaling on the new acoustic probs by adding the inverse
+# of that to the old acoustic probs, and then later inverting again.
+# this has to do with limitations in sgmm2-rescore-lattice: we can only
+# scale the *old* acoustic probs, not the new ones.
+inverse_prob_scale=`perl -e "print (1.0 / $prob_scale);"`
+cur_lats="ark:gunzip -c $olddir/lat.JOB.gz | lattice-scale --acoustic-scale=$inverse_prob_scale ark:- ark:- |"
+
+## Set up features.  Note: we only support LDA+MLLT features, this
+## is inherent in the method, we could not support deltas.
+
+for model_type in left right; do
+
+  feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |" # spliced features.
+  if [ ! -z "$transform_dir" ]; then  # using speaker-specific transforms.
+     # we want to transform in the sequence: $dir/full.mat, then the result of
+     # (extend-transform-dim $transform_dir/trans.JOB), then $dir/full_inv.mat to
+     # get back to the spliced space, then the left.mat or right.mat.  But
+     # note that compose-transforms operates in matrix-multiplication order,
+     # which is opposite from the "order of applying the transforms" order.
+     new_dim=$[`copy-matrix --binary=false $dir/full.mat - | wc -l` - 1]; # 117 in normal case.
+     feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk 'ark:extend-transform-dim --new-dimension=$new_dim ark:$transform_dir/trans.JOB ark:- | compose-transforms ark:- $dir/full.mat ark:- | compose-transforms $dir/full_inv.mat ark:- ark:- | compose-transforms $dir/${model_type}.mat ark:- ark:- |' ark:- ark:- |"
+  else  # else, we transform with the "left" or "right" matrix; these transform from the
+        # spliced space.
+     feats="$feats transform-feats $dir/${model_type}.mat |"
+     # If we don't have the --transform-dir option, make sure the model was
+     # trained in the same way.
+     if grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
+       echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
+       echo "  but you are not providing the --transform-dir option in test time."
+     fi
+  fi
+  if [ -f $olddir/trans.1 ]; then
+     echo "$0: warning: not using transforms in $olddir (this is just a "
+     echo " limitation of the script right now, and could be fixed)."
+  fi
+  
+  if [ $stage -le 2 ]; then
+    echo "Getting gselect info for $model_type model."
+    $cmd JOB=1:$nj $dir/log/gselect.$model_type.JOB.log \
+       sgmm2-gselect $dir/$model_type.mdl "$feats" \
+       "ark,t:|gzip -c >$dir/gselect.$model_type.JOB.gz" || exit 1;
+  fi
+  gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.$model_type.JOB.gz|"
+
+
+  # Rescore the state-level lattices with the model provided.  Just
+  # one command in this script.
+  # The --old-acoustic-scale=1.0 option means we just add the scores
+  # to the old scores.
+  if [ $stage -le 3 ]; then
+    echo "$0: rescoring lattices with $model_type model"
+    $cmd JOB=1:$nj $dir/log/rescore.${model_type}.JOB.log \
+      sgmm2-rescore-lattice --old-acoustic-scale=1.0 "$gselect_opt" $spkvecs_opt \
+      $dir/$model_type.mdl "$cur_lats" "$feats" \
+      "ark:|gzip -c > $dir/lat.${model_type}.JOB.gz" || exit 1;
+  fi
+  cur_lats="ark:gunzip -c $dir/lat.${model_type}.JOB.gz |"
+done
+
+if [ $stage -le 4 ]; then
+  echo "$0: getting final lattices."
+  $cmd JOB=1:$nj $dir/log/scale_lats.JOB.log \
+    lattice-scale --acoustic-scale=$prob_scale "$cur_lats" "ark:|gzip -c >$dir/lat.JOB.gz" \
+   || exit 1;
+fi
+
+rm $dir/lat.{left,right}.*.gz 2>/dev/null  # note: if these still exist, it will
+ # confuse the scoring script.
+
+[ ! -x local/score.sh ] && \
+  echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+local/score.sh --cmd "$cmd" $data $graphdir $dir
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_sgmm_fromlats.sh
+++ b/egs/chime_wsj0/s5/steps/decode_sgmm_fromlats.sh
@ -0,0 +1,273 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# This script does decoding with an SGMM system, with speaker vectors. 
+# If the SGMM system was
+# built on top of fMLLR transforms from a conventional system, you should
+# provide the --transform-dir option.
+# This script does not use a decoding graph, but instead you provide
+# a previous decoding directory with lattices in it.  This script will only
+# make use of the word sequences in the lattices; it limits the decoding
+# to those sequences.  You should also provide a "lang" directory from 
+# which this script will use the G.fst and L.fst.
+
+# Begin configuration section.
+stage=1
+alignment_model=
+transform_dir=    # dir to find fMLLR transforms.
+acwt=0.08333  # Just a default value, used for adaptation and beam-pruning..
+batch_size=75 # Limits memory blowup in compile-train-graphs-fsts
+cmd=run.pl
+beam=20.0
+gselect=15  # Number of Gaussian-selection indices for SGMMs.  [Note:
+            # the first_pass_gselect variable is used for the 1st pass of
+            # decoding and can be tighter.
+first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in 
+            # the 1st pass of decoding (lattice generation).
+max_active=7000
+
+#WARNING: This option is renamed lat_beam (it was renamed to follow the naming 
+#         in the other scripts
+lattice_beam=8.0 # Beam we use in lattice generation.
+vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for 
+    # speaker-vector computation.  Can be quite tight (actually we could
+    # probably just do best-path.
+use_fmllr=false
+fmllr_iters=10
+fmllr_min_count=1000
+scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# -ne 4 ]; then
+  echo "Usage: steps/decode_sgmm_fromlats.sh [options] <data-dir> <lang-dir> <old-decode-dir> <decode-dir>"
+  echo ""
+  echo "main options (for others, see top of script file)"
+  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
+  echo "                                           # where we can find transforms for SAT systems."
+  echo "  --alignment-model <ali-mdl>              # Model for the first-pass decoding."
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --beam <beam>                            # Decoding beam; default 13.0"
+  exit 1;
+fi
+
+data=$1
+lang=$2
+olddir=$3
+dir=$4
+srcdir=`dirname $dir`
+
+for f in $data/feats.scp $lang/G.fst $lang/L_disambig.fst $lang/phones/disambig.int \
+    $srcdir/final.mdl $srcdir/tree $olddir/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+nj=`cat $olddir/num_jobs` || exit 1;
+sdata=$data/split$nj;
+silphonelist=`cat $lang/phones/silence.csl` || exit 1
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"
+gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |"
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+
+## Set up features
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+if [ -z "$transform_dir" ] && [ -f $olddir/trans.1 ]; then
+  transform_dir=$olddir
+fi
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
+  [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
+    && echo "$0: #jobs mismatch with transform-dir." && exit 1;
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
+elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
+  echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
+  echo "  but you are not providing the --transform-dir option in test time."
+fi
+
+## Calculate FMLLR pre-transforms if needed. We are doing this here since this
+## step is requried by models both with and without speaker vectors
+if $use_fmllr; then
+  if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then
+    echo "$0: computing pre-transform for fMLLR computation."
+    sgmm-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1;
+  fi
+fi
+
+## Save Gaussian-selection info to disk.
+# Note: we can use final.mdl regardless of whether there is an alignment model--
+# they use the same UBM.
+if [ $stage -le 1 ]; then
+  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
+    sgmm-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \
+    "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
+fi
+
+## Work out name of alignment model. ##
+if [ -z "$alignment_model" ]; then
+  if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
+  else alignment_model=$srcdir/final.mdl; fi
+fi
+[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;
+
+# Generate state-level lattice which we can rescore.  This is done with the 
+# alignment model and no speaker-vectors.
+if [ $stage -le 2 ]; then
+  $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \
+ lattice-to-fst "ark:gunzip -c $olddir/lat.JOB.gz|" ark:- \| \
+  fsttablecompose "fstproject --project_output=true $lang/G.fst | fstarcsort |" ark:- ark:- \| \
+  fstdeterminizestar ark:- ark:- \| \
+  compile-train-graphs-fsts --read-disambig-syms=$lang/phones/disambig.int \
+    --batch-size=$batch_size $scale_opts \
+    $srcdir/tree $srcdir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \
+  sgmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
+    --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \
+    --word-symbol-table=$lang/words.txt "$gselect_opt_1stpass" $alignment_model \
+    "ark:-" "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1;
+fi
+
+## Check if the model has speaker vectors
+spkdim=`sgmm-info $srcdir/final.mdl | grep 'speaker vector' | awk '{print $NF}'`
+
+if [ $spkdim -gt 0 ]; then  ### For models with speaker vectors:
+
+# Estimate speaker vectors (1st pass).  Prune before determinizing
+# because determinization can take a while on un-pruned lattices.
+# Note: the sgmm-post-to-gpost stage is necessary because we have
+# a separate alignment-model and final model, otherwise we'd skip it 
+# and use sgmm-est-spkvecs.
+  if [ $stage -le 3 ]; then
+    $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \
+      gunzip -c $dir/pre_lat.JOB.gz \| \
+      lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+      lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alignment_model ark:- ark:- \| \
+      sgmm-post-to-gpost "$gselect_opt" $alignment_model "$feats" ark:- ark:- \| \
+      sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \
+      $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1;
+  fi
+
+# Estimate speaker vectors (2nd pass).  Since we already have spk vectors,
+# at this point we need to rescore the lattice to get the correct posteriors.
+  if [ $stage -le 4 ]; then
+    $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \
+      gunzip -c $dir/pre_lat.JOB.gz \| \
+      sgmm-rescore-lattice --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \
+      "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
+      lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+      lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+      weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
+      sgmm-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \
+      $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1;
+  fi
+  rm $dir/pre_vecs.*
+
+  if $use_fmllr; then
+  # Estimate fMLLR transforms (note: these may be on top of any
+  # fMLLR transforms estimated with the baseline GMM system.
+    if [ $stage -le 5 ]; then # compute fMLLR transforms.
+      echo "$0: computing fMLLR transforms."
+      $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+	gunzip -c $dir/pre_lat.JOB.gz \| \
+	sgmm-rescore-lattice --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \
+	"$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
+	lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+	lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+	lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+	weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
+	sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \
+	--fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \
+	$srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1;
+    fi
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"  
+  fi
+
+# Now rescore the state-level lattices with the adapted features and the
+# corresponding model.  Prune and determinize the lattices to limit
+# their size.
+  if [ $stage -le 6 ]; then
+    $cmd JOB=1:$nj $dir/log/rescore.JOB.log \
+      sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \
+      $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \
+      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
+      "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+  fi
+  rm $dir/pre_lat.*.gz
+
+else  ### For models without speaker vectors:
+
+  if $use_fmllr; then
+  # Estimate fMLLR transforms (note: these may be on top of any
+  # fMLLR transforms estimated with the baseline GMM system.
+    if [ $stage -le 5 ]; then # compute fMLLR transforms.
+      echo "$0: computing fMLLR transforms."
+      $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+	gunzip -c $dir/pre_lat.JOB.gz \| \
+	sgmm-rescore-lattice --utt2spk=ark:$sdata/JOB/utt2spk \
+	"$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
+	lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+	lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
+	lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+	weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
+	sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \
+	--fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \
+	$srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1;
+    fi
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"  
+  fi
+
+# Now rescore the state-level lattices with the adapted features and the
+# corresponding model.  Prune and determinize the lattices to limit
+# their size.
+  if [ $stage -le 6 ] && $use_fmllr; then
+    $cmd JOB=1:$nj $dir/log/rescore.JOB.log \
+      sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk \
+      $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \
+      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
+      "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+    rm $dir/pre_lat.*.gz
+  else  # Already done with decoding if no adaptation needed.
+    for n in `seq 1 $nj`; do
+      mv $dir/pre_lat.${n}.gz $dir/lat.${n}.gz
+    done
+  fi
+
+fi
+
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at 
+# different acoustic scales to get the final output.
+
+
+if [ $stage -le 7 ]; then
+  [ ! -x local/score.sh ] && \
+    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+  echo "score best paths"
+  local/score.sh --cmd "$cmd" $data $lang $dir
+  echo "score confidence and timing with sclite"
+  #local/score_sclite_conf.sh --cmd "$cmd" --language turkish $data $lang $dir
+fi
+echo "Decoding done."
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_sgmm_rescore.sh
+++ b/egs/chime_wsj0/s5/steps/decode_sgmm_rescore.sh
@ -0,0 +1,107 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# This script does decoding with an SGMM system, by rescoring lattices
+# generated from a previous SGMM system.  The directory with the lattices
+# is assumed to contain speaker vectors, if used.  Basically it rescores
+# the lattices one final time, using the same setup as the final decoding
+# pass of the source dir.  The assumption is that the model may have
+# been discriminatively trained.
+
+# If the system was built on top of fMLLR transforms from a conventional system,
+# you should provide the --transform-dir option.
+
+# Begin configuration section.
+transform_dir=    # dir to find fMLLR transforms.
+cmd=run.pl
+iter=final
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# -ne 4 ]; then
+  echo "Usage: steps/decode_sgmm_rescore.sh [options] <graph-dir|lang-dir> <data-dir> <old-decode-dir> <decode-dir>"
+  echo " e.g.: steps/decode_sgmm_rescore.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\"
+  echo "      exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr exp/sgmm3a_mmi/decode_dev93_tgpr"
+  echo "main options (for others, see top of script file)"
+  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
+  echo "                                           # where we can find transforms for SAT systems."
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --iter <iter>                            # iteration of model to use (default: final)"
+  exit 1;
+fi
+
+graphdir=$1
+data=$2
+olddir=$3
+dir=$4
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+
+for f in $graphdir/words.txt $data/feats.scp $olddir/lat.1.gz $olddir/gselect.1.gz \
+   $srcdir/$iter.mdl; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+nj=`cat $olddir/num_jobs` || exit 1;
+sdata=$data/split$nj;
+gselect_opt="--gselect=ark,s,cs:gunzip -c $olddir/gselect.JOB.gz|"
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+if [ -f $olddir/vecs.1 ]; then
+  echo "$0: using speaker vectors from $olddir"
+  spkvecs_opt="--spk-vecs=ark:$olddir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk"
+else
+  echo "$0: no speaker vectors found."
+  spkvecs_opt=
+fi
+
+
+## Set up features.
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
+  [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
+    && echo "$0: #jobs mismatch with transform-dir." && exit 1;
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
+elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
+  echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
+  echo "  but you are not providing the --transform-dir option in test time."
+fi
+
+if [ -f $olddir/trans.1 ]; then
+  echo "$0: using (in addition to any previous transforms) transforms from $olddir"
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$olddir/trans.JOB ark:- ark:- |"
+fi
+##
+
+# Rescore the state-level lattices with the model provided.  Just
+# one command in this script.
+echo "$0: rescoring lattices with SGMM model in $srcdir/$iter.mdl"
+$cmd JOB=1:$nj $dir/log/rescore.JOB.log \
+  sgmm-rescore-lattice "$gselect_opt" $spkvecs_opt \
+  $srcdir/$iter.mdl "ark:gunzip -c $olddir/lat.JOB.gz|" "$feats" \
+  "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+
+[ ! -x local/score.sh ] && \
+  echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+local/score.sh --cmd "$cmd" $data $graphdir $dir
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_si.sh
+++ b/egs/chime_wsj0/s5/steps/decode_si.sh
@ -0,0 +1,108 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# Begin configuration section.  
+transform_dir=
+iter=
+model= # You can specify the model to use (e.g. if you want to use the .alimdl)
+stage=0
+nj=4
+cmd=run.pl
+max_active=7000
+max_arcs=-1
+beam=13.0
+latbeam=6.0
+acwt=0.083333 # note: only really affects pruning (scoring is on lattices).
+num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+parallel_opts=  # If you supply num-threads, you should supply this too.
+scoring_opts=
+# note: there are no more min-lmwt and max-lmwt options, instead use
+# e.g. --scoring-opts "--min-lmwt 1 --max-lmwt 20"
+skip_scoring=false
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Usage: steps/decode.sh [options] <graph-dir> <data-dir> <decode-dir>"
+   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
+   echo " where the model is."
+   echo "e.g.: steps/decode.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr"
+   echo ""
+   echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
+   echo "what type of features you used (assuming it's one of these two)"
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --iter <iter>                                    # Iteration of model to test."
+   echo "  --model <model>                                  # which model to use (e.g. to"
+   echo "                                                   # specify the final.alimdl)"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --transform-dir <trans-dir>                      # dir to find fMLLR transforms "
+   echo "  --acwt <float>                                   # acoustic scale used for lattice generation "
+   echo "  --scoring-opts <string>                          # options to local/score.sh"
+   echo "  --num-threads <n>                                # number of threads to use, default 1."
+   echo "  --parallel-opts <opts>                           # e.g. '-pe smp 4' if you supply --num-threads 4"
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
+sdata=$data/split$nj;
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
+  if [ -z $iter ]; then model=$srcdir/final.mdl; 
+  else model=$srcdir/$iter.mdl; fi
+fi
+
+for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst; do
+  [ ! -f $f ] && echo "decode.sh: no such file $f" && exit 1;
+done
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "decode.sh: feature type is $feat_type";
+
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+thread_string=
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+if [ ! -z "$transform_dir" ]; then # add transforms to features...
+  echo "Using fMLLR transforms from $transform_dir"
+  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
+  [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \
+     echo "Mismatch in number of jobs with $transform_dir";
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
+fi
+
+if [ $stage -le 0 ]; then
+  $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
+    gmm-latgen-faster$thread_string --max-arcs=$max_arcs --max-active=$max_active --beam=$beam --lattice-beam=$latbeam \
+    --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+    $model $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+fi
+
+if ! $skip_scoring ; then
+  [ ! -x local/score.sh ] && \
+    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+  local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir
+fi
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/decode_with_map.sh
+++ b/egs/chime_wsj0/s5/steps/decode_with_map.sh
@ -0,0 +1,113 @@
+#!/bin/bash
+
+# Copyright 2012  Neha Agrawal, Cisco Systems;
+#                 Johns Hopkins University (Author: Daniel Povey);
+#                 
+# Apache 2.0
+
+# Begin configuration section.  
+transform_dir=
+iter=
+model= # You can specify the model to use (e.g. if you want to use the .alimdl)
+nj=4
+cmd=run.pl
+max_active=7000
+beam=13.0
+latbeam=6.0
+acwt=0.083333 # note: only really affects pruning (scoring is on lattices).
+mean_tau=20
+weight_tau=10
+flags=mw  # could also contain "v" for variance; the default
+          # tau for that is 50.
+stage=1
+# End configuration section.
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+if [ $# != 3 ]; then
+   echo "Usage: steps/decode.sh [options] <graph-dir> <data-dir> <decode-dir>"
+   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
+   echo " where the model is."
+   echo "e.g.: steps/decode.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr"
+   echo ""
+   echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
+   echo "what type of features you used (assuming it's one of these two)"
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --iter <iter>                                    # Iteration of model to test."
+   echo "  --model <model>                                  # which model to use (e.g. to"
+   echo "                                                   # specify the final.alimdl)"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --transform-dir <trans-dir>                      # dir to find fMLLR transforms "
+   echo "                                                   # speaker-adapted decoding"
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
+sdata=$data/split$nj;
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
+  if [ -z $iter ]; then model=$srcdir/final.mdl; 
+  else model=$srcdir/$iter.mdl; fi
+fi
+
+for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst; do
+  [ ! -f $f ] && echo "decode.sh: no such file $f" && exit 1;
+done
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "decode.sh: feature type is $feat_type";
+
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+if [ ! -z "$transform_dir" ]; then # add transforms to features...
+  echo "Using fMLLR transforms from $transform_dir"
+  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
+  [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \
+     echo "Mismatch in number of jobs with $transform_dir";
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
+fi
+
+if [ $stage -le 1 ]; then
+  echo "Doing first-pass decoding before MAP decoding."
+  $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \
+    gmm-decode-faster --max-active=$max_active --beam=$beam \
+    --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+    $model $graphdir/HCLG.fst "$feats" ark:$dir/tmp.JOB.tra ark:$dir/pass1_decode.JOB.ali || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "Computing MAP stats and doing MAP-adapted decoding"
+  $cmd JOB=1:$nj $dir/log/decode_pass2.JOB.log \
+    ali-to-post ark:$dir/pass1_decode.JOB.ali ark:- \| \
+  gmm-adapt-map --mean-tau=$mean_tau --weight-tau=$weight_tau \
+       --update-flags=$flags --spk2utt=ark:$sdata/JOB/spk2utt \
+     $model "$feats" ark:- ark:- \| \
+  gmm-latgen-map --lattice-beam=$latbeam --acoustic-scale=$acwt \
+   --utt2spk=ark:$sdata/JOB/utt2spk --max-active=$max_active --beam=$beam \
+   --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+   $model ark,s,cs:- $graphdir/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz"
+fi
+#rm -f $dir/pass1_decode.*.ali
+#rm -f $dir/tmp.*.tra
+
+[ ! -x local/score.sh ] && \
+  echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+local/score.sh --cmd "$cmd" $data $graphdir $dir
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/get_ctm.sh
+++ b/egs/chime_wsj0/s5/steps/get_ctm.sh
@ -0,0 +1,66 @@
+#!/bin/bash
+# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
+
+# This script produces CTM files from a decoding directory that has lattices
+# present.
+
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+use_segments=true # if we have a segments file, use it to convert
+                  # the segments to be relative to the original files.
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/get_ctm.sh [options] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --use-segments (true|false)     # use segments and reco2file_and_channel files "
+  echo "                                    # to produce a ctm relative to the original audio"
+  echo "                                    # files, with channel information (typically needed"
+  echo "                                    # for NIST scoring)."
+  echo "e.g.:"
+  echo "local/get_ctm.sh data/train data/lang exp/tri4a/decode/"
+  exit 1;
+fi
+
+data=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+dir=$3
+
+model=$dir/../final.mdl # assume model one level up from decoding dir.
+
+
+for f in $lang/words.txt $lang/phones/word_boundary.int \
+     $model $dir/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+done
+
+name=`basename $data`; # e.g. eval2000
+
+mkdir -p $dir/scoring/log
+
+if [ $stage -le 0 ]; then
+  if [ -f $data/segments ]; then
+    f=$data/reco2file_and_channel
+    [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+    filter_cmd="utils/convert_ctm.pl $data/segments $data/reco2file_and_channel"
+  else
+    filter_cmd=cat    
+  fi
+
+  $cmd LMWT=5:20 $dir/scoring/log/get_ctm.LMWT.log \
+    mkdir -p $dir/score_LMWT/ '&&' \
+    lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+    lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
+    nbest-to-ctm ark:- - \| \
+    utils/int2sym.pl -f 5 $lang/words.txt \| \
+    $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1;
+fi
+
+
--- a/egs/chime_wsj0/s5/steps/get_fmllr_basis.sh
+++ b/egs/chime_wsj0/s5/steps/get_fmllr_basis.sh
@ -0,0 +1,95 @@
+#!/bin/bash
+
+# Copyright 2012   Carnegie Mellon University (Author: Yajie Miao)
+#                  Johns Hopkins University (Author: Daniel Povey)
+
+# Decoding script that computes basis for basis-fMLLR (see decode_fmllr_basis.sh).
+# This can be on top of delta+delta-delta, or LDA+MLLT features.
+
+stage=0
+# Parameters in alignment of training data
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+per_utt=true # If true, then treat each utterance as a separate speaker for purposes of
+  # basis training... this is recommended if the number of actual speakers in your
+  # training set is less than (feature-dim) * (feature-dim+1).
+align_beam=10
+retry_beam=40
+silence_weight=0.01
+cmd=run.pl
+# End configuration section
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Usage: steps/get_fmllr_basis.sh [options] <data-dir> <lang-dir> <exp-dir>"
+   echo " e.g.: steps/decode_basis_fmllr.sh data/train_si84 data/lang exp/tri3b/"
+   echo "Note: we currently assume that this is the same data you trained the model with."
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                   # config containing options"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   exit 1;
+fi
+
+data=$1
+lang=$2
+dir=$3
+
+nj=`cat $dir/num_jobs` || exit 1;
+sdata=$data/split$nj;
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+splice_opts=`cat $dir/splice_opts 2>/dev/null` # frame-splicing options.
+
+silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+
+for f in $data/feats.scp $dir/final.alimdl $dir/final.mdl $dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+
+# Set up the unadapted features "$sifeats".
+if [ -f $dir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type";
+case $feat_type in
+  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |";;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+  # Set up the adapted features "$feats" for training set.
+if [ -f $srcdir/trans.1 ]; then 
+  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$sdata/trans.JOB ark:- ark:- |";
+else
+  feats="$sifeats";
+fi
+
+
+if $per_utt; then
+  spk2utt_opt=  # treat each utterance as separate speaker when computing basis.
+  echo "Doing per-utterance adaptation for purposes of computing the basis."
+else
+  echo "Doing per-speaker adaptation for purposes of computing the basis."
+  [ `cat $sdata/spk2utt | wc -l` -lt $[41*40] ] && \
+    echo "Warning: number of speakers is small, might be better to use --per-utt=true."
+  spk2utt_opt="--spk2utt=ark:$sdata/JOB/spk2utt"
+fi
+
+# Note: we get Gaussian level alignments with the "final.mdl" and the
+# speaker adapted features. 
+$cmd JOB=1:$nj $dir/log/basis_acc.JOB.log \
+  ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
+  weight-silence-post $silence_weight $silphonelist $dir/final.mdl ark:- ark:- \| \
+  gmm-post-to-gpost $dir/final.mdl "$feats" ark:- ark:- \| \
+  gmm-basis-fmllr-accs-gpost $spk2utt_opt \
+    $dir/final.mdl "$sifeats" ark,s,cs:- $dir/basis.acc.JOB || exit 1; 
+
+# Compute the basis matrices.
+$cmd $dir/log/basis_training.log \
+  gmm-basis-fmllr-training $dir/final.mdl $dir/fmllr.basis $dir/basis.acc.* || exit 1;
+rm $dir/basis.acc.* 2>/dev/null
+
+exit 0;
+
--- a/egs/chime_wsj0/s5/steps/get_lexicon_probs.sh
+++ b/egs/chime_wsj0/s5/steps/get_lexicon_probs.sh
@ -0,0 +1,225 @@
+#!/bin/bash
+# Copyright 2013  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+
+# From a training or alignment directory, and an original lexicon.txt and lang/
+# directory, obtain a new lexicon with pronunciation probabilities.
+
+
+# Begin configuration section.  
+stage=0
+smooth_count=1.0 # Amount of count to add corresponding to each original lexicon entry;
+                 # this corresponds to add-one smoothing of the pron-probs.
+max_one=true   # If true, normalize the pron-probs so the maximum value for each word is 1.0,
+               # rather than summing to one.  This is quite standard.
+
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 6 ]; then
+   echo "Usage: steps/get_lexicon_probs.sh <data-dir> <lang-dir> <src-dir|ali-dir> <old-lexicon> <exp-dir> <new-lexicon>"
+   echo "e.g.: steps/get_lexicon_probs.sh data/train data/lang exp/tri5 data/local/lexicon.txt \\"
+   echo "                      exp/tri5_lexprobs data/local_withprob/lexicon.txt"
+   echo "Note: we assume you ran using word-position-dependent phones but both the old and new lexicon will not have"
+   echo "these markings.  We also assume the new lexicon will have pron-probs but the old one does not; this limitation"
+   echo "of the script can be removed later."
+   echo "Main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --stage <stage>                                  # used to control partial re-running."
+   echo "  --max-one <true|false>                           # If true, normalize so max prob of each"
+   echo "                                                   # word is one.  Default: true"
+   echo "  --smooth <smooth-count>                          # Amount to smooth each count by (default: 1.0)"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+old_lexicon=$4
+dir=$5
+new_lexicon=$6
+
+oov=`cat $lang/oov.int` || exit 1;
+nj=`cat $srcdir/num_jobs` || exit 1;
+
+for f in $data/text $lang/L.fst $lang/phones/word_boundary.int $srcdir/ali.1.gz $old_lexicon; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/log
+utils/split_data.sh $data $nj # Make sure split data-dir exists.
+sdata=$data/split$nj
+
+
+mkdir -p $dir/log
+
+if [ $stage -le 0 ]; then
+
+  ( ( for n in `seq $nj`; do gunzip -c $srcdir/ali.$n.gz; done ) | \
+    linear-to-nbest ark:- "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $data/text |" '' '' ark:- | \
+    lattice-align-words $lang/phones/word_boundary.int $srcdir/final.mdl ark:- ark:- | \
+    lattice-to-phone-lattice --replace-words=false $srcdir/final.mdl ark:- ark,t:- | \
+    awk '{ if (NF == 4) { word_phones = sprintf("%s %s", $3, $4); count[word_phones]++; } } 
+        END { for(key in count) { print count[key], key; } }' | \
+          sed s:0,0,:: | awk '{print $2, $1, $3;}' | sed 's/_/ /g' | \
+          utils/int2sym.pl -f 3- $lang/phones.txt  | \
+          sed -E 's/_I( |$)/ /g' |  sed -E 's/_E( |$)/ /g' | sed -E 's/_B( |$)/ /g' | sed -E 's/_S( |$)/ /g' | \
+          utils/int2sym.pl -f 1 $lang/words.txt > $dir/lexicon_counts.txt
+  ) 2>&1 | tee $dir/log/get_fsts.log
+
+fi
+
+cat $old_lexicon | awk '{if (!($2 > 0.0 && $2 < 1.0)) { exit(1); }}' && \
+  echo "Error: old lexicon $old_lexicon appears to have pron-probs; we don't expect this." && \
+  exit 1;
+
+mkdir -p `dirname $new_lexicon` || exit 1;
+
+if [ $stage -le 1 ]; then
+  grep -v -w '^<eps>' $dir/lexicon_counts.txt | \
+  perl -e ' ($old_lexicon, $smooth_count, $max_one) = @ARGV;
+    ($smooth_count >= 0) || die "Invalid smooth_count $smooth_count";
+    ($max_one eq "true" || $max_one eq "false") || die "Invalid max_one variable $max_one";
+    open(O, "<$old_lexicon")||die "Opening old-lexicon file $old_lexicon"; 
+    while(<O>) {
+      $_ =~ m/(\S+)\s+(.+)/ || die "Bad old-lexicon line $_";
+      $word = $1;
+      $orig_pron = $2;
+      # Remember the mapping from canonical prons to original prons: in the case of
+      # syllable based systems we want to remember the locations of tabs in
+      # the original lexicon.
+      $pron = join(" ", split(" ", $orig_pron));
+      $orig_pron{$word,$pron} = $orig_pron;
+      $count{$word,$pron} += $smooth_count;
+      $tot_count{$word} += $smooth_count;
+    }
+    while (<STDIN>) {
+      $_ =~ m/(\S+)\s+(\S+)\s+(.+)/ || die "Bad new-lexicon line $_";
+      $word = $1;
+      $this_count = $2;
+      $pron = join(" ", split(" ", $3));
+      $count{$word,$pron} += $this_count;
+      $tot_count{$word} += $this_count;
+    }
+    if ($max_one eq "true") {  # replace $tot_count{$word} with max count
+       # of any pron.
+      %tot_count = {}; # set to empty assoc array.
+      foreach $key (keys %count) {
+        ($word, $pron) = split($; , $key); # $; is separator for strings that index assoc. arrays.
+        $this_count = $count{$key};
+        if (!defined $tot_count{$word} || $this_count > $tot_count{$word}) {
+          $tot_count{$word} = $this_count;
+        }
+      }
+    }
+    foreach $key (keys %count) {
+       ($word, $pron) = split($; , $key); # $; is separator for strings that index assoc. arrays.
+       $this_orig_pron = $orig_pron{$key};
+       if (!defined $this_orig_pron) { die "Word $word and pron $pron did not appear in original lexicon."; }
+       if (!defined $tot_count{$word}) { die "Tot-count not defined for word $word."; }
+       $prob = $count{$key} / $tot_count{$word};
+       print "$word\t$prob\t$this_orig_pron\n";  # Output happens here.
+    } '  $old_lexicon $smooth_count $max_one > $new_lexicon || exit 1;
+fi
+
+exit 0;
+
+echo $nj > $dir/num_jobs
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+cp $srcdir/{tree,final.mdl} $dir || exit 1;
+cp $srcdir/final.occs $dir;
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $dir    
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+## Set up model and alignment model.
+mdl=$srcdir/final.mdl
+if [ -f $srcdir/final.alimdl ]; then
+  alimdl=$srcdir/final.alimdl
+else
+  alimdl=$srcdir/final.mdl
+fi
+[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
+alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |"
+mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |"
+
+
+## Work out where we're getting the graphs from.
+if $use_graphs; then
+  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
+    echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1;
+  [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1;
+  graphdir=$srcdir
+else
+  graphdir=$dir
+  if [ $stage -le 0 ]; then
+    echo "$0: compiling training graphs"
+    tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";   
+    $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
+      compile-train-graphs $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
+        "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
+  fi
+fi
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: aligning data in $data using $alimdl and speaker-independent features."
+  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
+    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
+    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: computing fMLLR transforms"
+  if [ "$alimdl" != "$mdl" ]; then
+    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+      gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
+      gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \
+      --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \
+      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
+  else
+    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+      gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
+      --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \
+      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
+  fi
+fi
+
+feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
+
+if [ $stage -le 3 ]; then
+  echo "$0: doing final alignment."
+  $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \
+    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl_cmd" \
+    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+
+rm $dir/pre_ali.*.gz
+
+echo "$0: done aligning data."
+
+utils/summarize_warnings.pl $dir/log
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/get_train_ctm.sh
+++ b/egs/chime_wsj0/s5/steps/get_train_ctm.sh
@ -0,0 +1,66 @@
+#!/bin/bash
+# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
+
+# This script produces CTM files from a training directory that has alignments
+# present.
+
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+use_segments=true # if we have a segments file, use it to convert
+                  # the segments to be relative to the original files.
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/get_train_ctm.sh [options] <data-dir> <lang-dir> <ali-dir|exp-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --use-segments (true|false)     # use segments and reco2file_and_channel files "
+  echo "                                    # to produce a ctm relative to the original audio"
+  echo "                                    # files, with channel information (typically needed"
+  echo "                                    # for NIST scoring)."
+  echo "e.g.:"
+  echo "local/get_train_ctm.sh data/train data/lang exp/tri3a_ali"
+  echo "Produces ctm in: exp/tri3a_ali/ctm"
+  exit 1;
+fi
+
+data=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+dir=$3
+
+model=$dir/final.mdl # assume model one level up from decoding dir.
+
+
+for f in $lang/words.txt $lang/phones/word_boundary.int \
+     $model $dir/ali.1.gz $lang/oov.int; do
+  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+done
+
+oov=`cat $lang/oov.int` || exit 1;
+
+mkdir -p $dir/scoring/log
+
+if [ $stage -le 0 ]; then
+  if [ -f $data/segments ]; then
+    f=$data/reco2file_and_channel
+    [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+    filter_cmd="utils/convert_ctm.pl $data/segments $data/reco2file_and_channel"
+  else
+    filter_cmd=cat    
+  fi
+
+  $cmd $dir/log/get_ctm.log \
+    linear-to-nbest "ark:gunzip -c $dir/ali.*.gz|" \
+     "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data/text |" \
+     '' '' ark:- \| \
+    lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
+    nbest-to-ctm ark:- - \| \
+    utils/int2sym.pl -f 5 $lang/words.txt \| \
+    $filter_cmd '>' $dir/ctm || exit 1;
+fi
--- a/egs/chime_wsj0/s5/steps/lmrescore.sh
+++ b/egs/chime_wsj0/s5/steps/lmrescore.sh
@ -0,0 +1,122 @@
+#!/bin/bash
+
+# Begin configuration section.
+mode=4
+cmd=run.pl
+skip_scoring=false
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+for x in `seq 2`; do
+  [ "$1" == "--cmd" ] && cmd=$2 && shift 2;
+  [ "$1" == "--mode" ] && mode=$2 && shift 2;
+done
+
+if [ $# != 5 ]; then
+   echo "Do language model rescoring of lattices (remove old LM, add new LM)"
+   echo "Usage: steps/lmrescore.sh [options] <old-lang-dir> <new-lang-dir> <data-dir> <input-decode-dir> <output-decode-dir>"
+   echo "options: [--cmd (run.pl|queue.pl [queue opts])] [--mode (1|2|3|4)]"
+   exit 1;
+fi
+
+[ -f path.sh ] && . ./path.sh;
+
+oldlang=$1
+newlang=$2
+data=$3
+indir=$4
+outdir=$5
+
+oldlm=$oldlang/G.fst
+newlm=$newlang/G.fst
+! cmp $oldlang/words.txt $newlang/words.txt && echo "Warning: vocabularies may be incompatible."
+[ ! -f $oldlm ] && echo Missing file $oldlm && exit 1;
+[ ! -f $newlm ] && echo Missing file $newlm && exit 1;
+! ls $indir/lat.*.gz >/dev/null && echo "No lattices input directory $indir" && exit 1;
+
+oldlmcommand="fstproject --project_output=true $oldlm |"
+newlmcommand="fstproject --project_output=true $newlm |"
+
+mkdir -p $outdir/log
+
+phi=`grep -w '#0' $newlang/words.txt | awk '{print $2}'`
+
+if [ "$mode" == 4 ]; then
+  # we have to prepare $outdir/Ldet.fst in this case: determinized
+  # lexicon (determinized on phones), with disambig syms removed.
+  # take L_disambig.fst; get rid of transition with "#0 #0" on it; determinize
+  # with epsilon removal; remove disambiguation symbols.
+  fstprint $newlang/L_disambig.fst | awk '{if($4 != '$phi'){print;}}' | fstcompile | \
+    fstdeterminizestar | fstrmsymbols $newlang/phones/disambig.int >$outdir/Ldet.fst || exit 1;
+fi
+
+nj=`cat $indir/num_jobs` || exit 1;
+cp $indir/num_jobs $outdir
+
+
+#for lat in $indir/lat.*.gz; do
+#  number=`basename $lat | cut -d. -f2`;
+#  newlat=$outdir/`basename $lat`
+
+case "$mode" in
+  1) # 1 is inexact, it's the original way of doing it.
+    $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
+      lattice-lmrescore --lm-scale=-1.0 "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:-  \| \
+      lattice-lmrescore --lm-scale=1.0 ark:- "$newlmcommand" "ark,t:|gzip -c>$outdir/lat.JOB.gz" \
+      || exit 1;
+    ;;
+  2)  # 2 is equivalent to 1, but using more basic operations, combined.
+    $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
+      gunzip -c $indir/lat.JOB.gz \| \
+      lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
+      lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \| \
+      lattice-determinize ark:- ark:- \| \
+      lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
+      lattice-compose ark:- "fstproject --project_output=true $newlm |" ark:- \| \
+      lattice-determinize ark:- ark:- \| \
+      gzip -c \>$outdir/lat.JOB.gz || exit 1;
+    ;;
+  3) # 3 is "exact" in that we remove the old LM scores accepting any path
+     # through G.fst (which is what we want as that happened in lattice 
+     # generation), but we add the new one with "phi matcher", only taking
+     # backoff arcs if an explicit arc did not exist.
+    $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
+      gunzip -c $indir/lat.JOB.gz \| \
+      lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
+      lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \| \
+      lattice-determinize ark:- ark:- \| \
+      lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
+      lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \
+      lattice-determinize ark:- ark:- \| \
+      gzip -c \>$outdir/lat.JOB.gz || exit 1;
+    ;;
+  4) # 4 is also exact (like 3), but instead of subtracting the old LM-scores,
+     # it removes the old graph scores entirely and adds in the lexicon,
+     # grammar and transition weights.
+    mdl=`dirname $indir`/final.mdl
+    [ ! -f $mdl ] && echo No such model $mdl && exit 1;
+    $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
+      gunzip -c $indir/lat.JOB.gz \| \
+      lattice-scale --lm-scale=0.0 ark:- ark:- \| \
+      lattice-to-phone-lattice $mdl ark:- ark:- \| \
+      lattice-compose ark:- $outdir/Ldet.fst ark:- \| \
+      lattice-determinize ark:- ark:- \| \
+      lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \
+      lattice-add-trans-probs --transition-scale=1.0 --self-loop-scale=0.1 \
+      $mdl ark:- ark:- \| \
+      gzip -c \>$outdir/lat.JOB.gz  || exit 1;
+    ;;
+esac
+
+rm $outdir/Ldet.fst 2>/dev/null
+
+if ! $skip_scoring ; then
+  [ ! -x local/score.sh ] && \
+    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+  local/score.sh --cmd "$cmd" $data $newlang $outdir
+else
+  echo "Not scoring because requested so..."
+fi
+
+exit 0;
--- a/egs/chime_wsj0/s5/steps/make_bn_feats.sh
+++ b/egs/chime_wsj0/s5/steps/make_bn_feats.sh
@ -0,0 +1,117 @@
+#!/bin/bash 
+
+# Copyright 2012  Karel Vesely, Daniel Povey
+# Apache 2.0
+# To be run from .. (one directory up from here)
+# see ../run.sh for example
+
+# Begin configuration section.
+nj=4
+cmd=run.pl
+remove_last_layers=4 # remove N last components from the nnet
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 5 ]; then
+   echo "usage: $0 [options] <tgt-data-dir> <src-data-dir> <nnet-dir> <log-dir> <abs-path-to-bn-feat-dir>";
+   echo "options: "
+   echo "  --trim-transforms <N>                            # number of NNet Components to remove from the end"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+if [ -f path.sh ]; then . path.sh; fi
+
+data=$1
+srcdata=$2
+nndir=$3
+logdir=$4
+bnfeadir=$5
+
+######## CONFIGURATION
+
+# copy the dataset metadata from srcdata.
+mkdir -p $data || exit 1;
+cp $srcdata/* $data 2>/dev/null; rm $data/feats.scp $data/cmvn.scp;
+
+# make $bnfeadir an absolute pathname.
+bnfeadir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $bnfeadir ${PWD}`
+
+# use "name" as part of name of the archive.
+name=`basename $data`
+
+mkdir -p $bnfeadir || exit 1;
+mkdir -p $data || exit 1;
+mkdir -p $logdir || exit 1;
+
+
+srcscp=$srcdata/feats.scp
+scp=$data/feats.scp
+
+required="$srcscp $nndir/final.nnet"
+
+for f in $required; do
+  if [ ! -f $f ]; then
+    echo "$0: no such file $f"
+    exit 1;
+  fi
+done
+
+if [ ! -d $srcdata/split$nj -o $srcdata/split$nj -ot $srcdata/feats.scp ]; then
+  utils/split_data.sh $srcdata $nj
+fi
+
+
+#cut the MLP
+nnet=$bnfeadir/feature_extractor.nnet
+copy-nnet --remove-last-layers=$remove_last_layers --binary=false $nndir/final.nnet $nnet 2>$logdir/feature_extractor.log
+
+#get the feature transform
+feature_transform=$nndir/final.feature_transform
+
+echo "Creating bn-feats into $data"
+
+###
+### Prepare feature pipeline
+feats="ark,s,cs:copy-feats scp:$srcdata/split$nj/JOB/feats.scp ark:- |"
+# Optionally add cmvn
+if [ -f $nndir/norm_vars ]; then
+  norm_vars=$(cat $nndir/norm_vars 2>/dev/null)
+  feats="$feats apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$srcdata/utt2spk scp:$srcdata/cmvn.scp ark:- ark:- |"
+fi
+# Optionally add deltas
+if [ -f $nndir/delta_order ]; then
+  delta_order=$(cat $nndir/delta_order)
+  feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
+fi
+###
+###
+
+#Run the forward pass
+$cmd JOB=1:$nj $logdir/make_bnfeats.JOB.log \
+  nnet-forward --feature-transform=$feature_transform $nnet "$feats" \
+  ark,scp:$bnfeadir/raw_bnfea_$name.JOB.ark,$bnfeadir/raw_bnfea_$name.JOB.scp \
+  || exit 1;
+
+
+N0=$(cat $srcdata/feats.scp | wc -l) 
+N1=$(cat $bnfeadir/raw_bnfea_$name.*.scp | wc -l)
+if [[ "$N0" != "$N1" ]]; then
+  echo "Error producing bnfea features for $name:"
+  echo "Original feats : $N0  Bottleneck feats : $N1"
+  exit 1;
+fi
+
+# concatenate the .scp files together.
+for ((n=1; n<=nj; n++)); do
+  cat $bnfeadir/raw_bnfea_$name.$n.scp >> $data/feats.scp
+done
+
+
+echo "Succeeded creating MLP-BN features for $name ($data)"
+
--- a/egs/chime_wsj0/s5/steps/make_denlats.sh
+++ b/egs/chime_wsj0/s5/steps/make_denlats.sh
@ -0,0 +1,146 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# Create denominator lattices for MMI/MPE training.
+# Creates its output in $dir/lat.*.gz
+
+# Begin configuration section.
+nj=4
+cmd=run.pl
+sub_split=1
+beam=13.0
+lattice_beam=7.0
+acwt=0.1
+max_active=5000
+transform_dir=
+max_mem=20000000 # This will stop the processes getting too large.
+# This is in bytes, but not "real" bytes-- you have to multiply
+# by something like 5 or 10 to get real bytes (not sure why so large)
+num_threads=1
+parallel_opts=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "Usage: steps/make_denlats.sh [options] <data-dir> <lang-dir> <src-dir> <exp-dir>"
+   echo "  e.g.: steps/make_denlats.sh data/train data/lang exp/tri1 exp/tri1_denlats"
+   echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
+   echo " plus transforms."
+   echo ""
+   echo "Main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --sub-split <n-split>                            # e.g. 40; use this for "
+   echo "                           # large databases so your jobs will be smaller and"
+   echo "                           # will (individually) finish reasonably soon."
+   echo "  --transform-dir <transform-dir>   # directory to find fMLLR transforms."
+   echo "  --num-threads  <n>                # number of threads per decoding job"
+   echo "  --parallel-opts <string>          # if >1 thread, add this to 'cmd', e.g. -pe smp 6"
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+sdata=$data/split$nj
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+thread_string=
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+oov=`cat $lang/oov.int` || exit 1;
+
+mkdir -p $dir
+
+cp -r $lang $dir/
+
+# Compute grammar FST which corresponds to unigram decoding graph.
+new_lang="$dir/"$(basename "$lang")
+echo "Making unigram grammar FST in $new_lang"
+cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
+  awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
+  utils/make_unigram_grammar.pl | fstcompile > $new_lang/G.fst \
+   || exit 1;
+
+# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
+# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
+# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph.
+
+echo "Compiling decoding graph in $dir/dengraph"
+if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then
+   echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
+else
+  utils/mkgraph.sh $new_lang $srcdir $dir/dengraph || exit 1;
+fi
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "align_si.sh: feature type is $feat_type"
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $dir    
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+if [ ! -z "$transform_dir" ]; then # add transforms to features...
+  echo "$0: using fMLLR transforms from $transform_dir"
+  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
+  [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \
+    && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1;
+  [ -f $srcdir/final.mat ] && ! cmp $transform_dir/final.mat $srcdir/final.mat && \
+     echo "$0: LDA transforms differ between $srcdir and $transform_dir"
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
+else
+  if [ -f $srcdir/final.alimdl ]; then
+    echo "$0: you seem to have a SAT system but you did not supply the --transform-dir option.";
+    exit 1;
+  fi
+fi
+
+if [ $sub_split -eq 1 ]; then 
+  $cmd $parallel_opts JOB=1:$nj $dir/log/decode_den.JOB.log \
+   gmm-latgen-faster$thread_string --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+    --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
+     $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
+else
+  for n in `seq $nj`; do
+    if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then
+      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
+    else 
+      sdata2=$data/split$nj/$n/split$sub_split;
+      if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then
+        split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
+      fi
+      mkdir -p $dir/log/$n
+      mkdir -p $dir/part
+      feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g`
+      $cmd $parallel_opts JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
+        gmm-latgen-faster$thread_string --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+        --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
+          $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1;
+      echo Merging archives for data subset $n
+      rm $dir/.error 2>/dev/null;
+      for k in `seq $sub_split`; do
+        gunzip -c $dir/lat.$n.$k.gz || touch $dir/.error;
+      done | gzip -c > $dir/lat.$n.gz || touch $dir/.error;
+      [ -f $dir/.error ] && echo Merging lattices for subset $n failed && exit 1;
+      rm $dir/lat.$n.*.gz
+      touch $dir/.done.$n
+    fi
+  done
+fi
+
+
+echo "$0: done generating denominator lattices."
--- a/egs/chime_wsj0/s5/steps/make_denlats_nnet.sh
+++ b/egs/chime_wsj0/s5/steps/make_denlats_nnet.sh
@ -0,0 +1,177 @@
+#!/bin/bash
+# Copyright 2012-2013 Karel Vesely, Daniel Povey
+# Apache 2.0.
+
+# Create denominator lattices for MMI/MPE/sMBR training.
+# Creates its output in $dir/lat.*.ark,$dir/lat.scp
+# The lattices are uncompressed, we need random access for DNN training.
+
+# Begin configuration section.
+nj=4
+cmd=run.pl
+sub_split=1
+beam=13.0
+lattice_beam=7.0
+acwt=0.1
+max_active=5000
+nnet=
+max_mem=20000000 # This will stop the processes getting too large.
+# This is in bytes, but not "real" bytes-- you have to multiply
+# by something like 5 or 10 to get real bytes (not sure why so large)
+# End configuration section.
+use_gpu_id=-1 # disable gpu
+parallel_opts="-pe smp 2"
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "Usage: steps/$0 [options] <data-dir> <lang-dir> <src-dir> <exp-dir>"
+   echo "  e.g.: steps/$0 data/train data/lang exp/tri1 exp/tri1_denlats"
+   echo "Works for plain features (or CMN, delta), forwarded through feature-transform."
+   echo ""
+   echo "Main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --sub-split <n-split>                            # e.g. 40; use this for "
+   echo "                           # large databases so your jobs will be smaller and"
+   echo "                           # will (individually) finish reasonably soon."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+sdata=$data/split$nj
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+oov=`cat $lang/oov.int` || exit 1;
+
+mkdir -p $dir
+
+cp -r $lang $dir/
+
+# Compute grammar FST which corresponds to unigram decoding graph.
+new_lang="$dir/"$(basename "$lang")
+echo "Making unigram grammar FST in $new_lang"
+cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
+  awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
+  utils/make_unigram_grammar.pl | fstcompile > $new_lang/G.fst \
+   || exit 1;
+
+# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
+# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
+# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph.
+
+echo "Compiling decoding graph in $dir/dengraph"
+if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then
+   echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
+else
+  utils/mkgraph.sh $new_lang $srcdir $dir/dengraph || exit 1;
+fi
+
+
+
+#Get the files we will need
+cp $srcdir/{tree,final.mdl} $dir
+
+[ -z "$nnet" ] && nnet=$srcdir/final.nnet;
+[ ! -f "$nnet" ] && echo "Error nnet '$nnet' does not exist!" && exit 1;
+
+class_frame_counts=$srcdir/ali_train_pdf.counts
+[ -z "$class_frame_counts" ] && echo "Error class_frame_counts '$class_frame_counts' does not exist!" && exit 1;
+
+feature_transform=$srcdir/final.feature_transform
+if [ ! -f $feature_transform ]; then
+  echo "Missing feature_transform '$feature_transform'"
+  exit 1
+fi
+
+model=$dir/final.mdl
+[ -z "$model" ] && echo "Error transition model '$model' does not exist!" && exit 1;
+
+###
+### Prepare feature pipeline (same as for decoding)
+###
+# Create the feature stream:
+feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
+# Optionally add cmvn
+if [ -f $srcdir/norm_vars ]; then
+  norm_vars=$(cat $srcdir/norm_vars 2>/dev/null)
+  [ ! -f $sdata/1/cmvn.scp ] && echo "$0: cannot find cmvn stats $sdata/1/cmvn.scp" && exit 1
+  feats="$feats apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
+fi
+# Optionally add deltas
+if [ -f $srcdir/delta_order ]; then
+  delta_order=$(cat $srcdir/delta_order)
+  feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
+fi
+
+# Finally add feature_transform and the MLP
+feats="$feats nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet ark:- ark:- |"
+###
+###
+###
+
+
+
+###
+### We will produce lattices, where the correct path is not necessarily present
+###
+
+#1) We don't use reference path here...
+
+echo "Generating the denlats"
+#2) Generate the denominator lattices
+if [ $sub_split -eq 1 ]; then 
+  $cmd $parallel_opts JOB=1:$nj $dir/log/decode_den.JOB.log \
+    latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+      --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
+      $dir/dengraph/HCLG.fst "$feats" "ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp" || exit 1;
+else
+  for n in `seq $nj`; do
+    if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then
+      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
+    else
+      sdata2=$data/split$nj/$n/split$sub_split;
+      if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then
+        split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
+      fi
+      mkdir -p $dir/log/$n
+      mkdir -p $dir/part
+      feats_subset=$(echo $feats | sed s:JOB/:$n/split$sub_split/JOB/:g)
+      $cmd $parallel_opts JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
+        latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+          --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
+          $dir/dengraph/HCLG.fst "$feats_subset" "ark,scp:$dir/lat.$n.JOB.ark,$dir/lat.$n.JOB.scp" || exit 1;
+      echo Merging lists for data subset $n
+      for k in `seq $sub_split`; do
+        cat $dir/lat.$n.$k.scp
+      done > $dir/lat.$n.all.scp
+      echo Merge the ark $n
+      lattice-copy scp:$dir/lat.$n.all.scp ark,scp:$dir/lat.$n.ark,$dir/lat.$n.scp || exit 1;
+      #remove the data
+      rm $dir/lat.$n.*.ark $dir/lat.$n.*.scp $dir/lat.$n.all.scp
+      touch $dir/.done.$n
+    fi
+  done
+fi
+
+      
+
+#3) Merge the SCPs to create full list of lattices (will use random access)
+echo Merging to single list $dir/lat.scp
+for ((n=1; n<=nj; n++)); do
+  cat $dir/lat.$n.scp
+done > $dir/lat.scp
+
+
+echo "$0: done generating denominator lattices."
--- a/egs/chime_wsj0/s5/steps/make_denlats_nnet_cpu.sh
+++ b/egs/chime_wsj0/s5/steps/make_denlats_nnet_cpu.sh
@ -0,0 +1,146 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# Create denominator lattices for MMI/MPE training.
+# Creates its output in $dir/lat.*.gz
+
+# Begin configuration section.
+transform_dir=    # dir to find fMLLR transforms.
+nj=4 # number of decoding jobs.  If --transform-dir set, must match that number!
+cmd=run.pl
+sub_split=1
+beam=13.0
+lattice_beam=7.0
+acwt=0.1
+max_active=5000
+transform_dir=
+max_mem=20000000 # This will stop the processes getting too large.
+# This is in bytes, but not "real" bytes-- you have to multiply
+# by something like 5 or 10 to get real bytes (not sure why so large)
+# End configuration section.
+num_threads=1 # Number of threads used in nnet-logprob computation.  If you set
+              # this to a different value, make sure to also set the appropriate
+              # queue options.  If you set this too high it won't use all the
+              # threads as most of the time will be taken in the decoder.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "Usage: steps/make_denlats_nnet_cpu.sh [options] <data-dir> <lang-dir> <src-dir> <exp-dir>"
+   echo "  e.g.: steps/make_denlats_nnet_cpu.sh data/train data/lang exp/tri1 exp/tri1_denlats"
+   echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
+   echo " plus transforms."
+   echo ""
+   echo "Main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --sub-split <n-split>                            # e.g. 40; use this for "
+   echo "                           # large databases so your jobs will be smaller and"
+   echo "                           # will (individually) finish reasonably soon."
+   echo "  --transform-dir <transform-dir>   # directory to find fMLLR transforms."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+sdata=$data/split$nj
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+oov=`cat $lang/oov.int` || exit 1;
+
+mkdir -p $dir
+
+cp -r $lang $dir/
+
+# Compute grammar FST which corresponds to unigram decoding graph.
+new_lang="$dir/"$(basename "$lang")
+echo "Making unigram grammar FST in $new_lang"
+cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
+  awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
+  utils/make_unigram_grammar.pl | fstcompile > $new_lang/G.fst \
+   || exit 1;
+
+# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
+# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
+# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph.
+
+echo "Compiling decoding graph in $dir/dengraph"
+if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then
+   echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
+else
+  utils/mkgraph.sh $new_lang $srcdir $dir/dengraph || exit 1;
+fi
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "align_si.sh: feature type is $feat_type"
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $dir    
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+if [ ! -z "$transform_dir" ]; then # add transforms to features...
+  echo "$0: using fMLLR transforms from $transform_dir"
+  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
+  [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \
+    && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1;
+  [ -f $srcdir/final.mat ] && ! cmp $transform_dir/final.mat $srcdir/final.mat && \
+     echo "$0: LDA transforms differ between $srcdir and $transform_dir"
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
+else
+  if [ -f $srcdir/final.alimdl ]; then
+    echo "$0: you seem to have a SAT system but you did not supply the --transform-dir option.";
+    exit 1;
+  fi
+fi
+
+if [ $sub_split -eq 1 ]; then 
+  $cmd JOB=1:$nj $dir/log/decode_den.JOB.log \
+   nnet-logprob-parallel --num-threads=$num_threads $srcdir/final.mdl "$feats" ark:- \| \
+   latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+    --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
+     $dir/dengraph/HCLG.fst ark:- "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
+else
+  for n in `seq $nj`; do
+    if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then
+      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
+    else 
+      sdata2=$data/split$nj/$n/split$sub_split;
+      if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then
+        split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
+      fi
+      mkdir -p $dir/log/$n
+      mkdir -p $dir/part
+      feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g`
+      $cmd JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
+        nnet-logprob-parallel --num-threads=$num_threads $srcdir/final.mdl "$feats_subset" ark:- \| \
+        latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+        --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
+          $dir/dengraph/HCLG.fst ark:- "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1;
+      echo Merging archives for data subset $n
+      rm $dir/.error 2>/dev/null;
+      for k in `seq $sub_split`; do
+        gunzip -c $dir/lat.$n.$k.gz || touch $dir/.error;
+      done | gzip -c > $dir/lat.$n.gz || touch $dir/.error;
+      [ -f $dir/.error ] && echo Merging lattices for subset $n failed && exit 1;
+      rm $dir/lat.$n.*.gz
+      touch $dir/.done.$n
+    fi
+  done
+fi
+
+
+echo "$0: done generating denominator lattices."
--- a/egs/chime_wsj0/s5/steps/make_denlats_sgmm.sh
+++ b/egs/chime_wsj0/s5/steps/make_denlats_sgmm.sh
@ -0,0 +1,159 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# Create denominator lattices for MMI/MPE training, with SGMM models.  If the
+# features have fMLLR transforms you have to supply the --transform-dir option.
+# It gets any speaker vectors from the "alignment dir" ($alidir).  Note: this is
+# possibly a slight mismatch because the speaker vectors come from supervised
+# adaptation.
+
+# Begin configuration section.
+nj=4
+cmd=run.pl
+sub_split=1
+beam=13.0
+lattice_beam=7.0
+acwt=0.1
+max_active=5000
+transform_dir=
+max_mem=20000000 # This will stop the processes getting too large.
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "Usage: steps/make_denlats_sgmm.sh [options] <data-dir> <lang-dir> <src-dir|alidir> <exp-dir>"
+   echo "  e.g.: steps/make_denlats_sgmm.sh data/train data/lang exp/sgmm4a_ali exp/sgmm4a_denlats"
+   echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
+   echo " plus transforms."
+   echo ""
+   echo "Main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --sub-split <n-split>                            # e.g. 40; use this for "
+   echo "                           # large databases so your jobs will be smaller and"
+   echo "                           # will (individually) finish reasonably soon."
+   echo "  --transform-dir <transform-dir>   # directory to find fMLLR transforms."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+alidir=$3 # could also be $srcdir, but only if no vectors supplied.
+dir=$4
+
+sdata=$data/split$nj
+splice_opts=`cat $alidir/splice_opts 2>/dev/null`
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+oov=`cat $lang/oov.int` || exit 1;
+
+mkdir -p $dir
+
+cp -r $lang $dir/
+
+# Compute grammar FST which corresponds to unigram decoding graph.
+new_lang="$dir/"$(basename "$lang")
+echo "Making unigram grammar FST in $new_lang"
+cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
+  awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
+  utils/make_unigram_grammar.pl | fstcompile > $new_lang/G.fst \
+   || exit 1;
+
+# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
+# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
+# final.mdl from $alidir; the output HCLG.fst goes in $dir/graph.
+
+echo "Compiling decoding graph in $dir/dengraph"
+if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then
+   echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
+else
+  utils/mkgraph.sh $new_lang $alidir $dir/dengraph || exit 1;
+fi
+
+if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "align_si.sh: feature type is $feat_type"
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+    cp $alidir/final.mat $dir    
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+if [ ! -z "$transform_dir" ]; then # add transforms to features...
+  echo "$0: using fMLLR transforms from $transform_dir"
+  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
+  [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \
+    && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1;
+  [ -f $alidir/final.mat ] && ! cmp $transform_dir/final.mat $alidir/final.mat && \
+     echo "$0: LDA transforms differ between $alidir and $transform_dir"
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
+else
+  echo "Assuming you don't have a SAT system, since no --transform-dir option supplied "
+fi
+
+if [ -f $alidir/gselect.1.gz ]; then
+  gselect_opt="--gselect=ark,s,cs:gunzip -c $alidir/gselect.JOB.gz|"
+else
+  echo "$0: no such file $alidir/gselect.1.gz" && exit 1;
+fi
+
+if [ -f $alidir/vecs.1 ]; then
+  spkvecs_opt="--spk-vecs=ark:$alidir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk"
+else
+  if [ -f $alidir/final.alimdl ]; then
+    echo "You seem to have an SGMM system with speaker vectors,"
+    echo "yet we can't find speaker vectors.  Perhaps you supplied"
+    echo "the model director instead of the alignment directory?"
+    exit 1;
+  fi
+fi
+
+if [ $sub_split -eq 1 ]; then 
+  $cmd JOB=1:$nj $dir/log/decode_den.JOB.log \
+   sgmm-latgen-faster $spkvecs_opt "$gselect_opt" --beam=$beam \
+     --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+     --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $alidir/final.mdl  \
+     $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
+else
+  for n in `seq $nj`; do
+    if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then
+      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
+    else 
+      sdata2=$data/split$nj/$n/split$sub_split;
+      if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then
+        split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
+      fi
+      mkdir -p $dir/log/$n
+      mkdir -p $dir/part
+      feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g`
+      spkvecs_opt_subset=`echo $spkvecs_opt | sed "s/JOB/$n/g"`
+      gselect_opt_subset=`echo $gselect_opt | sed "s/JOB/$n/g"`
+      $cmd JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
+        sgmm-latgen-faster $spkvecs_opt_subset "$gselect_opt_subset" \
+          --beam=$beam --lattice-beam=$lattice_beam \
+          --acoustic-scale=$acwt --max-mem=$max_mem --max-active=$max_active \
+          --word-symbol-table=$lang/words.txt $alidir/final.mdl  \
+          $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1;
+      echo Merging archives for data subset $n
+      rm $dir/.error 2>/dev/null;
+      for k in `seq $sub_split`; do
+        gunzip -c $dir/lat.$n.$k.gz || touch $dir/.error;
+      done | gzip -c > $dir/lat.$n.gz || touch $dir/.error;
+      [ -f $dir/.error ] && echo Merging lattices for subset $n failed && exit 1;
+      rm $dir/lat.$n.*.gz
+      touch $dir/.done.$n
+    fi
+  done
+fi
+
+
+echo "$0: done generating denominator lattices with SGMMs."
--- a/egs/chime_wsj0/s5/steps/make_denlats_sgmm2.sh
+++ b/egs/chime_wsj0/s5/steps/make_denlats_sgmm2.sh
@ -0,0 +1,170 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# Create denominator lattices for MMI/MPE training, with SGMM models.  If the
+# features have fMLLR transforms you have to supply the --transform-dir option.
+# It gets any speaker vectors from the "alignment dir" ($alidir).  Note: this is
+# possibly a slight mismatch because the speaker vectors come from supervised
+# adaptation.
+
+# Begin configuration section.
+nj=4
+cmd=run.pl
+sub_split=1
+beam=13.0
+lattice_beam=7.0
+acwt=0.1
+max_active=5000
+transform_dir=
+max_mem=20000000 # This will stop the processes getting too large.
+num_threads=1
+parallel_opts=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "Usage: steps/make_denlats_sgmm2.sh [options] <data-dir> <lang-dir> <src-dir|alidir> <exp-dir>"
+   echo "  e.g.: steps/make_denlats_sgmm2.sh data/train data/lang exp/sgmm4a_ali exp/sgmm4a_denlats"
+   echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
+   echo " plus transforms."
+   echo ""
+   echo "Main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --sub-split <n-split>                            # e.g. 40; use this for "
+   echo "                           # large databases so your jobs will be smaller and"
+   echo "                           # will (individually) finish reasonably soon."
+   echo "  --transform-dir <transform-dir>   # directory to find fMLLR transforms."
+   echo "  --num-threads  <n>                # number of threads per decoding job"
+   echo "  --parallel-opts <string>          # if >1 thread, add this to 'cmd', e.g. -pe smp 6"
+   exit 1;
+fi
+
+data=$1
+lang=$2
+alidir=$3 # could also be $srcdir, but only if no vectors supplied.
+dir=$4
+
+sdata=$data/split$nj
+splice_opts=`cat $alidir/splice_opts 2>/dev/null`
+if [ $num_threads -gt 1 ]; then
+  # the -parallel becomes part of the binary name we decode with.
+  thread_string="-parallel --num-threads=$num_threads"
+fi
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+oov=`cat $lang/oov.int` || exit 1;
+
+mkdir -p $dir
+
+cp -r $lang $dir/
+
+# Compute grammar FST which corresponds to unigram decoding graph.
+new_lang="$dir/"$(basename "$lang")
+echo "Making unigram grammar FST in $new_lang"
+cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
+  awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
+  utils/make_unigram_grammar.pl | fstcompile > $new_lang/G.fst \
+   || exit 1;
+
+# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
+# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
+# final.mdl from $alidir; the output HCLG.fst goes in $dir/graph.
+
+echo "Compiling decoding graph in $dir/dengraph"
+if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then
+   echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
+else
+  utils/mkgraph.sh $new_lang $alidir $dir/dengraph || exit 1;
+fi
+
+if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "align_si.sh: feature type is $feat_type"
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+    cp $alidir/final.mat $dir    
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+if [ ! -z "$transform_dir" ]; then # add transforms to features...
+  echo "$0: using fMLLR transforms from $transform_dir"
+  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
+  [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \
+    && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1;
+  [ -f $alidir/final.mat ] && ! cmp $transform_dir/final.mat $alidir/final.mat && \
+     echo "$0: LDA transforms differ between $alidir and $transform_dir"
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
+else
+  echo "Assuming you don't have a SAT system, since no --transform-dir option supplied "
+fi
+
+if [ -f $alidir/gselect.1.gz ]; then
+  gselect_opt="--gselect=ark,s,cs:gunzip -c $alidir/gselect.JOB.gz|"
+else
+  echo "$0: no such file $alidir/gselect.1.gz" && exit 1;
+fi
+
+if [ -f $alidir/vecs.1 ]; then
+  spkvecs_opt="--spk-vecs=ark:$alidir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk"
+  [ "`cat $alidir/num_jobs`" -ne "$nj" ] \
+    && echo "$0: mismatch in number of jobs with $alidir" && exit 1;
+else
+  if [ -f $alidir/final.alimdl ]; then
+    echo "$0: You seem to have an SGMM system with speaker vectors,"
+    echo "yet we can't find speaker vectors.  Perhaps you supplied"
+    echo "the model director instead of the alignment directory?"
+    exit 1;
+  fi
+fi
+
+if [ $sub_split -eq 1 ]; then 
+  $cmd $parallel_opts JOB=1:$nj $dir/log/decode_den.JOB.log \
+   sgmm2-latgen-faster$thread_string $spkvecs_opt "$gselect_opt" --beam=$beam \
+     --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+     --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $alidir/final.mdl  \
+     $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
+else
+  for n in `seq $nj`; do
+    if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then
+      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
+    else 
+      sdata2=$data/split$nj/$n/split$sub_split;
+      if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then
+        split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
+      fi
+      mkdir -p $dir/log/$n
+      mkdir -p $dir/part
+      feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g`
+      spkvecs_opt_subset=`echo $spkvecs_opt | sed "s/JOB/$n/g"`
+      gselect_opt_subset=`echo $gselect_opt | sed "s/JOB/$n/g"`
+      $cmd $parallel_opts JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
+        sgmm2-latgen-faster$thread_string $spkvecs_opt_subset "$gselect_opt_subset" \
+          --beam=$beam --lattice-beam=$lattice_beam \
+          --acoustic-scale=$acwt --max-mem=$max_mem --max-active=$max_active \
+          --word-symbol-table=$lang/words.txt $alidir/final.mdl  \
+          $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1;
+      echo Merging archives for data subset $n
+      rm $dir/.error 2>/dev/null;
+      for k in `seq $sub_split`; do
+        gunzip -c $dir/lat.$n.$k.gz || touch $dir/.error;
+      done | gzip -c > $dir/lat.$n.gz || touch $dir/.error;
+      [ -f $dir/.error ] && echo Merging lattices for subset $n failed && exit 1;
+      rm $dir/lat.$n.*.gz
+      touch $dir/.done.$n
+    fi
+  done
+fi
+
+
+echo "$0: done generating denominator lattices with SGMMs."
--- a/Показать больше
+++ b/Показать больше
				`@ -0,0 +1 @@`
				`--use-energy=false # only non-default option.`