sync with trunk

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/karel@844 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2012-04-17 11:18:27 +00:00 · 2012-04-17 11:18:27 +00:00 · 6c217ca5ce
--- a/README.txt
+++ b/README.txt
@ -21,4 +21,13 @@ should be Dan Povey (dpovey@microsoft.com).  In addition to specific questions,
 please let me know if there are specific aspects of the project that you feel
 could be improved, that you find confusing, etc., and which missing features you
 most wish it had.
+
+
+
+==SVN-MERGING==
+Merge with trunk:
+svn merge ^/trunk ^/sandbox/karel
+
+When merging, resolve the tree conflicts by:
+svn resolve --accept working -R .
 
--- a/egs/rm/s4/README.txt
+++ b/egs/rm/s4/README.txt
@ -0,0 +1,20 @@
+This recipe is using a publicly available subset of Resource Management data,
+consisting of freely distributed feature files distributed by CMU and some
+metadata(e.g. the word-pair grammar file) available from LDC's website.
+
+To run the recipe the data should be downloaded first, for which ./getdata.sh
+command can be used. Then ./run.sh script can be executed to automatically perform
+all steps or the commands in it can be started manually by copy/pasting them. 
+
+The script and data layout are based on egs/rm/s3 recipe, with several exceptions:
+
+- because this recipe uses pre-extracted feature vectors no conversion from .sph
+to .wav format and consequent feature extraction is needed. The features are just
+converted from CMU Sphinx feature files to Kaldi Tables.
+
+- only one test set is available instead of several (e.g. mar87, oct87 and so on)
+as in the original recipe
+
+- no speaker-dependent processing
+
+- on the plus side it requires less disk space (about 220MB)
--- a/egs/rm/s4/conf/mfcc.conf
+++ b/egs/rm/s4/conf/mfcc.conf
@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
--- a/egs/rm/s4/conf/plp.conf
+++ b/egs/rm/s4/conf/plp.conf
@ -0,0 +1,2 @@
+# No non-default options for now.
+
--- a/egs/rm/s4/conf/topo.proto
+++ b/egs/rm/s4/conf/topo.proto
@ -0,0 +1,22 @@
+<Topology> 
+<TopologyEntry> 
+<ForPhones>
+NONSILENCEPHONES
+</ForPhones> 
+<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State> 
+<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State> 
+<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State> 
+<State> 3 </State>
+</TopologyEntry> 
+<TopologyEntry> 
+<ForPhones>
+SILENCEPHONES
+</ForPhones> 
+<State> 0 <PdfClass> 0 <Transition> 0 0.25 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 </State> 
+<State> 1 <PdfClass> 1 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State> 
+<State> 2 <PdfClass> 2 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State> 
+<State> 3 <PdfClass> 3 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State> 
+<State> 4 <PdfClass> 4 <Transition> 4 0.25 <Transition> 5 0.75 </State> 
+<State> 5 </State>
+</TopologyEntry> 
+</Topology> 
--- a/egs/rm/s4/getdata.sh
+++ b/egs/rm/s4/getdata.sh
@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Copyright 2012 Vassil Panayotov
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+source path.sh
+
+# Download and extract CMU's feature files
+mkdir -p $RM1_ROOT
+wget -P $RM1_ROOT http://www.speech.cs.cmu.edu/databases/rm1/rm1_cepstra.tar.gz
+tar -C $RM1_ROOT/ -xf $RM1_ROOT/rm1_cepstra.tar.gz
+
+# Download the available LDC metadata
+# For some reason wget needs to be run twice in order to get all needed data ...
+wget -P $RM1_ROOT -mk --no-parent -r -c -v -nH http://www.ldc.upenn.edu/Catalog/docs/LDC93S3B/
+wget -P $RM1_ROOT -mk --no-parent -r -c -v -nH http://www.ldc.upenn.edu/Catalog/docs/LDC93S3B/
+mv $RM1_ROOT/Catalog/docs/LDC93S3B $RM1_ROOT/
+rm -rf $RM1_ROOT/Catalog
--- a/egs/rm/s4/local/decode.sh
+++ b/egs/rm/s4/local/decode.sh
@ -0,0 +1,40 @@
+#!/bin/bash
+
+# This script basically calls the supplied decoding script
+# once for each test set (in parallel on the same machine),
+# and then averages the resulting WERs.
+# The interpretation of the decode-dir-1, etc., as inputs,
+# outputs and so on, depends on the decoding script you call.
+
+# It assumes the model directory is one level of from decode-dir-1.
+
+mono_opt=
+
+if [ "$1" == "--mono" ]; then
+   mono_opt=$1;
+   shift;
+fi
+
+script=$1
+decode_dir_1=$2 # e.g. exp/sgmm3b/decode
+decode_dir_2=$3
+decode_dir_3=$4
+dir=`dirname $decode_dir_1` # e.g. exp/sgmm3b
+
+if [ $# -ne 2 ]; then 
+  echo "Usage: scripts/decode.sh <decode-script> <decode-dir-1>"
+  exit 1;
+fi
+if [ ! -x $script -o ! -d $dir ]; then
+  echo "scripts/decode.sh: Either no such script $script or not executable, or no such dir $dir"
+  exit 1;
+fi
+
+scripts/mkgraph.sh $mono_opt data/lang_test $dir $dir/graph
+
+$script $dir data/test data/lang $decode_dir_1/ &
+wait
+
+# The publicly available RM subset has just one test set(instead of mar87 etc.),
+# so no averaging is needed
+grep WER $decode_dir_1/wer* || echo "Error decoding $decode_dir: no WER results found."
--- a/egs/rm/s4/local/make_trans.pl
+++ b/egs/rm/s4/local/make_trans.pl
@ -0,0 +1,69 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# usage:  make_trans.sh prefix in.flist input.snr out.txt out.scp
+
+# prefix is first letters of the database "key" (rest are numeric)
+
+# in.flist is just a list of filenames, probably of .sph files.
+# input.snr is an snr format file from the RM dataset.  
+# out.txt is the output transcriptions in format "key word1 word\n"
+# out.scp is the output scp file, which is as in.scp but has the
+# database-key first on each line.
+
+# Reads from first argument e.g. $rootdir/rm1_audio1/rm1/doc/al_sents.snr
+# and second argument train_wav.scp 
+# Writes to standard output trans.txt
+
+if(@ARGV != 5) {
+    die "usage:  make_trans.sh prefix in.flist input.snr out.txt out.scp\n";
+}
+($prefix, $in_flist, $input_snr, $out_txt, $out_scp) = @ARGV;
+
+open(F, "<$input_snr") || die "Opening SNOR file $input_snr";
+
+while(<F>) {
+    if(m/^;/) { next; }
+    m/(.+) \((.+)\)/ || die "bad line $_";
+    $T{$2} = $1;
+}
+
+close(F);
+open(G, "<$in_flist") || die "Opening file list $in_flist";
+
+open(O, ">$out_txt") || die "Open output transcription file $out_txt";
+
+open(P, ">$out_scp") || die "Open output scp file $out_scp";
+
+while(<G>) {
+    $_ =~ m:/(\w+)/(\w+)\.mfc\s+$:i || die "bad scp line $_";
+    $spkname = $1;
+    $uttname = $2;
+    $uttname  =~ tr/a-z/A-Z/;
+    defined $T{$uttname} || die "no trans for sent $uttname";
+    $spkname =~ s/_//g; # remove underscore from spk name to make key nicer.
+    $key = $prefix . "_" . $spkname . "_" . $uttname;
+    $key =~ tr/A-Z/a-z/; # Make it all lower case.
+     # to make the numerical and string-sorted orders the same.
+    print O "$key $T{$uttname}\n";
+    print P "$key $_";
+    $n++;
+} 
+close(O) || die "Closing output.";
+close(P) || die "Closing output.";
+
+
--- a/egs/rm/s4/local/rm_data_prep.sh
+++ b/egs/rm/s4/local/rm_data_prep.sh
@ -0,0 +1,80 @@
+#!/bin/bash
+#
+# Copyright 2012 Vassil Panayotov
+# modified from a file that was:
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# To be run from one directory above this script.
+
+# Note: when creating your own data preparation scripts, it's a good idea
+# to make sure that the speaker id (if present) is a prefix of the utterance
+# id, that the output scp file is sorted on utterance id, and that the 
+# transcription file is exactly the same length as the scp file and is also
+# sorted on utterance id (missing transcriptions should be removed from the
+# scp file using e.g. scripts/filter_scp.pl)
+
+if [ $# != 1 ]; then
+  echo "Usage: ../../local/RM_data_prep.sh /path/to/RM"
+  exit 1; 
+fi 
+
+export LC_ALL=C
+
+RMROOT=$1
+
+mkdir -p data/local
+cd data/local
+
+if [ ! -d $RMROOT/LDC93S3B -o ! -d $RMROOT/rm1 ]; then
+  echo "Speech data is missing. You can download the data by running ./getdata.sh"
+  exit 1; 
+fi
+
+# Make a list of files
+cat $RMROOT/rm1/etc/rm1_train.fileids | \
+    xargs -I_x_ echo $RMROOT/rm1/feat/_x_.mfc > train.flist
+cat $RMROOT/rm1/etc/rm1_test.fileids | \
+    xargs -I_x_ echo $RMROOT/rm1/feat/_x_.mfc > test.flist
+
+# make_trans.pl also creates the utterance id's and the kaldi-format scp file.
+
+# training set
+../../local/make_trans.pl trn train.flist $RMROOT/LDC93S3B/disc_1/doc/al_sents.snr train_trans.txt train.scp
+mv train_trans.txt tmp; sort -k 1 tmp > train_trans.txt
+mv train.scp tmp; sort -k 1 tmp > train.scp
+rm tmp
+
+# test set
+../../local/make_trans.pl test test.flist $RMROOT/LDC93S3B/disc_1/doc/al_sents.snr test_trans.txt test.scp
+mv test_trans.txt tmp; sort -k 1 tmp > test_trans.txt
+mv test.scp tmp; sort -k 1 tmp > test.scp
+rm tmp
+
+# We already have the features, so sph2pipe step is skipped and
+# given the limited data the speaker-dependent processing is also not used 
+
+../../scripts/make_rm_lm.pl $RMROOT/LDC93S3B/disc_1/doc/wp_gram.txt  > G.txt || exit 1;
+
+# Convert the CMU's lexicon to a form which the other scripts expect
+# (leave only the first pronunciation variant, convert "'" to "+", 
+# and convert the phones to lower case)
+cat $RMROOT/rm1/etc/rm1.dic | \
+  egrep -v '\(' | \
+  sed -e "s/'/\+/g" | \
+  sed -e "s/^\([[:alnum:]-]\+\(+[[:alpha:]]\+\)\?\)\(.*\)/\1\L\3/g" > lexicon.txt
+
+
+echo RM_data_prep succeeded.
--- a/egs/rm/s4/local/rm_format_data.sh
+++ b/egs/rm/s4/local/rm_format_data.sh
@ -0,0 +1,126 @@
+#!/bin/bash
+#
+# Copyright 2012 Vassil Panayotov
+# modified from:
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# To be run from one directory above this script.
+
+
+
+if [ -f path.sh ]; then . path.sh; fi
+
+data_list="train test"
+
+for x in lang lang_test $data_list; do
+  mkdir -p data/$x
+done
+
+# Copy stuff into its final location:
+
+for x in $data_list; do
+  cp data/local/${x}.scp data/$x/mfc.scp || exit 1;
+  cp data/local/${x}_trans.txt data/$x/text || exit 1;
+done
+
+# We are not using make_words_symtab.pl for symbol table creation in this
+# recipe, because CMU's lexicon have several words that are not in the 
+# word-pair grammar
+cat data/local/lexicon.txt | \
+ awk 'BEGIN{print "<eps>\t0";} {print $1 "\t" NR;} END{print "!SIL\t" NR+1;}' \
+ > data/lang/words.txt
+scripts/make_phones_symtab.pl < data/local/lexicon.txt > data/lang/phones.txt
+cp data/lang/words.txt data/lang_test/words.txt
+
+silphones="sil"; # This would in general be a space-separated list of all silence phones.  E.g. "sil vn"
+# Generate colon-separated lists of silence and non-silence phones.
+scripts/silphones.pl data/lang/phones.txt "$silphones" data/lang/silphones.csl \
+  data/lang/nonsilphones.csl
+
+ndisambig=`scripts/add_lex_disambig.pl data/local/lexicon.txt data/local/lexicon_disambig.txt`
+ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence in lexicon FST.
+scripts/add_disambig.pl data/lang/phones.txt $ndisambig > data/lang_test/phones_disambig.txt
+cp data/lang_test/phones_disambig.txt data/lang/ # needed for MMI.
+
+silprob=0.5  # same prob as word
+scripts/make_lexicon_fst.pl data/local/lexicon.txt $silprob sil  | \
+  fstcompile --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt \
+   --keep_isymbols=false --keep_osymbols=false | \
+   fstarcsort --sort_type=olabel > data/lang/L.fst
+
+# Create L_align.fst, which is as L.fst but with alignment symbols (#1 and #2 at the
+# beginning and end of words, on the input side)... useful if we
+# ever need to e.g. create ctm's-- these are used to work out the
+# word boundaries.
+
+
+cat data/local/lexicon.txt | \
+ awk '{printf("%s #1 ", $1); for (n=2; n <= NF; n++) { printf("%s ", $n); } print "#2"; }' | \
+ scripts/make_lexicon_fst.pl - 0.5 sil | \
+ fstcompile --isymbols=data/lang_test/phones_disambig.txt --osymbols=data/lang_test/words.txt \
+  --keep_isymbols=false --keep_osymbols=false | \
+ fstarcsort --sort_type=olabel > data/lang_test/L_align.fst
+
+# L_disambig.fst has the disambiguation symbols (c.f. Mohri's papers)
+
+scripts/make_lexicon_fst.pl data/local/lexicon_disambig.txt $silprob sil '#'$ndisambig | \
+   fstcompile --isymbols=data/lang_test/phones_disambig.txt --osymbols=data/lang_test/words.txt \
+   --keep_isymbols=false --keep_osymbols=false | fstarcsort --sort_type=olabel \
+    > data/lang_test/L_disambig.fst
+
+cp data/lang_test/L_disambig.fst data/lang/  # Needed for MMI training.
+
+fstcompile --isymbols=data/lang/words.txt --osymbols=data/lang/words.txt --keep_isymbols=false \
+    --keep_osymbols=false data/local/G.txt > data/lang_test/G.fst
+
+# Checking that G is stochastic [note, it wouldn't be for an Arpa]
+fstisstochastic data/lang_test/G.fst || echo Error: G is not stochastic
+
+# Checking that G.fst is determinizable.
+fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
+
+# Checking that L_disambig.fst is determinizable.
+fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
+
+# Checking that disambiguated lexicon times G is determinizable
+fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
+   fstdeterminize >/dev/null || echo Error
+
+# Checking that LG is stochastic:
+fsttablecompose data/lang/L.fst data/lang_test/G.fst | \
+   fstisstochastic || echo Error: LG is not stochastic.
+
+# Checking that L_disambig.G is stochastic:
+fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
+   fstisstochastic || echo Error: LG is not stochastic.
+
+
+## Check lexicon.
+## just have a look and make sure it seems sane.
+echo "First few lines of lexicon FST:"
+fstprint   --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst  | head
+
+
+silphonelist=`cat data/lang/silphones.csl | sed 's/:/ /g'`
+nonsilphonelist=`cat data/lang/nonsilphones.csl | sed 's/:/ /g'`
+cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \
+   sed "s:SILENCEPHONES:$silphonelist:" > data/lang/topo 
+
+for x in phones.txt words.txt silphones.csl nonsilphones.csl topo; do
+   cp data/lang/$x data/lang_test/$x || exit 1;
+done
+
+echo RM_format_data succeeded.
--- a/egs/rm/s4/path.sh
+++ b/egs/rm/s4/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+
+# path to Kaldi's root directory
+root=`pwd`/../../..
+
+export PATH=${root}/src/bin:${root}/tools/openfst/bin:${root}/src/fstbin/:${root}/src/gmmbin/:${root}/src/featbin/:${root}/src/fgmmbin:${root}/src/sgmmbin:${root}/src/lm:${root}/src/latbin:${root}/src/tiedbin/:$PATH  
+
+# path to the directory in which the subset of RM corpus is stored
+export RM1_ROOT=`pwd`/data/download
+
+export LC_ALL=C
+export LC_LOCALE_ALL=C
+
--- a/egs/rm/s4/run.sh
+++ b/egs/rm/s4/run.sh
@ -0,0 +1,57 @@
+#!/bin/bash
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+source ./path.sh
+
+# call the next line with the directory where the RM data is
+local/rm_data_prep.sh $RM1_ROOT || exit 1;
+
+local/rm_format_data.sh || exit 1;
+
+# the directory, where you want to store MFCC features.
+featdir=data/rm_feats
+
+# convert the Sphinx feature files to Kaldi tables
+for x in train test; do
+ steps/make_mfcc.sh data/$x exp/make_mfcc/$x $featdir  || exit 1;
+done
+
+scripts/subset_data_dir.sh data/train 1000 data/train.1k  || exit 1;
+
+# train monophone system.
+steps/train_mono.sh data/train.1k data/lang exp/mono  || exit 1;
+
+# monophone decoding
+local/decode.sh --mono steps/decode_deltas.sh exp/mono/decode || exit 1;
+
+# Get alignments from monophone system.
+steps/align_deltas.sh data/train data/lang exp/mono exp/mono_ali || exit 1;
+
+# train tri1 [first triphone pass]
+steps/train_deltas.sh data/train data/lang exp/mono_ali exp/tri1 || exit 1;
+
+# decode tri1
+local/decode.sh steps/decode_deltas.sh exp/tri1/decode || exit 1;
+
+# align tri1
+steps/align_deltas.sh --graphs "ark,s,cs:gunzip -c exp/tri1/graphs.fsts.gz|" \
+    data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+
+# train tri2a [delta+delta-deltas]
+steps/train_deltas.sh data/train data/lang exp/tri1_ali exp/tri2a || exit 1;
+
+# decode tri2a
+local/decode.sh steps/decode_deltas.sh exp/tri2a/decode || exit 1;
--- a/egs/rm/s4/scripts/add_disambig.pl
+++ b/egs/rm/s4/scripts/add_disambig.pl
@ -0,0 +1,58 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Adds some specified number of disambig symbols to a symbol table.
+# Adds these as #1, #2, etc.
+# If the --include-zero option is specified, includes an extra one
+# #0.
+if(!(@ARGV == 2 || (@ARGV ==3 && $ARGV[0] eq "--include-zero"))) {
+    die "Usage: add_disambig.pl [--include-zero] symtab.txt num_extra > symtab_out.txt ";
+}
+
+if(@ARGV  == 3) {
+    $include_zero = 1;
+    $ARGV[0] eq "--include-zero" || die "Bad option/first argument $ARGV[0]";
+    shift @ARGV;
+} else {
+    $include_zero = 0;
+}
+
+$input = $ARGV[0];
+$nsyms = $ARGV[1];
+
+open(F, "<$input") || die "Opening file $input";
+
+while(<F>) {
+    @A = split(" ", $_);
+    @A == 2 || die "Bad line $_";
+    $lastsym = $A[1];
+    print;
+}
+
+if(!defined($lastsym)){
+ die "Empty symbol file?";
+}
+
+if($include_zero) {
+    $lastsym++;
+    print "#0  $lastsym\n";
+}
+
+for($n = 1; $n <= $nsyms; $n++) {
+    $y = $n + $lastsym;
+    print "#$n  $y\n";
+}
--- a/egs/rm/s4/scripts/add_lex_disambig.pl
+++ b/egs/rm/s4/scripts/add_lex_disambig.pl
@ -0,0 +1,101 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Adds disambiguation symbols to a lexicon.
+# Outputs still in the normal lexicon format.
+# Disambig syms are numbered #1, #2, #3, etc. (#0 
+# reserved for symbol in grammar).
+# Outputs the number of disambig syms to the standard output.
+
+if(@ARGV != 2) {
+    die "Usage: add_lex_disambig.pl [ --sil silphone ] lexicon.txt lexicon_disambig.txt "
+}
+
+
+$lexfn = shift @ARGV;
+$lexoutfn = shift @ARGV;
+
+open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
+
+# (1)  Read in the lexicon.
+@L = ( );
+while(<L>) {
+    @A = split(" ", $_);
+    push @L, join(" ", @A);
+}
+
+# (2) Work out the count of each phone-sequence in the
+# lexicon.
+
+foreach $l (@L) {
+    @A = split(" ", $l);
+    shift @A; # Remove word.
+    $count{join(" ",@A)}++;
+}
+
+# (3) For each left sub-sequence of each phone-sequence, note down
+# that exists (for identifying prefixes of longer strings).
+
+foreach $l (@L) {
+    @A = split(" ", $l);
+    shift @A; # Remove word.
+    while(@A > 0) {
+        pop @A;  # Remove last phone
+        $issubseq{join(" ",@A)} = 1;
+    }
+}
+
+# (4) For each entry in the lexicon:
+#  if the phone sequence is unique and is not a
+#  prefix of another word, no diambig symbol.
+#  Else output #1, or #2, #3, ... if the same phone-seq
+#  has already been assigned a disambig symbol.
+
+
+open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";
+
+$max_disambig = 0;
+foreach $l (@L) {
+    @A = split(" ", $l);
+    $word = shift @A;
+    $phnseq = join(" ",@A);
+    if(!defined $issubseq{$phnseq}
+       && $count{$phnseq}==1) {
+        ; # Do nothing.
+    } else {
+        if($phnseq eq "") { # need disambig symbols for the empty string
+            # that are not used anywhere else.
+            $max_disambig++;
+            $reserved{$max_disambig} = 1;
+            $phnseq = "#$max_disambig";
+        } else {
+            $curnumber = $disambig_of{$phnseq};
+            if(!defined{$curnumber}) { $curnumber = 0; }
+            $curnumber++; # now 1 or 2, ... 
+            while(defined $reserved{$curnumber} ) { $curnumber++; } # skip over reserved symbols
+            if($curnumber > $max_disambig) {
+                $max_disambig = $curnumber;
+            }
+            $disambig_of{$phnseq} = $curnumber;
+            $phnseq = $phnseq . " #" . $curnumber;
+         }
+    }
+    print O "$word\t$phnseq\n";
+}
+
+print $max_disambig . "\n";
+
--- a/egs/rm/s4/scripts/filter_scp.pl
+++ b/egs/rm/s4/scripts/filter_scp.pl
@ -0,0 +1,40 @@
+#!/usr/bin/perl -w
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script takes a list of utterance-ids and filters an scp
+# file (or any file whose first field is an utterance id), printing
+# out only those lines whose first field is in id_list.
+
+if(@ARGV < 1 || @ARGV > 2) {
+    die "Usage: filter_scp.pl id_list [in.scp] > out.scp ";
+}
+
+$idlist = shift @ARGV;
+open(F, "<$idlist") || die "Could not open id-list file $idlist";
+while(<F>) {
+    @A = split;
+    @A>=1 || die "Invalid id-list file line $_";
+    $seen{$A[0]} = 1;
+}
+
+while(<>) {
+    @A = split;
+    @A > 0 || die "Invalid scp file line $_";
+    if($seen{$A[0]}) {
+        print $_;
+    }
+}
--- a/egs/rm/s4/scripts/int2sym.pl
+++ b/egs/rm/s4/scripts/int2sym.pl
@ -0,0 +1,90 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+$ignore_noninteger = 0;
+$ignore_first_field = 0;
+$field = -1;
+for($x = 0; $x < 2; $x++) {
+    if($ARGV[0] eq "--ignore-noninteger") { $ignore_noninteger = 1; shift @ARGV; }
+    if($ARGV[0] eq "--ignore-first-field") { $ignore_first_field = 1; shift @ARGV; }
+    if($ARGV[0] eq "--field") { 
+       shift @ARGV; $field = $ARGV[0]+0; shift @ARGV;
+       if ($field < 1) { die "Bad argument to --field option: $field"; }
+    }
+}
+
+if ($ignore_first_field && $field > 0) { die "Incompatible options ignore-first-field and field"; }
+$zfield = $field-1; # Change to zero-based indexing.
+
+$symtab = shift @ARGV;
+if(!defined $symtab) {
+    die "Usage: sym2int.pl symtab [input] > output\n";
+}
+open(F, "<$symtab") || die "Error opening symbol table file $symtab";
+while(<F>) {
+    @A = split(" ", $_);
+    @A == 2 || die "bad line in symbol table file: $_";
+    $int2sym{$A[1]} = $A[0];
+}
+
+sub int2sym {
+    my $a = shift @_;
+    my $pos = shift @_;
+    if($a !~  m:^\d+$:) { # not all digits..
+        if($ignore_noninteger) {
+            print $a . " ";
+            next;
+        } else {
+            if($pos == 0) {
+                die "int2sym.pl: found noninteger token $a (try --ignore-first-field)\n";
+            } else {
+                die "int2sym.pl: found noninteger token $a (try --ignore-noninteger if valid input)\n";
+            }
+        }
+    }
+    $s = $int2sym{$a};
+    if(!defined ($s)) {
+        die "int2sym.pl: integer $a not in symbol table $symtab.";
+    }
+    return $s;
+}
+
+$error = 0;
+while(<>) {
+    @A = split(" ", $_);
+    if($ignore_first_field) {
+        $key = shift @A;
+        print $key . " ";
+    }
+    if ($field != -1) {
+        if ($zfield <= $#A && $zfield >= 0) {
+            $a = $A[$zfield];
+            $A[$zfield] = int2sym($a, $zfield);
+        }
+        print join(" ", @A);
+    } else {
+        for ($pos = 0; $pos <= $#A; $pos++) {
+            $a = $A[$pos];
+            $s = int2sym($a, $pos);
+            print $s . " ";
+        }
+    }
+    print "\n";
+}
+
+
+
--- a/egs/rm/s4/scripts/make_lexicon_fst.pl
+++ b/egs/rm/s4/scripts/make_lexicon_fst.pl
@ -0,0 +1,122 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# makes lexicon FST (no pron-probs involved).
+
+if(@ARGV != 1 && @ARGV != 3 && @ARGV != 4) {
+    die "Usage: make_lexicon_fst.pl lexicon.txt [silprob silphone [sil_disambig_sym]] lexiconfst.txt"
+}
+
+$lexfn = shift @ARGV;
+if(@ARGV == 0) {
+    $silprob = 0.0;
+} elsif (@ARGV == 2){ 
+    ($silprob,$silphone) = @ARGV;
+} else {
+    ($silprob,$silphone,$sildisambig) = @ARGV;
+}
+if($silprob != 0.0) {
+    $silprob < 1.0 || die "Sil prob cannot be >= 1.0";
+    $silcost = -log($silprob);
+    $nosilcost = -log(1.0 - $silprob);
+}
+
+
+open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
+
+
+
+sub is_sil {
+    # Return true (1) if provided with a phone-sequence
+    # that means silence.
+    # @_ is the parameters of the function
+    # This function returns true if @_ equals ( $silphone )
+    # or something of the form ( "#0", $silphone, "#1" )
+    # where the "#0" and "#1" are disambiguation symbols.
+    return ( @_ == 1 && $_[0] eq $silphone ||
+             (@_ == 3 && $_[1] eq $silphone &&
+              $_[0] =~ m/^\#\d+$/ &&
+              $_[0] =~ m/^\#\d+$/));
+}
+
+if( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero.
+    $loopstate = 0;
+    $nexststate = 1; # next unallocated state.
+    while(<L>) {
+        @A = split(" ", $_);
+        $w = shift @A;
+
+        $s = $loopstate;
+        $word_or_eps = $w;
+        while (@A > 0) {
+            $p = shift @A;
+            if(@A > 0) {
+                $ns = $nextstate++;
+            } else {
+                $ns = $loopstate;
+            }
+            print "$s\t$ns\t$p\t$word_or_eps\n";
+            $word_or_eps = "<eps>";
+            $s = $ns;
+        }
+    }
+    print "$loopstate\t0\n"; # final-cost.
+} else { # have silence probs.
+    $startstate = 0;
+    $loopstate = 1;
+    $silstate = 2; # state from where we go to loopstate after emitting silence.
+    print "$startstate\t$loopstate\t<eps>\t<eps>\t$nosilcost\n"; # no silence.
+    if (!defined $sildisambig) {
+        print "$startstate\t$loopstate\t$silphone\t<eps>\t$silcost\n"; # silence.
+        print "$silstate\t$loopstate\t$silphone\t<eps>\n"; # no cost.
+        $nextstate = 3;
+    } else {
+        $disambigstate = 3;
+        $nextstate = 4;
+        print "$startstate\t$disambigstate\t$silphone\t<eps>\t$silcost\n"; # silence.
+        print "$silstate\t$disambigstate\t$silphone\t<eps>\n"; # no cost.
+        print "$disambigstate\t$loopstate\t$sildisambig\t<eps>\n"; # silence disambiguation symbol.
+    }
+    while(<L>) {
+        @A = split(" ", $_);
+        $w = shift @A;
+
+        $s = $loopstate;
+        $word_or_eps = $w;
+        while (@A > 0) {
+            $p = shift @A;
+            if(@A > 0) {
+                $ns = $nextstate++;
+                print "$s\t$ns\t$p\t$word_or_eps\n";
+                $word_or_eps = "<eps>";
+                $s = $ns;
+            } else {
+                if(!is_sil(@A)){
+                    # This is non-deterministic but relatively compact,
+                    # and avoids epsilons.
+                    print "$s\t$loopstate\t$p\t$word_or_eps\t$nosilcost\n";
+                    print "$s\t$silstate\t$p\t$word_or_eps\t$silcost\n";
+                } else {
+                    # no point putting opt-sil after silence word.
+                    print "$s\t$loopstate\t$p\t$word_or_eps\n";
+                }
+                $word_or_eps = "<eps>";
+            }
+        }            
+    }
+    print "$loopstate\t0\n"; # final-cost.
+}
--- a/egs/rm/s4/scripts/make_phones_symtab.pl
+++ b/egs/rm/s4/scripts/make_phones_symtab.pl
@ -0,0 +1,37 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# make_phones_symtab.pl < lexicon.txt > phones.txt
+
+
+while(<>) {
+    @A = split(" ", $_);
+    for ($i=2; $i<@A; $i++) {
+        $P{$A[$i]} = 1; # seen it.
+    }
+}
+
+print "<eps>\t0\n";
+$n = 1;
+foreach $p (sort keys %P) {
+    if($p ne "<eps>") {
+        print "$p\t$n\n";
+        $n++;
+    }
+}
+
+print "sil\t$n\n";
+
--- a/egs/rm/s4/scripts/make_rm_lm.pl
+++ b/egs/rm/s4/scripts/make_rm_lm.pl
@ -0,0 +1,119 @@
+#!/usr/bin/perl
+
+# Copyright 2010-2011 Yanmin Qian  Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This file takes as input the file wp_gram.txt that comes with the RM
+# distribution, and creates the language model as an acceptor in FST form.
+
+# make_rm_lm.pl   wp_gram.txt > G.txt
+
+if (@ARGV != 1) {
+    print "usage: make_rm_lm.pl  wp_gram.txt > G.txt\n";
+    exit(0);
+}
+unless (open(IN_FILE, "@ARGV[0]")) {
+    die ("can't open @ARGV[0]");
+}
+
+
+$flag = 0;
+$count_wrd = 0;
+$cnt_ends = 0;
+$init = "";
+
+while ($line = <IN_FILE>)
+{	
+	chop($line);
+
+    $line =~ s/ //g;
+    
+	if(($line =~ /^>/)) 
+	{
+		if($flag == 0) 
+		{
+			$flag = 1;
+		}
+		$line =~ s/>//g;
+		$hashcnt{$init} = $i;
+		$init = $line;
+		$i = 0;
+		$count_wrd++;
+		@LineArray[$count_wrd - 1] = $init;
+ 		$hashwrd{$init} = 0;
+	}
+	elsif($flag != 0)
+	{
+		
+		$hash{$init}[$i] = $line;
+		$i++; 			
+		if($line =~ /SENTENCE-END/)
+		{
+			$cnt_ends++;
+		}
+ 	} 
+	else
+	{}
+}
+
+$hashcnt{$init} = $i;
+
+$num = 0;
+$weight = 0;
+$init_wrd = "SENTENCE-END";
+$hashwrd{$init_wrd} = @LineArray;
+for($i = 0; $i < $hashcnt{$init_wrd}; $i++)
+{
+	$weight = -log(1/$hashcnt{$init_wrd});
+	$hashwrd{$hash{$init_wrd}[$i]} = $i + 1;
+	print "0    $hashwrd{$hash{$init_wrd}[$i]}    $hash{$init_wrd}[$i]    $hash{$init_wrd}[$i]    $weight\n";
+}
+$num = $i;
+
+for($i = 0; $i < @LineArray; $i++)
+{
+	if(@LineArray[$i] eq 'SENTENCE-END')
+	{}
+	else
+	{
+		if($hashwrd{@LineArray[$i]} == 0)
+		{
+			$num++;
+			$hashwrd{@LineArray[$i]} = $num;
+		}
+		for($j = 0; $j < $hashcnt{@LineArray[$i]}; $j++)
+		{
+			$weight = -log(1/$hashcnt{@LineArray[$i]});
+			if($hashwrd{$hash{@LineArray[$i]}[$j]} == 0)
+			{
+				$num++;
+				$hashwrd{$hash{@LineArray[$i]}[$j]} = $num;
+			}
+			if($hash{@LineArray[$i]}[$j] eq 'SENTENCE-END')
+			{
+				print "$hashwrd{@LineArray[$i]}    $hashwrd{$hash{@LineArray[$i]}[$j]}    <eps>    <eps>    $weight\n"
+                }
+			else
+			{
+				print "$hashwrd{@LineArray[$i]}    $hashwrd{$hash{@LineArray[$i]}[$j]}    $hash{@LineArray[$i]}[$j]    $hash{@LineArray[$i]}[$j]    $weight\n";
+			}
+		}
+	}
+}
+
+print "$hashwrd{$init_wrd}    0\n";
+close(IN_FILE);
+
+
--- a/egs/rm/s4/scripts/make_roots.pl
+++ b/egs/rm/s4/scripts/make_roots.pl
@ -0,0 +1,102 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Written by Dan Povey 9/21/2010.  Apache 2.0 License.
+
+# This version of make_roots.pl is specialized for RM.
+
+# This script creates the file roots.txt which is an input to train-tree.cc.  It
+# specifies how the trees are built.  The input file phone-sets.txt is a partial
+# version of roots.txt in which phones are represented by their spelled form, not
+# their symbol id's.  E.g. at input, phone-sets.txt might contain;
+#  shared not-split  sil
+# Any phones not specified in phone-sets.txt but present in phones.txt will
+# be given a default treatment.  If the --separate option is given, we create
+# a separate tree root for each of them, otherwise they are all lumped in one set.
+# The arguments shared|not-shared and split|not-split are needed if any
+# phones are not specified in phone-sets.txt.  What they mean is as follows:
+# if shared=="shared" then we share the tree-root between different HMM-positions
+# (0,1,2).  If split=="split" then we actually do decision tree splitting on
+# that root, otherwise we forbid decision-tree splitting.  (The main reason we might 
+# set this to false is for silence when
+# we want to ensure that the HMM-positions will remain with a single PDF id.
+
+
+$separate = 0;
+if($ARGV[0] eq "--separate") {
+    $separate = 1;
+    shift @ARGV;
+}
+
+if(@ARGV != 4) {
+    die "Usage: make_roots.pl [--separate] phones.txt silence-phone-list[integer,colon-separated] shared|not-shared split|not-split > roots.txt\n";
+}
+
+
+($phonesfile, $silphones, $shared, $split) = @ARGV;
+if($shared ne "shared" && $shared ne "not-shared") {
+    die "Third argument must be \"shared\" or \"not-shared\"\n";
+}
+if($split ne "split" && $split ne "not-split") {
+    die "Third argument must be \"split\" or \"not-split\"\n";
+}
+
+
+
+open(F, "<$phonesfile") || die "Opening file $phonesfile";
+
+while(<F>) {
+    @A = split(" ", $_);
+    if(@A != 2) {
+        die "Bad line in phones symbol file: ".$_;
+    }
+    if($A[1] != 0) {
+        $symbol2id{$A[0]} = $A[1];
+        $id2symbol{$A[1]} = $A[0];
+    }
+}
+
+if($silphones == ""){ 
+    die "Empty silence phone list in make_roots.pl";
+}
+foreach $silphoneid (split(":", $silphones)) {
+    defined $id2symbol{$silphoneid} || die "No such silence phone id $silphoneid";
+    # Give each silence phone its own separate pdfs in each state, but
+    # no sharing (in this recipe; WSJ is different.. in this recipe there
+    #is only one silence phone anyway.)
+    $issil{$silphoneid} = 1;
+    print "not-shared not-split $silphoneid\n";
+}
+
+$idlist = "";
+$remaining_phones = "";
+
+if($separate){
+    foreach $a (keys %id2symbol) {
+        if(!defined $issil{$a}) {
+            print "$shared $split $a\n";
+        }
+    }
+} else {
+    print "$shared $split ";
+    foreach $a (keys %id2symbol) {
+        if(!defined $issil{$a}) {
+            print "$a ";
+        }
+    }
+    print "\n";
+}
--- a/egs/rm/s4/scripts/mkgraph.sh
+++ b/egs/rm/s4/scripts/mkgraph.sh
@ -0,0 +1,112 @@
+#!/bin/bash
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+N=3
+P=1
+clean=false
+
+for x in 1 2 3; do 
+  if [ $1 == "--mono" ]; then
+    N=1;
+    P=0;
+    shift;
+  fi
+  if [ $1 == "--clean" ]; then
+    clean=true
+    shift;
+  fi
+
+done
+
+if [ $# != 3 ]; then
+   echo "Usage: scripts/mkgraph.sh <test-lang-dir> <model-dir> <graphdir>"
+   echo "e.g.: scripts/mkgraph.sh data/lang_test exp/tri1/ exp/tri1/graph"
+   exit 1;
+fi
+
+if [ -f path.sh ]; then . path.sh; fi
+
+lang=$1
+tree=$2/tree
+model=$2/final.mdl
+dir=$3
+
+if $clean; then rm -r $lang/tmp; fi
+
+mkdir -p $dir
+
+tscale=1.0
+loopscale=0.1
+
+# If $lang/tmp/LG.fst does not exist or is older than its sources, make it...
+# (note: the [[ ]] brackets make the || type operators work (inside [ ], we
+# would have to use -o instead),  -f means file exists, and -ot means older than).
+
+mkdir -p $lang/tmp
+if [[ ! -f $lang/tmp/LG.fst || $lang/tmp/LG.fst -ot $lang/G.fst || \
+      $lang/tmp/LG.fst -ot $lang/L_disambig.fst ]]; then
+  fsttablecompose $lang/L_disambig.fst $lang/G.fst | fstdeterminizestar --use-log=true | \
+    fstminimizeencoded  > $lang/tmp/LG.fst || exit 1;
+  fstisstochastic $lang/tmp/LG.fst || echo "warning: LG not stochastic."
+fi
+
+if [ ! -f $lang/phones_disambig.txt ]; then
+  echo "No such file $lang/phones_disambig.txt (supplied a training lang/ directory?)"
+  exit 1;
+fi
+
+grep '#' $lang/phones_disambig.txt | awk '{print $2}' > $lang/tmp/disambig_phones.list
+
+
+clg=$lang/tmp/CLG_${N}_${P}.fst
+
+if [[ ! -f $clg || $clg -ot $lang/tmp/LG.fst ]]; then
+  fstcomposecontext --context-size=$N --central-position=$P \
+   --read-disambig-syms=$lang/tmp/disambig_phones.list \
+   --write-disambig-syms=$lang/tmp/disambig_ilabels_${N}_${P}.list \
+    $lang/tmp/ilabels_${N}_${P} < $lang/tmp/LG.fst >$clg
+  fstisstochastic $clg  || echo "warning: CLG not stochastic."
+fi
+
+if [[ ! -f $dir/Ha.fst || $dir/Ha.fst -ot $model ]]; then
+  make-h-transducer --disambig-syms-out=$dir/disambig_tid.list \
+    --transition-scale=$tscale $lang/tmp/ilabels_${N}_${P} $tree $model \
+     > $dir/Ha.fst  || exit 1;
+fi
+
+if [[ ! -f $dir/HCLGa.fst || $dir/HCLGa.fst -ot $dir/Ha.fst || \
+      $dir/HCLGa.fst -ot $clg ]]; then
+  fsttablecompose $dir/Ha.fst $clg | fstdeterminizestar --use-log=true \
+    | fstrmsymbols $dir/disambig_tid.list | fstrmepslocal | \
+     fstminimizeencoded > $dir/HCLGa.fst || exit 1;
+  fstisstochastic $dir/HCLGa.fst || echo "HCLGa is not stochastic"
+fi
+
+if [[ ! -f $dir/HCLG.fst || $dir/HCLG.fst -ot $dir/HCLGa.fst ]]; then
+  add-self-loops --self-loop-scale=$loopscale --reorder=true \
+    $model < $dir/HCLGa.fst > $dir/HCLG.fst || exit 1;
+
+  if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
+    # No point doing this test if transition-scale not 1, as it is bound to fail. 
+    fstisstochastic $dir/HCLG.fst || echo "Final HCLG is not stochastic."
+  fi
+fi
+
+
+# to make const fst:
+# fstconvert --fst_type=const $dir/HCLG.fst $dir/HCLG_c.fst
+
--- a/egs/rm/s4/scripts/silphones.pl
+++ b/egs/rm/s4/scripts/silphones.pl
@ -0,0 +1,57 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# creates integer lists of silence and non-silence phones in files,
+# e.g. silphones.csl="1:2:3 \n"
+# and nonsilphones.csl="4:5:6:7:...:24\n";
+
+if(@ARGV != 4) {
+    die "Usage: silphones.pl phones.txt \"sil1 sil2 sil3\" silphones.csl nonsilphones.csl";
+}
+
+($symtab, $sillist, $silphones, $nonsilphones) = @ARGV;
+open(S,"<$symtab") || die "Opening symbol table $symtab";
+
+
+foreach $s (split(" ", $sillist)) {
+    $issil{$s} = 1;
+}
+
+@sil = ();
+@nonsil = ();
+while(<S>){
+    @A = split(" ", $_);
+    @A == 2 || die "Bad line $_ in phone-symbol-table file $symtab";
+    ($sym, $int) = @A;
+    if($int != 0) {
+        if($issil{$sym}) { push @sil, $int; $seensil{$sym}=1; }
+        else { push @nonsil, $int; }
+    }
+}
+
+foreach $k(keys %issil) {
+    if(!$seensil{$k}) { die "No such silence phone $k"; }
+}
+open(F, ">$silphones") || die "opening silphones file $silphones";
+open(G, ">$nonsilphones") || die "opening nonsilphones file $nonsilphones";
+print F join(":", @sil) . "\n";
+print G join(":", @nonsil) . "\n";
+close(F);
+close(G);
+if(@sil == 0) { print STDERR "Warning: silphones.pl no silence phones.\n" }
+if(@nonsil == 0) { print STDERR "Warning: silphones.pl no non-silence phones.\n" }
+
--- a/egs/rm/s4/scripts/subset_data_dir.sh
+++ b/egs/rm/s4/scripts/subset_data_dir.sh
@ -0,0 +1,99 @@
+#!/bin/bash
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This script operates on a directory, such as in data/train/,
+# that contains some subset of the following files:
+#  feats.scp
+#  wav.scp
+#  spk2utt
+#  utt2spk
+#  text
+# It creates a subset of that data, consisting of some specified
+# number of utterances.  (The selected utterances are distributed
+# evenly throughout the file, by the program ./subset_scp.pl).
+
+# If you give the --per-spk option, it will attempt to select
+# the supplied number of utterances for each speaker (typically
+# you would supply a much smaller number in this case).
+
+perspk=false
+if [ "$1" == "--per-spk" ]; then
+  perspk=true;
+  shift;
+fi
+
+if [ $# != 3 ]; then
+  echo "Usage: subset_data_dir.sh [--per-spk] <srcdir> <num-utt> <destdir>"
+  exit 1;
+fi
+
+srcdir=$1
+numutt=$2
+destdir=$3
+
+
+if [ ! -f $srcdir/feats.scp ]; then
+  echo "subset_data_dir.sh: no such file $srcdir/feats.scp" 
+  exit 1;
+fi
+
+
+## scripting note: $perspk evaluates to true or false
+## so this becomes the command true or false.
+if $perspk; then
+  mkdir -p $destdir
+  awk '{ n='$numutt'; printf("%s ",$1); skip=1; while(n*(skip+1) <= NF-1) { skip++; }
+         for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); } 
+         printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt
+  scripts/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
+  scripts/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
+  [ -f $srcdir/wav.scp ] && scripts/filter_scp.pl $destdir/feats.scp <$srcdir/wav.scp >$destdir/wav.scp
+  [ -f $srcdir/text ] && scripts/filter_scp.pl $destdir/feats.scp <$srcdir/text >$destdir/text
+  [ -f $srcdir/spk2gender ] && scripts/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
+  srcutts=`cat $srcdir/utt2spk | wc -l`
+  destutts=`cat $destdir/utt2spk | wc -l`
+  echo "Retained $numutt utterances per speaker from data-dir $srcdir and put it in $destdir, reducing #utt from $srcutts to $destutts"
+  exit 0;
+else
+  if [ $numutt -gt `cat $srcdir/feats.scp | wc -l` ]; then
+    echo "subset_data_dir.sh: cannot subset to more utterances than you originally had."
+    exit 1;
+  fi 
+
+  mkdir -p $destdir || exit 1;
+
+  # create feats.scp
+  scripts/subset_scp.pl $numutt $srcdir/feats.scp > $destdir/feats.scp || exit 1;
+ 
+  if [ -f $srcdir/wav.scp ]; then
+    scripts/filter_scp.pl $destdir/feats.scp $srcdir/mfc.scp > $destdir/mfc.scp || exit 1;
+  else
+    rm $destdir/mfc.scp 2>/dev/null
+  fi
+
+  if [ -f $srcdir/utt2spk ]; then
+    scripts/filter_scp.pl $destdir/feats.scp $srcdir/utt2spk > $destdir/utt2spk|| exit 1;
+    scripts/utt2spk_to_spk2utt.pl $destdir/utt2spk > $destdir/spk2utt || exit 1;
+  fi
+
+  [ -f $srcdir/text ] && scripts/filter_scp.pl $destdir/feats.scp <$srcdir/text >$destdir/text
+
+  [ -f $srcdir/spk2gender ] && scripts/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
+
+  echo "Created a $numutt-utterance subset of $srcdir and put it in $destdir."
+
+  exit 0;
+fi
--- a/egs/rm/s4/scripts/subset_scp.pl
+++ b/egs/rm/s4/scripts/subset_scp.pl
@ -0,0 +1,59 @@
+#!/usr/bin/perl -w
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This program selects a subset of N elements in the scp.
+# It selects them evenly from throughout the scp, in order to
+# avoid selecting too many from the same speaker.
+# It prints them on the standard output.
+
+if(@ARGV < 2 ) {
+    die "Usage: subset_scp.pl N in.scp ";
+}
+
+$N = shift @ARGV;
+if($N == 0) {
+    die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\"";
+}
+$inscp = shift @ARGV;
+open(I, "<$inscp") || die "Opening input scp file $inscp";
+
+@F = ();
+while(<I>) {
+    push @F, $_;
+}
+$numlines = @F;
+if($N > $numlines) {
+    die "You requested from subset_scp.pl more elements than available: $N > $numlines";
+}
+
+sub select_n {
+    my ($start,$end,$num_needed) = @_;
+    my $diff = $end - $start;
+    if($num_needed > $diff) { die "select_n: code error"; }
+    if($diff == 1 ) {
+        if($num_needed  > 0) {
+            print $F[$start];
+        }
+    } else {
+        my $halfdiff = int($diff/2);
+        my $halfneeded = int($num_needed/2);
+        select_n($start, $start+$halfdiff, $halfneeded);
+        select_n($start+$halfdiff, $end, $num_needed - $halfneeded);
+    }
+}
+select_n(0, $numlines, $N);
+
--- a/egs/rm/s4/scripts/sym2int.pl
+++ b/egs/rm/s4/scripts/sym2int.pl
@ -0,0 +1,82 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+$ignore_oov = 0;
+$ignore_first_field = 0;
+for($x = 0; $x < 3; $x++) {
+    # Note: it will just print OOVS unmodified if you specify --ignore-oov.
+    # Else will complain and put nothing out.
+    if($ARGV[0] eq "--ignore-oov") { $ignore_oov = 1; shift @ARGV; } 
+    if($ARGV[0] eq "--ignore-first-field") { $ignore_first_field = 1; shift @ARGV; }
+    if($ARGV[0] eq "--map-oov") { shift @ARGV; $map_oov = shift @ARGV; }
+}
+
+$symtab = shift @ARGV;
+if(!defined $symtab) {
+    die "Usage: sym2int.pl symtab [input transcriptions] > output transcriptions\n";
+}
+open(F, "<$symtab") || die "Error opening symbol table file $symtab";
+while(<F>) {
+    @A = split(" ", $_);
+    @A == 2 || die "bad line in symbol table file: $_";
+    $sym2int{$A[0]} = $A[1] + 0;
+}
+
+$num_warning = 0;
+$max_warning = 20;
+$error = 0;
+while(<>) {
+    @A = split(" ", $_);
+    if(@A == 0) {
+        die "Empty line in transcriptions input.";
+    }
+    if($ignore_first_field) {
+        $key = shift @A;
+        print $key . " ";
+    }
+    @B = ();
+    foreach $a (@A) {
+        $i = $sym2int{$a};
+        if(!defined ($i)) {
+            if (defined $map_oov) {
+                if (!defined $sym2int{$map_oov}) {
+                    die "sym2int.pl: invalid map-oov option $map_oov (symbol not defined in $symtab)";
+                }
+                if ($num_warning++ < $max_warning) {
+                    print STDERR "sym2int.pl: replacing $a with $map_oov\n";
+                    if ($num_warning == $max_warning) {
+                        print STDERR "sym2int.pl: not warning for OOVs any more times\n";
+                    }
+                }
+                $i = $sym2int{$map_oov};
+            } elsif($ignore_oov) {
+                $i = $a; # just print them out unmodified..
+            } else {
+                die "sym2int.pl: undefined symbol $a\n";
+            }
+        }
+        push @B, $i;
+    }
+    print join(" ", @B);
+    print "\n";
+}
+
+if($error) { exit(1); }
+else { exit(0); }
+
+
+
--- a/egs/rm/s4/steps/align_deltas.sh
+++ b/egs/rm/s4/steps/align_deltas.sh
@ -0,0 +1,78 @@
+#!/bin/bash
+# Copyright 2010-2011 Microsoft Corporation  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# To be run from ..
+
+# This script does training-data alignment given a model built using 
+# CMN + delta + delta-delta features.  Its output, all in its own
+# experimental directory, is cmvn.ark, ali, tree, and final.mdl 
+# (the last two are just copied from the source directory). 
+
+# Option to use precompiled graphs from last phase, if these
+# are available (i.e. if they were built with the same data).
+
+graphs=
+if [ "$1" == --graphs ]; then
+   shift;
+   graphs=$1
+   shift
+fi
+
+
+if [ $# != 4 ]; then
+   echo "Usage: steps/align_deltas.sh <data-dir> <lang-dir> <src-dir> <exp-dir>"
+   echo " e.g.: steps/align_deltas.sh data/train data/lang exp/tri1 exp/tri1_ali"
+   exit 1;
+fi
+
+if [ -f path.sh ]; then . path.sh; fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+
+
+mkdir -p $dir
+cp $srcdir/{tree,final.mdl,final.occs} $dir || exit 1;  # Create copy of the tree and model and occs...
+
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+
+
+
+echo "Computing cepstral mean and variance statistics"
+compute-cmvn-stats scp:$data/feats.scp \
+     ark:$dir/cmvn.ark 2>$dir/cmvn.log || exit 1;
+
+feats="ark:apply-cmvn --norm-vars=false ark:$dir/cmvn.ark scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |"
+
+# Align all training data using the supplied model.
+
+echo "Aligning all training data"
+if [ -z "$graphs" ]; then # --graphs option not supplied [-z means empty string]
+  # compute integer form of transcripts.
+  scripts/sym2int.pl --ignore-first-field $lang/words.txt < $data/text > $dir/train.tra \
+    || exit 1;
+  gmm-align $scale_opts --beam=8 --retry-beam=40 $dir/tree $dir/final.mdl $lang/L.fst \
+   "$feats" ark:$dir/train.tra ark:$dir/ali 2> $dir/align.log || exit 1;
+  rm $dir/train.tra
+else
+  gmm-align-compiled $scale_opts --beam=8 --retry-beam=40 $dir/final.mdl \
+   "$graphs" "$feats" ark:$dir/ali 2> $dir/align.log || exit 1;
+fi
+
+echo "Done."
--- a/egs/rm/s4/steps/decode_deltas.sh
+++ b/egs/rm/s4/steps/decode_deltas.sh
@ -0,0 +1,77 @@
+#!/bin/bash
+
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# Decoding script that works with a GMM model and delta-delta plus
+# cepstral mean subtraction features.  Used, for example, to decode
+# mono/ and tri1/
+# This script generates lattices and rescores them with different
+# acoustic weights, in order to explore a range of different
+# weights.
+
+if [ $# != 4 ]; then
+   echo "Usage: steps/decode_deltas.sh <model-dir> <data-dir> <lang-dir> <decode-dir>"
+   echo " e.g.: steps/decode_deltas.sh exp/mono data/test_feb89 data/lang_test exp/mono/decode/feb89"
+   exit 1;
+fi
+
+srcdir=$1
+data=$2
+lang=$3
+dir=$4
+graphdir=$srcdir/graph
+
+mkdir -p $dir
+
+if [ -f path.sh ]; then . path.sh; fi
+
+if [ ! -f $srcdir/final.mdl ]; then
+   echo No model file $srcdir/final.mdl
+   exit 1;
+fi
+
+if [[ ! -f $graphdir/HCLG.fst || $graphdir/HCLG.fst -ot $srcdir/final.mdl ]]; then
+   echo "Graph $graphdir/HCLG.fst does not exist or is too old."
+   exit 1;
+fi
+
+# We only do one decoding pass, so there is no point caching the
+# CMVN stats-- we make them part of a pipe.
+feats="ark:compute-cmvn-stats scp:$data/feats.scp ark:- | apply-cmvn --norm-vars=false  ark:- scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |"
+
+# For Resource Management, we use beam of 20 and acwt of 1/10.
+# More normal, LVCSR setups would have a beam of 13 and acwt of 1/15 or so.
+# If you decode with a beam of 20 on an LVCSR setup it will be very slow.
+
+gmm-latgen-simple --beam=20.0 --acoustic-scale=0.1 --word-symbol-table=$lang/words.txt \
+  $srcdir/final.mdl $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.gz" \
+  ark,t:$dir/test.tra ark,t:$dir/test.ali \
+     2> $dir/decode.log || exit 1;
+
+# In this setup there are no non-scored words, so
+# scoring is simple.
+
+# Now rescore lattices with various acoustic scales, and compute the WER.
+for inv_acwt in 4 5 6 7 8 9 10; do
+  acwt=`perl -e "print (1.0/$inv_acwt);"`
+  lattice-best-path --acoustic-scale=$acwt --word-symbol-table=$lang/words.txt \
+     "ark:gunzip -c $dir/lat.gz|" ark,t:$dir/${inv_acwt}.tra \
+     2>$dir/rescore_${inv_acwt}.log
+
+  scripts/sym2int.pl --ignore-first-field $lang/words.txt $data/text | \
+   compute-wer --mode=present ark:-  ark,p:$dir/${inv_acwt}.tra \
+    >& $dir/wer_${inv_acwt}
+done
--- a/egs/rm/s4/steps/make_mfcc.sh
+++ b/egs/rm/s4/steps/make_mfcc.sh
@ -0,0 +1,48 @@
+#!/bin/bash 
+# Copyright 2012 Vassil Panayotov
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# To be run from .. (one directory up from here)
+
+if [ $# != 3 ]; then
+   echo "usage: make_mfcc.sh <data-dir> <log-dir> <abs-path-to-mfccdir>";
+   exit 1;
+fi
+
+if [ -f path.sh ]; then . path.sh; fi
+
+data=$1
+logdir=$2
+mfccdir=$3
+
+# use "name" as part of name of the archive.
+name=`basename $data`
+
+mkdir -p $mfccdir || exit 1;
+mkdir -p $logdir || exit 1;
+
+scp=$data/mfc.scp
+if [ ! -f $scp ]; then
+   echo "make_mfcc.sh: no such file $f";
+   exit 1;
+fi
+
+log=$logdir/make_mfcc.log
+
+copy-feats --sphinx-in=true \
+ scp:$scp ark,scp:$mfccdir/raw_mfcc_$name.ark,$data/feats.scp 2>$log
+
+echo "Succeeded creating MFCC features for $name"
+
--- a/egs/rm/s4/steps/train_deltas.sh
+++ b/egs/rm/s4/steps/train_deltas.sh
@ -0,0 +1,126 @@
+#!/bin/bash
+# Copyright 2010-2011 Microsoft Corporation  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# To be run from ..
+# Triphone model training, using delta-delta features and cepstral
+# mean normalization.  It starts from an existing directory (e.g.
+# exp/mono), supplied as an argument, which is assumed to be built using
+# the same type of features.
+
+if [ $# != 4 ]; then
+   echo "Usage: steps/train_deltas.sh <data-dir> <lang-dir> <ali-dir> <exp-dir>"
+   echo " e.g.: steps/train_deltas.sh data/train data/lang exp/mono_ali exp/tri1"
+   exit 1;
+fi
+
+if [ -f path.sh ]; then . path.sh; fi
+
+data=$1
+lang=$2
+alidir=$3
+dir=$4
+
+if [ ! -f $alidir/final.mdl -o ! -f $alidir/ali ]; then
+  echo "Error: alignment dir $alidir does not contain final.mdl and ali"
+  exit 1;
+fi
+
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+realign_iters="5 10 15 20";  
+silphonelist=`cat $lang/silphones.csl`
+numiters=25    # Number of iterations of training
+maxiterinc=15 # Last iter to increase #Gauss on.
+numleaves=1800 # target num-leaves in tree building.
+numgauss=$[$numleaves + $numleaves/2];  # starting num-Gauss.
+     # Initially mix up to avg. 1.5 Gauss/state ( a bit more
+     # than this, due to state clustering... then slowly mix 
+     # up to final amount.
+totgauss=9000 # Target #Gaussians
+incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
+
+
+mkdir -p $dir
+
+
+feats="ark:apply-cmvn --norm-vars=false ark:$alidir/cmvn.ark scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |"
+
+
+
+echo "Accumulating tree stats"
+acc-tree-stats  --ci-phones=$silphonelist $alidir/final.mdl "$feats" \
+   ark:$alidir/ali $dir/treeacc 2> $dir/acc.tree.log  || exit 1;
+
+
+echo "Computing questions for tree clustering"
+
+cat $lang/phones.txt | awk '{print $NF}' | grep -v -w 0 > $dir/phones.list
+cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/questions.log || exit 1;
+scripts/int2sym.pl $lang/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
+compile-questions $lang/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;
+
+# Have to make silence root not-shared because we will not split it.
+scripts/make_roots.pl --separate $lang/phones.txt $silphonelist shared split \
+    > $dir/roots.txt 2>$dir/roots.log || exit 1;
+
+
+echo "Building tree"
+build-tree --verbose=1 --max-leaves=$numleaves \
+    $dir/treeacc $dir/roots.txt \
+    $dir/questions.qst $lang/topo $dir/tree  2> $dir/train_tree.log || exit 1;
+
+gmm-init-model  --write-occs=$dir/1.occs  \
+    $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/init_model.log || exit 1;
+
+gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl \
+   2>$dir/mixup.log || exit 1;
+
+#rm $dir/treeacc
+
+# Convert alignments generated from monophone model, to use as initial alignments.
+
+convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree ark:$alidir/ali ark:$dir/cur.ali 2>$dir/convert.log 
+  # Debug step only: convert back and check they're the same.
+  convert-ali $dir/1.mdl $alidir/final.mdl $alidir/tree ark:$dir/cur.ali ark:- \
+   2>/dev/null | cmp - $alidir/ali || exit 1; 
+
+# Make training graphs
+echo "Compiling training graphs"
+compile-train-graphs $dir/tree $dir/1.mdl  $lang/L.fst  \
+  "ark:scripts/sym2int.pl --ignore-first-field $lang/words.txt < $data/text |" \
+  "ark:|gzip -c >$dir/graphs.fsts.gz"  2>$dir/compile_graphs.log  || exit 1;
+
+x=1
+while [ $x -lt $numiters ]; do
+   echo Pass $x
+   if echo $realign_iters | grep -w $x >/dev/null; then
+     echo "Aligning data"
+     gmm-align-compiled $scale_opts --beam=8 --retry-beam=40 $dir/$x.mdl \
+             "ark:gunzip -c $dir/graphs.fsts.gz|" "$feats" \
+             ark:$dir/cur.ali 2> $dir/align.$x.log || exit 1;
+   fi
+   gmm-acc-stats-ali --binary=false $dir/$x.mdl "$feats" ark:$dir/cur.ali $dir/$x.acc 2> $dir/acc.$x.log  || exit 1;
+   gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl $dir/$x.acc $dir/$[$x+1].mdl 2> $dir/update.$x.log || exit 1;
+   rm $dir/$x.mdl $dir/$x.acc
+   rm $dir/$x.occs 
+   if [[ $x -le $maxiterinc ]]; then 
+      numgauss=$[$numgauss+$incgauss];
+   fi
+   x=$[$x+1];
+done
+
+( cd $dir; rm final.mdl 2>/dev/null; ln -s $x.mdl final.mdl; ln -s $x.occs final.occs )
+
+echo Done
--- a/egs/rm/s4/steps/train_mono.sh
+++ b/egs/rm/s4/steps/train_mono.sh
@ -0,0 +1,105 @@
+#!/bin/bash
+# Copyright 2010-2011 Microsoft Corporation  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# To be run from ..
+# Flat start and monophone training, with delta-delta features.
+# This script applies cepstral mean normalization (per speaker),
+# unlike the corresponding script in s1/
+
+if [ $# != 3 ]; then
+   echo "Usage: steps/train_mono.sh <data-dir> <lang-dir> <exp-dir>"
+   echo " e.g.: steps/train_mono.sh data/train.1k data/lang exp/mono"
+   exit 1;
+fi
+
+
+data=$1
+lang=$2
+dir=$3
+
+if [ -f path.sh ]; then . path.sh; fi
+
+# Configuration:
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+numiters=30    # Number of iterations of training
+maxiterinc=20 # Last iter to increase #Gauss on.
+numgauss=250 # Initial num-Gauss (must be more than #states=3*phones).
+totgauss=1000 # Target #Gaussians.  
+incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
+realign_iters="1 2 3 4 5 6 7 8 9 10 12 15 20 25";
+
+mkdir -p $dir
+echo "Computing cepstral mean and variance statistics"
+
+compute-cmvn-stats  scp:$data/feats.scp ark:$dir/cmvn.ark 2>$dir/cmvn.log || exit 1;
+
+feats="ark:apply-cmvn --norm-vars=false ark:$dir/cmvn.ark scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |"
+
+# compute integer form of transcripts.
+scripts/sym2int.pl --ignore-first-field $lang/words.txt < $data/text > $dir/train.tra \
+  || exit 1;
+
+echo "Initializing monophone system."
+
+gmm-init-mono "--train-feats=$feats subset-feats --n=10 ark:- ark:-|" $lang/topo 39  \
+   $dir/0.mdl $dir/tree 2> $dir/init.log || exit 1;
+
+
+echo "Compiling training graphs"
+compile-train-graphs $dir/tree $dir/0.mdl  $lang/L.fst \
+  ark:$dir/train.tra  "ark:|gzip -c >$dir/graphs.fsts.gz"  \
+  2>$dir/compile_graphs.log || exit 1 
+
+echo Pass 0
+
+align-equal-compiled "ark:gunzip -c $dir/graphs.fsts.gz|" "$feats" \
+   ark,t,f:-  2>$dir/align.0.log | \
+ gmm-acc-stats-ali --binary=true $dir/0.mdl "$feats" ark:- \
+     $dir/0.acc 2> $dir/acc.0.log  || exit 1;
+
+# In the following steps, the --min-gaussian-occupancy=3 option is important, otherwise
+# we fail to est "rare" phones and later on, they never align properly.
+
+gmm-est --min-gaussian-occupancy=3  --mix-up=$numgauss \
+    $dir/0.mdl $dir/0.acc $dir/1.mdl 2> $dir/update.0.log || exit 1;
+
+rm $dir/0.acc
+
+beam=4 # will change to 8 below after 1st pass
+x=1
+while [ $x -lt $numiters ]; do
+  echo "Pass $x"
+  if echo $realign_iters | grep -w $x >/dev/null; then
+    echo "Aligning data"
+    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$[$beam*4] $dir/$x.mdl \
+        "ark:gunzip -c $dir/graphs.fsts.gz|" "$feats" t,ark:$dir/cur.ali \
+        2> $dir/align.$x.log || exit 1;
+  fi
+  gmm-acc-stats-ali --binary=false $dir/$x.mdl "$feats" ark:$dir/cur.ali $dir/$x.acc 2> $dir/acc.$x.log  || exit 1;
+  gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl $dir/$x.acc $dir/$[$x+1].mdl 2> $dir/update.$x.log || exit 1;
+  rm $dir/$x.mdl $dir/$x.acc $dir/$x.occs 2>/dev/null
+  if [ $x -le $maxiterinc ]; then
+     numgauss=$[$numgauss+$incgauss];
+  fi
+  beam=8
+  x=$[$x+1]
+done
+
+( cd $dir; rm final.mdl 2>/dev/null; ln -s $x.mdl final.mdl; ln -s $x.occs final.occs )
+
+# example of showing the alignments:
+# show-alignments data/lang/phones.txt $dir/30.mdl ark:$dir/cur.ali | head -4
+
--- a/egs/timit/s4/RESULTS
+++ b/egs/timit/s4/RESULTS
@ -0,0 +1,24 @@
+exp/mono/decode_dev_bg/wer_3
+compute-wer --text --mode=present ark:exp/mono/decode_dev_bg/test_trans.filt ark,p:- 
+%WER 33.73 [ 5079 / 15057, 392 ins, 1716 del, 2971 sub ]
+%SER 100.00 [ 400 / 400 ]
+Scored 400 sentences, 0 not present in hyp.
+
+exp/mono/decode_test_bg/wer
+compute-wer --text --mode=present ark:exp/mono/decode_test_bg/test.trans ark,p:exp/mono/decode_test_bg/text 
+%WER 35.68 [ 2574 / 7215, 204 ins, 848 del, 1522 sub ]
+%SER 100.00 [ 192 / 192 ]
+Scored 192 sentences, 0 not present in hyp.
+
+exp/tri1/decode_dev_bg/wer_6
+compute-wer --text --mode=present ark:exp/tri1/decode_dev_bg/test.trans ark,p:- 
+%WER 28.68 [ 4319 / 15057, 474 ins, 1333 del, 2512 sub ]
+%SER 100.00 [ 400 / 400 ]
+Scored 400 sentences, 0 not present in hyp.
+
+exp/tri1/decode_test_bg/wer
+compute-wer --text --mode=present ark:exp/tri1/decode_test_bg/test.trans ark,p:exp/tri1/decode_test_bg/text 
+%WER 31.02 [ 2238 / 7215, 226 ins, 704 del, 1308 sub ]
+%SER 100.00 [ 192 / 192 ]
+Scored 192 sentences, 0 not present in hyp.
+
--- a/egs/timit/s4/conf/dev_spk.list
+++ b/egs/timit/s4/conf/dev_spk.list
@ -0,0 +1,50 @@
+faks0
+fdac1
+fjem0
+mgwt0
+mjar0
+mmdb1
+mmdm2
+mpdf0
+fcmh0
+fkms0
+mbdg0
+mbwm0
+mcsh0
+fadg0
+fdms0
+fedw0
+mgjf0
+mglb0
+mrtk0
+mtaa0
+mtdt0
+mthc0
+mwjg0
+fnmr0
+frew0
+fsem0
+mbns0
+mmjr0
+mdls0
+mdlf0
+mdvc0
+mers0
+fmah0
+fdrw0
+mrcs0
+mrjm4
+fcal1
+mmwh0
+fjsj0
+majc0
+mjsw0
+mreb0
+fgjd0
+fjmg0
+mroa0
+mteb0
+mjfc0
+mrjr0
+fmml0
+mrws1
--- a/egs/timit/s4/conf/mfcc.conf
+++ b/egs/timit/s4/conf/mfcc.conf
@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
--- a/egs/timit/s4/conf/phones.60-48-39.map
+++ b/egs/timit/s4/conf/phones.60-48-39.map
@ -0,0 +1,61 @@
+aa	aa	aa
+ae	ae	ae
+ah	ah	ah
+ao	ao	aa
+aw	aw	aw
+ax	ax	ah
+ax-h	ax	ah
+axr	er	er
+ay	ay	ay
+b	b	b
+bcl	vcl	sil
+ch	ch	ch
+d	d	d
+dcl	vcl	sil
+dh	dh	dh
+dx	dx	dx
+eh	eh	eh
+el	el	l
+em	m	m
+en	en	n
+eng	ng	ng
+epi	epi	sil
+er	er	er
+ey	ey	ey
+f	f	f
+g	g	g
+gcl	vcl	sil
+h#	sil	sil
+hh	hh	hh
+hv	hh	hh
+ih	ih	ih
+ix	ix	ih
+iy	iy	iy
+jh	jh	jh
+k	k	k
+kcl	cl	sil
+l	l	l
+m	m	m
+n	n	n
+ng	ng	ng
+nx	n	n
+ow	ow	ow
+oy	oy	oy
+p	p	p
+pau	sil	sil
+pcl	cl	sil
+q
+r	r	r
+s	s	s
+sh	sh	sh
+t	t	t
+tcl	cl	sil
+th	th	th
+uh	uh	uh
+uw	uw	uw
+ux	uw	uw
+v	v	v
+w	w	w
+y	y	y
+z	z	z
+zh	zh	sh
--- a/egs/timit/s4/conf/test_spk.list
+++ b/egs/timit/s4/conf/test_spk.list
@ -0,0 +1,24 @@
+mdab0
+mwbt0
+felc0
+mtas1
+mwew0
+fpas0
+mjmp0
+mlnt0
+fpkt0
+mlll0
+mtls0
+fjlm0
+mbpm0
+mklt0
+fnlp0
+mcmj0
+mjdh0
+fmgd0
+mgrt0
+mnjm0
+fdhc0
+mjln0
+mpam0
+fmld0
--- a/egs/timit/s4/conf/topo.proto
+++ b/egs/timit/s4/conf/topo.proto
@ -0,0 +1,20 @@
+<Topology> 
+<TopologyEntry> 
+<ForPhones>
+NONSILENCEPHONES
+</ForPhones> 
+<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State> 
+<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State> 
+<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State> 
+<State> 3 </State>
+</TopologyEntry> 
+<TopologyEntry> 
+<ForPhones>
+SILENCEPHONES
+</ForPhones> 
+<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>
+<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State>
+<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State>
+<State> 3 </State>
+</TopologyEntry> 
+</Topology> 
--- a/egs/timit/s4/local/timit_data_prep.sh
+++ b/egs/timit/s4/local/timit_data_prep.sh
@ -0,0 +1,110 @@
+#!/bin/bash -u
+
+# Copyright 2012  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+set -o errexit
+
+function error_exit () {
+  echo -e "$@" >&2; exit 1;
+}
+
+function read_dirname () {
+  local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`;
+  [ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory";
+  local retval=`cd $dir_name 2>/dev/null && pwd || exit 1`
+  echo $retval
+}
+
+PROG=`basename $0`;
+usage="Usage: $PROG <arguments>\n
+Prepare train, dev, test file lists for TIMIT.\n\n
+Required arguments:\n
+  --config-dir=DIR\tDirecory containing the necessary config files\n
+  --corpus-dir=DIR\tDirectory for the GlobalPhone corpus\n
+  --work-dir=DIR\t\tWorking directory\n
+";
+
+if [ $# -lt 3 ]; then
+  error_exit $usage;
+fi
+
+while [ $# -gt 0 ];
+do
+  case "$1" in
+  --help) echo -e $usage; exit 0 ;;
+  --config-dir=*)
+  CONFDIR=`read_dirname $1`; shift ;;
+  --corpus-dir=*)
+  CORPUS=`read_dirname $1`; shift ;;
+  --work-dir=*)
+  WDIR=`read_dirname $1`; shift ;;
+  *)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
+  esac
+done
+
+# (1) check if the config files are in place:
+cd $CONFDIR
+[ -f test_spk.list ] || error_exit "$PROG: Eval-set speaker list not found.";
+
+cd $WDIR
+[ -f path.sh ] && . path.sh  # Sets the PATH to contain necessary executables
+
+# (2) get the various file lists (for audio, transcription, etc.)
+mkdir -p data/local
+timit_prep_flists.sh --corpus-dir=$CORPUS --dev-spk=$CONFDIR/dev_spk.list \
+  --test-spk=$CONFDIR/test_spk.list --work-dir=data
+
+# (3) Normalize the transcripts.
+timit_norm_trans.pl -i data/local/train.trans -m $CONFDIR/phones.60-48-39.map \
+  -to 48 > data/local/train.trans2;
+for x in dev test; do
+  timit_norm_trans.pl -i data/local/${x}.trans -m $CONFDIR/phones.60-48-39.map \
+    -to 39 > data/local/${x}.trans2;
+done
+
+# Create the lexicon, which is just an identity mapping
+cut -d' ' -f2- data/local/train.trans2 | tr ' ' '\n' | sort -u > data/local/p
+paste data/local/p data/local/p > data/local/lexicon.txt
+
+# add disambig symbols to the lexicon: TODO: delete
+ndisambig=`add_lex_disambig.pl data/local/lexicon.txt data/local/lexicon_disambig.txt`
+ndisambig=$[$ndisambig+1];  # add one disambig symbol for silence
+echo $ndisambig > data/local/lex_ndisambig
+
+# Get the list of phones and map them to integers (adding the null symbol <eps>
+# to the list).
+cut -f2 data/local/lexicon.txt \
+  | awk 'BEGIN{ print "<eps> 0"; } { printf("%s %d\n", $1, NR); }' \
+  > data/local/phones.txt
+
+# Get the list of words:
+cut -f1 data/local/lexicon.txt \
+  | awk 'BEGIN{print "<eps> 0";} {printf("%s %d\n", $1, NR);} 
+         END{printf("#0 %d\n", NR+1);}' > data/local/words.txt
+
+# (4) Create the phone bigram LM
+(
+  [ -z "$IRSTLM" ] && \
+    error_exit "LM building wo'nt work without setting the IRSTLM env variable"
+  cut -d' ' -f2- data/local/train.trans2 | sed -e 's:^:<s> :' -e 's:$: </s>:' \
+    > data/local/lm_train.txt
+  build-lm.sh -i data/local/lm_train.txt -n 2 -o data/local/lm_phone_bg.ilm.gz
+  compile-lm data/local/lm_phone_bg.ilm.gz --text yes /dev/stdout \
+    | grep -v unk | gzip -c > data/local/lm_phone_bg.arpa.gz 
+
+) >& data/prepare_lm.log
+
+echo "Finished data preparation."
--- a/egs/timit/s4/local/timit_format_data.sh
+++ b/egs/timit/s4/local/timit_format_data.sh
@ -0,0 +1,136 @@
+#!/bin/bash -u
+
+# Copyright 2012  Arnab Ghoshal
+# Copyright 2010-2011  Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+set -o errexit
+set -o pipefail
+
+function error_exit () {
+  echo -e "$@" >&2; exit 1;
+}
+
+function read_dirname () {
+  local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`;
+  [ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory";
+  local retval=`cd $dir_name 2>/dev/null && pwd || exit 1`
+  echo $retval
+}
+
+PROG=`basename $0`;
+usage="Usage: $PROG <arguments>\n
+Prepare train, dev, test file lists.\n\n
+Required arguments:\n
+  --hmm-proto=FILE\tPrototype of the HMM topology\n
+  --work-dir=DIR\t\tWorking directory\n
+";
+
+if [ $# -lt 2 ]; then
+  error_exit $usage;
+fi
+
+while [ $# -gt 0 ];
+do
+  case "$1" in
+  --help) echo -e $usage; exit 0 ;;
+  --hmm-proto=*)
+  PROTO=`expr "X$1" : '[^=]*=\(.*\)'`;
+  [ -f $PROTO ] || error_exit "Cannot find HMM prototype file '$PROTO'"; 
+  shift ;;
+  --work-dir=*)
+  WDIR=`read_dirname $1`; shift ;;
+  *)  echo "Unknown argument: $1, exiting"; error_exit $usage ;;
+  esac
+done
+
+cd $WDIR
+. path.sh
+
+echo "Preparing train data"
+
+# (0) Create a directory to contain files needed in training:
+for x in train dev test; do 
+  mkdir -p data/$x
+  cp data/local/${x}_wav.scp data/$x/wav.scp
+  cp data/local/${x}.trans2 data/$x/text
+  cp data/local/${x}.spk2utt data/$x/spk2utt
+  cp data/local/${x}.utt2spk data/$x/utt2spk
+done
+
+mkdir -p data/lang
+cp data/local/phones.txt -t data/lang/
+cp data/local/words.txt -t data/lang/
+
+# (1) Generate colon-separated lists of silence and non-silence phones
+silphones="cl epi sil vcl";
+silphones.pl data/lang/phones.txt "$silphones" \
+  data/lang/silphones.csl data/lang/nonsilphones.csl
+
+# (2) Create the L.fst without disambiguation symbols, for use in training.
+make_lexicon_fst.pl data/local/lexicon.txt 0.5 sil \
+  | fstcompile --isymbols=data/lang/phones.txt \
+    --osymbols=data/lang/words.txt --keep_isymbols=false \
+    --keep_osymbols=false \
+  | fstarcsort --sort_type=olabel > data/lang/L.fst
+
+# (3) Create phonesets.txt and extra_questions.txt.
+timit_make_questions.pl -i data/lang/phones.txt \
+  -m data/lang/phonesets_mono.txt -r data/lang/roots.txt
+grep -v sil data/lang/phonesets_mono.txt \
+  > data/lang/phonesets_cluster.txt
+
+# (4), Finally, for training, create the HMM topology prototype:
+silphonelist=`cat data/lang/silphones.csl | sed 's/:/ /g'`
+nonsilphonelist=`cat data/lang/nonsilphones.csl | sed 's/:/ /g'`
+sed -e "s:NONSILENCEPHONES:$nonsilphonelist:" \
+  -e "s:SILENCEPHONES:$silphonelist:" $PROTO > data/lang/topo
+
+echo "Preparing test data"
+
+# (0) Copy over some files common to traina and test:
+mkdir -p data/lang_test
+for f in phones.txt words.txt L.fst silphones.csl nonsilphones.csl; do
+  cp data/lang/$f -t data/lang_test/
+done
+
+# (1) Create a list of phones including the disambiguation symbols.
+#     --include-zero includes the #0 symbol that is passed from G.fst
+ndisambig=`cat data/local/lex_ndisambig`;
+add_disambig.pl --include-zero data/lang_test/phones.txt $ndisambig \
+  > data/lang_test/phones_disambig.txt
+cp data/lang_test/phones_disambig.txt -t data/lang/  # for MMI.
+
+# (2) Create the lexicon FST with disambiguation symbols. There is an extra
+#     step where we create a loop to "pass through" the disambiguation symbols
+#     from G.fst.  
+phone_disambig_symbol=`grep \#0 data/lang_test/phones_disambig.txt | awk '{print $2}'`
+word_disambig_symbol=`grep \#0 data/lang_test/words.txt | awk '{print $2}'`
+
+make_lexicon_fst.pl data/local/lexicon_disambig.txt 0.5 sil '#'$ndisambig \
+  | fstcompile --isymbols=data/lang_test/phones_disambig.txt \
+    --osymbols=data/lang_test/words.txt --keep_isymbols=false \
+    --keep_osymbols=false \
+  | fstaddselfloops  "echo $phone_disambig_symbol |" \
+    "echo $word_disambig_symbol |" \
+  | fstarcsort --sort_type=olabel > data/lang_test/L_disambig.fst
+
+  # Needed for discriminative training
+cp data/lang_test/L_disambig.fst -t data/lang/
+
+# (3) Convert the language model to FST, and create decoding configuration.
+timit_format_lms.sh data
+
+echo "Succeeded in formatting data."
--- a/egs/timit/s4/local/timit_format_lms.sh
+++ b/egs/timit/s4/local/timit_format_lms.sh
@ -0,0 +1,71 @@
+#!/bin/bash -u
+
+# Copyright 2012  Arnab Ghoshal
+# Copyright 2010-2011  Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+set -o errexit
+#set -o pipefail
+
+function error_exit () {
+  echo -e "$@" >&2; exit 1;
+}
+
+function read_dirname () {
+  [ -d "$1" ] || error_exit "Argument '$1' not a directory";
+  local retval=`cd $1 2>/dev/null && pwd || exit 1`
+  echo $retval
+}
+
+function format_lms () {
+  local lm_suffix=$1;
+  local work_dir=$2
+  local test=$work_dir/lang_test_${lm_suffix}
+
+  mkdir -p $test
+  for f in phones.txt words.txt phones_disambig.txt L.fst L_disambig.fst \
+           silphones.csl nonsilphones.csl; do
+    cp $work_dir/lang_test/$f $test
+  done
+
+  # Removing all "illegal" combinations of <s> and </s>, which are supposed to 
+  # occur only at being/end of utt.  These can cause determinization failures 
+  # of CLG [ends up being epsilon cycles].
+  gunzip -c $work_dir/local/lm_${lm_suffix}.arpa.gz \
+    | egrep -v '<s> <s>|</s> <s>|</s> </s>' \
+    | arpa2fst - | fstprint \
+    | eps2disambig.pl | s2eps.pl \
+    | fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt \
+      --keep_isymbols=false --keep_osymbols=false \
+    | fstrmepsilon > $test/G.fst
+  set +e
+  fstisstochastic $test/G.fst
+  set -e
+}
+
+PROG=`basename $0`;
+usage="Usage: $PROG data_dir\n
+ Convert ARPA-format language models to FSTs.\n";
+
+if [ $# -ne 1 ]; then
+  error_exit $usage;
+fi
+WDIR=`read_dirname $1`;
+
+# Next, for each type of language model, create the corresponding FST
+# and the corresponding lang_test directory.
+
+echo "Preparing language models for test"
+format_lms phone_bg $WDIR >& $WDIR/format_lms.log
--- a/egs/timit/s4/local/timit_make_questions.pl
+++ b/egs/timit/s4/local/timit_make_questions.pl
@ -0,0 +1,58 @@
+#!/usr/bin/perl -w
+
+# Copyright 2012  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# 'phonesets_mono' contains sets of phones that are shared when building the 
+# monophone system and when asking questions based on an automatic clustering 
+# of phones, for the triphone system.  
+# 'roots' contain the information about which phones share a common root in 
+# the phonetic decision tree and which have distinct pdfs. It also states 
+# whether the tree-building should split the roots or not.
+
+my $usage = "Usage: timit_make_questions.pl -i phones -m phoneset_mono -r roots\
+Creates sharerd phonesets for monophone and context-dependent training.\
+Required arguments:\
+  -i\tInput list of phones (can contain stress/position markers)\
+  -m\tOutput shared phoneset for use in monophone training\
+  -r\tOutput sharing and splitting info for context-dependent training\n";
+
+use strict;
+use Getopt::Long;
+my ($in_phones, $mono, $roots, %phoneset);
+GetOptions ("i=s" => \$in_phones,  # Input list of phones
+            "m=s" => \$mono,       # Shared phone-set for monophone system
+	    "r=s" => \$roots );    # roots file for context-dependent systems
+
+die "$usage" unless(defined($in_phones) && defined($mono) && defined($roots));
+
+open(P, "<$in_phones") or die "Cannot read from file '$in_phones': $!";
+open(MONO, ">$mono") or die "Cannot write to file '$mono': $!";
+open(ROOTS, ">$roots") or die "Cannot write to file '$roots': $!";
+
+while (<P>) {
+  next if m/eps|sil|vcl|cl|epi/;
+  chomp;
+  m/^(\S+)(_.)?\s+\S+$/ or die "Bad line: $_\n";
+  my $full_phone = defined($2)? $1.$2 : $1;
+  push @{$phoneset{$1}}, $full_phone;
+}
+
+print MONO "cl epi sil vcl\n";
+print ROOTS "not-shared not-split cl epi sil vcl\n";
+foreach my $p (sort keys %phoneset) {
+  print MONO join(" ", @{$phoneset{$p}}), "\n";
+  print ROOTS "shared split ", join(" ", @{$phoneset{$p}}), "\n";
+}
--- a/egs/timit/s4/local/timit_norm_trans.pl
+++ b/egs/timit/s4/local/timit_norm_trans.pl
@ -0,0 +1,89 @@
+#!/usr/bin/perl -w
+
+# Copyright 2012  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script normalizes the TIMIT phonetic transcripts that have been 
+# extracted in a format where each line contains an utterance ID followed by 
+# the transcript, e.g.:
+# fcke0_si1111 h# hh ah dx ux w iy dcl d ix f ay n ih q h#
+
+my $usage = "Usage: timit_norm_trans.pl -i transcript -m phone_map -from [60|48] -to [48|39] > normalized\n
+Normalizes phonetic transcriptions for TIMIT, by mapping the phones to a 
+smaller set defined by the -m option. This script assumes that the mapping is 
+done in the \"standard\" fashion, i.e. to 48 or 39 phones.  The input is 
+assumed to have 60 phones (+1 for glottal stop, which is deleted), but that can
+be changed using the -from option. The input format is assumed to be utterance 
+ID followed by transcript on the same line.\n";
+
+use strict;
+use Getopt::Long;
+die "$usage" unless(@ARGV >= 1);
+my ($in_trans, $phone_map, $num_phones_out);
+my $num_phones_in = 60;
+GetOptions ("i=s" => \$in_trans,          # Input transcription
+	    "m=s" => \$phone_map,         # File containing phone mappings
+	    "from=i" => \$num_phones_in,  # Input #phones: must be 60 or 48
+	    "to=i" => \$num_phones_out ); # Output #phones: must be 48 or 39
+
+die $usage unless(defined($in_trans) && defined($phone_map) && 
+		  defined($num_phones_out));
+if ($num_phones_in != 60 && $num_phones_in != 48) {
+  die "Can only used 60 or 48 for -from (used $num_phones_in)."
+}
+if ($num_phones_out != 48 && $num_phones_out != 39) {
+  die "Can only used 48 or 39 for -to (used $num_phones_out)."
+}
+unless ($num_phones_out < $num_phones_in) {
+  die "Argument to -from ($num_phones_in) must be greater than that to -to ($num_phones_out)."
+}
+
+
+open(M, "<$phone_map") or die "Cannot open mappings file '$phone_map': $!";
+my (%phonemap, %seen_phones);
+my $num_seen_phones = 0;
+while (<M>) {
+  chomp;
+  next if ($_ =~ /^q\s*.*$/); # Ignore glottal stops.
+  m:^(\S+)\s+(\S+)\s+(\S+)$: or die "Bad line: $_";
+  my $mapped_from = ($num_phones_in == 60)? $1 : $2;
+  my $mapped_to = ($num_phones_out == 48)? $2 : $3;
+  if (!defined($seen_phones{$mapped_to})) {
+    $seen_phones{$mapped_to} = 1;
+    $num_seen_phones += 1;
+  }
+  $phonemap{$mapped_from} = $mapped_to;
+}
+if ($num_seen_phones != $num_phones_out) {
+  die "Trying to map to $num_phones_out phones, but seen only $num_seen_phones";
+}
+
+open(T, "<$in_trans") or die "Cannot open transcription file '$in_trans': $!";
+while (<T>) {
+  chomp;
+  $_ =~ m:^(\S+)\s+(.+): or die "Bad line: $_";
+  my $utt_id = $1;
+  my $trans = $2;
+
+  $trans =~ s/q//g;  # Remove glottal stops.
+  $trans =~ s/^\s*//; $trans =~ s/\s*$//;  # Normalize spaces
+
+  print $utt_id;
+  for my $phone (split(/\s+/, $trans)) {
+    print " $phonemap{$phone}"
+  }
+  print "\n";
+}
--- a/egs/timit/s4/local/timit_prep_flists.sh
+++ b/egs/timit/s4/local/timit_prep_flists.sh
@ -0,0 +1,121 @@
+#!/bin/bash -u
+
+# Copyright 2012  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+set -o errexit
+set -o pipefail
+
+function read_dirname () {
+  local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`;
+  [ -d "$dir_name" ] || { echo "Argument '$dir_name' not a directory" >&2; \
+    exit 1; }
+  local retval=`cd $dir_name 2>/dev/null && pwd || exit 1`
+  echo $retval
+}
+
+PROG=`basename $0`;
+usage="Usage: $PROG <arguments>\n
+Prepare train, dev, test file lists for TIMIT.\n\n
+Required arguments:\n
+  --corpus-dir=DIR\tDirectory for the TIMIT corpus\n
+  --dev-spk=FILE\tDevelopment set speaker list\n
+  --test-spk=FILE\tCore test set speaker list\n
+  --work-dir=DIR\t\tPlace to write the files (in a subdirectory with the 2-letter language code)\n
+";
+
+if [ $# -lt 3 ]; then
+  echo -e $usage; exit 1;
+fi
+
+while [ $# -gt 0 ];
+do
+  case "$1" in
+  --help) echo -e $usage; exit 0 ;;
+  --corpus-dir=*) 
+  CORPUS=`read_dirname $1`; shift ;;
+  --dev-spk=*)
+  DEVSPK=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;;
+  --test-spk=*)
+  TESTSPK=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;;
+  --work-dir=*)
+  WDIR=`read_dirname $1`; shift ;;
+  *)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
+  esac
+done
+
+if [ ! -d "$CORPUS/train" -a ! -d "$CORPUS/TRAIN" ]; then
+  echo "Expecting directory $CORPUS/train or $CORPUS/TRAIN to exist."
+  exit 1;
+fi
+
+tmpdir=$(mktemp -d);
+trap 'rm -rf "$tmpdir"' EXIT
+
+# Get the list of speakers. The list of speakers in the 24-speaker core test 
+# set and the 50-speaker development set must be supplied to the script. All
+# speakers in the 'train' directory are used for training.
+tr '[:upper:]' '[:lower:]' < $DEVSPK > $tmpdir/dev_spk    # Just in case!
+tr '[:upper:]' '[:lower:]' < $TESTSPK > $tmpdir/test_spk  # Just in case!
+
+ls -d "$CORPUS"/train/dr*/* | sed -e "s:^.*/::" > $tmpdir/train_spk
+
+
+ODIR=$WDIR/local  # Directory to write file lists & transcripts
+mkdir -p $ODIR
+
+for x in train dev test; do
+  # First, find the list of audio files (use only si & sx utterances).
+  # Note: train & test sets are under different directories, but doing find on 
+  # both and grepping for the speakers will work correctly.
+  find $CORPUS/{train,test} -not \( -name 'sa*' \) -name '*.wav' \
+    | grep -f $tmpdir/${x}_spk > $ODIR/${x}_sph.flist
+  sed -e 's:.*/\(.*\)/\(.*\).wav$:\1_\2:' $ODIR/${x}_sph.flist \
+    > $tmpdir/${x}_sph.uttids
+  paste $tmpdir/${x}_sph.uttids $ODIR/${x}_sph.flist \
+    | sort -k1,1 > $ODIR/${x}_sph.scp
+
+  # Now, get the transcripts: each line of the output contains an utterance 
+  # ID followed by the transcript.
+  find $CORPUS/{train,test} -not \( -name 'sa*' \) -name '*.phn' \
+    | grep -f $tmpdir/${x}_spk > $tmpdir/${x}_phn.flist
+  sed -e 's:.*/\(.*\)/\(.*\).phn$:\1_\2:' $tmpdir/${x}_phn.flist \
+    > $tmpdir/${x}_phn.uttids
+  while read line; do
+    [ -f $line ] || error_exit "Cannot find transcription file '$line'";
+    cut -f3 -d' ' "$line" | tr '\n' ' ' | sed -e 's: *$:\n:'
+  done < $tmpdir/${x}_phn.flist > $tmpdir/${x}_phn.trans
+  paste $tmpdir/${x}_phn.uttids $tmpdir/${x}_phn.trans \
+    | sort -k1,1 > $ODIR/${x}.trans
+
+  # # Intersect the set of utterances with transcripts with the set of those
+  # # with valid audio.
+  # cut -f1 $tmpdir/${x}.trans \
+  #   | join $tmpdir/${x}_basenames_wav2 - > $tmpdir/${x}_basenames
+  # # Get the common set of WAV files and transcripts.
+  # join $tmpdir/${x}_basenames $tmpdir/${x}_wav.scp \
+  #   > $ODIR/${x}_wav.scp
+  # join $tmpdir/${x}_basenames $tmpdir/${x}.trans \
+  #   > $ODIR/${x}.trans
+
+  awk '{printf("%s sph2pipe -f wav %s |\n", $1, $2);}' < $ODIR/${x}_sph.scp \
+    > $ODIR/${x}_wav.scp
+
+  sed -e 's:_.*$::' $tmpdir/${x}_sph.uttids \
+    | paste -d' ' $tmpdir/${x}_sph.uttids - | sort -k1,1 \
+    > $ODIR/${x}.utt2spk
+  utt2spk_to_spk2utt.pl $ODIR/${x}.utt2spk \
+    > $ODIR/${x}.spk2utt;
+done
--- a/egs/timit/s4/path.sh
+++ b/egs/timit/s4/path.sh
@ -0,0 +1,34 @@
+# This contains the locations of the tools and data required for running
+# the GlobalPhone experiments.
+
+KALDIROOT=`cd ../../..; pwd`
+KALDISRC=$KALDIROOT/src
+KALDIBIN=$KALDISRC/bin:$KALDISRC/featbin:$KALDISRC/fgmmbin:$KALDISRC/fstbin  
+KALDIBIN=$KALDIBIN:$KALDISRC/gmmbin:$KALDISRC/latbin:$KALDISRC/nnetbin
+KALDIBIN=$KALDIBIN:$KALDISRC/sgmmbin:$KALDISRC/tiedbin:$KALDISRC/lm
+
+FSTBIN=$KALDIROOT/tools/openfst/bin
+LMBIN=$KALDIROOT/tools/irstlm/bin
+
+[ -d $PWD/local ] || { echo "Expecting 'local' subdirectory"; exit 1; }
+[ -d $PWD/utils ] || { echo "Expecting 'utils' subdirectory"; exit 1; }
+[ -d $PWD/steps ] || { echo "Expecting 'steps' subdirectory"; exit 1; }
+
+LOCALUTILS=$PWD/local
+KALDIUTILS=$PWD/utils
+KALDISTEPS=$PWD/steps
+SCRIPTS=$LOCALUTILS:$KALDIUTILS:$KALDISTEPS
+
+# If you already have shorten and sox on your path, comment the following out.
+# Else use install.sh to install them first in the specified locations.
+SPH2PIPE=$KALDIROOT/tools/sph2pipe_v2.5
+[ -x $SPH2PIPE/sph2pipe ] || { echo "Cannot find sph2pipe executable"; }
+TOOLS=$SPH2PIPE
+
+export PATH=$PATH:$KALDIBIN:$FSTBIN:$LMBIN:$SCRIPTS:$TOOLS
+export LC_ALL=C
+export IRSTLM=$KALDIROOT/tools/irstlm
+
+# Site-specific configs:
+[ `hostname -y` == ecdf ] && \
+  { . /etc/profile.d/modules.sh; module add intel/mkl; }
--- a/egs/timit/s4/run.sh
+++ b/egs/timit/s4/run.sh
@ -0,0 +1,77 @@
+#!/bin/bash -u
+
+# Copyright 2012  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+exit 1;
+# This script shows the steps needed to build a phone recognizer for TIMIT.
+
+# This recipe follows the setup first described in:
+#   K. F. Lee and H. W. Hon, "Speaker-independent phone recognition using hidden Markov models," 1988 
+# where the training set is mapped to 48 phones and the results are presented 
+# on a 39-phone subset of that.
+
+# Set WORKDIR to someplace with enough disk space. That is where MFCCs will 
+# get created, as well as the LM in ARPA & FST formats.
+WORKDIR=/path/with/disk/space
+cp -r conf local utils steps path.sh $WORKDIR
+cd $WORKDIR
+local/timit_data_prep.sh --config-dir=$PWD/conf --corpus-dir=/path/to/TIMIT --work-dir=$WORKDIR
+
+local/timit_format_data.sh --hmm-proto=conf/topo.proto --work-dir=$PWD
+
+# Now make MFCC features.
+mfccdir=$WORKDIR/data/MFCC
+for x in train dev test; do
+  steps/make_mfcc.sh --num-jobs 6 data/$x exp/make_mfcc/$x $mfccdir
+done
+
+decode_cmd="qsub -q all.q@@blade -l ram_free=500M,mem_free=500M"
+train_cmd="qsub -q all.q@@blade -l ram_free=200M,mem_free=200M"
+
+steps/train_mono.sh --num-jobs 10 --qcmd "$train_cmd" \
+  data/train data/lang exp/mono
+utils/mkgraph.sh --mono data/lang_test_phone_bg exp/mono exp/mono/graph_bg
+steps/decode_deltas.sh --accwt 1.0 --beam 20.0 --latgen --num-jobs 6 \
+  --qcmd "$decode_cmd" exp/mono/graph_bg data/dev exp/mono/decode_dev_bg
+utils/score_lats.sh exp/mono/decode_dev_bg exp/mono/graph_bg/words.txt \
+  data/dev conf/phones.60-48-39.map 
+opt_accwt=`grep WER exp/mono/decode_dev_bg/wer_* \
+  | sed -e 's?.*wer_??' -e 's?:%WER??' -e 's?\[.*??' | sort -k2,2 -g \
+  | head -1 | awk '{print 1/$1}'`
+steps/decode_deltas.sh --accwt $opt_accwt --beam 20.0 --num-jobs 4 \
+  --qcmd "$decode_cmd" exp/mono/graph_bg data/test exp/mono/decode_test_bg
+utils/score_text.sh exp/mono/decode_test_bg exp/mono/graph_bg/words.txt \
+  data/test conf/phones.60-48-39.map 
+
+steps/align_deltas.sh --num-jobs 10 --qcmd "$train_cmd" \
+  data/train data/lang exp/mono exp/mono_ali
+
+steps/train_deltas.sh --num-jobs 10 --qcmd "$train_cmd" \
+  2000 10000 data/train data/lang exp/mono_ali exp/tri1
+
+utils/mkgraph.sh data/lang_test_phone_bg exp/tri1 exp/tri1/graph_bg
+steps/decode_deltas.sh --accwt 1.0 --beam 20.0 --latgen --num-jobs 6 \
+  --qcmd "$decode_cmd" exp/tri1/graph_bg data/dev exp/tri1/decode_dev_bg
+utils/score_lats.sh exp/tri1/decode_dev_bg exp/tri1/graph_bg/words.txt \
+  data/dev conf/phones.60-48-39.map 
+opt_accwt=`grep WER exp/tri1/decode_dev_bg/wer_* \
+  | sed -e 's?.*wer_??' -e 's?:%WER??' -e 's?\[.*??' | sort -k2,2 -g \
+  | head -1 | awk '{print 1/$1}'`
+steps/decode_deltas.sh --accwt $opt_accwt --beam 20.0 --num-jobs 4 \
+  --qcmd "$decode_cmd" exp/tri1/graph_bg data/test exp/tri1/decode_test_bg
+utils/score_text.sh exp/tri1/decode_test_bg exp/tri1/graph_bg/words.txt \
+  data/test conf/phones.60-48-39.map 
+
--- a/egs/timit/s4/steps/align_deltas.sh
+++ b/egs/timit/s4/steps/align_deltas.sh
@ -0,0 +1,138 @@
+#!/bin/bash
+
+# Copyright 2010-2012  Microsoft Corporation;  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# To be run from ..
+
+# This script does training-data alignment given a model built using 
+# CMN + delta + delta-delta features.  It splits the data into
+# four chunks and does everything in parallel on the same machine.
+# Its output, all in its own experimental directory, is (assuming
+# you don't change the #jobs with --num-job option),
+# {0,1,2,3}.cmvn {0,1,2,3}.ali.gz, tree, final.mdl 
+# and final.occs (the last three are just copied from the source directory). 
+
+
+# Option to use precompiled graphs from last phase, if these
+# are available (i.e. if they were built with the same data).
+# These must be split into four pieces.
+
+function error_exit () {
+  echo -e "$@" >&2; exit 1;
+}
+
+function readint () {
+  local retval=${1/#*=/};  # In case --switch=ARG format was used
+  retval=${retval#0*}      # Strip any leading 0's
+  [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \
+    || error_exit "Argument \"$retval\" not an integer."
+  echo $retval
+}
+
+njobs=4      # Default number of jobs
+qcmd=""   # Options for the submit_jobs.sh script
+oldgraphs=false
+
+PROG=`basename $0`;
+usage="Usage: $PROG [options]  <data-dir> <lang-dir> <src-dir> <exp-dir>\n
+e.g.: $PROG data/train data/lang exp/tri1 exp/tri1_ali\n\n
+Options:\n
+  --help\t\tPrint this message and exit\n
+  --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n
+  --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
+  --use-graphs\tReuse older graphs\n
+";
+
+while [ $# -gt 0 ]; do
+  case "${1# *}" in  # ${1# *} strips any leading spaces from the arguments
+    --help) echo -e $usage; exit 0 ;;
+    --num-jobs)
+      shift; njobs=`readint $1`;
+      [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive.";
+      shift ;;
+    --qcmd)
+      shift; qcmd=" --qcmd=${1}"; shift ;;
+    --use-graphs)
+      oldgraphs=true; shift ;;
+    -*)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
+    *)   break ;;   # end of options: interpreted as the data-dir
+  esac
+done
+
+if [ $# != 4 ]; then
+  error_exit $usage;
+fi
+
+[ -f path.sh ] && . path.sh
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+if [ -f $lang/oov.txt ]; then
+  oov_opt="--map-oov '"`cat $lang/oov.txt`"'"
+else
+  oov_opt='--ignore-oov'
+fi
+
+mkdir -p $dir
+# Create copy of the tree and model and occs...
+cp $srcdir/{tree,final.mdl,final.occs} $dir || exit 1;
+
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+
+if [ ! -d $data/split$njobs -o $data/split$njobs -ot $data/feats.scp ]; then
+  split_data.sh $data $njobs
+fi
+
+echo "Computing cepstral mean and variance statistics"
+# for n in `get_splits.pl $njobs`; do # Do this locally; it's fast.
+submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/cmvnTASK_ID.log \
+  compute-cmvn-stats --spk2utt=ark:$data/split$njobs/TASK_ID/spk2utt \
+    scp:$data/split$njobs/TASK_ID/feats.scp ark:$dir/TASK_ID.cmvn \
+    || error_exit "Computing CMN/CVN stats failed.";
+
+
+# Align all training data using the supplied model.
+echo "Aligning data from $data"
+feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
+
+if $oldgraphs; then 
+  # for n in `get_splits.pl $njobs`; do
+  # feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
+  ls $srcdir/{1..$njobs}.fsts.gz >/dev/null \
+    || error_exit "Missing FSTs with --use-graphs option specified."
+  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/alignTASK_ID.log \
+    gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 $dir/final.mdl \
+      "ark:gunzip -c $srcdir/TASK_ID.fsts.gz|" "$feats" "ark:|gzip -c >$dir/TASK_ID.ali.gz" \
+      || error_exit "Error doing alignment.";
+
+else
+  # for n in `get_splits.pl $njobs`; do
+  # feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
+  # compute integer form of transcripts.
+  tra="ark:sym2int.pl $oov_opt --ignore-first-field $lang/words.txt $data/split$njobs/TASK_ID/text|";
+  # We could just use gmm-align in the next line, but it's less efficient as 
+  # it compiles the training graphs one by one.
+  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/alignTASK_ID.log \
+    compile-train-graphs $dir/tree $dir/final.mdl  $lang/L.fst "$tra" ark:- \| \
+      gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 $dir/final.mdl \
+      ark:- "$feats" "ark:|gzip -c >$dir/TASK_ID.ali.gz" \
+      || error_exit "Error doing alignment.";
+fi
+
+echo "Done aligning data."
--- a/egs/timit/s4/steps/decode_deltas.sh
+++ b/egs/timit/s4/steps/decode_deltas.sh
@ -0,0 +1,125 @@
+#!/bin/bash -u
+
+# Copyright 2012  Arnab Ghoshal
+# Copyright 2010-2011  Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# Decoding script that works with a GMM model and delta-delta plus
+# cepstral mean subtraction features.  Used, for example, to decode
+# mono/ and tri1/
+# This script just generates lattices for a single broken-up
+# piece of the data.
+
+function error_exit () {
+  echo -e "$@" >&2; exit 1;
+}
+
+function readfloat () {
+  local retval=${1/#*=/};  # In case --switch=ARG format was used
+  [[ "$retval" =~ ^-?[1-9]*\.*[0-9]*$ ]] \
+    || error_exit "Argument \"$retval\" not a real number."
+  echo $retval
+}
+
+function readint () {
+  local retval=${1/#*=/};  # In case --switch=ARG format was used
+  retval=${retval#0*}      # Strip any leading 0's
+  [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \
+    || error_exit "Argument \"$retval\" not an integer."
+  echo $retval
+}
+
+accwt=1.0
+beam=30.0
+latgen=0
+njobs=4
+qcmd=""   # Options for the submit_jobs.sh script
+
+PROG=`basename $0`;
+usage="Usage: $PROG [options] <graph-dir> <data-dir> <decode-dir>\n
+e.g.: $PROG exp/mono/graph_bg data/dev exp/mono/decode_dev_bg\n\n
+Options:\n
+  --help\t\tPrint this message and exit\n
+  --accwt FLOAT\tScaling for acoustic likelihoods (default=$accwt).\n
+  --beam FLOAT\tDecoder beam (default=$beam)\n
+  --latgen\tGenerate lattices (off by default)\n
+  --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n
+  --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
+";
+
+while [ $# -gt 0 ]; do
+  case "${1# *}" in  # ${1# *} strips any leading spaces from the arguments
+    --help) echo -e $usage; exit 0 ;;
+    --accwt)
+      shift; accwt=`readfloat $1`; shift ;;
+    --beam)
+      shift; beam=`readfloat $1`; shift ;;
+    --latgen) shift; latgen=1 ;;
+    --num-jobs)
+      shift; njobs=`readint $1`;
+      [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive.";
+      shift ;;
+    --qcmd)
+      shift; qcmd="--qcmd=${1}"; shift ;;
+    -*)  error_exit "Unknown argument: $1, exiting\n$usage" ;;
+    *)   break ;;   # end of options: interpreted as the data-dir
+  esac
+done
+
+if [ $# != 3 ]; then
+  error_exit $usage;
+fi
+
+[ -f path.sh ] && . path.sh
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+
+mkdir -p $dir
+
+requirements="$data/feats.scp $srcdir/final.mdl $graphdir/HCLG.fst"
+for f in $requirements; do
+  if [ ! -f $f ]; then
+    echo "decode_deltas.sh: no such file $f";
+    exit 1;
+  fi
+done
+
+# We only do one decoding pass, so there is no point caching the
+# CMVN stats-- we make them part of a pipe.
+feats="ark:compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:- scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |"
+if [ $njobs -gt 1 ]; then
+  if [ ! -d $data/split$njobs -o $data/split$njobs -ot $data/feats.scp ]; then
+    split_data.sh $data $njobs
+  fi
+  mydata=$data/split$njobs/TASK_ID
+  feats="ark:compute-cmvn-stats --spk2utt=ark:$mydata/spk2utt scp:$mydata/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$mydata/utt2spk ark:- scp:$mydata/feats.scp ark:- | add-deltas ark:- ark:- |"
+fi
+
+if [ $latgen -eq 1 ]; then
+  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/decode.TASK_ID.log \
+    gmm-latgen-faster --max-active=7000 --beam=$beam --lattice-beam=6.0 \
+      --acoustic-scale=$accwt --word-symbol-table=$graphdir/words.txt \
+      $srcdir/final.mdl $graphdir/HCLG.fst "$feats" \
+      "ark:|gzip -c > $dir/lat.TASK_ID.gz" || error_exit "Decoding failed.";
+else
+  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/decode.TASK_ID.log \
+    gmm-decode-faster --beam=$beam --acoustic-scale=$accwt \
+      --word-symbol-table=$graphdir/words.txt $srcdir/final.mdl \
+      $graphdir/HCLG.fst "$feats" ark,t:$dir/test.TASK_ID.tra \
+      || error_exit "Decoding failed.";
+fi
--- a/egs/timit/s4/steps/make_mfcc.sh
+++ b/egs/timit/s4/steps/make_mfcc.sh
@ -0,0 +1,111 @@
+#!/bin/bash -u
+
+# Copyright 2012  Arnab Ghoshal
+# Copyright 2010-2011  Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# To be run from .. (one directory up from here)
+
+function error_exit () {
+  echo -e "$@" >&2; exit 1;
+}
+
+function readint () {
+  local retval=${1/#*=/};  # In case --switch=ARG format was used
+  retval=${retval#0*}      # Strip any leading 0's
+  [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \
+    || error_exit "Argument \"$retval\" not an integer."
+  echo $retval
+}
+
+njobs=4   # Default number of jobs
+stage=-4  # Default starting stage (start with calculating CMN/CVN stats)
+qcmd=""   # Options for the submit_jobs.sh script
+
+PROG=`basename $0`;
+usage="Usage: $PROG [options] <data-dir> <log-dir> <abs-path-to-mfccdir>\n\n
+Options:\n
+  --help\t\tPrint this message and exit\n
+  --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n
+  --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
+";
+
+while [ $# -gt 0 ]; do
+  case "${1# *}" in  # ${1# *} strips any leading spaces from the arguments
+    --help) echo -e $usage; exit 0 ;;
+    --num-jobs)
+      shift; njobs=`readint $1`;
+      [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive.";
+      shift ;;
+    --qcmd)
+      shift; qcmd="--qcmd=${1}"; shift ;;
+    -*)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
+    *)   break ;;   # end of options: interpreted as the data-dir
+  esac
+done
+
+if [ $# != 3 ]; then
+  error_exit $usage;
+fi
+
+[ -f path.sh ] && . path.sh
+
+data=$1
+logdir=$2
+mfccdir=$3
+
+# use "name" as part of name of the archive.
+name=`basename $data`
+
+mkdir -p $mfccdir || exit 1;
+mkdir -p $logdir || exit 1;
+
+scp=$data/wav.scp
+config=conf/mfcc.conf
+required="$scp $config"
+
+for f in $required; do
+  if [ ! -f $f ]; then
+    echo "make_mfcc.sh: no such file $f"
+    exit 1;
+  fi
+done
+
+# note: in general, the double-parenthesis construct in bash "((" is "C-style
+# syntax" where we can get rid of the $ for variable names, and omit spaces.
+# The "for" loop in this style is a special construct.
+
+split_scps=""
+for ((n=1; n<=njobs; n++)); do
+  split_scps="$split_scps $logdir/wav$n.scp"
+done
+
+split_scp.pl $scp $split_scps || exit 1;
+
+rm -f $logdir/.error.$name 2>/dev/null
+submit_jobs.sh "$qcmd" --njobs=$njobs --log=$logdir/make_mfcc.TASK_ID.log \
+  compute-mfcc-feats --verbose=2 --config=$config scp:$logdir/wavTASK_ID.scp \
+  ark,scp:$mfccdir/mfcc_$name.TASK_ID.ark,$mfccdir/mfcc_$name.TASK_ID.scp \
+  || error_exit "Error producing mfcc features for $name:"`tail $logdir/make_mfcc.*.log`
+
+# concatenate the .scp files together.
+rm $data/feats.scp 2>/dev/null
+for ((n=1; n<=njobs; n++)); do
+  cat $mfccdir/mfcc_$name.$n.scp >> $data/feats.scp
+done
+
+# rm $logdir/wav*.scp
+
+echo "Succeeded creating MFCC features for $name"
--- a/egs/timit/s4/steps/train_deltas.sh
+++ b/egs/timit/s4/steps/train_deltas.sh
@ -0,0 +1,256 @@
+#!/bin/bash
+
+# Copyright 2010-2012  Microsoft Corporation;  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# To be run from ..
+# Triphone model training, using (e.g. MFCC) + delta + acceleration features and
+# cepstral mean normalization.  It starts from an existing directory (e.g.
+# exp/mono), supplied as an argument, which is assumed to be built using the same
+# type of features.
+#
+# This script starts from previously generated state-level alignments
+# (in $alidir), e.g. generated by a previous monophone or triphone
+# system.  To build a context-dependent triphone system, we build 
+# decision trees that map a 3-phone phonetic context window to a
+# pdf index.  It's not really clear which is the right reference, but
+# on is "Tree-based state tying for high accuracy acoustic modelling"
+# by Steve Young et al.  
+# In a typical approach, there are decision trees for
+# each monophone HMM-state (i.e. 3 per phone), and each one gets to
+# ask questions about the left and right phone.  These questions
+# correspond to sets of phones, corresponding to phonetic classes
+# (e.g. vowel, consonant, liquid, solar, ... ).  In Kaldi, we prefer
+# fully automatic algorithms, and anyway we're not sure where to get
+# these types of lists, so we just generate the classes automatically.
+# This is based on a top-down binary tree clustering of the phones
+# (see "cluster-phones"), where we take single-Gaussian statistics for 
+# just the central state of each phone (assuming this to be more 
+# representative of the phones), and we get a tree structure on the
+# phones; each class corresponds to a node of the tree (it contains all 
+# the phones that are children of that node).  Note: you could
+# replace questions.txt with something derived from manually written
+# questions.
+#  Also, the roots of the tree correspond to classes of phones (typically
+# corresponding to "real phones", because the actual phones may contain
+# word-begin/end and stress information), and the tree gets to ask
+# questions also about the central phone, and about the state in the HMM.
+#  After building the tree, we do a number of iterations of Gaussian
+# Mixture Model training; on selected iterations we redo the Viterbi
+# alignments (initially, these are taken from the previous system).
+# The Gaussian mixture splitting, whereby we go from a single Gaussian
+# per state to multiple Gaussians, is done on all iterations (although
+# we stop doing this a few iterations before the end).  We don't have
+# a fixed number of Gaussians per state, but we have an overall target
+# #Gaussians that's specified on each iteration, and we allocate
+# the Gaussians among states according to a power-law where the #Gaussians
+# is proportional to the count to the power 0.2.  The target
+# increases linearly during training [note: logarithmically seems more
+# natural but didn't work as well.]
+
+function error_exit () {
+  echo -e "$@" >&2; exit 1;
+}
+
+function readint () {
+  local retval=${1/#*=/};  # In case --switch=ARG format was used
+  retval=${retval#0*}      # Strip any leading 0's
+  [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \
+    || error_exit "Argument \"$retval\" not an integer."
+  echo $retval
+}
+
+njobs=4    # Default number of jobs
+stage=-4   # Default starting stage (start with tree building)
+qcmd=""  # Options for the submit_jobs.sh script
+
+PROG=`basename $0`;
+usage="Usage: $PROG [options] <num-leaves> <tot-gauss> <data-dir> <lang-dir> <ali-dir> <exp-dir>\n
+e.g.: $PROG 2000 10000 data/train_si84 data/lang exp/mono_ali exp/tri1\n\n
+Options:\n
+  --help\t\tPrint this message and exit\n
+  --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n
+  --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
+  --stage INT\tStarting stage (e.g. -4 for tree building; 2 for iter 2; default=$stage)\n
+";
+
+while [ $# -gt 0 ]; do
+  case "${1# *}" in  # ${1# *} strips any leading spaces from the arguments
+    --help) echo -e $usage; exit 0 ;;
+    --num-jobs) 
+      shift; njobs=`readint $1`;
+      [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive.";
+      shift ;;
+    --qcmd)
+      shift; qcmd=" --qcmd=${1}"; shift ;;
+    --stage)
+      shift; stage=`readint $1`; shift ;;
+    -*)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
+    *)   break ;;   # end of options: interpreted as num-leaves
+  esac
+done
+
+if [ $# != 6 ]; then
+  error_exit $usage;
+fi
+
+[ -f path.sh ] && . path.sh
+
+numleaves=$1
+totgauss=$2
+data=$3
+lang=$4
+alidir=$5
+dir=$6
+
+if [ ! -f $alidir/final.mdl ]; then
+  echo "Error: alignment dir $alidir does not contain final.mdl"
+  exit 1;
+fi
+
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+realign_iters="10 20 30";
+silphonelist=`cat $lang/silphones.csl`
+numiters=35    # Number of iterations of training
+maxiterinc=25 # Last iter to increase #Gauss on.
+numgauss=$numleaves
+incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
+
+if [ -f $lang/oov.txt ]; then
+  oov_opt="--map-oov '"`cat $lang/oov.txt`"'"
+else
+  oov_opt='--ignore-oov'
+fi
+
+mkdir -p $dir/log
+if [ ! -d $data/split$njobs -o $data/split$njobs -ot $data/feats.scp ]; then
+  split_data.sh $data $njobs
+fi
+
+# for n in `get_splits.pl $njobs`; do
+featspart="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$njobs/TASK_ID/utt2spk ark:$alidir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
+
+if [ $stage -le -3 ]; then
+# The next stage assumes we won't need the context of silence, which
+# assumes something about $lang/roots.txt, but it seems pretty safe.
+  echo "Accumulating tree stats"
+  # for n in `get_splits.pl $njobs`; do
+  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/acc_tree.TASK_ID.log \
+    acc-tree-stats  --ci-phones=$silphonelist $alidir/final.mdl "$featspart" \
+      "ark:gunzip -c $alidir/TASK_ID.ali.gz|" $dir/TASK_ID.treeacc \
+      || error_exit "Error accumulating tree stats";
+  
+  sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log \
+      || error_exit "Error summing tree stats.";
+  rm $dir/*.treeacc
+fi
+
+if [ $stage -le -2 ]; then
+# preparing questions, roots file...
+  echo "Computing questions for tree clustering"
+  ( sym2int.pl $lang/phones.txt $lang/phonesets_cluster.txt > $dir/phonesets.txt
+    cluster-phones $dir/treeacc $dir/phonesets.txt $dir/questions.txt \
+      2> $dir/log/questions.log
+    [ -f $lang/extra_questions.txt ] && \
+      sym2int.pl $lang/phones.txt $lang/extra_questions.txt \
+      >> $dir/questions.txt
+    compile-questions $lang/topo $dir/questions.txt $dir/questions.qst \
+      2>$dir/log/compile_questions.log
+    sym2int.pl --ignore-oov $lang/phones.txt $lang/roots.txt > $dir/roots.txt
+  ) || error_exit "Error in generating questions for tree clustering."
+
+  echo "Building tree"
+  submit_jobs.sh "$qcmd" --log=$dir/log/train_tree.log \
+    build-tree --verbose=1 --max-leaves=$numleaves $dir/treeacc $dir/roots.txt \
+      $dir/questions.qst $lang/topo $dir/tree \
+      || error_exit "Error in building tree.";
+
+  gmm-init-model --write-occs=$dir/1.occs \
+    $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log \
+    || error_exit "Error in initializing the model.";
+
+  gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl \
+    2>$dir/log/mixup.log || error_exit "Error mixing up to $numgauss Gaussains";
+
+  rm $dir/treeacc
+fi
+
+
+if [ $stage -le -1 ]; then
+# Convert alignments in $alidir, to use as initial alignments.
+# This assumes that $alidir was split in $njobs pieces, just like the
+# current dir.  Just do this locally-- it's very fast.
+  echo "Converting old alignments"
+  # for n in `get_splits.pl $njobs`; do
+  submit_jobs.sh --njobs=$njobs --log=$dir/log/convertTASK_ID.log \
+    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
+      "ark:gunzip -c $alidir/TASK_ID.ali.gz|" \
+      "ark:|gzip -c >$dir/TASK_ID.ali.gz" \
+      || error_exit "Error converting old alignments.";
+fi
+
+if [ $stage -le 0 ]; then
+# Make training graphs (this is split in $njobs parts).
+  echo "Compiling training graphs"
+  # for n in `get_splits.pl $njobs`; do
+  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/compile_graphsTASK_ID.log \
+    compile-train-graphs $dir/tree $dir/1.mdl  $lang/L.fst  \
+      "ark:sym2int.pl $oov_opt --ignore-first-field $lang/words.txt < $data/split$njobs/TASK_ID/text |" \
+      "ark:|gzip -c >$dir/TASK_ID.fsts.gz" \
+      || error_exit "Error compiling training graphs";
+fi
+
+x=1
+while [ $x -lt $numiters ]; do
+  echo Pass $x
+  if [ $stage -le $x ]; then
+    if echo $realign_iters | grep -w $x >/dev/null; then
+      echo "Aligning data"
+      # for n in `get_splits.pl $njobs`; do
+      submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/align.$x.TASK_ID.log \
+        gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 $dir/$x.mdl \
+          "ark:gunzip -c $dir/TASK_ID.fsts.gz|" "$featspart" \
+          "ark:|gzip -c >$dir/TASK_ID.ali.gz" \
+	  || error_exit "Error aligning data on iteration $x";
+    fi  # Realign iters
+
+    # for n in `get_splits.pl $njobs`; do
+    submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/acc.$x.TASK_ID.log \
+      gmm-acc-stats-ali  $dir/$x.mdl "$featspart" \
+        "ark,s,cs:gunzip -c $dir/TASK_ID.ali.gz|" $dir/$x.TASK_ID.acc \
+	|| error_exit "Error accumulating stats on iteration $x";
+
+    submit_jobs.sh "$qcmd" --log=$dir/log/update.$x.log \
+      gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \
+	"gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl \
+	|| error_exit "Error in pass $x extimation.";
+    rm -f r/$x.mdl $dir/$x.*.acc rm $dir/$x.occs 
+  fi  # Completed a training stage.
+  if [[ $x -le $maxiterinc ]]; then 
+    numgauss=$[$numgauss+$incgauss];
+  fi
+  x=$[$x+1];
+done
+
+( cd $dir; rm -f final.{mdl,occs}; ln -s $x.mdl final.mdl; \
+  ln -s $x.occs final.occs; )
+
+# Print out summary of the warning messages.
+for x in $dir/log/*.log; do 
+  n=`grep WARNING $x | wc -l`; 
+  if [ $n -ne 0 ]; then echo $n warnings in $x; fi; 
+done
+
+echo Done
--- a/egs/timit/s4/steps/train_mono.sh
+++ b/egs/timit/s4/steps/train_mono.sh
@ -0,0 +1,202 @@
+#!/bin/bash
+
+# Copyright 2012  Arnab Ghoshal
+# Copyright 2010-2011  Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# To be run from ..
+# Flat start and monophone training, with delta-delta features.
+# This script applies cepstral mean normalization (per speaker).
+
+function error_exit () {
+  echo -e "$@" >&2; exit 1;
+}
+
+function readint () {
+  local retval=${1/#*=/};  # In case --switch=ARG format was used
+  retval=${retval#0*}      # Strip any leading 0's
+  [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \
+    || error_exit "Argument \"$retval\" not an integer."
+  echo $retval
+}
+
+njobs=4   # Default number of jobs
+stage=-4  # Default starting stage (start with calculating CMN/CVN stats)
+qcmd=""   # Options for the submit_jobs.sh script
+
+PROG=`basename $0`;
+usage="Usage: $PROG [options] <data-dir> <lang-dir> <exp-dir>\n
+e.g.: $PROG data/train.1k data/lang exp/mono\n\n
+Options:\n
+  --help\t\tPrint this message and exit\n
+  --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n
+  --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
+  --stage INT\tStarting stage (e.g. -4 for CMN/CVN stats; 2 for iter 2; default=$stage)\n
+";
+
+while [ $# -gt 0 ]; do
+  case "${1# *}" in  # ${1# *} strips any leading spaces from the arguments
+    --help) echo -e $usage; exit 0 ;;
+    --num-jobs)
+      shift; njobs=`readint $1`;
+      [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive.";
+      shift ;;
+    --qcmd)
+      shift; qcmd="--qcmd=${1}"; shift ;;
+    --stage)
+      shift; stage=`readint $1`; shift ;;
+    -*)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
+    *)   break ;;   # end of options: interpreted as the data-dir
+  esac
+done
+
+if [ $# != 3 ]; then
+  error_exit $usage;
+fi
+
+data=$1
+lang=$2
+dir=$3
+
+[ -f path.sh ] && . path.sh
+
+# Configuration:
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+numiters=40    # Number of iterations of training
+maxiterinc=30 # Last iter to increase #Gauss on.
+numgauss=300 # Initial num-Gauss (must be more than #states=3*phones).
+totgauss=1000 # Target #Gaussians.  
+incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
+realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38";
+if [ -f $lang/oov.txt ]; then
+  oov_opt="--map-oov '"`cat $lang/oov.txt`"'"
+else
+  oov_opt='--ignore-oov'
+fi
+
+mkdir -p $dir/log
+if [ ! -d $data/split$njobs -o $data/split$njobs -ot $data/feats.scp ]; then
+  split_data.sh $data $njobs
+fi
+
+if [ $stage -le -3 ]; then
+  echo "Computing cepstral mean and variance statistics"
+  # for n in `get_splits.pl $njobs`; do # do this locally; it's fast.
+  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/cmvnTASK_ID.log \
+    compute-cmvn-stats --spk2utt=ark:$data/split$njobs/TASK_ID/spk2utt \
+      scp:$data/split$njobs/TASK_ID/feats.scp ark:$dir/TASK_ID.cmvn \
+      || error_exit "Computing CMN/CVN stats failed.";
+fi
+
+feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk \"ark:cat $dir/*.cmvn|\" scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |"
+
+# for n in `get_splits.pl $njobs`; do
+# for n in `seq 1 $njobs`; do
+featspart="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$njobs/TASK_ID/utt2spk ark:$dir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
+
+
+if [ $stage -le -2 ]; then
+  echo "Initializing monophone system."
+  if [ -f $lang/phonesets_mono.txt ]; then
+    echo "Using shared phones from $lang/phonesets_mono.txt"
+    # In recipes with stress and position markers, this pools together
+    # the stats for the different versions of the same phone (also for 
+    # the various silence phones).
+    sym2int.pl $lang/phones.txt $lang/phonesets_mono.txt > $dir/phonesets.int
+    shared_phones_opt="--shared-phones=$dir/phonesets.int"
+  fi
+
+  gmm-init-mono $shared_phones_opt \
+    "--train-feats=$feats subset-feats --n=10 ark:- ark:-|" $lang/topo 39  \
+    $dir/0.mdl $dir/tree 2> $dir/log/init.log \
+    || error_exit "Monophone model initialization failed.";
+fi
+
+if [ $stage -le -1 ]; then
+  echo "Compiling training graphs"
+  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/compile_graphsTASK_ID.log \
+    compile-train-graphs $dir/tree $dir/0.mdl  $lang/L.fst \
+      "ark:sym2int.pl $oov_opt --ignore-first-field $lang/words.txt < $data/split$njobs/TASK_ID/text|" \
+      "ark:|gzip -c >$dir/TASK_ID.fsts.gz" \
+      || error_exit "Error compiling training graphs.";
+fi
+
+if [ $stage -le 0 ]; then
+  echo "Aligning data equally (pass 0)"
+# for n in `get_splits.pl $njobs`; do
+  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/align.0.TASK_ID.log \
+    align-equal-compiled "ark:gunzip -c $dir/TASK_ID.fsts.gz|" "$featspart" \
+      ark,t,f:- \| \
+    gmm-acc-stats-ali --binary=true $dir/0.mdl "$featspart" \
+      ark:- $dir/0.TASK_ID.acc \
+      || error_exit "Error in pass 0 accumulation";
+
+# In the following steps, the --min-gaussian-occupancy=3 option is important, 
+# otherwise we cannot est "rare" phones and later on, they never align properly.
+  gmm-est --min-gaussian-occupancy=3 --mix-up=$numgauss \
+    $dir/0.mdl "gmm-sum-accs - $dir/0.*.acc|" $dir/1.mdl \
+    2> $dir/log/update.0.log || error_exit "Error in pass 0 estimation.";
+
+  rm $dir/0.*.acc
+fi  # Finished 0'th training iteration.
+
+beam=6  # will change to 10 below after 1st pass
+x=1
+while [ $x -lt $numiters ]; do
+  echo "Pass $x"
+  if [ $stage -le $x ]; then
+    if echo $realign_iters | grep -w $x >/dev/null; then
+      echo "Aligning data"
+      # for n in `get_splits.pl $njobs`; do
+      submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/align.$x.TASK_ID.log \
+	gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$[$beam*4] \
+	  $dir/$x.mdl "ark:gunzip -c $dir/TASK_ID.fsts.gz|" "$featspart" \
+	  "ark,t:|gzip -c >$dir/TASK_ID.ali.gz" \
+	  || error_exit "Error in pass $x alignment.";
+    fi  # Realign iters
+
+    # for n in `get_splits.pl $njobs`; do
+    submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/acc.$x.TASK_ID.log \
+      gmm-acc-stats-ali $dir/$x.mdl "$featspart" \
+	"ark:gunzip -c $dir/TASK_ID.ali.gz|" $dir/$x.TASK_ID.acc \
+	|| error_exit "Error in pass $x accumulation.";
+
+    submit_jobs.sh "$qcmd" --log=$dir/log/update.$x.log \
+      gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \
+	"gmm-sum-accs - $dir/$x.*.acc|" $dir/$[$x+1].mdl \
+	|| error_exit "Error in pass $x extimation.";
+    rm -f $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs
+  fi  # Completed a training stage.
+  if [ $x -le $maxiterinc ]; then
+    numgauss=$[$numgauss+$incgauss];
+  fi
+  beam=10
+  x=$[$x+1];
+done
+
+( cd $dir; rm -f final.{mdl,occs}; ln -s $x.mdl final.mdl; \
+  ln -s $x.occs final.occs; )
+
+# Print out summary of the warning messages.
+for x in $dir/log/*.log; do 
+  n=`grep WARNING $x | wc -l`; 
+  if [ $n -ne 0 ]; then echo $n warnings in $x; fi; 
+done
+
+echo Done
+
+# example of showing the alignments:
+# show-alignments data/lang/phones.txt $dir/30.mdl "ark:gunzip -c $dir/0.ali.gz|" | head -4
+
--- a/egs/timit/s4/utils/add_disambig.pl
+++ b/egs/timit/s4/utils/add_disambig.pl
@ -0,0 +1,58 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Adds some specified number of disambig symbols to a symbol table.
+# Adds these as #1, #2, etc.
+# If the --include-zero option is specified, includes an extra one
+# #0.
+
+$include_zero = 0;
+if($ARGV[0] eq "--include-zero") {
+    $include_zero = 1;
+    shift @ARGV;
+}
+
+if(@ARGV != 2) {
+    die "Usage: add_disambig.pl [--include-zero] symtab.txt num_extra > symtab_out.txt ";
+}
+
+
+$input = $ARGV[0];
+$nsyms = $ARGV[1];
+
+open(F, "<$input") || die "Opening file $input";
+
+while(<F>) {
+    @A = split(" ", $_);
+    @A == 2 || die "Bad line $_";
+    $lastsym = $A[1];
+    print;
+}
+
+if(!defined($lastsym)){
+ die "Empty symbol file?";
+}
+
+if($include_zero) {
+    $lastsym++;
+    print "#0  $lastsym\n";
+}
+
+for($n = 1; $n <= $nsyms; $n++) {
+    $y = $n + $lastsym;
+    print "#$n  $y\n";
+}
--- a/egs/timit/s4/utils/add_lex_disambig.pl
+++ b/egs/timit/s4/utils/add_lex_disambig.pl
@ -0,0 +1,101 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Adds disambiguation symbols to a lexicon.
+# Outputs still in the normal lexicon format.
+# Disambig syms are numbered #1, #2, #3, etc. (#0 
+# reserved for symbol in grammar).
+# Outputs the number of disambig syms to the standard output.
+
+if(@ARGV != 2) {
+    die "Usage: add_lex_disambig.pl  lexicon.txt lexicon_disambig.txt "
+}
+
+
+$lexfn = shift @ARGV;
+$lexoutfn = shift @ARGV;
+
+open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
+
+# (1)  Read in the lexicon.
+@L = ( );
+while(<L>) {
+    @A = split(" ", $_);
+    push @L, join(" ", @A);
+}
+
+# (2) Work out the count of each phone-sequence in the
+# lexicon.
+
+foreach $l (@L) {
+    @A = split(" ", $l);
+    shift @A; # Remove word.
+    $count{join(" ",@A)}++;
+}
+
+# (3) For each left sub-sequence of each phone-sequence, note down
+# that exists (for identifying prefixes of longer strings).
+
+foreach $l (@L) {
+    @A = split(" ", $l);
+    shift @A; # Remove word.
+    while(@A > 0) {
+        pop @A;  # Remove last phone
+        $issubseq{join(" ",@A)} = 1;
+    }
+}
+
+# (4) For each entry in the lexicon:
+#  if the phone sequence is unique and is not a
+#  prefix of another word, no diambig symbol.
+#  Else output #1, or #2, #3, ... if the same phone-seq
+#  has already been assigned a disambig symbol.
+
+
+open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";
+
+$max_disambig = 0;
+foreach $l (@L) {
+    @A = split(" ", $l);
+    $word = shift @A;
+    $phnseq = join(" ",@A);
+    if(!defined $issubseq{$phnseq}
+       && $count{$phnseq}==1) {
+        ; # Do nothing.
+    } else {
+        if($phnseq eq "") { # need disambig symbols for the empty string
+            # that are not use anywhere else.
+            $max_disambig++;
+            $reserved{$max_disambig} = 1;
+            $phnseq = "#$max_disambig";
+        } else {
+            $curnumber = $disambig_of{$phnseq};
+            if(!defined{$curnumber}) { $curnumber = 0; }
+            $curnumber++; # now 1 or 2, ... 
+            while(defined $reserved{$curnumber} ) { $curnumber++; } # skip over reserved symbols
+            if($curnumber > $max_disambig) {
+                $max_disambig = $curnumber;
+            }
+            $disambig_of{$phnseq} = $curnumber;
+            $phnseq = $phnseq . " #" . $curnumber;
+         }
+    }
+    print O "$word\t$phnseq\n";
+}
+
+print $max_disambig . "\n";
+
--- a/egs/timit/s4/utils/decode.sh
+++ b/egs/timit/s4/utils/decode.sh
@ -0,0 +1,145 @@
+#!/bin/bash   
+
+# Copyright 2012  Arnab Ghoshal
+# Copyright 2010-2011  Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+function error_exit () {
+  echo -e "$@" >&2; exit 1;
+}
+
+function readint () {
+  local retval=${1/#*=/};  # In case --switch=ARG format was used
+  retval=${retval#0*}      # Strip any leading 0's
+  [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \
+    || error_exit "Argument \"$retval\" not an integer."
+  echo $retval
+}
+
+function read_dirname () {
+  local dir_name=${1/#*=/};  # In case --switch=ARG format was used
+  [ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory";
+  local retval=`cd $dir_name 2>/dev/null && pwd || exit 1`
+  echo $retval
+}
+
+orig_args="$*"
+njobs=""  # Total number of jobs unset by default. Will set to #speakers (if 
+          # using a grid) or 4 (if not), unless specified by user.
+lang=""   # Option for sclite scoring (off by default)
+opts=""
+qcmd=""   # Options for the submit_jobs.sh script
+
+PROG=`basename $0`;
+usage="Usage: $PROG [options] <decode_script> <graph-dir> <data-dir> <decode-dir> [extra-args...]\n\n
+Options:\n
+  --help\t\tPrint this message and exit\n
+  -l DIR\t\tDirectory to find L_align.fst (needed for sclite scoring)\n
+  --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n
+  --opts STRING\tOptions for the decoder script\n
+  --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
+";
+
+while [ $# -gt 0 ]; do
+  case "${1# *}" in  # ${1# *} strips any leading spaces from the arguments
+    --help) echo -e $usage; exit 0 ;;
+    -l) 
+      shift; lang=`read_dirname $1`;
+      [ ! -f "$lang/phones_disambig.txt" -o ! -f "$lang/L_align.fst" ] && \
+	error_exit "Invalid argument to -l option; expected $lang/phones_disambig.txt and $lang/L_align.fst to exist."
+      shift ;;
+    --num-jobs)
+      shift; njobs=`readint $1`;
+      [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive.";
+      shift ;;
+    --opts)
+      shift; opts="$1"; shift ;;
+    --qcmd)
+      shift; qcmd="--qcmd=${1}"; shift ;;
+    --stage)
+      shift; stage=`readint $1`; shift ;;
+    -*)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
+    *)   break ;;   # end of options: interpreted as the script to execute
+  esac
+done
+
+
+if [ $# -lt 4 ]; then
+  error_exit $usage;
+fi
+
+script=$1
+graphdir=$2
+data=$3
+dir=$4
+# Make "dir" an absolute pathname.
+dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`
+mkdir -p $dir || exit 1
+shift;shift;shift;shift;
+# Remaining args will be supplied to decoding script.
+extra_args=$* 
+
+[ -f path.sh ] && . path.sh
+
+for file in $script $scp $data/utt2spk; do
+  if [ ! -f "$file" ]; then
+     echo "decode.sh: no such file $file"
+     exit 1
+  fi 
+done
+
+if [ ! -f $graphdir/HCLG.fst -a ! -f $graphdir/G.fst ]; then
+  # Note: most scripts expect HCLG.fst in graphdir, but the
+  # "*_fromlats.sh" script(s) require(s) a "lang" dir in that
+  # position
+  echo No such file: $graphdir/HCLG.fst or $graphdir/G.fst
+  exit 1;
+fi
+
+if [ -z "$njobs" ]; then # Figure out num-jobs; user did not specify.
+  if [ -z "$qcmd" ]; then
+    njobs=4
+  else  # running on queue...
+    njobs=`utt2spk_to_spk2utt.pl $data/utt2spk | wc -l`
+  fi
+fi
+
+echo "Decoding with num-jobs = $njobs"
+if [[ $njobs -gt 1 || ! -d $data/split$njobs || \
+      $data/split$njobs -ot $data/feats.scp ]]; then
+  split_data.sh $data $njobs
+fi
+
+#for n in `get_splits.pl $njobs`; do
+submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/partTASK_ID.log \
+  $script $opts -j $njobs TASK_ID $graphdir $data $dir $extra_args \
+  || error_exit "Error in decoding script: command was decode.sh $orig_args"
+
+if ls $dir/lat.*.gz >&/dev/null; then
+  if [ -n "$lang" ]; then  
+  # sclite scoring: $lang directory supplied only for this reason.
+    [ ! -f $data/stm ] && \
+      error_exit "Expected $data/stm to exist (-l only used for sclite scoring)"
+    score_lats_ctm.sh $dir $lang $data || \
+      error_exit "Error in scoring of lattices using sclite."
+  else
+    score_lats.sh $dir $graphdir/words.txt $data || \
+      error_exit "Error in scoring of latices.";
+  fi
+elif ls $dir/*.txt >&/dev/null; then
+  score_text.sh $dir $data || error_exit "Error in scoring of hypotheses.";
+else
+  eror_exit "No output found in $dir, not scoring.";
+fi
--- a/egs/timit/s4/utils/eps2disambig.pl
+++ b/egs/timit/s4/utils/eps2disambig.pl
@ -0,0 +1,23 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This script replaces epsilon with #0 on the input side only, of the G.fst
+# acceptor.  
+
+while(<>){
+    s:^(\d+\s+\d+\s+)\<eps\>(\s+):$1#0$2:;
+    print;
+}
--- a/egs/timit/s4/utils/filter_scp.pl
+++ b/egs/timit/s4/utils/filter_scp.pl
@ -0,0 +1,41 @@
+#!/usr/bin/perl -w
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script takes a list of utterance-ids or any file whose first field
+# of each line is an utterance-id, and filters an scp
+# file (or any file whose first field is an utterance id), printing
+# out only those lines whose first field is in id_list.
+
+if(@ARGV < 1 || @ARGV > 2) {
+    die "Usage: filter_scp.pl id_list [in.scp] > out.scp ";
+}
+
+$idlist = shift @ARGV;
+open(F, "<$idlist") || die "Could not open id-list file $idlist";
+while(<F>) {
+    @A = split;
+    @A>=1 || die "Invalid id-list file line $_";
+    $seen{$A[0]} = 1;
+}
+
+while(<>) {
+    @A = split;
+    @A > 0 || die "Invalid scp file line $_";
+    if($seen{$A[0]}) {
+        print $_;
+    }
+}
--- a/egs/timit/s4/utils/int2sym.pl
+++ b/egs/timit/s4/utils/int2sym.pl
@ -0,0 +1,90 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+$ignore_noninteger = 0;
+$ignore_first_field = 0;
+$field = -1;
+for($x = 0; $x < 2; $x++) {
+    if($ARGV[0] eq "--ignore-noninteger") { $ignore_noninteger = 1; shift @ARGV; }
+    if($ARGV[0] eq "--ignore-first-field") { $ignore_first_field = 1; shift @ARGV; }
+    if($ARGV[0] eq "--field") { 
+       shift @ARGV; $field = $ARGV[0]+0; shift @ARGV;
+       if ($field < 1) { die "Bad argument to --field option: $field"; }
+    }
+}
+
+if ($ignore_first_field && $field > 0) { die "Incompatible options ignore-first-field and field"; }
+$zfield = $field-1; # Change to zero-based indexing.
+
+$symtab = shift @ARGV;
+if(!defined $symtab) {
+    die "Usage: sym2int.pl symtab [input] > output\n";
+}
+open(F, "<$symtab") || die "Error opening symbol table file $symtab";
+while(<F>) {
+    @A = split(" ", $_);
+    @A == 2 || die "bad line in symbol table file: $_";
+    $int2sym{$A[1]} = $A[0];
+}
+
+sub int2sym {
+    my $a = shift @_;
+    my $pos = shift @_;
+    if($a !~  m:^\d+$:) { # not all digits..
+        if($ignore_noninteger) {
+            print $a . " ";
+            next;
+        } else {
+            if($pos == 0) {
+                die "int2sym.pl: found noninteger token $a (try --ignore-first-field)\n";
+            } else {
+                die "int2sym.pl: found noninteger token $a (try --ignore-noninteger if valid input)\n";
+            }
+        }
+    }
+    $s = $int2sym{$a};
+    if(!defined ($s)) {
+        die "int2sym.pl: integer $a not in symbol table $symtab.";
+    }
+    return $s;
+}
+
+$error = 0;
+while(<>) {
+    @A = split(" ", $_);
+    if($ignore_first_field) {
+        $key = shift @A;
+        print $key . " ";
+    }
+    if ($field != -1) {
+        if ($zfield <= $#A && $zfield >= 0) {
+            $a = $A[$zfield];
+            $A[$zfield] = int2sym($a, $zfield);
+        }
+        print join(" ", @A);
+    } else {
+        for ($pos = 0; $pos <= $#A; $pos++) {
+            $a = $A[$pos];
+            $s = int2sym($a, $pos);
+            print $s . " ";
+        }
+    }
+    print "\n";
+}
+
+
+
--- a/egs/timit/s4/utils/make_lexicon_fst.pl
+++ b/egs/timit/s4/utils/make_lexicon_fst.pl
@ -0,0 +1,122 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# makes lexicon FST (no pron-probs involved).
+
+if(@ARGV != 1 && @ARGV != 3 && @ARGV != 4) {
+    die "Usage: make_lexicon_fst.pl lexicon.txt [silprob silphone [sil_disambig_sym]] lexiconfst.txt"
+}
+
+$lexfn = shift @ARGV;
+if(@ARGV == 0) {
+    $silprob = 0.0;
+} elsif (@ARGV == 2){ 
+    ($silprob,$silphone) = @ARGV;
+} else {
+    ($silprob,$silphone,$sildisambig) = @ARGV;
+}
+if($silprob != 0.0) {
+    $silprob < 1.0 || die "Sil prob cannot be >= 1.0";
+    $silcost = -log($silprob);
+    $nosilcost = -log(1.0 - $silprob);
+}
+
+
+open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
+
+
+
+sub is_sil {
+    # Return true (1) if provided with a phone-sequence
+    # that means silence.
+    # @_ is the parameters of the function
+    # This function returns true if @_ equals ( $silphone )
+    # or something of the form ( "#0", $silphone, "#1" )
+    # where the "#0" and "#1" are disambiguation symbols.
+    return ( @_ == 1 && $_[0] eq $silphone ||
+             (@_ == 3 && $_[1] eq $silphone &&
+              $_[0] =~ m/^\#\d+$/ &&
+              $_[0] =~ m/^\#\d+$/));
+}
+
+if( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero.
+    $loopstate = 0;
+    $nexststate = 1; # next unallocated state.
+    while(<L>) {
+        @A = split(" ", $_);
+        $w = shift @A;
+
+        $s = $loopstate;
+        $word_or_eps = $w;
+        while (@A > 0) {
+            $p = shift @A;
+            if(@A > 0) {
+                $ns = $nextstate++;
+            } else {
+                $ns = $loopstate;
+            }
+            print "$s\t$ns\t$p\t$word_or_eps\n";
+            $word_or_eps = "<eps>";
+            $s = $ns;
+        }
+    }
+    print "$loopstate\t0\n"; # final-cost.
+} else { # have silence probs.
+    $startstate = 0;
+    $loopstate = 1;
+    $silstate = 2; # state from where we go to loopstate after emitting silence.
+    print "$startstate\t$loopstate\t<eps>\t<eps>\t$nosilcost\n"; # no silence.
+    if (!defined $sildisambig) {
+        print "$startstate\t$loopstate\t$silphone\t<eps>\t$silcost\n"; # silence.
+        print "$silstate\t$loopstate\t$silphone\t<eps>\n"; # no cost.
+        $nextstate = 3;
+    } else {
+        $disambigstate = 3;
+        $nextstate = 4;
+        print "$startstate\t$disambigstate\t$silphone\t<eps>\t$silcost\n"; # silence.
+        print "$silstate\t$disambigstate\t$silphone\t<eps>\n"; # no cost.
+        print "$disambigstate\t$loopstate\t$sildisambig\t<eps>\n"; # silence disambiguation symbol.
+    }
+    while(<L>) {
+        @A = split(" ", $_);
+        $w = shift @A;
+
+        $s = $loopstate;
+        $word_or_eps = $w;
+        while (@A > 0) {
+            $p = shift @A;
+            if(@A > 0) {
+                $ns = $nextstate++;
+                print "$s\t$ns\t$p\t$word_or_eps\n";
+                $word_or_eps = "<eps>";
+                $s = $ns;
+            } else {
+                if(!is_sil(@A)){
+                    # This is non-deterministic but relatively compact,
+                    # and avoids epsilons.
+                    print "$s\t$loopstate\t$p\t$word_or_eps\t$nosilcost\n";
+                    print "$s\t$silstate\t$p\t$word_or_eps\t$silcost\n";
+                } else {
+                    # no point putting opt-sil after silence word.
+                    print "$s\t$loopstate\t$p\t$word_or_eps\n";
+                }
+                $word_or_eps = "<eps>";
+            }
+        }            
+    }
+    print "$loopstate\t0\n"; # final-cost.
+}
--- a/egs/timit/s4/utils/mkgraph.sh
+++ b/egs/timit/s4/utils/mkgraph.sh
@ -0,0 +1,134 @@
+#!/bin/bash
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script creates a fully expanded decoding graph (HCLG) that represents
+# all the language-model, pronunciation dictionary (lexicon), context-dependency,
+# and HMM structure in our model.  The output is a Finite State Transducer
+# that has word-ids on the output, and pdf-ids on the input (these are indexes
+# that resolve to Gaussian Mixture Models).  
+# See
+#  http://kaldi.sourceforge.net/graph_recipe_test.html
+# (this is compiled from this repository using Doxygen,
+# the source for this part is in src/doc/graph_recipe_test.dox)
+
+
+N=3
+P=1
+clean=false
+
+for x in 1 2 3; do 
+  if [ $1 == "--mono" ]; then
+    N=1;
+    P=0;
+    shift;
+  fi
+  if [ $1 == "--clean" ]; then
+    clean=true
+    shift;
+  fi
+
+done
+
+if [ $# != 3 ]; then
+   echo "Usage: scripts/mkgraph.sh <test-lang-dir> <model-dir> <graphdir>"
+   echo "e.g.: scripts/mkgraph.sh data/lang_test exp/tri1/ exp/tri1/graph"
+   exit 1;
+fi
+
+if [ -f path.sh ]; then . path.sh; fi
+
+lang=$1
+tree=$2/tree
+model=$2/final.mdl
+dir=$3
+
+if $clean; then rm -r $lang/tmp; fi
+
+mkdir -p $dir
+
+tscale=1.0
+loopscale=0.1
+
+# If $lang/tmp/LG.fst does not exist or is older than its sources, make it...
+# (note: the [[ ]] brackets make the || type operators work (inside [ ], we
+# would have to use -o instead),  -f means file exists, and -ot means older than).
+
+required="$lang/L.fst $lang/G.fst $lang/phones_disambig.txt $lang/words.txt $lang/silphones.csl $model $tree"
+for f in $required; do
+  [ ! -f $f ] && echo "mkgraph.sh: expected $f to exist" && exit 1;
+done
+
+mkdir -p $lang/tmp
+if [[ ! -f $lang/tmp/LG.fst || $lang/tmp/LG.fst -ot $lang/G.fst || \
+      $lang/tmp/LG.fst -ot $lang/L_disambig.fst ]]; then
+  fsttablecompose $lang/L_disambig.fst $lang/G.fst | fstdeterminizestar --use-log=true | \
+    fstminimizeencoded  > $lang/tmp/LG.fst || exit 1;
+  fstisstochastic $lang/tmp/LG.fst || echo "warning: LG not stochastic."
+fi
+
+if [ ! -f $lang/phones_disambig.txt ]; then
+  echo "No such file $lang/phones_disambig.txt (supplied a training lang/ directory?)"
+  exit 1;
+fi
+
+grep '#' $lang/phones_disambig.txt | awk '{print $2}' > $lang/tmp/disambig_phones.list
+
+
+clg=$lang/tmp/CLG_${N}_${P}.fst
+
+if [[ ! -f $clg || $clg -ot $lang/tmp/LG.fst ]]; then
+  fstcomposecontext --context-size=$N --central-position=$P \
+   --read-disambig-syms=$lang/tmp/disambig_phones.list \
+   --write-disambig-syms=$lang/tmp/disambig_ilabels_${N}_${P}.list \
+    $lang/tmp/ilabels_${N}_${P} < $lang/tmp/LG.fst >$clg
+  fstisstochastic $clg  || echo "warning: CLG not stochastic."
+fi
+
+if [[ ! -f $dir/Ha.fst || $dir/Ha.fst -ot $model  \
+    || $dir/Ha.fst -ot $lang/tmp/ilabels_${N}_${P} ]]; then
+  make-h-transducer --disambig-syms-out=$dir/disambig_tid.list \
+    --transition-scale=$tscale $lang/tmp/ilabels_${N}_${P} $tree $model \
+     > $dir/Ha.fst  || exit 1;
+fi
+
+if [[ ! -f $dir/HCLGa.fst || $dir/HCLGa.fst -ot $dir/Ha.fst || \
+      $dir/HCLGa.fst -ot $clg ]]; then
+  fsttablecompose $dir/Ha.fst $clg | fstdeterminizestar --use-log=true \
+    | fstrmsymbols $dir/disambig_tid.list | fstrmepslocal | \
+     fstminimizeencoded > $dir/HCLGa.fst || exit 1;
+  fstisstochastic $dir/HCLGa.fst || echo "HCLGa is not stochastic"
+fi
+
+if [[ ! -f $dir/HCLG.fst || $dir/HCLG.fst -ot $dir/HCLGa.fst ]]; then
+  add-self-loops --self-loop-scale=$loopscale --reorder=true \
+    $model < $dir/HCLGa.fst > $dir/HCLG.fst || exit 1;
+
+  if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
+    # No point doing this test if transition-scale not 1, as it is bound to fail. 
+    fstisstochastic $dir/HCLG.fst || echo "Final HCLG is not stochastic."
+  fi
+fi
+
+# keep a copy of the lexicon and a list of silence phones with HCLG...
+# this means we can decode without refrence to the $lang directory.
+cp $lang/words.txt $dir/
+cp $lang/silphones.csl $dir/
+
+# to make const fst:
+# fstconvert --fst_type=const $dir/HCLG.fst $dir/HCLG_c.fst
+
+echo "Finished making decoding graphs in $dir"
--- a/egs/timit/s4/utils/s2eps.pl
+++ b/egs/timit/s4/utils/s2eps.pl
@ -0,0 +1,27 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This script replaces <s> and </s> with <eps> (on both input and output sides),
+# for the G.fst acceptor.
+
+while(<>){
+    @A = split(" ", $_);
+    if ( @A >= 4 ) {
+        if ($A[2] eq "<s>" || $A[2] eq "</s>") { $A[2] = "<eps>"; }
+        if ($A[3] eq "<s>" || $A[3] eq "</s>") { $A[3] = "<eps>"; }
+    }
+    print join("\t", @A) . "\n";
+}
--- a/egs/timit/s4/utils/score_lats.sh
+++ b/egs/timit/s4/utils/score_lats.sh
@ -0,0 +1,56 @@
+#!/bin/bash
+
+# Copyright 2012  Arnab Ghoshal
+# Copyright 2010-2011  Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+
+if [ $# -ne 4 ]; then
+  echo "Usage: score_lats.sh <decode-dir> <word-symbol-table> <data-dir> <phone-map>"
+  exit 1;
+fi
+
+dir=$1
+symtab=$2
+data=$3
+phonemap=$4
+
+if [ ! -f $symtab ]; then
+  echo No such word symbol table file $symtab
+  exit 1;
+fi
+if [ ! -f $data/text ]; then
+  echo Could not find transcriptions in $data/text
+  exit 1
+fi
+
+
+trans=$data/text
+cp $trans $dir/test.trans
+
+for inv_acwt in `seq 1 7`; do 
+  acwt=`perl -e "print (1.0/$inv_acwt);"`
+  lattice-best-path --acoustic-scale=$acwt --word-symbol-table=$symtab \
+    "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/${inv_acwt}.tra \
+    2>$dir/rescore_${inv_acwt}.log
+     
+  cat $dir/${inv_acwt}.tra \
+    | int2sym.pl --ignore-first-field $symtab \
+    | timit_norm_trans.pl -i - -m $phonemap -from 48 -to 39 |
+    compute-wer --text --mode=present ark:$dir/test.trans  ark,p:-  \
+    >& $dir/wer_$inv_acwt
+done
+
--- a/egs/timit/s4/utils/score_text.sh
+++ b/egs/timit/s4/utils/score_text.sh
@ -0,0 +1,50 @@
+#!/bin/bash
+
+# Copyright 2012  Arnab Ghoshal
+# Copyright 2010-2011  Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+
+if [ $# -ne 4 ]; then
+  echo "Usage: score_text.sh <decode-dir> <word-symbol-table> <data-dir> <phone-map>"
+  exit 1;
+fi
+
+dir=$1
+symtab=$2
+data=$3
+phonemap=$4
+
+if [ ! -f $data/text ]; then
+  echo Could not find transcriptions in $data/text
+  exit 1
+fi
+
+trans=$data/text
+sort -k1,1 $trans > $dir/test.trans
+
+# We assume the transcripts are already in integer form.
+cat $dir/*.tra | sort -k1,1 \
+  | int2sym.pl --ignore-first-field $symtab \
+  | timit_norm_trans.pl -i - -m $phonemap -from 48 -to 39 \
+  > $dir/text
+
+compute-wer --text --mode=present ark:$dir/test.trans ark,p:$dir/text \
+  >& $dir/wer
+
+grep WER $dir/wer
+
--- a/egs/timit/s4/utils/silphones.pl
+++ b/egs/timit/s4/utils/silphones.pl
@ -0,0 +1,57 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# creates integer lists of silence and non-silence phones in files,
+# e.g. silphones.csl="1:2:3 \n"
+# and nonsilphones.csl="4:5:6:7:...:24\n";
+
+if(@ARGV != 4) {
+    die "Usage: silphones.pl phones.txt \"sil1 sil2 sil3\" silphones.csl nonsilphones.csl";
+}
+
+($symtab, $sillist, $silphones, $nonsilphones) = @ARGV;
+open(S,"<$symtab") || die "Opening symbol table $symtab";
+
+
+foreach $s (split(" ", $sillist)) {
+    $issil{$s} = 1;
+}
+
+@sil = ();
+@nonsil = ();
+while(<S>){
+    @A = split(" ", $_);
+    @A == 2 || die "Bad line $_ in phone-symbol-table file $symtab";
+    ($sym, $int) = @A;
+    if($int != 0) {
+        if($issil{$sym}) { push @sil, $int; $seensil{$sym}=1; }
+        else { push @nonsil, $int; }
+    }
+}
+
+foreach $k(keys %issil) {
+    if(!$seensil{$k}) { die "No such silence phone $k"; }
+}
+open(F, ">$silphones") || die "opening silphones file $silphones";
+open(G, ">$nonsilphones") || die "opening nonsilphones file $nonsilphones";
+print F join(":", @sil) . "\n";
+print G join(":", @nonsil) . "\n";
+close(F);
+close(G);
+if(@sil == 0) { print STDERR "Warning: silphones.pl no silence phones.\n" }
+if(@nonsil == 0) { print STDERR "Warning: silphones.pl no non-silence phones.\n" }
+
--- a/egs/timit/s4/utils/split_data.sh
+++ b/egs/timit/s4/utils/split_data.sh
@ -0,0 +1,79 @@
+#!/bin/bash
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+set -o errexit
+
+if [ $# != 2 ]; then
+  echo "Usage: split_data.sh data-dir num-to-split"
+  exit 1
+fi
+
+data=$1
+numsplit=$2
+
+if [ $numsplit -le 0 ]; then
+  echo "Invalid num-split argument $numsplit";
+  exit 1;
+fi
+
+n=0;
+feats=""
+wavs=""
+utt2spks=""
+texts=""
+
+nu=`cat $data/utt2spk | wc -l`
+nf=`cat $data/feats.scp | wc -l`
+nt=`cat $data/text | wc -l`
+if [ $nu -ne $nf ]; then
+  echo "split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf);"
+  echo "this script may produce incorrectly split data."
+  echo "use utils/fix_data_dir.sh to fix this."
+fi
+if [ $nt -ne 0 -a $nu -ne $nt ]; then
+  echo "split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt);"
+  echo "this script may produce incorrectly split data."
+  echo "use utils/fix_data_dir.sh to fix this."
+fi
+
+# utilsscripts/get_split.pl returns "0 1 2 3" or "00 01 .. 18 19" or whatever.
+# for n in `get_splits.pl $numsplit`; do
+for n in `seq 1 $numsplit`; do  # Changed this to usual number sequence -Arnab
+  mkdir -p $data/split$numsplit/$n
+  feats="$feats $data/split$numsplit/$n/feats.scp"
+  wavs="$wavs $data/split$numsplit/$n/wav.scp"
+  texts="$texts $data/split$numsplit/$n/text"
+  utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk"
+done
+
+split_scp.pl --utt2spk=$data/utt2spk $data/utt2spk $utt2spks
+split_scp.pl --utt2spk=$data/utt2spk $data/feats.scp $feats
+[ -f $data/wav.scp ] && \
+  split_scp.pl --utt2spk=$data/utt2spk $data/wav.scp $wavs
+[ -f $data/text ] && \
+  split_scp.pl --utt2spk=$data/utt2spk $data/text $texts
+
+# for n in `get_splits.pl $numsplit`; do
+for n in `seq 1 $numsplit`; do  # Changed this to usual number sequence -Arnab
+  utt2spk_to_spk2utt.pl $data/split$numsplit/$n/utt2spk \
+    > $data/split$numsplit/$n/spk2utt
+  # for completeness, also split the spk2gender file
+  [ -f $data/spk2gender ] && \
+    filter_scp.pl $data/split$numsplit/$n/spk2utt $data/spk2gender \
+    > $data/split$numsplit/$n/spk2gender 
+done
+
+exit 0
--- a/egs/timit/s4/utils/split_scp.pl
+++ b/egs/timit/s4/utils/split_scp.pl
@ -0,0 +1,211 @@
+#!/usr/bin/perl -w
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+# This program splits up any kind of .scp or archive-type file.
+# If there is no utt2spk option it will work on any text  file and
+# will split it up with an approximately equal number of lines in
+# each but.
+# With the --utt2spk option it will work on anything that has the 
+# utterance-id as the first entry on each line; the utt2spk file is
+# of the form "utterance speaker" (on each line).
+# It splits it into equal size chunks as far as it can.  If you use
+# the utt2spk option it will make sure these chunks coincide with
+# speaker boundaries.  In this case, if there are more chunks
+# than speakers (and in some other circumstances), some of the 
+# resulting  chunks will be empty and it
+# will print a warning.
+# You will normally call this like:
+# split_scp.pl scp scp.1 scp.2 scp.3 ...
+# or
+# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
+# Note that you can use this script to split the utt2spk file itself,
+# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
+
+# You can also call the scripts like:
+# split_scp.pl -j 3 0 scp scp.0
+# [note: with this option, it assumes zero-based indexing of the split parts,
+# i.e. the second number must be 0 <= n < num-jobs.]
+
+$num_jobs = 0;
+$job_id = 0;
+$utt2spk_file = "";
+
+for ($x = 1; $x <= 2; $x++) {
+    if ($ARGV[0] eq "-j") {
+        shift @ARGV;
+        $num_jobs = shift @ARGV;
+        $job_id = shift @ARGV;
+        if ($num_jobs <= 0 || $job_id < 0 || $job_id >= $num_jobs) {
+            die "Invalid num-jobs and job-id: $num_jobs and $job_id";
+        }
+    }
+    if ($ARGV[0] =~ "--utt2spk=(.+)") {
+        $utt2spk_file=$1;
+        shift;
+    }
+}
+
+if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
+    die "Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ... \n" .
+        " or: split_scp.pl -j num-jobs job-id [--utt2spk=<utt2spk_file>] in.scp [out.scp]\n" .
+        " ... where 0 <= job-id < num-jobs.";
+}
+   
+$inscp = shift @ARGV;
+if ($num_jobs == 0) { # without -j option
+    @OUTPUTS = @ARGV;
+} else {
+    for ($j = 0; $j < $num_jobs; $j++) {
+        if ($j == $job_id) { 
+            if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
+            else { push @OUTPUTS, "-"; }
+        } else {
+            push @OUTPUTS, "/dev/null";
+        }
+    }
+} 
+
+if ($utt2spk_file ne "") {  # We have the --utt2spk option...
+    open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file";
+    while(<U>) {
+        @A = split;
+        @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file";
+        ($u,$s) = @A;
+        $utt2spk{$u} = $s;
+    }
+    open(I, "<$inscp") || die "Opening input scp file $inscp";
+    @spkrs = ();
+    while(<I>) {
+        @A = split;
+        if(@A == 0) { die "Empty or space-only line in scp file $inscp"; }
+        $u = $A[0];
+        $s = $utt2spk{$u};
+        if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; }
+        if(!defined $spk_count{$s}) { 
+            push @spkrs, $s; 
+            $spk_count{$s} = 0;
+            $spk_data{$s} = "";
+        }
+        $spk_count{$s}++;
+        $spk_data{$s} = $spk_data{$s} . $_;
+    }
+    # Now split as equally as possible ..
+    # First allocate spks to files by allocating an approximately
+    # equal number of speakers.
+    $numspks = @spkrs;  # number of speakers.
+    $numscps = @OUTPUTS; # number of output files.
+    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+        $scparray[$scpidx] = []; # [] is array reference.
+    }
+    for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
+        $scpidx = int(($spkidx*$numscps) / $numspks);
+        $spk = $spkrs[$spkidx];
+        push @{$scparray[$scpidx]}, $spk;
+        $scpcount[$scpidx] += $spk_count{$spk};
+    }
+
+    # Now will try to reassign beginning + ending speakers
+    # to different scp's and see if it gets more balanced.
+    # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
+    # We can show that if considering changing just 2 scp's, we minimize
+    # this by minimizing the squared difference in sizes.  This is
+    # equivalent to minimizing the absolute difference in sizes.  This
+    # shows this method is bound to converge.
+
+    $changed = 1;
+    while($changed) {
+        $changed = 0;
+        for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+            # First try to reassign ending spk of this scp.
+            if($scpidx < $numscps-1) {
+                $sz = @{$scparray[$scpidx]};
+                if($sz > 0) {
+                    $spk = $scparray[$scpidx]->[$sz-1];
+                    $count = $spk_count{$spk};
+                    $nutt1 = $scpcount[$scpidx];
+                    $nutt2 = $scpcount[$scpidx+1];
+                    if( abs( ($nutt2+$count) - ($nutt1-$count))
+                        < abs($nutt2 - $nutt1))  { # Would decrease
+                        # size-diff by reassigning spk...
+                        $scpcount[$scpidx+1] += $count;
+                        $scpcount[$scpidx] -= $count;
+                        pop @{$scparray[$scpidx]};
+                        unshift @{$scparray[$scpidx+1]}, $spk;
+                        $changed = 1;
+                    }
+                }
+            }
+            if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
+                $spk = $scparray[$scpidx]->[0];
+                $count = $spk_count{$spk};
+                $nutt1 = $scpcount[$scpidx-1];
+                $nutt2 = $scpcount[$scpidx];
+                if( abs( ($nutt2-$count) - ($nutt1+$count))
+                    < abs($nutt2 - $nutt1))  { # Would decrease
+                    # size-diff by reassigning spk...
+                    $scpcount[$scpidx-1] += $count;
+                    $scpcount[$scpidx] -= $count;
+                    shift @{$scparray[$scpidx]};
+                    push @{$scparray[$scpidx-1]}, $spk;
+                    $changed = 1;
+                }
+            }
+        }
+    }
+    # Now print out the files...
+    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+        $scpfn = $OUTPUTS[$scpidx];
+        open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing.";
+        $count = 0;
+        if(@{$scparray[$scpidx]} == 0) {
+            print STDERR "Warning: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n";
+        } else {
+            foreach $spk ( @{$scparray[$scpidx]} ) {
+                print F $spk_data{$spk};
+                $count += $spk_count{$spk};
+            }
+            if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; }
+        }
+        close(F);
+    }
+} else { 
+   # This block is the "normal" case where there is no --utt2spk 
+   # option and we just break into equal size chunks.
+
+    open(I, "<$inscp") || die "Opening input scp file $inscp";
+
+    $numscps = @OUTPUTS;  # size of array.
+    @F = ();
+    while(<I>) {
+        push @F, $_;
+    }
+    $numlines = @F;
+    if($numlines == 0) {
+        print STDERR "split_scp.pl: warning: empty input scp file $inscp";
+    }
+    $linesperscp = int( ($numlines+($numscps-1)) / $numscps); # the +$(numscps-1) forces rounding up.
+# [just doing int() rounds down].
+    for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
+        $scpfile = $OUTPUTS[$scpidx];
+        open(O, ">$scpfile") || die "Opening output scp file $scpfile";
+        for($n = $linesperscp * $scpidx; $n < $numlines && $n < $linesperscp*($scpidx+1); $n++) {
+            print O $F[$n];
+        }
+        close(O) || die "Closing scp file $scpfile";
+    }
+}
--- a/egs/timit/s4/utils/submit_jobs.sh
+++ b/egs/timit/s4/utils/submit_jobs.sh
@ -0,0 +1,125 @@
+#!/bin/bash -u
+
+# Copyright 2012  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+set -o errexit
+
+function error_exit () {
+  echo -e "$@" >&2; exit 1;
+}
+
+function readposint () {
+  local retval=`expr "X$1" : '[^=]*=\(.*\)'`;
+  retval=${retval#0*}  # Strip any leading 0's
+  [[ "$retval" =~ ^[1-9][0-9]*$ ]] \
+    || error_exit "Argument \"$retval\" not a positive integer."
+  echo $retval
+}
+
+PROG=`basename $0`;
+usage="Usage: $PROG [options] --log=logfile command\n
+Runs the supplied command and redirect the stdout & stderr to logfile.\n
+With the --qcmd option, the command is submitted to a grid engine.\n
+Any 'TASK_ID' in logfile or command is replaced with job number or \$SGE_TASK_ID (for SGE).\n\n
+Required arguments:\n
+  --log=FILE\tOutput of command redirected to this file.\n\n
+Options:\n
+  --njobs=INT\tNumber of jobs to run (default=1). Assumes split data exists.\n
+  --qcmd=STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
+";
+
+if [ $# -lt 2 ]; then
+  error_exit $usage;
+fi
+
+NJOBS=1     # Default number of jobs
+QCMD=""     # No grid usage by default
+while [ $# -gt 1 ]; do
+  case "${1# *}" in  # ${1# *} strips any leading spaces from the arguments
+  --help) echo -e $usage; exit 0 ;;
+  --qcmd=*)
+  QCMD=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;;
+  --njobs=*)
+  NJOBS=`readposint $1`; shift ;;
+  --log=*)
+  LOGF=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;;
+  -*)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
+  '')  shift ;;  # Handle any empty arguments
+  *)   break ;;  # interpreted as the command to execute
+  esac
+done
+
+logfile_base=`basename $LOGF .log`
+logfile_dir=`dirname $LOGF`
+mkdir -p $logfile_dir;
+
+# Now, parse the command to execute
+exec_cmd="";
+while [ $# -gt 0 ]; do
+  case "$1" in
+  *\"*) exec_cmd=$exec_cmd"'''$1''' "; shift ;;
+  *\ *) exec_cmd=$exec_cmd"\"$1\" "; shift ;;
+     *) exec_cmd=$exec_cmd"$1 "; shift ;;
+  esac
+done
+
+function run_locally {
+  rm -f $logfile_dir/.error;
+  for n in `seq 1 $NJOBS`; do
+    local this_logfile=${logfile_base//TASK_ID/$n}
+    this_logfile=$logfile_dir"/"$this_logfile".log"
+    local this_command=${exec_cmd//TASK_ID/$n}
+    ( echo -e "# Command:\n# $this_command";
+      echo "# Running on: "`hostname`;
+      echo "# Started at: "`date`;
+      eval $this_command || touch $logfile_dir/.error
+      echo "# Finished at: "`date` ) >> $this_logfile 2>&1 &
+  done
+  wait;
+  [ -f $logfile_dir/.error ] && { rm -f $logfile_dir/.error; \
+      error_exit "One (or more) locally run jobs failed."; }
+  exit 0;
+}
+
+function run_on_grid {
+  local this_logfile=${logfile_base//TASK_ID/\$SGE_TASK_ID}
+  this_logfile=$logfile_dir"/"$this_logfile".log"
+  # If log files are in a separate 'log' directory, create the job submission
+  # scripts one level up.
+  local qdir=${logfile_dir/%log/q}
+  mkdir -p $qdir
+  local qlog=$qdir/queue.log
+  local this_command=${exec_cmd//TASK_ID/\$SGE_TASK_ID}
+  local run_this=$qdir"/"${logfile_base//TASK_ID/}".sh"
+  run_this=${run_this//../.}
+  printf "#!/bin/bash\n#\$ -S /bin/bash\n#\$ -V -cwd -j y\n" > $run_this
+  { printf "set -e\n";
+    printf "{ cd %s\n  . path.sh\n  echo Running on: \`hostname\`\n" "$PWD";
+    printf "  echo Started at: \`date\`\n  $this_command\n  ret=\$\?\n";
+    printf "  echo Finished at: \`date\`\n} >& %s\nexit \$ret\n" "$this_logfile"
+    printf "# Submitted with:\n"
+    printf "# $QCMD -sync y -o $qlog -t 1-$NJOBS $run_this >> $qlog 2>&1\n"
+  } >> $run_this
+  $QCMD -sync y -o $qlog -t 1-${NJOBS} $run_this >> $qlog 2>&1
+  exit $?
+}
+
+if [ -z "$QCMD" ]; then
+  run_locally;
+else
+  run_on_grid;
+fi
+
--- a/egs/timit/s4/utils/sym2int.pl
+++ b/egs/timit/s4/utils/sym2int.pl
@ -0,0 +1,82 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+$ignore_oov = 0;
+$ignore_first_field = 0;
+for($x = 0; $x < 3; $x++) {
+    # Note: it will just print OOVS unmodified if you specify --ignore-oov.
+    # Else will complain and put nothing out.
+    if($ARGV[0] eq "--ignore-oov") { $ignore_oov = 1; shift @ARGV; } 
+    if($ARGV[0] eq "--ignore-first-field") { $ignore_first_field = 1; shift @ARGV; }
+    if($ARGV[0] eq "--map-oov") { shift @ARGV; $map_oov = shift @ARGV; }
+}
+
+$symtab = shift @ARGV;
+if(!defined $symtab) {
+    die "Usage: sym2int.pl symtab [input transcriptions] > output transcriptions\n";
+}
+open(F, "<$symtab") || die "Error opening symbol table file $symtab";
+while(<F>) {
+    @A = split(" ", $_);
+    @A == 2 || die "bad line in symbol table file: $_";
+    $sym2int{$A[0]} = $A[1] + 0;
+}
+
+$num_warning = 0;
+$max_warning = 20;
+$error = 0;
+while(<>) {
+    @A = split(" ", $_);
+    if(@A == 0) {
+        die "Empty line in transcriptions input.";
+    }
+    if($ignore_first_field) {
+        $key = shift @A;
+        print $key . " ";
+    }
+    @B = ();
+    foreach $a (@A) {
+        $i = $sym2int{$a};
+        if(!defined ($i)) {
+            if (defined $map_oov) {
+                if (!defined $sym2int{$map_oov}) {
+                    die "sym2int.pl: invalid map-oov option $map_oov (symbol not defined in $symtab)";
+                }
+                if ($num_warning++ < $max_warning) {
+                    print STDERR "sym2int.pl: replacing $a with $map_oov\n";
+                    if ($num_warning == $max_warning) {
+                        print STDERR "sym2int.pl: not warning for OOVs any more times\n";
+                    }
+                }
+                $i = $sym2int{$map_oov};
+            } elsif($ignore_oov) {
+                $i = $a; # just print them out unmodified..
+            } else {
+                die "sym2int.pl: undefined symbol $a\n";
+            }
+        }
+        push @B, $i;
+    }
+    print join(" ", @B);
+    print "\n";
+}
+
+if($error) { exit(1); }
+else { exit(0); }
+
+
+
--- a/egs/timit/s4/utils/utt2spk_to_spk2utt.pl
+++ b/egs/timit/s4/utils/utt2spk_to_spk2utt.pl
@ -0,0 +1,39 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# converts an utt2spk file to a spk2utt file.
+# Takes input from the stdin or from a file argument;
+# output goes to the standard out.
+
+if ( @ARGV > 1 ) {
+    die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt";
+}
+
+while(<>){ 
+    @A = split(" ", $_);
+    @A == 2 || die "Invalid line in utt2spk file: $_";
+    ($u,$s) = @A;
+    if(!$seen_spk{$s}) {
+        $seen_spk{$s} = 1;
+        push @spklist, $s;
+    }
+    $uttlist{$s} = $uttlist{$s} . "$u ";
+}
+foreach $s (@spklist) {
+    $l = $uttlist{$s};
+    $l =~ s: $::; # remove trailing space.
+    print "$s $l\n";
+}
--- a/src/Makefile
+++ b/src/Makefile
@ -61,3 +61,4 @@ nnet_cpu: base util matrix
 rnn: base util matrix lat 


+FSTROOT = /mnt/matylda5/iveselyk/DEVEL/kaldi/sandbox/karel/tools/openfst
--- a/src/configure
+++ b/src/configure
@ -31,16 +31,22 @@ ATLASROOT=`rel2abs ../tools/ATLAS/`
 FSTROOT=`rel2abs ../tools/openfst`

 function usage {
-  echo 'Usage: ./configure [--atlas-root=ATLASROOT] [--fst-root=FSTROOT] 
+  echo 'Usage: ./configure [--threaded-atlas={yes|no}] [--atlas-root=ATLASROOT] [--fst-root=FSTROOT] 
  [--mkl-root=MKLROOT] [--mkl-libdir=MKLLIBDIR] [--mathlib=ATLAS|MKL|CLAPACK]';
 }

+threaded_atlas=false #  By default, use the un-threaded version of ATLAS.
+
 while [ $# -gt 0 ];
 do
  case "$1" in
  --help) usage; exit 0 ;;
  --atlas-root=*) 
  ATLASROOT=`read_dirname $1`; shift ;;
+  --threaded-atlas=yes)
+  threaded_atlas=true; shift ;;
+  --threaded-atlas=no)
+  threaded_atlas=false; shift ;;
  --fst-root=*)
  FSTROOT=`read_dirname $1`; shift ;;
  --mkl-root=*)
@ -137,10 +143,12 @@ function linux_check_static {
 }

 function linux_configure_static {
-  if [ -z $ATLASLIBDIR ]; then # Note: it'll pick up the first one below.
+  if $threaded_atlas; then pt=pt; else pt=""; fi
+
+  if [ -z $ATLASLIBDIR ]; then # Note: it'll pick up the last one below.
    for dir in /usr{,/local}/lib{64,}{,/atlas,/atlas-sse2,/atlas-sse3} \
      `pwd`/../tools/ATLAS/build/install/lib/ $ATLASROOT/lib; do
-     linux_check_static &&  ATLASLIBDIR=$dir && break
+     linux_check_static &&  ATLASLIBDIR=$dir
    done
    if [ -z $ATLASLIBDIR ]; then # Note: it'll pick up the last one below.
      echo "Could not find libatlas.a in any of the obvious places... will try dynamic libraries."
@ -168,13 +176,14 @@ function linux_configure_static {
    return ;
  fi
   
-  for x in libcblas.a libatlas.a libf77blas.a; do
+  for x in lib${pt}cblas.a libatlas.a lib${pt}f77blas.a; do
    if [ ! -f $ATLASLIBDIR/$x ]; then
      echo "Configuring static ATLAS libraries failed: Could not find library $x in directory $ATLASLIBDIR"
      return 1;
    fi
    ATLASLIBS="$ATLASLIBS $ATLASLIBDIR/$x"
  done    
+  if $threaded_atlas; then ATLASLIBS="$ATLASLIBS"; fi

  echo ATLASINC = $ATLASROOT/include >> kaldi.mk
  echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
@ -189,21 +198,24 @@ function linux_check_dynamic {
  # will exit with success if $dir seems to contain ATLAS libraries with
  # right architecture (compatible with default "nm")
  if [ -f $dir/libatlas.so ]; then # candidate...
-    if nm $dir/libatlas.so 2>&1 | grep "File format not recognized" >/dev/null; then
+    if nm --dynamic $dir/libatlas.so 2>&1 | grep "File format not recognized" >/dev/null; then
      echo "Directory $dir may contain dynamic ATLAS libraries but seems to be wrong architecture";
      return 1;
    fi
+    echo "Atlas found in $dir";
    return 0;
  else
-    return 1;
+      echo "No libatlas.so in $dir";
+      return 1;
  fi
 }

 function linux_configure_dynamic {
+  if $threaded_atlas; then pt=pt; else pt=""; fi
  if [ -z $ATLASLIBDIR ]; then # Note: it'll pick up the last one below.
    for dir in /usr{,/local}/lib{,64}{,/atlas,/atlas-sse2,/atlas-sse3} \
      `pwd`/../tools/ATLAS/build/install/lib/ $ATLASROOT/lib; do
-       linux_check_dyamic && ATLASLIBDIR=$dir
+       linux_check_dynamic && ATLASLIBDIR=$dir
    done
    if [ -z $ATLASLIBDIR ]; then
      echo "Could not find libatlas.so in any of the obvious places."
@ -221,7 +233,7 @@ function linux_configure_dynamic {
  # for all the names we have encountered. 
  for libname in lapack lapack_atlas  clapack; do
    if [ -f $ATLASLIBDIR/lib${libname}.so -a "$ATLASLIBS" == "" ]; then
-      if nm  $ATLASLIBDIR/lib${libname}.so  | grep ATL_cgetrf >/dev/null; then
+      if nm  --dynamic $ATLASLIBDIR/lib${libname}.so  | grep ATL_cgetrf >/dev/null; then
         ATLASLIBS="-L$ATLASLIBDIR -l${libname}"
         echo "Using library $ATLASLIBS as ATLAS's CLAPACK library."
      fi
@ -232,13 +244,14 @@ function linux_configure_dynamic {
    return 1;
  fi
   
-  for x in cblas atlas f77blas; do
+  for x in ${pt}cblas atlas ${pt}f77blas; do
    if [ ! -f $ATLASLIBDIR/lib$x.so ]; then
      echo "Configuring dynamic ATLAS libraries failed: Could not find library $x in directory $ATLASLIBDIR"
      return 1;
    fi
    ATLASLIBS="$ATLASLIBS -l$x"
  done
+  if $threaded_atlas; then ATLASLIBS="$ATLASLIBS"; fi

  echo ATLASINC = $ATLASROOT/include >> kaldi.mk
  echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
@ -274,6 +287,11 @@ fi

 cp makefiles/common.mk kaldi.mk

+# Removing any previously defined FSTROOT in Makefile
+cp Makefile Makefile.bak
+grep -v ^'FSTROOT =' Makefile.bak > Makefile
+[ cmp Makefile Makefile.bak >&/dev/null ] || rm Makefile.bak
+
 # Most of the OS-specific steps below will append to kaldi.mk
 echo "Doing OS specific configurations ..."

@ -289,6 +307,7 @@ if [ "`uname`" == "Darwin"  ]; then
    failure "Static OpenFST library not found:  See ../tools/INSTALL"
  fi
  echo FSTROOT = $FSTROOT >> kaldi.mk
+  echo FSTROOT = $FSTROOT >> Makefile
  # posix_memalign and gcc -rdynamic options not present on OS X 10.5.*
  osx_ver=`sw_vers | grep ProductVersion | awk '{print $2}' | sed -e 's?\.[^.]*$??'`
  echo "Configuring for OS X version $osx_ver ..."
@ -316,6 +335,8 @@ if [ "`uname -o`" == "Cygwin"  ]; then
    if [ ! -f /usr/lib/lapack/cygblas-0.dll ]; then
       failure "please first install package liblapack0"
    fi
+    echo FSTROOT = $FSTROOT >> kaldi.mk
+    echo FSTROOT = $FSTROOT >> Makefile
    cat makefiles/cygwin.mk >> kaldi.mk
    echo "Configuration succeeded for platform cygwin"
    exit 0
@ -326,6 +347,7 @@ if [ "`uname`" == "Linux" ]; then
    failure "Static OpenFST library not found:  See ../tools/INSTALL"
  fi
  echo FSTROOT = $FSTROOT >> kaldi.mk
+  echo FSTROOT = $FSTROOT >> Makefile

  echo "On Linux: Checking for linear algebra header files ..."
  if [ $MATHLIB == "ATLAS" ]; then
				`@ -0,0 +1 @@`
				`--use-energy=false # only non-default option.`