Updates to various data preparation scripts so validation checks on 'lang' directories will pass. It's possible some of these changes will break some setups, but it's not feasible to fully test this right now.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4739 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2015-01-02 03:38:04 +00:00 · 2015-01-02 03:38:04 +00:00 · 8dc30c3b6b
--- a/egs/aurora4/s5/local/aurora4_format_data.sh
+++ b/egs/aurora4/s5/local/aurora4_format_data.sh
@ -38,11 +38,8 @@ echo Preparing language models for test
 for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
  test=data/lang_test_${lm_suffix}
-  mkdir -p $test
+  cp -rT data/lang $test
-  for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
+
     phones/; do
    cp -r data/lang/$f $test
  done
  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt
@ -60,26 +57,8 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
-  fstisstochastic $test/G.fst
+  
- # The output is like:
+  utils/validate_lang.pl --skip-determinizability-check $test || exit 1;
 # 9.14233e-05 -0.259833
 # we do expect the first of these 2 numbers to be close to zero (the second is
 # nonzero because the backoff weights make the states sum to >1).
 # Because of the <s> fiasco for these particular LMs, the first number is not
 # as close to zero as it could be.
  # Everything below is only for diagnostic.
  # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
  # this might cause determinization failure of CLG.
  # #0 is treated as an empty word.
  mkdir -p $tmpdir/g
  awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
    < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
  fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
   fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
    echo "Language model has cycles with empty words" && exit 1
  rm -r $tmpdir/g
 done
 echo "Succeeded in formatting data."
--- a/egs/aurora4/s5/local/wsj_data_prep.sh
+++ b/egs/aurora4/s5/local/wsj_data_prep.sh
@ -1,201 +0,0 @@
 #!/bin/bash
 # Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0.
 if [ $# -le 3 ]; then
   echo "Arguments should be a list of WSJ directories, see ../run.sh for example."
   exit 1;
 fi
 dir=`pwd`/data/local/data
 lmdir=`pwd`/data/local/nist_lm
 mkdir -p $dir $lmdir
 local=`pwd`/local
 utils=`pwd`/utils
 . ./path.sh # Needed for KALDI_ROOT
 export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
 sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
 if [ ! -x $sph2pipe ]; then
   echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
   exit 1;
 fi
 cd $dir
 # Make directory of links to the WSJ disks such as 11-13.1.  This relies on the command
 # line arguments being absolute pathnames.
 rm -r links/ 2>/dev/null
 mkdir links/
 ln -s $* links
 # Do some basic checks that we have what we expected.
 if [ ! -d links/11-13.1 -o ! -d links/13-34.1 -o ! -d links/11-2.1 ]; then
  echo "wsj_data_prep.sh: Spot check of command line arguments failed"
  echo "Command line arguments must be absolute pathnames to WSJ directories"
  echo "with names like 11-13.1."
  exit 1;
 fi
 # This version for SI-84
 cat links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
 $local/ndx2flist.pl $* | sort | \
 grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si84.flist
 nl=`cat train_si84.flist | wc -l`
 [ "$nl" -eq 7138 ] || echo "Warning: expected 7138 lines in train_si84.flist, got $nl"
 # This version for SI-284
 cat links/13-34.1/wsj1/doc/indices/si_tr_s.ndx \
 links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
 $local/ndx2flist.pl  $* | sort | \
 grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si284.flist
 nl=`cat train_si284.flist | wc -l`
 [ "$nl" -eq 37416 ] || echo "Warning: expected 37416 lines in train_si284.flist, got $nl"
 # Now for the test sets.
 # links/13-34.1/wsj1/doc/indices/readme.doc 
 # describes all the different test sets.
 # Note: each test-set seems to come in multiple versions depending
 # on different vocabulary sizes, verbalized vs. non-verbalized
 # pronunciations, etc.  We use the largest vocab and non-verbalized
 # pronunciations.
 # The most normal one seems to be the "baseline 60k test set", which
 # is h1_p0. 
 # Nov'92 (333 utts)
 # These index files have a slightly different format;
 # have to add .wv1
 cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_20.ndx | \
  $local/ndx2flist.pl $* |  awk '{printf("%s.wv1\n", $1)}' | \
  sort > test_eval92.flist
 # Nov'92 (330 utts, 5k vocab)
 cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
  $local/ndx2flist.pl $* |  awk '{printf("%s.wv1\n", $1)}' | \
  sort > test_eval92_5k.flist
 # Nov'93: (213 utts)
 # Have to replace a wrong disk-id.
 cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \
  sed s/13_32_1/13_33_1/ | \
  $local/ndx2flist.pl $* | sort > test_eval93.flist
 # Nov'93: (213 utts, 5k)
 cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h2_p0.ndx | \
  sed s/13_32_1/13_33_1/ | \
  $local/ndx2flist.pl $* | sort > test_eval93_5k.flist
 # Dev-set for Nov'93 (503 utts)
 cat links/13-34.1/wsj1/doc/indices/h1_p0.ndx | \
  $local/ndx2flist.pl $* | sort > test_dev93.flist
 # Dev-set for Nov'93 (513 utts, 5k vocab)
 cat links/13-34.1/wsj1/doc/indices/h2_p0.ndx | \
  $local/ndx2flist.pl $* | sort > test_dev93_5k.flist
 # Dev-set Hub 1,2 (503, 913 utterances)
 # Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.  
 # Sometimes this gets copied from the CD's with upcasing, don't know 
 # why (could be older versions of the disks).
 find `readlink links/13-16.1`/???1/??_??_20 -print | grep -i ".wv1" | sort > dev_dt_20.flist
 find `readlink links/13-16.1`/???1/??_??_05 -print | grep -i ".wv1" | sort > dev_dt_05.flist
 # Finding the transcript files:
 for x in $*; do find -L $x -iname '*.dot'; done > dot_files.flist
 # Convert the transcripts into our format (no normalization yet)
 for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
   $local/flist2scp.pl $x.flist | sort > ${x}_sph.scp
   cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl  dot_files.flist > $x.trans1
 done
 # Do some basic normalization steps.  At this point we don't remove OOVs--
 # that will be done inside the training scripts, as we'd like to make the
 # data-preparation stage independent of the specific lexicon used.
 noiseword="<NOISE>";
 for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
   cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1;
 done
 # Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
 for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
  awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
 done
 # Make the utt2spk and spk2utt files.
 for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
   cat ${x}_sph.scp | awk '{print $1}' | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
   cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
 done
 #in case we want to limit lm's on most frequent words, copy lm training word frequency list
 cp links/13-32.1/wsj1/doc/lng_modl/vocab/wfl_64.lst $lmdir
 chmod u+w $lmdir/*.lst # had weird permissions on source.
 # The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without
 # verbalized pronunciations.   This is the most common test setup, I understand.
 cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1;
 chmod u+w $lmdir/lm_bg.arpa.gz
 # trigram would be:
 cat links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb20onp.z | \
 perl -e 'while(<>){ if(m/^\\data\\/){ print; last;  } } while(<>){ print; }' | \
 gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1;
 prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1;
 gzip -f $lmdir/lm_tgpr.arpa || exit 1;
 # repeat for 5k language models
 cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb05onp.z  $lmdir/lm_bg_5k.arpa.gz || exit 1;
 chmod u+w $lmdir/lm_bg_5k.arpa.gz
 # trigram would be: !only closed vocabulary here!
 cp links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1;
 chmod u+w $lmdir/lm_tg_5k.arpa.gz
 gunzip $lmdir/lm_tg_5k.arpa.gz
 tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz
 rm $lmdir/lm_tg_5k.arpa
 prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1;
 gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1;
 if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
  rm wsj0-train-spkrinfo.txt
  ! wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt && \
    echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
    wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt 
 fi
 if [ ! -f wsj0-train-spkrinfo.txt ]; then
  echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
  echo "This is possibly omitted from the training disks; couldn't find it." 
  echo "Everything else may have worked; we just may be missing gender info"
  echo "which is only needed for VTLN-related diagnostics anyway."
  exit 1
 fi
 # Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
 # LDC put it on the web.  Perhaps it was accidentally omitted from the
 # disks.  
 cat links/11-13.1/wsj0/doc/spkrinfo.txt \
    links/13-32.1/wsj1/doc/evl_spok/spkrinfo.txt \
    links/13-34.1/wsj1/doc/dev_spok/spkrinfo.txt \
    links/13-34.1/wsj1/doc/train/spkrinfo.txt \
   ./wsj0-train-spkrinfo.txt  | \
    perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
   awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
 echo "Data preparation succeeded"
--- a/egs/aurora4/s5/local/wsj_extend_dict.sh
+++ b/egs/aurora4/s5/local/wsj_extend_dict.sh
@ -1,173 +0,0 @@
 #!/bin/bash
 # This script builds a larger word-list and dictionary 
 # than used for the LMs supplied with the WSJ corpus.
 # It uses a couple of strategies to fill-in words in
 # the LM training data but not in CMUdict.  One is
 # to generate special prons for possible acronyms, that
 # just consist of the constituent letters.  The other
 # is designed to handle derivatives of known words
 # (e.g. deriving the pron of a plural from the pron of
 # the base-word), but in a more general, learned-from-data
 # way.
 # It makes use of scripts in local/dict/
 if [ $# -ne 1 ]; then
  echo "Usage: local/wsj_train_lms.sh /foo/bar/WSJ/13-32.1/"
  exit 1
 fi
 if [ "`basename $1`" != 13-32.1 ]; then
  echo "Expecting the argument to this script to end in 13-32.1"
  exit 1
 fi
 # e.g.
 #srcdir=/mnt/matylda2/data/WSJ1/13-32.1
 export PATH=$PATH:`pwd`/local/dict/
 srcdir=$1
 mkdir -p data/local/dict_larger
 dir=data/local/dict_larger
 cp data/local/dict/* data/local/dict_larger # Various files describing phones etc.
  # are there; we just want to copy them as the phoneset is the same.
 rm data/local/dict_larger/lexicon.txt # we don't want this.
 rm data/local/dict_larger/lexiconp.txt # we don't want this either.
 mincount=2 # Minimum count of an OOV we will try to generate a pron for.
 [ ! -f data/local/dict/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1;
 # Remove comments from cmudict; print first field; remove
 # words like FOO(1) which are alternate prons: our dict format won't
 # include these markers.
 grep -v ';;;' data/local/dict/cmudict/cmudict.0.7a | 
 perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu
 cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu
 echo "Getting training data [this should take at least a few seconds; if not, there's a problem]"
 # Convert to uppercase, remove XML-like markings.
 # For words ending in "." that are not in CMUdict, we assume that these
 # are periods that somehow remained in the data during data preparation,
 # and we we replace the "." with "\n".  Note: we found this by looking at
 # oov.counts below (before adding this rule).
 touch $dir/cleaned.gz
 if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then
  echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]";
 else
 gunzip -c $srcdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \
  | awk '/^</{next}{print toupper($0)}' | perl -e '
   open(F, "<$ARGV[0]")||die;
   while(<F>){ chop; $isword{$_} = 1; }
   while(<STDIN>) { 
    @A = split(" ", $_); 
    for ($n = 0; $n < @A; $n++) {
      $a = $A[$n];
      if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "."
         # and have no other "." in them: treat as period.
         print "$a";
         if ($n+1 < @A) { print "\n"; }
      } else { print "$a "; }
    }
    print "\n";
  }
 ' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz
 fi
 # get unigram counts
 echo "Getting unigram counts"
 gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \
  awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams
 cat $dir/unigrams | awk -v dict=$dir/dict.cmu \
  'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \
   > $dir/oov.counts
 echo "Most frequent unseen unigrams are: "
 head $dir/oov.counts
 # Prune away singleton counts, and remove things with numbers in
 # (which should have been normalized) and with no letters at all.
 cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \
  | awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist
 # Automatic rule-finding...
 # First make some prons for possible acronyms.
 # Note: we don't do this for things like U.K or U.N,
 # or A.B. (which doesn't exist anyway), 
 # as we consider this normalization/spelling errors.
 cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu  > $dir/dict.acronyms
 mkdir $dir/f $dir/b # forward, backward directions of rules...
  # forward is normal suffix
  # rules, backward is reversed (prefix rules).  These
  # dirs contain stuff we create while making the rule-based
  # extensions to the dictionary.
 # Remove ; and , from words, if they are present; these
 # might crash our scripts, as they are used as separators there.
 filter_dict.pl $dir/dict.cmu > $dir/f/dict 
 cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
 reverse_dict.pl $dir/f/dict > $dir/b/dict
 reverse_dict.pl $dir/f/oovs > $dir/b/oovs
 # The next stage takes a few minutes.
 # Note: the forward stage takes longer, as English is
 # mostly a suffix-based language, and there are more rules
 # that it finds.
 for d in $dir/f $dir/b; do
 (
   cd $d
   cat dict | get_rules.pl 2>get_rules.log >rules
   get_rule_hierarchy.pl rules >hierarchy
   awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
     limit_candidate_prons.pl hierarchy | \
     score_prons.pl dict | \
     count_rules.pl >rule.counts
   # the sort command below is just for convenience of reading.
   score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
   get_candidate_prons.pl rules.with_scores dict oovs | \
     limit_candidate_prons.pl hierarchy > oovs.candidates
 )  &   
 done 
 wait
 # Merge the candidates.
 reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
 select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s  %s\n", $1, $2);}' \
  > $dir/dict.oovs
 cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
 awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
 sort $dir/oovlist | diff - $dir/oovlist.handled  | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
 # add_counts.pl attaches to original counts to the list of handled/not-handled OOVs
 add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts
 add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts
 echo "**Top OOVs we handled are:**"; 
 head $dir/oovlist.handled.counts
 echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**"; 
 head $dir/oovlist.not_handled.counts
 echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`"
 echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`"
 echo "Count of OOVs we didn't handle due to low count is" \
    `awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts`
 # The two files created above are for humans to look at, as diagnostics.
 cat <<EOF | cat - $dir/dict.cmu $dir/dict.oovs_merged | sort | uniq > $dir/lexicon.txt
 !SIL SIL
 <SPOKEN_NOISE> SPN
 <UNK> SPN
 <NOISE> NSN
 EOF
 echo "Created $dir/lexicon.txt"
--- a/egs/aurora4/s5/local/wsj_format_data.sh
+++ b/egs/aurora4/s5/local/wsj_format_data.sh
@ -1,86 +0,0 @@
 #!/bin/bash
 # Copyright 2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # This script takes data prepared in a corpus-dependent way
 # in data/local/, and converts it into the "canonical" form,
 # in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
 # data/train_si284, data/train_si84, etc.
 # Don't bother doing train_si84 separately (although we have the file lists
 # in data/local/) because it's just the first 7138 utterances in train_si284.
 # We'll create train_si84 after doing the feature extraction.
 . ./path.sh || exit 1;
 echo "Preparing train and test data"
 srcdir=data/local/data
 lmdir=data/local/nist_lm
 tmpdir=data/local/lm_tmp
 lexicon=data/local/lang_tmp/lexiconp.txt
 mkdir -p $tmpdir
 for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do 
  mkdir -p data/$x
  cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
  cp $srcdir/$x.txt data/$x/text || exit 1;
  cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
  cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
  utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1;
 done
 # Next, for each type of language model, create the corresponding FST
 # and the corresponding lang_test_* directory.
 echo Preparing language models for test
 for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
  test=data/lang_test_${lm_suffix}
  mkdir -p $test
  for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
     phones/; do
    cp -r data/lang/$f $test
  done
  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt
  # grep -v '<s> <s>' because the LM seems to have some strange and useless
  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
  # which are supposed to occur only at being/end of utt.  These can cause 
  # determinization failures of CLG [ends up being epsilon cycles].
  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
    grep -v '<s> <s>' | \
    grep -v '</s> <s>' | \
    grep -v '</s> </s>' | \
    arpa2fst - | fstprint | \
    utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
  fstisstochastic $test/G.fst
 # The output is like:
 # 9.14233e-05 -0.259833
 # we do expect the first of these 2 numbers to be close to zero (the second is
 # nonzero because the backoff weights make the states sum to >1).
 # Because of the <s> fiasco for these particular LMs, the first number is not
 # as close to zero as it could be.
  # Everything below is only for diagnostic.
  # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
  # this might cause determinization failure of CLG.
  # #0 is treated as an empty word.
  mkdir -p $tmpdir/g
  awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
    < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
  fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
   fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
    echo "Language model has cycles with empty words" && exit 1
  rm -r $tmpdir/g
 done
 echo "Succeeded in formatting data."
 rm -r $tmpdir
--- a/egs/aurora4/s5/local/wsj_format_local_lms.sh
+++ b/egs/aurora4/s5/local/wsj_format_local_lms.sh
@ -1,52 +0,0 @@
 #!/bin/bash
 # Copyright Johns Hopkins University (Author: Daniel Povey) 2012
 . ./path.sh
 [ ! -d data/lang_bd ] && echo "Expect data/local/lang_bd to exist" && exit 1;
 lm_srcdir_3g=data/local/local_lm/3gram-mincount
 lm_srcdir_4g=data/local/local_lm/4gram-mincount
 [ ! -d "$lm_srcdir_3g" ] && echo "No such dir $lm_srcdir_3g" && exit 1;
 [ ! -d "$lm_srcdir_4g" ] && echo "No such dir $lm_srcdir_4g" && exit 1;
 for d in data/lang_test_bd_{tg,tgpr,fg,fgpr}; do
  rm -r $d 2>/dev/null
  cp -r data/lang_bd $d
 done
 lang=data/lang_bd
 # Be careful: this time we dispense with the grep -v '<s> <s>' so this might
 # not work for LMs generated from all toolkits.
 gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \
  arpa2fst - | fstprint | \
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tgpr/G.fst || exit 1;
  fstisstochastic data/lang_test_bd_tgpr/G.fst
 gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
  arpa2fst - | fstprint | \
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tg/G.fst || exit 1;
  fstisstochastic data/lang_test_bd_tg/G.fst
 gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
  arpa2fst - | fstprint | \
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fg/G.fst || exit 1;
  fstisstochastic data/lang_test_bd_fg/G.fst
 gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \
  arpa2fst - | fstprint | \
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fgpr/G.fst || exit 1;
  fstisstochastic data/lang_test_bd_fgpr/G.fst
 exit 0;
--- a/egs/aurora4/s5/local/wsj_train_lms.sh
+++ b/egs/aurora4/s5/local/wsj_train_lms.sh
@ -1,202 +0,0 @@
 #!/bin/bash
 # This script trains LMs on the WSJ LM-training data.
 # It requires that you have already run wsj_extend_dict.sh,
 # to get the larger-size dictionary including all of CMUdict
 # plus any OOVs and possible acronyms that we could easily 
 # derive pronunciations for.
 # This script takes no command-line arguments
 dir=data/local/local_lm
 srcdir=data/local/dict_larger
 mkdir -p $dir
 . ./path.sh || exit 1; # for KALDI_ROOT
 export PATH=$KALDI_ROOT/tools/kaldi_lm:$PATH
 ( # First make sure the kaldi_lm toolkit is installed.
 cd $KALDI_ROOT/tools || exit 1;
 if [ -d kaldi_lm ]; then
   echo Not installing the kaldi_lm toolkit since it is already there.
 else
   echo Downloading and installing the kaldi_lm tools
   if [ ! -f kaldi_lm.tar.gz ]; then
     wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1;
   fi
   tar -xvzf kaldi_lm.tar.gz || exit 1;
   cd kaldi_lm
   make || exit 1;
   echo Done making the kaldi_lm tools
 fi
 ) || exit 1;
 if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
  echo "Expecting files $srcdir/cleaned.gz and $srcdir/lexicon.txt to exist";
  echo "You need to run local/wsj_extend_dict.sh before running this script."
  exit 1;
 fi
 # Get a wordlist-- keep everything but silence, which should not appear in
 # the LM.
 awk '{print $1}' $srcdir/lexicon.txt | grep -v -w '!SIL' > $dir/wordlist.txt
 # Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
 echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)" 
 gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.txt \
  'BEGIN{while((getline<w)>0) v[$1]=1;}
  {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
  | gzip -c > $dir/train_nounk.gz
 # Get unigram counts (without bos/eos, but this doens't matter here, it's
 # only to get the word-map, which treats them specially & doesn't need their
 # counts).
 # Add a 1-count for each word in word-list by including that in the data,
 # so all words appear.
 gunzip -c $dir/train_nounk.gz | cat - $dir/wordlist.txt | \
  awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
 sort -nr > $dir/unigram.counts
 # Get "mapped" words-- a character encoding of the words that makes the common words very short.
 cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map
 gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
  { for(n=1;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz
 # To save disk space, remove the un-mapped training data.  We could
 # easily generate it again if needed.
 rm $dir/train_nounk.gz 
 train_lm.sh --arpa --lmtype 3gram-mincount $dir
 #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
 # 7.8 million N-grams.
 prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
 # 1.45 million N-grams.
 # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
 train_lm.sh --arpa --lmtype 4gram-mincount $dir
 #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
 # 10.3 million N-grams.
 prune_lm.sh --arpa 7.0 $dir/4gram-mincount
 # 1.50 million N-grams
 # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
 exit 0
 ### Below here, this script is showing various commands that 
 ## were run during LM tuning.
 train_lm.sh --arpa --lmtype 3gram-mincount $dir
 #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
 # 7.8 million N-grams.
 prune_lm.sh --arpa 3.0 $dir/3gram-mincount/
 #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740
 # 2.5 million N-grams.
 prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
 # 1.45 million N-grams.
 # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
 train_lm.sh --arpa --lmtype 4gram-mincount $dir
 #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
 # 10.3 million N-grams.
 prune_lm.sh --arpa 3.0 $dir/4gram-mincount
 #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294
 # 2.6 million N-grams.
 prune_lm.sh --arpa 4.0 $dir/4gram-mincount
 # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717
 # 2.15 million N-grams.
 prune_lm.sh --arpa 5.0 $dir/4gram-mincount
 # 1.86 million N-grams
 # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023
 prune_lm.sh --arpa 7.0 $dir/4gram-mincount
 # 1.50 million N-grams
 # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
 train_lm.sh --arpa --lmtype 3gram $dir
 # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866
 # 20.0 million N-grams
 ! which ngram-count  \
  && echo "SRILM tools not installed so not doing the comparison" && exit 1;
 #################
 # You could finish the script here if you wanted.
 # Below is to show how to do baselines with SRILM.
 #  You'd have to install the SRILM toolkit first.
 heldout_sent=10000 # Don't change this if you want result to be comparable with
    # kaldi_lm results
 sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
 mkdir -p $sdir
 gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout
 gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train
 (echo "<s>"; echo "</s>" ) | cat - $dir/wordlist.txt > $sdir/wordlist.final.s
 # 3-gram:
 ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
  -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
 ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2
 #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
 #0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437
 # Trying 4-gram:
 ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
  -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz
 ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout 
 #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
 #0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822
 #3-gram with pruning:
 ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
  -prune 0.0000001 -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz
 ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout 
 #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
 #0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616
 # Around 2.25M N-grams.
 # Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/"
 # above, which gave 2.5 million N-grams and a perplexity of 156.
 # Note: all SRILM experiments above fully discount all singleton 3 and 4-grams.
 # You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to
 # the kaldi_lm experiments above without "-mincount".
 ##  From here is how to train with
 # IRSTLM.  This is not really working at the moment.
 export IRSTLM=$KALDI_ROOT/tools/irstlm/
 idir=$dir/irstlm
 mkdir $idir
 gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | $IRSTLM/scripts/add-start-end.sh | \
  gzip -c > $idir/train.gz
 $IRSTLM/bin/dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no
 cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\
 {print $0;}}' > vocab.irstlm.20k
 $IRSTLM/bin/build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz  -p yes \
  -n 3 -s improved-kneser-ney -b yes
 # Testing perplexity with SRILM tools:
 ngram -lm $idir/lm_3gram.gz  -ppl $sdir/cleaned.heldout 
 #data/local/local_lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for <unk> in closed-vocabulary LM
 #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs
 #0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599
 # Perplexity is very bad (should be ~141, since we used -p option,
 # not 175),
 # but adding -debug 3 to the command line shows that
 # the IRSTLM LM does not seem to sum to one properly, so it seems that
 # it produces an LM that isn't interpretable in the normal way as an ARPA
 # LM.
--- a/egs/aurora4/s5/local/wsj_train_rnnlms.sh
+++ b/egs/aurora4/s5/local/wsj_train_rnnlms.sh
@ -1,153 +0,0 @@
 #!/bin/bash 
 # Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
 # This script trains LMs on the WSJ LM-training data.
 # It requires that you have already run wsj_extend_dict.sh,
 # to get the larger-size dictionary including all of CMUdict
 # plus any OOVs and possible acronyms that we could easily 
 # derive pronunciations for.
 # This script takes no command-line arguments but takes the --cmd option.
 # Begin configuration section.
 rand_seed=0
 cmd=run.pl
 nwords=10000 # This is how many words we're putting in the vocab of the RNNLM. 
 hidden=30
 class=200 # Num-classes... should be somewhat larger than sqrt of nwords.
 direct=1000 # Probably number of megabytes to allocate for hash-table for "direct" connections.
 rnnlm_ver=rnnlm-0.3e # version of RNNLM to use
 # End configuration section.
 [ -f ./path.sh ] && . ./path.sh
 . utils/parse_options.sh
 if [ $# != 1 ]; then
   echo "Usage: local/wsj_train_rnnlms.sh [options] <dest-dir>"
   echo "For options, see top of script file"
   exit 1;
 fi
 dir=$1
 srcdir=data/local/dict_larger
 mkdir -p $dir
 export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH
 ( # First make sure the kaldi_lm toolkit is installed.
 # Note: this didn't work out of the box for me, I had to
 # change the g++ version to just "g++" (no cross-compilation
 # needed for me as I ran on a machine that had been setup
 # as 64 bit by default.
 cd $KALDI_ROOT/tools || exit 1;
 if [ -d $rnnlm_ver ]; then
   echo Not installing the rnnlm toolkit since it is already there.
 else
   echo Downloading and installing the rnnlm tools
   # http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz
   if [ ! -f $rnnlm_ver.tgz ]; then
     wget http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz || exit 1;
   fi
   mkdir $rnnlm_ver
   cd $rnnlm_ver
   tar -xvzf ../$rnnlm_ver.tgz || exit 1;
   make CC=g++ || exit 1;
   echo Done making the rnnlm tools
 fi
 ) || exit 1;
 if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
  echo "Expecting files $srcdir/cleaned.gz and $srcdir/wordlist.final to exist";
  echo "You need to run local/wsj_extend_dict.sh before running this script."
  exit 1;
 fi
 cat $srcdir/lexicon.txt | awk '{print $1}' | grep -v -w '!SIL' > $dir/wordlist.all
 # Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
 echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)" 
 gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.all \
  'BEGIN{while((getline<w)>0) v[$1]=1;}
  {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
  | gzip -c > $dir/all.gz
 echo "Splitting data into train and validation sets."
 heldout_sent=10000
 gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data
 gunzip -c $dir/all.gz | tail -n +$heldout_sent | \
 perl -e ' use List::Util qw(shuffle); @A=<>; print join("", shuffle(@A)); ' \
  > $dir/train.in # training data
  # The rest will consist of a word-class represented by <RNN_UNK>, that
  # maps (with probabilities) to a whole class of words.
 # Get unigram counts from our training data, and use this to select word-list
 # for RNNLM training; e.g. 10k most frequent words.  Rest will go in a class
 # that we (manually, at the shell level) assign probabilities for words that
 # are in that class.  Note: this word-list doesn't need to include </s>; this
 # automatically gets added inside the rnnlm program.
 # Note: by concatenating with $dir/wordlist.all, we are doing add-one
 # smoothing of the counts.
 cat $dir/train.in $dir/wordlist.all | grep -v '</s>' | grep -v '<s>' | \
  awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
  sort -nr > $dir/unigram.counts
 head -$nwords $dir/unigram.counts | awk '{print $2}' > $dir/wordlist.rnn
 tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts
 tot=`awk '{x=x+$1} END{print x}' $dir/unk_class.counts`
 awk -v tot=$tot '{print $2, ($1*1.0/tot);}' <$dir/unk_class.counts  >$dir/unk.probs
 for type in train valid; do
  cat $dir/$type.in | awk -v w=$dir/wordlist.rnn \
    'BEGIN{while((getline<w)>0) v[$1]=1;}
    {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<RNN_UNK> ";print ""}'|sed 's/ $//g' \
    > $dir/$type
 done
 rm $dir/train.in # no longer needed-- and big.
 # Now randomize the order of the training data.
 cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \
 sort | cut -f 2 > $dir/foo
 mv $dir/foo $dir/train
 # OK we'll train the RNNLM on this data.
 # todo: change 100 to 320.
 # using 100 classes as square root of 10k.
 echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
 #time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/100.rnnlm \
 #  -hidden 100 -rand-seed 1 -debug 2 -class 100 -bptt 2 -bptt-block 20 \
 #  -direct-order 4 -direct 1000 -binary >& $dir/rnnlm1.log &
 $cmd $dir/rnnlm.log \
   $KALDI_ROOT/tools/$rnnlm_ver/rnnlm -independent -train $dir/train -valid $dir/valid \
   -rnnlm $dir/rnnlm -hidden $hidden -rand-seed 1 -debug 2 -class $class -bptt 2 -bptt-block 20 \
   -direct-order 4 -direct $direct -binary || exit 1;
 # make it like a Kaldi table format, with fake utterance-ids.
 cat $dir/valid.in | awk '{ printf("uttid-%d ", NR); print; }' > $dir/valid.with_ids
 utils/rnnlm_compute_scores.sh $dir $dir/tmp.valid $dir/valid.with_ids \
  $dir/valid.scores
 nw=`wc -w < $dir/valid.with_ids` # Note: valid.with_ids includes utterance-ids which
  # is one per word, to account for the </s> at the end of each sentence; this is the
  # correct number to normalize buy.
 p=`awk -v nw=$nw '{x=x+$2} END{print exp(x/nw);}' <$dir/valid.scores` 
 echo Perplexity is $p | tee $dir/perplexity.log
 rm $dir/train $dir/all.gz
 # This is a better setup, but takes a long time to train:
 #echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
 #time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/320.rnnlm \
 #  -hidden 320 -rand-seed 1 -debug 2 -class 300 -bptt 2 -bptt-block 20 \
 #  -direct-order 4 -direct 2000 -binary
--- a/egs/babel/s5/local/wsj_data_prep.sh
+++ b/egs/babel/s5/local/wsj_data_prep.sh
@ -1,201 +0,0 @@
 #!/bin/bash
 # Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0.
 if [ $# -le 3 ]; then
   echo "Arguments should be a list of WSJ directories, see ../run.sh for example."
   exit 1;
 fi
 dir=`pwd`/data/local/data
 lmdir=`pwd`/data/local/nist_lm
 mkdir -p $dir $lmdir
 local=`pwd`/local
 utils=`pwd`/utils
 . ./path.sh # Needed for KALDI_ROOT
 export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
 sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
 if [ ! -x $sph2pipe ]; then
   echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
   exit 1;
 fi
 cd $dir
 # Make directory of links to the WSJ disks such as 11-13.1.  This relies on the command
 # line arguments being absolute pathnames.
 rm -r links/ 2>/dev/null
 mkdir links/
 ln -s $* links
 # Do some basic checks that we have what we expected.
 if [ ! -d links/11-13.1 -o ! -d links/13-34.1 -o ! -d links/11-2.1 ]; then
  echo "wsj_data_prep.sh: Spot check of command line arguments failed"
  echo "Command line arguments must be absolute pathnames to WSJ directories"
  echo "with names like 11-13.1."
  exit 1;
 fi
 # This version for SI-84
 cat links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
 $local/ndx2flist.pl $* | sort | \
 grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si84.flist
 nl=`cat train_si84.flist | wc -l`
 [ "$nl" -eq 7138 ] || echo "Warning: expected 37416 lines in train_si84.flist, got $nl"
 # This version for SI-284
 cat links/13-34.1/wsj1/doc/indices/si_tr_s.ndx \
 links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
 $local/ndx2flist.pl  $* | sort | \
 grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si284.flist
 nl=`cat train_si284.flist | wc -l`
 [ "$nl" -eq 37416 ] || echo "Warning: expected 37416 lines in train_si284.flist, got $nl"
 # Now for the test sets.
 # links/13-34.1/wsj1/doc/indices/readme.doc 
 # describes all the different test sets.
 # Note: each test-set seems to come in multiple versions depending
 # on different vocabulary sizes, verbalized vs. non-verbalized
 # pronunciations, etc.  We use the largest vocab and non-verbalized
 # pronunciations.
 # The most normal one seems to be the "baseline 60k test set", which
 # is h1_p0. 
 # Nov'92 (333 utts)
 # These index files have a slightly different format;
 # have to add .wv1
 cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_20.ndx | \
  $local/ndx2flist.pl $* |  awk '{printf("%s.wv1\n", $1)}' | \
  sort > test_eval92.flist
 # Nov'92 (330 utts, 5k vocab)
 cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
  $local/ndx2flist.pl $* |  awk '{printf("%s.wv1\n", $1)}' | \
  sort > test_eval92_5k.flist
 # Nov'93: (213 utts)
 # Have to replace a wrong disk-id.
 cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \
  sed s/13_32_1/13_33_1/ | \
  $local/ndx2flist.pl $* | sort > test_eval93.flist
 # Nov'93: (213 utts, 5k)
 cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h2_p0.ndx | \
  sed s/13_32_1/13_33_1/ | \
  $local/ndx2flist.pl $* | sort > test_eval93_5k.flist
 # Dev-set for Nov'93 (503 utts)
 cat links/13-34.1/wsj1/doc/indices/h1_p0.ndx | \
  $local/ndx2flist.pl $* | sort > test_dev93.flist
 # Dev-set for Nov'93 (513 utts, 5k vocab)
 cat links/13-34.1/wsj1/doc/indices/h2_p0.ndx | \
  $local/ndx2flist.pl $* | sort > test_dev93_5k.flist
 # Dev-set Hub 1,2 (503, 913 utterances)
 # Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.  
 # Sometimes this gets copied from the CD's with upcasing, don't know 
 # why (could be older versions of the disks).
 find `readlink links/13-16.1`/???1/??_??_20 -print | grep -i ".wv1" | sort > dev_dt_20.flist
 find `readlink links/13-16.1`/???1/??_??_05 -print | grep -i ".wv1" | sort > dev_dt_05.flist
 # Finding the transcript files:
 for x in $*; do find -L $x -iname '*.dot'; done > dot_files.flist
 # Convert the transcripts into our format (no normalization yet)
 for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
   $local/flist2scp.pl $x.flist | sort > ${x}_sph.scp
   cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl  dot_files.flist > $x.trans1
 done
 # Do some basic normalization steps.  At this point we don't remove OOVs--
 # that will be done inside the training scripts, as we'd like to make the
 # data-preparation stage independent of the specific lexicon used.
 noiseword="<NOISE>";
 for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
   cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1;
 done
 # Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
 for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
  awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
 done
 # Make the utt2spk and spk2utt files.
 for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
   cat ${x}_sph.scp | awk '{print $1}' | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
   cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
 done
 #in case we want to limit lm's on most frequent words, copy lm training word frequency list
 cp links/13-32.1/wsj1/doc/lng_modl/vocab/wfl_64.lst $lmdir
 chmod u+w $lmdir/*.lst # had weird permissions on source.
 # The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without
 # verbalized pronunciations.   This is the most common test setup, I understand.
 cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1;
 chmod u+w $lmdir/lm_bg.arpa.gz
 # trigram would be:
 cat links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb20onp.z | \
 perl -e 'while(<>){ if(m/^\\data\\/){ print; last;  } } while(<>){ print; }' | \
 gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1;
 prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1;
 gzip -f $lmdir/lm_tgpr.arpa || exit 1;
 # repeat for 5k language models
 cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb05onp.z  $lmdir/lm_bg_5k.arpa.gz || exit 1;
 chmod u+w $lmdir/lm_bg_5k.arpa.gz
 # trigram would be: !only closed vocabulary here!
 cp links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1;
 chmod u+w $lmdir/lm_tg_5k.arpa.gz
 gunzip $lmdir/lm_tg_5k.arpa.gz
 tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz
 rm $lmdir/lm_tg_5k.arpa
 prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1;
 gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1;
 if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
  rm wsj0-train-spkrinfo.txt
  ! wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt && \
    echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
    wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt 
 fi
 if [ ! -f wsj0-train-spkrinfo.txt ]; then
  echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
  echo "This is possibly omitted from the training disks; couldn't find it." 
  echo "Everything else may have worked; we just may be missing gender info"
  echo "which is only needed for VTLN-related diagnostics anyway."
  exit 1
 fi
 # Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
 # LDC put it on the web.  Perhaps it was accidentally omitted from the
 # disks.  
 cat links/11-13.1/wsj0/doc/spkrinfo.txt \
    links/13-32.1/wsj1/doc/evl_spok/spkrinfo.txt \
    links/13-34.1/wsj1/doc/dev_spok/spkrinfo.txt \
    links/13-34.1/wsj1/doc/train/spkrinfo.txt \
   ./wsj0-train-spkrinfo.txt  | \
    perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
   awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
 echo "Data preparation succeeded"
--- a/egs/babel/s5/local/wsj_extend_dict.sh
+++ b/egs/babel/s5/local/wsj_extend_dict.sh
@ -1,172 +0,0 @@
 #!/bin/bash
 # This script builds a larger word-list and dictionary 
 # than used for the LMs supplied with the WSJ corpus.
 # It uses a couple of strategies to fill-in words in
 # the LM training data but not in CMUdict.  One is
 # to generate special prons for possible acronyms, that
 # just consist of the constituent letters.  The other
 # is designed to handle derivatives of known words
 # (e.g. deriving the pron of a plural from the pron of
 # the base-word), but in a more general, learned-from-data
 # way.
 # It makes use of scripts in local/dict/
 if [ $# -ne 1 ]; then
  echo "Usage: local/wsj_train_lms.sh /foo/bar/WSJ/13-32.1/"
  exit 1
 fi
 if [ "`basename $1`" != 13-32.1 ]; then
  echo "Expecting the argument to this script to end in 13-32.1"
  exit 1
 fi
 # e.g.
 #srcdir=/mnt/matylda2/data/WSJ1/13-32.1
 export PATH=$PATH:`pwd`/local/dict/
 srcdir=$1
 mkdir -p data/local/dict_larger
 dir=data/local/dict_larger
 cp data/local/dict/* data/local/dict_larger # Various files describing phones etc.
  # are there; we just want to copy them as the phoneset is the same.
 rm data/local/dict_larger/lexicon.txt # we don't want this.
 mincount=2 # Minimum count of an OOV we will try to generate a pron for.
 [ ! -f data/local/dict/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1;
 # Remove comments from cmudict; print first field; remove
 # words like FOO(1) which are alternate prons: our dict format won't
 # include these markers.
 grep -v ';;;' data/local/dict/cmudict/cmudict.0.7a | 
 perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu
 cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu
 echo "Getting training data [this should take at least a few seconds; if not, there's a problem]"
 # Convert to uppercase, remove XML-like markings.
 # For words ending in "." that are not in CMUdict, we assume that these
 # are periods that somehow remained in the data during data preparation,
 # and we we replace the "." with "\n".  Note: we found this by looking at
 # oov.counts below (before adding this rule).
 touch $dir/cleaned.gz
 if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then
  echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]";
 else
 gunzip -c $srcdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \
  | awk '/^</{next}{print toupper($0)}' | perl -e '
   open(F, "<$ARGV[0]")||die;
   while(<F>){ chop; $isword{$_} = 1; }
   while(<STDIN>) { 
    @A = split(" ", $_); 
    for ($n = 0; $n < @A; $n++) {
      $a = $A[$n];
      if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "."
         # and have no other "." in them: treat as period.
         print "$a";
         if ($n+1 < @A) { print "\n"; }
      } else { print "$a "; }
    }
    print "\n";
  }
 ' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz
 fi
 # get unigram counts
 echo "Getting unigram counts"
 gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \
  awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams
 cat $dir/unigrams | awk -v dict=$dir/dict.cmu \
  'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \
   > $dir/oov.counts
 echo "Most frequent unseen unigrams are: "
 head $dir/oov.counts
 # Prune away singleton counts, and remove things with numbers in
 # (which should have been normalized) and with no letters at all.
 cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \
  | awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist
 # Automatic rule-finding...
 # First make some prons for possible acronyms.
 # Note: we don't do this for things like U.K or U.N,
 # or A.B. (which doesn't exist anyway), 
 # as we consider this normalization/spelling errors.
 cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu  > $dir/dict.acronyms
 mkdir $dir/f $dir/b # forward, backward directions of rules...
  # forward is normal suffix
  # rules, backward is reversed (prefix rules).  These
  # dirs contain stuff we create while making the rule-based
  # extensions to the dictionary.
 # Remove ; and , from words, if they are present; these
 # might crash our scripts, as they are used as separators there.
 filter_dict.pl $dir/dict.cmu > $dir/f/dict 
 cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
 reverse_dict.pl $dir/f/dict > $dir/b/dict
 reverse_dict.pl $dir/f/oovs > $dir/b/oovs
 # The next stage takes a few minutes.
 # Note: the forward stage takes longer, as English is
 # mostly a suffix-based language, and there are more rules
 # that it finds.
 for d in $dir/f $dir/b; do
 (
   cd $d
   cat dict | get_rules.pl 2>get_rules.log >rules
   get_rule_hierarchy.pl rules >hierarchy
   awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
     limit_candidate_prons.pl hierarchy | \
     score_prons.pl dict | \
     count_rules.pl >rule.counts
   # the sort command below is just for convenience of reading.
   score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
   get_candidate_prons.pl rules.with_scores dict oovs | \
     limit_candidate_prons.pl hierarchy > oovs.candidates
 )  &   
 done 
 wait
 # Merge the candidates.
 reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
 select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s  %s\n", $1, $2);}' \
  > $dir/dict.oovs
 cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
 awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
 sort $dir/oovlist | diff - $dir/oovlist.handled  | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
 # add_counts.pl attaches to original counts to the list of handled/not-handled OOVs
 add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts
 add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts
 echo "**Top OOVs we handled are:**"; 
 head $dir/oovlist.handled.counts
 echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**"; 
 head $dir/oovlist.not_handled.counts
 echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`"
 echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`"
 echo "Count of OOVs we didn't handle due to low count is" \
    `awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts`
 # The two files created above are for humans to look at, as diagnostics.
 cat <<EOF | cat - $dir/dict.cmu $dir/dict.oovs_merged | sort | uniq > $dir/lexicon.txt
 !SIL SIL
 <SPOKEN_NOISE> SPN
 <UNK> SPN
 <NOISE> NSN
 EOF
 echo "Created $dir/lexicon.txt"
--- a/egs/babel/s5/local/wsj_format_data.sh
+++ b/egs/babel/s5/local/wsj_format_data.sh
@ -1,86 +0,0 @@
 #!/bin/bash
 # Copyright 2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # This script takes data prepared in a corpus-dependent way
 # in data/local/, and converts it into the "canonical" form,
 # in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
 # data/train_si284, data/train_si84, etc.
 # Don't bother doing train_si84 separately (although we have the file lists
 # in data/local/) because it's just the first 7138 utterances in train_si284.
 # We'll create train_si84 after doing the feature extraction.
 . ./path.sh || exit 1;
 echo "Preparing train and test data"
 srcdir=data/local/data
 lmdir=data/local/nist_lm
 tmpdir=data/local/lm_tmp
 lexicon=data/local/lang_tmp/lexicon.txt
 mkdir -p $tmpdir
 for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do 
  mkdir -p data/$x
  cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
  cp $srcdir/$x.txt data/$x/text || exit 1;
  cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
  cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
  utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1;
 done
 # Next, for each type of language model, create the corresponding FST
 # and the corresponding lang_test_* directory.
 echo Preparing language models for test
 for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
  test=data/lang_test_${lm_suffix}
  mkdir -p $test
  for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
     phones/; do
    cp -r data/lang/$f $test
  done
  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt
  # grep -v '<s> <s>' because the LM seems to have some strange and useless
  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
  # which are supposed to occur only at being/end of utt.  These can cause 
  # determinization failures of CLG [ends up being epsilon cycles].
  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
    grep -v '<s> <s>' | \
    grep -v '</s> <s>' | \
    grep -v '</s> </s>' | \
    arpa2fst - | fstprint | \
    utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
  fstisstochastic $test/G.fst
 # The output is like:
 # 9.14233e-05 -0.259833
 # we do expect the first of these 2 numbers to be close to zero (the second is
 # nonzero because the backoff weights make the states sum to >1).
 # Because of the <s> fiasco for these particular LMs, the first number is not
 # as close to zero as it could be.
  # Everything below is only for diagnostic.
  # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
  # this might cause determinization failure of CLG.
  # #0 is treated as an empty word.
  mkdir -p $tmpdir/g
  awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
    < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
  fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
   fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
    echo "Language model has cycles with empty words" && exit 1
  rm -r $tmpdir/g
 done
 echo "Succeeded in formatting data."
 rm -r $tmpdir
--- a/egs/babel/s5/local/wsj_format_local_lms.sh
+++ b/egs/babel/s5/local/wsj_format_local_lms.sh
@ -1,54 +0,0 @@
 #!/bin/bash
 # Copyright Johns Hopkins University (Author: Daniel Povey) 2012
 . ./path.sh
 [ ! -d data/lang_bd ] && echo "Expect data/local/lang_bd to exist" && exit 1;
 lm_srcdir_3g=data/local/local_lm/3gram-mincount
 lm_srcdir_4g=data/local/local_lm/4gram-mincount
 [ ! -d "$lm_srcdir_3g" ] && echo "No such dir $lm_srcdir_3g" && exit 1;
 [ ! -d "$lm_srcdir_4g" ] && echo "No such dir $lm_srcdir_4g" && exit 1;
 for d in data/lang_test_bd_{tg,tgpr,fg,fgpr}; do
  rm -r $d 2>/dev/null
  cp -r data/lang_bd $d
 done
 lang=data/lang_bd
 exit
 # Be careful: this time we dispense with the grep -v '<s> <s>' so this might
 # not work for LMs generated from all toolkits.
 gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \
  arpa2fst - | fstprint | \
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tgpr/G.fst || exit 1;
  fstisstochastic data/lang_test_bd_tgpr/G.fst
 gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
  arpa2fst - | fstprint | \
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tg/G.fst || exit 1;
  fstisstochastic data/lang_test_bd_tg/G.fst
 gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
  arpa2fst - | fstprint | \
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fg/G.fst || exit 1;
  fstisstochastic data/lang_test_bd_fg/G.fst
 gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \
  arpa2fst - | fstprint | \
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fgpr/G.fst || exit 1;
  fstisstochastic data/lang_test_bd_fgpr/G.fst
 exit 0;
--- a/egs/babel/s5/local/wsj_prepare_dict.sh
+++ b/egs/babel/s5/local/wsj_prepare_dict.sh
@ -1,83 +0,0 @@
 #!/bin/bash
 # Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #  http://www.apache.org/licenses/LICENSE-2.0
 #
 # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 # MERCHANTABLITY OR NON-INFRINGEMENT.
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 # Call this script from one level above, e.g. from the s3/ directory.  It puts
 # its output in data/local/.
 # The parts of the output of this that will be needed are
 # [in data/local/dict/ ]
 # lexicon.txt
 # extra_questions.txt
 # nonsilence_phones.txt
 # optional_silence.txt
 # silence_phones.txt
 # run this from ../
 dir=data/local/dict
 mkdir -p $dir
 # (1) Get the CMU dictionary
 svn co https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict  \
  $dir/cmudict || exit 1;
 # can add -r 10966 for strict compatibility.
 #(2) Dictionary preparation:
 # Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
 # We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.
 # silence phones, one per line.
 (echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt
 echo SIL > $dir/optional_silence.txt
 # nonsilence phones; on each line is a list of phones that correspond
 # really to the same base phone.
 cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \
 perl -e 'while(<>){
  chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; 
  $phones_of{$1} .= "$_ "; }
  foreach $list (values %phones_of) {print $list . "\n"; } ' \
  > $dir/nonsilence_phones.txt || exit 1;
 # A few extra questions that will be added to those obtained by automatically clustering
 # the "real" phones.  These ask about stress; there's also one for silence.
 cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
 cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
 >> $dir/extra_questions.txt || exit 1;
 grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
 perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
  > $dir/lexicon1_raw_nosil.txt || exit 1;
 # Add to cmudict the silences, noises etc.
 (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; echo '<NOISE> NSN'; ) | \
 cat - $dir/lexicon1_raw_nosil.txt  > $dir/lexicon2_raw.txt || exit 1;
 # lexicon.txt is without the _B, _E, _S, _I markers.
 # This is the input to wsj_format_data.sh
 cp $dir/lexicon2_raw.txt $dir/lexicon.txt
 echo "Dictionary preparation succeeded"
--- a/egs/babel/s5/local/wsj_train_lms.sh
+++ b/egs/babel/s5/local/wsj_train_lms.sh
@ -1,202 +0,0 @@
 #!/bin/bash
 # This script trains LMs on the WSJ LM-training data.
 # It requires that you have already run wsj_extend_dict.sh,
 # to get the larger-size dictionary including all of CMUdict
 # plus any OOVs and possible acronyms that we could easily 
 # derive pronunciations for.
 # This script takes no command-line arguments
 dir=data/local/local_lm
 srcdir=data/local/dict_larger
 mkdir -p $dir
 . ./path.sh || exit 1; # for KALDI_ROOT
 export PATH=$KALDI_ROOT/tools/kaldi_lm:$PATH
 ( # First make sure the kaldi_lm toolkit is installed.
 cd $KALDI_ROOT/tools || exit 1;
 if [ -d kaldi_lm ]; then
   echo Not installing the kaldi_lm toolkit since it is already there.
 else
   echo Downloading and installing the kaldi_lm tools
   if [ ! -f kaldi_lm.tar.gz ]; then
     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
   fi
   tar -xvzf kaldi_lm.tar.gz || exit 1;
   cd kaldi_lm
   make || exit 1;
   echo Done making the kaldi_lm tools
 fi
 ) || exit 1;
 if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
  echo "Expecting files $srcdir/cleaned.gz and $srcdir/lexicon.txt to exist";
  echo "You need to run local/wsj_extend_dict.sh before running this script."
  exit 1;
 fi
 # Get a wordlist-- keep everything but silence, which should not appear in
 # the LM.
 awk '{print $1}' $srcdir/lexicon.txt | grep -v -w '!SIL' > $dir/wordlist.txt
 # Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
 echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)" 
 gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.txt \
  'BEGIN{while((getline<w)>0) v[$1]=1;}
  {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
  | gzip -c > $dir/train_nounk.gz
 # Get unigram counts (without bos/eos, but this doens't matter here, it's
 # only to get the word-map, which treats them specially & doesn't need their
 # counts).
 # Add a 1-count for each word in word-list by including that in the data,
 # so all words appear.
 gunzip -c $dir/train_nounk.gz | cat - $dir/wordlist.txt | \
  awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
 sort -nr > $dir/unigram.counts
 # Get "mapped" words-- a character encoding of the words that makes the common words very short.
 cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map
 gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
  { for(n=1;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz
 # To save disk space, remove the un-mapped training data.  We could
 # easily generate it again if needed.
 rm $dir/train_nounk.gz 
 train_lm.sh --arpa --lmtype 3gram-mincount $dir
 #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
 # 7.8 million N-grams.
 prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
 # 1.45 million N-grams.
 # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
 train_lm.sh --arpa --lmtype 4gram-mincount $dir
 #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
 # 10.3 million N-grams.
 prune_lm.sh --arpa 7.0 $dir/4gram-mincount
 # 1.50 million N-grams
 # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
 exit 0
 ### Below here, this script is showing various commands that 
 ## were run during LM tuning.
 train_lm.sh --arpa --lmtype 3gram-mincount $dir
 #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
 # 7.8 million N-grams.
 prune_lm.sh --arpa 3.0 $dir/3gram-mincount/
 #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740
 # 2.5 million N-grams.
 prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
 # 1.45 million N-grams.
 # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
 train_lm.sh --arpa --lmtype 4gram-mincount $dir
 #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
 # 10.3 million N-grams.
 prune_lm.sh --arpa 3.0 $dir/4gram-mincount
 #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294
 # 2.6 million N-grams.
 prune_lm.sh --arpa 4.0 $dir/4gram-mincount
 # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717
 # 2.15 million N-grams.
 prune_lm.sh --arpa 5.0 $dir/4gram-mincount
 # 1.86 million N-grams
 # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023
 prune_lm.sh --arpa 7.0 $dir/4gram-mincount
 # 1.50 million N-grams
 # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
 train_lm.sh --arpa --lmtype 3gram $dir
 # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866
 # 20.0 million N-grams
 ! which ngram-count  \
  && echo "SRILM tools not installed so not doing the comparison" && exit 1;
 #################
 # You could finish the script here if you wanted.
 # Below is to show how to do baselines with SRILM.
 #  You'd have to install the SRILM toolkit first.
 heldout_sent=10000 # Don't change this if you want result to be comparable with
    # kaldi_lm results
 sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
 mkdir -p $sdir
 gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout
 gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train
 (echo "<s>"; echo "</s>" ) | cat - $dir/wordlist.txt > $sdir/wordlist.final.s
 # 3-gram:
 ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
  -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
 ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2
 #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
 #0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437
 # Trying 4-gram:
 ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
  -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz
 ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout 
 #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
 #0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822
 #3-gram with pruning:
 ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
  -prune 0.0000001 -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz
 ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout 
 #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
 #0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616
 # Around 2.25M N-grams.
 # Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/"
 # above, which gave 2.5 million N-grams and a perplexity of 156.
 # Note: all SRILM experiments above fully discount all singleton 3 and 4-grams.
 # You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to
 # the kaldi_lm experiments above without "-mincount".
 ##  From here is how to train with
 # IRSTLM.  This is not really working at the moment.
 export IRSTLM=$KALDI_ROOT/tools/irstlm/
 idir=$dir/irstlm
 mkdir $idir
 gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | $IRSTLM/scripts/add-start-end.sh | \
  gzip -c > $idir/train.gz
 $IRSTLM/bin/dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no
 cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\
 {print $0;}}' > vocab.irstlm.20k
 $IRSTLM/bin/build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz  -p yes \
  -n 3 -s improved-kneser-ney -b yes
 # Testing perplexity with SRILM tools:
 ngram -lm $idir/lm_3gram.gz  -ppl $sdir/cleaned.heldout 
 #data/local/local_lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for <unk> in closed-vocabulary LM
 #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs
 #0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599
 # Perplexity is very bad (should be ~141, since we used -p option,
 # not 175),
 # but adding -debug 3 to the command line shows that
 # the IRSTLM LM does not seem to sum to one properly, so it seems that
 # it produces an LM that isn't interpretable in the normal way as an ARPA
 # LM.
--- a/egs/babel/s5/local/wsj_train_rnnlms.sh
+++ b/egs/babel/s5/local/wsj_train_rnnlms.sh
@ -1,153 +0,0 @@
 #!/bin/bash 
 # Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
 # This script trains LMs on the WSJ LM-training data.
 # It requires that you have already run wsj_extend_dict.sh,
 # to get the larger-size dictionary including all of CMUdict
 # plus any OOVs and possible acronyms that we could easily 
 # derive pronunciations for.
 # This script takes no command-line arguments but takes the --cmd option.
 # Begin configuration section.
 rand_seed=0
 cmd=run.pl
 nwords=10000 # This is how many words we're putting in the vocab of the RNNLM. 
 hidden=30
 class=200 # Num-classes... should be somewhat larger than sqrt of nwords.
 direct=1000 # Probably number of megabytes to allocate for hash-table for "direct" connections.
 rnnlm_ver=rnnlm-0.3e # version of RNNLM to use
 # End configuration section.
 [ -f ./path.sh ] && . ./path.sh
 . utils/parse_options.sh
 if [ $# != 1 ]; then
   echo "Usage: local/wsj_train_rnnlms.sh [options] <dest-dir>"
   echo "For options, see top of script file"
   exit 1;
 fi
 dir=$1
 srcdir=data/local/dict_larger
 mkdir -p $dir
 export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH
 ( # First make sure the kaldi_lm toolkit is installed.
 # Note: this didn't work out of the box for me, I had to
 # change the g++ version to just "g++" (no cross-compilation
 # needed for me as I ran on a machine that had been setup
 # as 64 bit by default.
 cd $KALDI_ROOT/tools || exit 1;
 if [ -d $rnnlm_ver ]; then
   echo Not installing the rnnlm toolkit since it is already there.
 else
   echo Downloading and installing the rnnlm tools
   # http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz
   if [ ! -f $rnnlm_ver.tgz ]; then
     wget http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz || exit 1;
   fi
   mkdir $rnnlm_ver
   cd $rnnlm_ver
   tar -xvzf ../$rnnlm_ver.tgz || exit 1;
   make CC=g++ || exit 1;
   echo Done making the rnnlm tools
 fi
 ) || exit 1;
 if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
  echo "Expecting files $srcdir/cleaned.gz and $srcdir/wordlist.final to exist";
  echo "You need to run local/wsj_extend_dict.sh before running this script."
  exit 1;
 fi
 cat $srcdir/lexicon.txt | awk '{print $1}' | grep -v -w '!SIL' > $dir/wordlist.all
 # Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
 echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)" 
 gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.all \
  'BEGIN{while((getline<w)>0) v[$1]=1;}
  {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
  | gzip -c > $dir/all.gz
 echo "Splitting data into train and validation sets."
 heldout_sent=10000
 gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data
 gunzip -c $dir/all.gz | tail -n +$heldout_sent | \
 perl -e ' use List::Util qw(shuffle); @A=<>; print join("", shuffle(@A)); ' \
  > $dir/train.in # training data
  # The rest will consist of a word-class represented by <RNN_UNK>, that
  # maps (with probabilities) to a whole class of words.
 # Get unigram counts from our training data, and use this to select word-list
 # for RNNLM training; e.g. 10k most frequent words.  Rest will go in a class
 # that we (manually, at the shell level) assign probabilities for words that
 # are in that class.  Note: this word-list doesn't need to include </s>; this
 # automatically gets added inside the rnnlm program.
 # Note: by concatenating with $dir/wordlist.all, we are doing add-one
 # smoothing of the counts.
 cat $dir/train.in $dir/wordlist.all | grep -v '</s>' | grep -v '<s>' | \
  awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
  sort -nr > $dir/unigram.counts
 head -$nwords $dir/unigram.counts | awk '{print $2}' > $dir/wordlist.rnn
 tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts
 tot=`awk '{x=x+$1} END{print x}' $dir/unk_class.counts`
 awk -v tot=$tot '{print $2, ($1*1.0/tot);}' <$dir/unk_class.counts  >$dir/unk.probs
 for type in train valid; do
  cat $dir/$type.in | awk -v w=$dir/wordlist.rnn \
    'BEGIN{while((getline<w)>0) v[$1]=1;}
    {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<RNN_UNK> ";print ""}'|sed 's/ $//g' \
    > $dir/$type
 done
 rm $dir/train.in # no longer needed-- and big.
 # Now randomize the order of the training data.
 cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \
 sort | cut -f 2 > $dir/foo
 mv $dir/foo $dir/train
 # OK we'll train the RNNLM on this data.
 # todo: change 100 to 320.
 # using 100 classes as square root of 10k.
 echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
 #time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/100.rnnlm \
 #  -hidden 100 -rand-seed 1 -debug 2 -class 100 -bptt 2 -bptt-block 20 \
 #  -direct-order 4 -direct 1000 -binary >& $dir/rnnlm1.log &
 $cmd $dir/rnnlm.log \
   $KALDI_ROOT/tools/$rnnlm_ver/rnnlm -independent -train $dir/train -valid $dir/valid \
   -rnnlm $dir/rnnlm -hidden $hidden -rand-seed 1 -debug 2 -class $class -bptt 2 -bptt-block 20 \
   -direct-order 4 -direct $direct -binary || exit 1;
 # make it like a Kaldi table format, with fake utterance-ids.
 cat $dir/valid.in | awk '{ printf("uttid-%d ", NR); print; }' > $dir/valid.with_ids
 utils/rnnlm_compute_scores.sh $dir $dir/tmp.valid $dir/valid.with_ids \
  $dir/valid.scores
 nw=`wc -w < $dir/valid.with_ids` # Note: valid.with_ids includes utterance-ids which
  # is one per word, to account for the </s> at the end of each sentence; this is the
  # correct number to normalize buy.
 p=`awk -v nw=$nw '{x=x+$2} END{print exp(x/nw);}' <$dir/valid.scores` 
 echo Perplexity is $p | tee $dir/perplexity.log
 rm $dir/train $dir/all.gz
 # This is a better setup, but takes a long time to train:
 #echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
 #time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/320.rnnlm \
 #  -hidden 320 -rand-seed 1 -debug 2 -class 300 -bptt 2 -bptt-block 20 \
 #  -direct-order 4 -direct 2000 -binary
--- a/egs/chime_wsj0/s5/local/chime_format_data.sh
+++ b/egs/chime_wsj0/s5/local/chime_format_data.sh
@ -38,11 +38,8 @@ echo Preparing language models for test
 for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
  test=data/lang_test_${lm_suffix}
-  mkdir -p $test
+  cp -rT data/lang $test
-  for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
+
     phones/; do
    cp -r data/lang/$f $test
  done
  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt
@ -60,26 +57,8 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
  fstisstochastic $test/G.fst
 # The output is like:
 # 9.14233e-05 -0.259833
 # we do expect the first of these 2 numbers to be close to zero (the second is
 # nonzero because the backoff weights make the states sum to >1).
 # Because of the <s> fiasco for these particular LMs, the first number is not
 # as close to zero as it could be.
-  # Everything below is only for diagnostic.
+  utils/validate_lang.pl $test || exit 1;
  # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
  # this might cause determinization failure of CLG.
  # #0 is treated as an empty word.
  mkdir -p $tmpdir/g
  awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
    < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
  fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
   fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
    echo "Language model has cycles with empty words" && exit 1
  rm -r $tmpdir/g
 done
 echo "Succeeded in formatting data."
--- a/egs/chime_wsj0/s5/run.sh
+++ b/egs/chime_wsj0/s5/run.sh
@ -256,6 +256,3 @@ $cuda_cmd $dir/_train_nnet.log \
 utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri7a_dnn exp/tri7a_dnn/graph_tgpr_5k || exit 1;
 steps/nnet/decode.sh --nj 8 --acwt 0.10 --config conf/decode_dnn.config \
  exp/tri7a_dnn/graph_tgpr_5k data-fbank/test_eval92_5k_noisy $dir/decode_tgpr_5k_eval92_5k_noisy || exit 1;
--- a/egs/librispeech/s5/local/format_data.sh
+++ b/egs/librispeech/s5/local/format_data.sh
@ -27,7 +27,7 @@ mkdir -p $tmpdir
 for lm_suffix in tgpr; do
  test=data/lang_test_${lm_suffix}
  mkdir -p $test
-  for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones/; do
+  for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones oov.txt oov.int; do
    cp -r data/lang/$f $test
  done
  gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\
@ -47,26 +47,8 @@ for lm_suffix in tgpr; do
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
  fstisstochastic $test/G.fst
 # The output is like:
 # 9.14233e-05 -0.259833
 # we do expect the first of these 2 numbers to be close to zero (the second is
 # nonzero because the backoff weights make the states sum to >1).
 # Because of the <s> fiasco for these particular LMs, the first number is not
 # as close to zero as it could be.
-  # Everything below is only for diagnostic.
+  utils/validate_lang.pl $test || exit 1;
  # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
  # this might cause determinization failure of CLG.
  # #0 is treated as an empty word.
  mkdir -p $tmpdir/g
  awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
    < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
  fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
   fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
    echo "Language model has cycles with empty words" && exit 1
  rm -r $tmpdir/g
 done
 echo "Succeeded in formatting data."
--- a/egs/librispeech/s5/local/format_lms.sh
+++ b/egs/librispeech/s5/local/format_lms.sh
@ -6,7 +6,15 @@
 # Prepares the test time language model(G) transducers
 # (adapted from wsj/s5/local/wsj_format_data.sh)
-. path.sh
+. ./path.sh || exit 1;
 # begin configuration section
 src_dict=data/local/dict/lexicon.txt # only needed for diagnostics, to identify empty words.
 src_dir=data/lang
 # end configuration section
 . utils/parse_options.sh || exit 1;
 set -e
 if [ $# -ne 1 ]; then
@ -14,29 +22,41 @@ if [ $# -ne 1 ]; then
  echo "e.g.: $0 /export/a15/vpanayotov/data/lm"
  echo ", where:"
  echo "    <lm-dir> is the directory in which the language model is stored/downloaded"
  echo "Options:"
  echo "   --src-dir  <dir>           # source lang directory, default data/lang"
  exit 1
 fi
 lm_dir=$1
-tmpdir=data/local/lm_tmp
+if [ ! -d $lm_dir ]; then
-lexicon=data/local/lang_tmp/lexiconp.txt
+  echo "$0: expected source LM directory $lm_dir to exist"
  exit 1;
 fi
 if [ ! -f $src_dir/words.txt ]; then
  echo "$0: expected $src_dir/words.txt to exist."
  exit 1;
 fi
 tmpdir=data/local/lm_tmp.$$
 trap "rm -r $tmpdir" EXIT
 mkdir -p $tmpdir
-for lm_suffix in tgsmall tgmed tglarge; do
+for lm_suffix in tgsmall tgmed; do
-  test=data/lang_test_${lm_suffix}
+  # tglarge is prepared by a separate command, called from run.sh; we don't
-  mkdir -p $test
+  # want to compile G.fst for tglarge, as it takes a while.
-  for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones/; do
+  test=${src_dir}_test_${lm_suffix}
-    cp -r data/lang/$f $test
+  cp -rT ${src_dir} $test
  done
  gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\
   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt || exit 1
  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
+  # stuff in it with multiple <s>'s in the history.  Encountered some other
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
+  # similar things in a LM from Geoff.  Removing all "illegal" combinations of
-  # which are supposed to occur only at being/end of utt.  These can cause
+  # <s> and </s>, which are supposed to occur only at being/end of utt.  These
-  # determinization failures of CLG [ends up being epsilon cycles].
+  # can cause determinization failures of CLG [ends up being epsilon cycles].
  gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
    grep -v '<s> <s>' | \
    grep -v '</s> <s>' | \
@ -44,31 +64,12 @@ for lm_suffix in tgsmall tgmed tglarge; do
    arpa2fst - | fstprint | \
    utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+    --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+    fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
  fstisstochastic $test/G.fst || true
 # The output is like:
 # 9.14233e-05 -0.259833
 # we do expect the first of these 2 numbers to be close to zero (the second is
 # nonzero because the backoff weights make the states sum to >1).
 # Because of the <s> fiasco for these particular LMs, the first number is not
 # as close to zero as it could be.
-  # Everything below is only for diagnostic.
+  utils/validate_lang.pl --skip-determinization-check $test || exit 1;
  # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
  # this might cause determinization failure of CLG.
  # #0 is treated as an empty word.
  mkdir -p $tmpdir/g
  awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
    < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
  fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
   fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
    echo "Language model has cycles with empty words" && exit 1
  rm -r $tmpdir/g
 done
 echo "Succeeded in formatting data."
 rm -r $tmpdir
 exit 0
--- a/egs/sprakbanken/s5/local/sprak_train_cmulm.sh
+++ b/egs/sprakbanken/s5/local/sprak_train_cmulm.sh
@ -50,10 +50,7 @@ idngram2lm -linear -idngram $lmdir/sprak.idngram -vocab \
 test=data/lang_test_${lm_suffix}
 mkdir -p $test
-for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
+cp -rT  data/lang $test
   phones/; do
  cp -r data/lang/$f $test
 done
 cat $lmdir/sprak.arpa | \
 utils/find_arpa_oovs.pl $test/words.txt  > $lmdir/oovs_${lm_suffix}.txt
@ -72,25 +69,9 @@ cat $lmdir/sprak.arpa | \
  utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
    --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
   fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
 fstisstochastic $test/G.fst
 # The output is like:
 # 9.14233e-05 -0.259833
 # we do expect the first of these 2 numbers to be close to zero (the second is
 # nonzero because the backoff weights make the states sum to >1).
 # Because of the <s> fiasco for these particular LMs, the first number is not
 # as close to zero as it could be.
  # Everything below is only for diagnostic.
  # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
  # this might cause determinization failure of CLG.
  # #0 is treated as an empty word.
 mkdir -p $tmpdir
 awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
  < "$lexicon"  >$tmpdir/select_empty.fst.txt
 fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/select_empty.fst.txt | \
 fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/empty_words.fst
 fstinfo $tmpdir/empty_words.fst | grep cyclic | grep -w 'y' && 
  echo "Language model has cycles with empty words" && exit 1
-echo "Succeeded in formatting data."
+utils/validate_lang.pl $test || exit 1;
-rm -r $tmpdir
+
 exit 0;
--- a/egs/sprakbanken/s5/local/sprak_train_irstlm.sh
+++ b/egs/sprakbanken/s5/local/sprak_train_irstlm.sh
@ -17,7 +17,6 @@ lm_suffix=$3
 N=$4
 lmdir=$5
 extdict=${srcdict}_$lm_suffix
 tmpdir=data/local/lm_tmp
 lang_tmp=data/local/lang_tmp
 extlang=data/lang_$lm_suffix
@ -137,10 +136,8 @@ tlm -tr=$lmdir/extra4.ngt -n=$N -lm=wb -o=$lmdir/extra${N}$lm_suffix
 test=data/lang_test_${N}${lm_suffix}
 mkdir -p $test
-for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
+
-   phones/; do
+cp -r $extlang $test
  cp -r $extlang/$f $test
 done
 cat $lmdir/extra${N}$lm_suffix | \
 utils/find_arpa_oovs.pl $test/words.txt  > $lmdir/oovs_${lm_suffix}.txt
@ -159,29 +156,7 @@ cat $lmdir/extra${N}$lm_suffix | \
  utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
    --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
   fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
 fstisstochastic $test/G.fst
-echo "Succeeded in formatting data."
+utils/validate_lang.pl $test || exit 1;
 # The output is like:
 # 9.14233e-05 -0.259833
 # we do expect the first of these 2 numbers to be close to zero (the second is
 # nonzero because the backoff weights make the states sum to >1).
 # Because of the <s> fiasco for these particular LMs, the first number is not
 # as close to zero as it could be.
-  # Everything below is only for diagnostic.
+exit 0;
  # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
  # this might cause determinization failure of CLG.
  # #0 is treated as an empty word.
 echo "Running diagnostics. Investigate if the LM has cycles."
 mkdir -p $tmpdir
 awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
  < "$lmdir/text.filt"  >$tmpdir/select_empty.fst.txt
 fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/select_empty.fst.txt | \
 fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/empty_words.fst
 fstinfo $tmpdir/empty_words.fst | grep cyclic | grep -w 'y' && 
  echo "Language model has cycles with empty words" && exit 1
 rm -rf $tmpdir
--- a/egs/sprakbanken/s5/local/train_irstlm.sh
+++ b/egs/sprakbanken/s5/local/train_irstlm.sh
@ -51,13 +51,10 @@ wait
 test=data/lang_test_${lm_suffix}
 mkdir -p $test
-for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
+cp -rT $srcdir $test
   phones/; do
  cp -r $srcdir/$f $test
 done
 cat $lmdir/train${ngram}.arpa | \
-utils/find_arpa_oovs.pl $test/words.txt  > $lmdir/oovs_${lm_suffix}.txt
+  utils/find_arpa_oovs.pl $test/words.txt  > $lmdir/oovs_${lm_suffix}.txt
  # grep -v '<s> <s>' because the LM seems to have some strange and useless
  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
@ -73,27 +70,10 @@ cat $lmdir/train${ngram}.arpa | \
  utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
    --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
   fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
 fstisstochastic $test/G.fst
 # The output is like:
 # 9.14233e-05 -0.259833
 # we do expect the first of these 2 numbers to be close to zero (the second is
 # nonzero because the backoff weights make the states sum to >1).
 # Because of the <s> fiasco for these particular LMs, the first number is not
 # as close to zero as it could be.
-  # Everything below is only for diagnostic.
+utils/validate_lang.pl $test || exit 1;
  # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
  # this might cause determinization failure of CLG.
  # #0 is treated as an empty word.
 awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
  < "$lmdir/lm_input"  >$tmpdir/select_empty.fst.txt
 fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/select_empty.fst.txt | \
 fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/empty_words.fst
 fstinfo $tmpdir/empty_words.fst | grep cyclic | grep -w 'y' && 
  echo "Language model has cycles with empty words" && exit 1
 echo "Succeeded in formatting data."
 exit 0;
 #rm -rf $tmpdir
 #rm -f $ccs
--- a/egs/swbd/s5b/local/swbd1_prepare_dict.sh
+++ b/egs/swbd/s5b/local/swbd1_prepare_dict.sh
@ -22,7 +22,7 @@ cp $srcdict $dir/lexicon0.txt || exit 1;
 patch <local/dict.patch $dir/lexicon0.txt || exit 1;
 #(2a) Dictionary preparation:
-# Pre-processing (Upper-case, remove comments)
+# Pre-processing (lower-case, remove comments)
 awk 'BEGIN{getline}($0 !~ /^#/) {$0=tolower($0); print}' \
  $srcdict | sort | awk '($0 !~ /^[[:space:]]*$/) {print}' \
   > $dir/lexicon1.txt || exit 1;
--- a/egs/wsj/s5/local/wsj_format_data.sh
+++ b/egs/wsj/s5/local/wsj_format_data.sh
@ -38,11 +38,9 @@ echo Preparing language models for test
 for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
  test=data/lang_test_${lm_suffix}
-  mkdir -p $test
+
-  for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
+  cp -rT data/lang $test || exit 1;
-     phones/; do
+
    cp -r data/lang/$f $test
  done
  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt
@ -60,26 +58,8 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
  fstisstochastic $test/G.fst
 # The output is like:
 # 9.14233e-05 -0.259833
 # we do expect the first of these 2 numbers to be close to zero (the second is
 # nonzero because the backoff weights make the states sum to >1).
 # Because of the <s> fiasco for these particular LMs, the first number is not
 # as close to zero as it could be.
-  # Everything below is only for diagnostic.
+  utils/validate_lang.pl --skip-determinization-check $test || exit 1;
  # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
  # this might cause determinization failure of CLG.
  # #0 is treated as an empty word.
  mkdir -p $tmpdir/g
  awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
    < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
  fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
   fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
    echo "Language model has cycles with empty words" && exit 1
  rm -r $tmpdir/g
 done
 echo "Succeeded in formatting data."
--- a/egs/wsj/s5/local/wsj_prepare_dict.sh
+++ b/egs/wsj/s5/local/wsj_prepare_dict.sh
@ -1,6 +1,7 @@
 #!/bin/bash
-# Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Copyright 2010-2012 Microsoft Corporation  
 #           2012-2014 Johns Hopkins University (Author: Daniel Povey)
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -70,14 +71,16 @@ grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
 # Add to cmudict the silences, noises etc.
 # the sort | uniq is to remove a duplicated pron from cmudict.
 (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; echo '<NOISE> NSN'; ) | \
- cat - $dir/lexicon1_raw_nosil.txt  > $dir/lexicon2_raw.txt || exit 1;
+ cat - $dir/lexicon1_raw_nosil.txt | sort | uniq > $dir/lexicon2_raw.txt || exit 1;
 # lexicon.txt is without the _B, _E, _S, _I markers.
 # This is the input to wsj_format_data.sh
 cp $dir/lexicon2_raw.txt $dir/lexicon.txt
 rm $dir/lexiconp.txt 2>/dev/null
 echo "Dictionary preparation succeeded"