Update arpa2fst invocations in individual egs/*/local scripts

2016-04-10 20:15:08 -07:00 · 2016-04-10 20:15:08 -07:00 · 829432d05b
--- a/egs/aspire/s5/local/fisher_create_test_lang.sh
+++ b/egs/aspire/s5/local/fisher_create_test_lang.sh
@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 #

 if [ -f path.sh ]; then . path.sh; fi
@ -10,26 +10,12 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz

 cp -rT data/lang data/lang_test

-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
-
+   arpa2fst --disambig-symbol=#0 \
+            --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst

 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst

 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@ -61,4 +47,3 @@ utils/build_const_arpa_lm.sh \
    data/local/lm/4gram-mincount/lm_unpruned.gz data/lang data/lang_test_fg

 echo "$0 succeeded"
-
--- a/egs/aurora4/s5/local/aurora4_format_data.sh
+++ b/egs/aurora4/s5/local/aurora4_format_data.sh
@ -21,7 +21,7 @@ tmpdir=data/local/lm_tmp
 lexicon=data/local/lang_tmp/lexiconp.txt
 mkdir -p $tmpdir

-for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do 
+for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do
  mkdir -p data/$x
  cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
  cp $srcdir/$x.txt data/$x/text || exit 1;
@ -42,23 +42,9 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
  cp -r data/lang/* $test

  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst

-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-  # which are supposed to occur only at being/end of utt.  These can cause 
-  # determinization failures of CLG [ends up being epsilon cycles].
-  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-    grep -v '<s> <s>' | \
-    grep -v '</s> <s>' | \
-    grep -v '</s> </s>' | \
-    arpa2fst - | fstprint | \
-    utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
-  
  utils/validate_lang.pl --skip-determinization-check $test || exit 1;
 done

--- a/egs/babel/s5/local/arpa2G.sh
+++ b/egs/babel/s5/local/arpa2G.sh
@ -39,14 +39,8 @@ destdir=$3
 mkdir $destdir 2>/dev/null || true

 gunzip -c $lmfile | \
-    grep -v '<s> <s>' | grep -v '</s> <s>' |  grep -v '</s> </s>' | \
-    arpa2fst - | \
-    fstprint | \
-    utils/eps2disambig.pl | \
-    utils/s2eps.pl | \
-    fstcompile --isymbols=$langdir/words.txt \
-    --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$langdir/words.txt - $destdir/G.fst || exit 1
 fstisstochastic $destdir/G.fst || true

 exit 0
--- a/egs/babel/s5/local/arpa2G_syllables.sh
+++ b/egs/babel/s5/local/arpa2G_syllables.sh
@ -22,7 +22,7 @@ silence_id=`grep -w SIL $langdir/words.txt | awk '{print $2}'` || exit 1;
 [ -z $silence_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1;
 rho=$[$last_id+1]

-# state 0 is start-state.  state 1 is state after we saw silence.  state 2 is 
+# state 0 is start-state.  state 1 is state after we saw silence.  state 2 is
 # "dead state/failure state" that is not coaccessible.
 cat <<EOF | fstcompile > $destdir/rho.fst
 0 1 $silence_id $silence_id
@ -35,16 +35,11 @@ EOF


 gunzip -c $lmfile | \
-    grep -v '<s> <s>' | grep -v '</s> <s>' |  grep -v '</s> </s>' | \
    sed 's/<unk>/<oov>/g' | \
-    arpa2fst - | \
-    fstprint | \
-    utils/eps2disambig.pl | \
-    utils/s2eps.pl | \
-    fstcompile --isymbols=$langdir/words.txt \
-    --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+    arpa2fst --disambig-symbol=#0 --ilabel-sort=false \
+             --read-symbol-table=$langdir/words.txt - | \
    fstrhocompose "$rho" - $destdir/rho.fst | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
+    fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1

 fstisstochastic $destdir/G.fst || true

--- a/egs/babel/s5/local/prepare_kaldi_lm_from_training_text.sh
+++ b/egs/babel/s5/local/prepare_kaldi_lm_from_training_text.sh
@ -8,7 +8,7 @@
 # This script trains LMs on the WSJ LM-training data.
 # It requires that you have already run wsj_extend_dict.sh,
 # to get the larger-size dictionary including all of CMUdict
-# plus any OOVs and possible acronyms that we could easily 
+# plus any OOVs and possible acronyms that we could easily
 # derive pronunciations for.

 # This script takes as command-line arguments the relevant data/lang
@ -69,7 +69,7 @@ cat $data/text | awk '{for (n=2;n<NF;n++) printf("%s ", $n); printf "\n";}' | \
 gzip -c > $dir/train_in.gz || exit 1;

 # Get training data with OOV words (w.r.t. our current vocab) replaced with <unk>.
-echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)" 
+echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
 gunzip -c $dir/train_in.gz | awk -v w=$dir/wordlist \
  'BEGIN{while((getline<w)>0) v[$1]=1;}
  {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<unk> ";print ""}'|sed 's/ $//g' \
@ -93,7 +93,7 @@ gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<

 # To save disk space, remove the un-mapped training data.  We could
 # easily generate it again if needed.
-rm $dir/train_nounk.gz 
+rm $dir/train_nounk.gz


 ##################################################################
@ -177,7 +177,7 @@ prune_lm.sh --arpa 5.0 $dir/4gram
 # The default LM chosen to be the last pruned 4gram-mincount
 #
 # Note: One can cheat and provide an external ARPA LM here!!!
-#       To do so, make sure that 
+#       To do so, make sure that
 #         -- its vocabulary is fully covered by $lang/words.txt,
 #         -- it is gzipped and
 #         -- it is placed in the $dir directory.
@ -205,14 +205,9 @@ echo "Compiling $gzipped_ARPA_LM into $lang/G.fst"

 . ./path.sh || exit 1;
 gunzip -c $gzipped_ARPA_LM | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' | \
-  arpa2fst - | fstprint | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
-      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon | fstarcsort --sort_type=ilabel > $lang/G.fst || exit 1;
-  fstisstochastic $lang/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$lang/words.txt - $lang/G.fst || exit 1;
+fstisstochastic $lang/G.fst

 ##################################################################
 # Redo the FST step after reviewing perplexities reported by the
@ -220,4 +215,3 @@ gunzip -c $gzipped_ARPA_LM | \
 ##################################################################

 exit 0
-
--- a/egs/babel/s5b/local/arpa2G.sh
+++ b/egs/babel/s5b/local/arpa2G.sh
@ -38,7 +38,7 @@ if [ $# -ne 3 ]; then
 fi

 set -e           #Exit on non-zero return code from any command
-set -o pipefail  #Exit if any of the commands in the pipeline will 
+set -o pipefail  #Exit if any of the commands in the pipeline will
                 #return non-zero return code

 lmfile=$1
@ -58,7 +58,7 @@ if [ ! -z "$oov_prob_file" ]; then
    exit 1;
  fi

-  min_prob=$(gunzip -c $lmfile | perl -e '  $minlogprob = 0.0; 
+  min_prob=$(gunzip -c $lmfile | perl -e '  $minlogprob = 0.0;
     while(<STDIN>) { if (m/\\(\d)-grams:/) { $order = $1; }
      if ($order == 1) { @A = split;
       if ($A[0] < $minlogprob && $A[0] != -99) { $minlogprob = $A[0]; }}} print $minlogprob')
@ -75,7 +75,7 @@ if [ ! -z "$oov_prob_file" ]; then
      while(<STDIN>) {
      if (m/^ngram 1=(\d+)/) { $n = $1 + $num_oovs; print "ngram 1=$n\n"; }
      else { print; } # print all lines unchanged except the one that says ngram 1=X.
-      if (m/^\\1-grams:$/) { 
+      if (m/^\\1-grams:$/) {
        foreach $l (@OOVS) {
          @A = split(" ", $l);
          @A == 2 || die "bad line in oov2prob: $_;";
@ -96,16 +96,11 @@ elif [[ $lmfile == *.gz ]] ; then
 else
  decompress="cat $lmfile"
 fi
- 
+
 $decompress | \
-  grep -v '<s> <s>' | grep -v '</s> <s>' |  grep -v '</s> </s>' | \
-  arpa2fst - | \
-  fstprint | \
-  utils/eps2disambig.pl | \
-  utils/s2eps.pl | \
-  fstcompile --isymbols=$langdir/words.txt \
-  --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-  fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$langdir/words.txt - $destdir/G.fst || exit 1
+
 fstisstochastic $destdir/G.fst || true;

 if $cleanup; then
--- a/egs/babel/s5b/local/arpa2G_syllables.sh
+++ b/egs/babel/s5b/local/arpa2G_syllables.sh
@ -22,7 +22,7 @@ silence_id=`grep -w SIL $langdir/words.txt | awk '{print $2}'` || exit 1;
 [ -z $silence_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1;
 rho=$[$last_id+1]

-# state 0 is start-state.  state 1 is state after we saw silence.  state 2 is 
+# state 0 is start-state.  state 1 is state after we saw silence.  state 2 is
 # "dead state/failure state" that is not coaccessible.
 cat <<EOF | fstcompile > $destdir/rho.fst
 0 1 $silence_id $silence_id
@ -35,16 +35,11 @@ EOF


 gunzip -c $lmfile | \
-    grep -v '<s> <s>' | grep -v '</s> <s>' |  grep -v '</s> </s>' | \
    sed 's/<unk>/<oov>/g' | \
-    arpa2fst - | \
-    fstprint | \
-    utils/eps2disambig.pl | \
-    utils/s2eps.pl | \
-    fstcompile --isymbols=$langdir/words.txt \
-    --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+    arpa2fst --disambig-symbol=#0 --ilabel-sort=false \
+             --read-symbol-table=$langdir/words.txt - | \
    fstrhocompose "$rho" - $destdir/rho.fst | \
-    fstrmepsilon > $destdir/G.fst || exit 1
+    fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1

 fstisstochastic $destdir/G.fst || true

--- a/egs/babel/s5b/local/prepare_kaldi_lm_from_training_text.sh
+++ b/egs/babel/s5b/local/prepare_kaldi_lm_from_training_text.sh
@ -8,7 +8,7 @@
 # This script trains LMs on the WSJ LM-training data.
 # It requires that you have already run wsj_extend_dict.sh,
 # to get the larger-size dictionary including all of CMUdict
-# plus any OOVs and possible acronyms that we could easily 
+# plus any OOVs and possible acronyms that we could easily
 # derive pronunciations for.

 # This script takes as command-line arguments the relevant data/lang
@ -69,7 +69,7 @@ cat $data/text | awk '{for (n=2;n<NF;n++) printf("%s ", $n); printf "\n";}' | \
 gzip -c > $dir/train_in.gz || exit 1;

 # Get training data with OOV words (w.r.t. our current vocab) replaced with <unk>.
-echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)" 
+echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
 gunzip -c $dir/train_in.gz | awk -v w=$dir/wordlist \
  'BEGIN{while((getline<w)>0) v[$1]=1;}
  {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<unk> ";print ""}'|sed 's/ $//g' \
@ -93,7 +93,7 @@ gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<

 # To save disk space, remove the un-mapped training data.  We could
 # easily generate it again if needed.
-rm $dir/train_nounk.gz 
+rm $dir/train_nounk.gz


 ##################################################################
@ -177,7 +177,7 @@ prune_lm.sh --arpa 5.0 $dir/4gram
 # The default LM chosen to be the last pruned 4gram-mincount
 #
 # Note: One can cheat and provide an external ARPA LM here!!!
-#       To do so, make sure that 
+#       To do so, make sure that
 #         -- its vocabulary is fully covered by $lang/words.txt,
 #         -- it is gzipped and
 #         -- it is placed in the $dir directory.
@ -205,14 +205,9 @@ echo "Compiling $gzipped_ARPA_LM into $lang/G.fst"

 . ./path.sh || exit 1;
 gunzip -c $gzipped_ARPA_LM | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' | \
-  arpa2fst - | fstprint | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
-      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon > $lang/G.fst || exit 1;
-  fstisstochastic $lang/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$lang/words.txt - $lang/G.fst || exit 1;
+fstisstochastic $lang/G.fst

 ##################################################################
 # Redo the FST step after reviewing perplexities reported by the
@ -220,4 +215,3 @@ gunzip -c $gzipped_ARPA_LM | \
 ##################################################################

 exit 0
-
--- a/egs/babel/s5c/local/arpa2G.sh
+++ b/egs/babel/s5c/local/arpa2G.sh
@ -38,7 +38,7 @@ if [ $# -ne 3 ]; then
 fi

 set -e           #Exit on non-zero return code from any command
-set -o pipefail  #Exit if any of the commands in the pipeline will 
+set -o pipefail  #Exit if any of the commands in the pipeline will
                 #return non-zero return code

 lmfile=$1
@ -58,7 +58,7 @@ if [ ! -z "$oov_prob_file" ]; then
    exit 1;
  fi

-  min_prob=$(gunzip -c $lmfile | perl -e '  $minlogprob = 0.0; 
+  min_prob=$(gunzip -c $lmfile | perl -e '  $minlogprob = 0.0;
     while(<STDIN>) { if (m/\\(\d)-grams:/) { $order = $1; }
      if ($order == 1) { @A = split;
       if ($A[0] < $minlogprob && $A[0] != -99) { $minlogprob = $A[0]; }}} print $minlogprob')
@ -75,7 +75,7 @@ if [ ! -z "$oov_prob_file" ]; then
      while(<STDIN>) {
      if (m/^ngram 1=(\d+)/) { $n = $1 + $num_oovs; print "ngram 1=$n\n"; }
      else { print; } # print all lines unchanged except the one that says ngram 1=X.
-      if (m/^\\1-grams:$/) { 
+      if (m/^\\1-grams:$/) {
        foreach $l (@OOVS) {
          @A = split(" ", $l);
          @A == 2 || die "bad line in oov2prob: $_;";
@ -96,16 +96,11 @@ elif [[ $lmfile == *.gz ]] ; then
 else
  decompress="cat $lmfile"
 fi
- 
+
 $decompress | \
-  grep -v '<s> <s>' | grep -v '</s> <s>' |  grep -v '</s> </s>' | \
-  arpa2fst - | \
-  fstprint | \
-  utils/eps2disambig.pl | \
-  utils/s2eps.pl | \
-  fstcompile --isymbols=$langdir/words.txt \
-  --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-  fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$langdir/words.txt - $destdir/G.fst || exit 1
+
 fstisstochastic $destdir/G.fst || true;

 if $cleanup; then
--- a/egs/babel/s5c/local/arpa2G_syllables.sh
+++ b/egs/babel/s5c/local/arpa2G_syllables.sh
@ -22,7 +22,7 @@ silence_id=`grep -w SIL $langdir/words.txt | awk '{print $2}'` || exit 1;
 [ -z $silence_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1;
 rho=$[$last_id+1]

-# state 0 is start-state.  state 1 is state after we saw silence.  state 2 is 
+# state 0 is start-state.  state 1 is state after we saw silence.  state 2 is
 # "dead state/failure state" that is not coaccessible.
 cat <<EOF | fstcompile > $destdir/rho.fst
 0 1 $silence_id $silence_id
@ -35,16 +35,11 @@ EOF


 gunzip -c $lmfile | \
-    grep -v '<s> <s>' | grep -v '</s> <s>' |  grep -v '</s> </s>' | \
    sed 's/<unk>/<oov>/g' | \
-    arpa2fst - | \
-    fstprint | \
-    utils/eps2disambig.pl | \
-    utils/s2eps.pl | \
-    fstcompile --isymbols=$langdir/words.txt \
-    --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+    arpa2fst --disambig-symbol=#0 --ilabel-sort=false \
+             --read-symbol-table=$langdir/words.txt - | \
    fstrhocompose "$rho" - $destdir/rho.fst | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
+    fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1

 fstisstochastic $destdir/G.fst || true

--- a/egs/babel/s5c/local/prepare_kaldi_lm_from_training_text.sh
+++ b/egs/babel/s5c/local/prepare_kaldi_lm_from_training_text.sh
@ -8,7 +8,7 @@
 # This script trains LMs on the WSJ LM-training data.
 # It requires that you have already run wsj_extend_dict.sh,
 # to get the larger-size dictionary including all of CMUdict
-# plus any OOVs and possible acronyms that we could easily 
+# plus any OOVs and possible acronyms that we could easily
 # derive pronunciations for.

 # This script takes as command-line arguments the relevant data/lang
@ -69,7 +69,7 @@ cat $data/text | awk '{for (n=2;n<NF;n++) printf("%s ", $n); printf "\n";}' | \
 gzip -c > $dir/train_in.gz || exit 1;

 # Get training data with OOV words (w.r.t. our current vocab) replaced with <unk>.
-echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)" 
+echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
 gunzip -c $dir/train_in.gz | awk -v w=$dir/wordlist \
  'BEGIN{while((getline<w)>0) v[$1]=1;}
  {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<unk> ";print ""}'|sed 's/ $//g' \
@ -93,7 +93,7 @@ gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<

 # To save disk space, remove the un-mapped training data.  We could
 # easily generate it again if needed.
-rm $dir/train_nounk.gz 
+rm $dir/train_nounk.gz


 ##################################################################
@ -177,7 +177,7 @@ prune_lm.sh --arpa 5.0 $dir/4gram
 # The default LM chosen to be the last pruned 4gram-mincount
 #
 # Note: One can cheat and provide an external ARPA LM here!!!
-#       To do so, make sure that 
+#       To do so, make sure that
 #         -- its vocabulary is fully covered by $lang/words.txt,
 #         -- it is gzipped and
 #         -- it is placed in the $dir directory.
@ -205,14 +205,9 @@ echo "Compiling $gzipped_ARPA_LM into $lang/G.fst"

 . ./path.sh || exit 1;
 gunzip -c $gzipped_ARPA_LM | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' | \
-  arpa2fst - | fstprint | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
-      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon | fstarcsort --sort_type=ilabel > $lang/G.fst || exit 1;
-  fstisstochastic $lang/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$lang/words.txt - $lang/G.fst || exit 1;
+fstisstochastic $lang/G.fst

 ##################################################################
 # Redo the FST step after reviewing perplexities reported by the
@ -220,4 +215,3 @@ gunzip -c $gzipped_ARPA_LM | \
 ##################################################################

 exit 0
-
--- a/egs/callhome_egyptian/s5/local/callhome_create_test_lang.sh
+++ b/egs/callhome_egyptian/s5/local/callhome_create_test_lang.sh
@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 #

 if [ -f path.sh ]; then . path.sh; fi
@ -12,25 +12,18 @@ mkdir -p data/lang_test
 cp -r data/lang/* data/lang_test

 # grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
+# LM doesn't have these "invalid combinations".  These can cause
 # determinization failures of CLG [ends up being epsilon cycles].
 # Note: remove_oovs.pl takes a list of words in the LM that aren't in
 # our word list.  Since our LM doesn't have any, we just give it
 # /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst


 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst

 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@ -59,4 +52,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \


 echo "$0 succeeded"
-
--- a/egs/chime2/s5/local/chime_format_data.sh
+++ b/egs/chime2/s5/local/chime_format_data.sh
@ -17,11 +17,9 @@
 echo "Preparing train and test data"
 srcdir=data/local/data
 lmdir=data/local/nist_lm
-tmpdir=data/local/lm_tmp
 lexicon=data/local/lang_tmp/lexiconp.txt
-mkdir -p $tmpdir

-for x in test_eval92_clean test_eval92_noisy test_eval92_5k_clean test_eval92_5k_noisy dev_dt_05_clean dev_dt_05_reverb dev_dt_05_noisy dev_dt_20_clean dev_dt_20_reverb dev_dt_20_noisy train_si84_clean train_si84_reverb train_si84_noisy; do 
+for x in test_eval92_clean test_eval92_noisy test_eval92_5k_clean test_eval92_5k_noisy dev_dt_05_clean dev_dt_05_reverb dev_dt_05_noisy dev_dt_20_clean dev_dt_20_reverb dev_dt_20_noisy train_si84_clean train_si84_reverb train_si84_noisy; do
  mkdir -p data/$x
  cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
  cp $srcdir/$x.txt data/$x/text || exit 1;
@ -42,25 +40,10 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
  cp -r data/lang/* $test

  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt
-
-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-  # which are supposed to occur only at being/end of utt.  These can cause 
-  # determinization failures of CLG [ends up being epsilon cycles].
-  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-    grep -v '<s> <s>' | \
-    grep -v '</s> <s>' | \
-    grep -v '</s> </s>' | \
-    arpa2fst - | fstprint | \
-    utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst

  utils/validate_lang.pl $test || exit 1;
 done

 echo "Succeeded in formatting data."
-rm -r $tmpdir
--- a/egs/chime3/s5/local/chime3_train_lms.sh
+++ b/egs/chime3/s5/local/chime3_train_lms.sh
@ -18,7 +18,7 @@ if [ $# -ne 1 ]; then
  exit 1;
 fi

-# check data directories 
+# check data directories
 chime3_data=$1
 wsj0_data=$chime3_data/data/WSJ0 # directory of WSJ0 in CHiME3. You can also specify your WSJ0 corpus directory
 if [ ! -d $chime3_data ]; then
@ -70,7 +70,7 @@ else
   | awk -v voc=$dir/vocab_5k.txt '
   BEGIN{ while((getline<voc)>0) { invoc[$1]=1; }}
   /^</{next}{
-     for (x=1;x<=NF;x++) { 
+     for (x=1;x<=NF;x++) {
       w=toupper($x);
       if (invoc[w]) { printf("%s ",w); } else { printf("<UNK> "); }
     }
@ -88,7 +88,7 @@ else
                 $chime3_data/data/transcriptions/dt05_simu.trn_all \
      |gzip -c > $dir/valid.gz
 fi
-  
+
 # train a large n-gram language model
 lm_suffix=${order}gkn_5k
 if [ -f $dir/lm_${lm_suffix}.arpa.gz ]; then
@ -121,22 +121,8 @@ for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
  cp -r data/lang/$f $test
 done
 gunzip -c $dir/lm_${lm_suffix}.arpa.gz | \
- utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt
-
-# grep -v '<s> <s>' because the LM seems to have some strange and useless
-# stuff in it with multiple <s>'s in the history.  Encountered some other similar
-# things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-# which are supposed to occur only at being/end of utt.  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-gunzip -c $dir/lm_${lm_suffix}.arpa.gz | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' | \
-  arpa2fst - | fstprint | \
-  utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
-  utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-    --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-   fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$test/words.txt - $test/G.fst
 fstisstochastic $test/G.fst
 # The output is like:
 # 9.14233e-05 -0.259833
@ -154,10 +140,9 @@ awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "
    < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
 fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
 fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
-fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
  echo "Language model has cycles with empty words" && exit 1
 rm -r $tmpdir/g

 echo "Succeeded in preparing a large ${order}-gram LM"
 rm -r $tmpdir
-
--- a/egs/chime3/s5/local/clean_chime3_format_data.sh
+++ b/egs/chime3/s5/local/clean_chime3_format_data.sh
@ -20,7 +20,7 @@ tmpdir=data/local/lm_tmp
 lexicon=data/local/lang_tmp/lexiconp.txt
 mkdir -p $tmpdir

-for x in et05_orig_clean dt05_orig_clean tr05_orig_clean; do 
+for x in et05_orig_clean dt05_orig_clean tr05_orig_clean; do
  mkdir -p data/$x
  cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
  cp $srcdir/$x.txt data/$x/text || exit 1;
@ -43,29 +43,15 @@ for lm_suffix in tgpr_5k; do
    cp -r data/lang/$f $test
  done
  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt
-
-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-  # which are supposed to occur only at being/end of utt.  These can cause 
-  # determinization failures of CLG [ends up being epsilon cycles].
-  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-    grep -v '<s> <s>' | \
-    grep -v '</s> <s>' | \
-    grep -v '</s> </s>' | \
-    arpa2fst - | fstprint | \
-    utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
  fstisstochastic $test/G.fst
- # The output is like:
- # 9.14233e-05 -0.259833
- # we do expect the first of these 2 numbers to be close to zero (the second is
- # nonzero because the backoff weights make the states sum to >1).
- # Because of the <s> fiasco for these particular LMs, the first number is not
- # as close to zero as it could be.
+  # The output is like:
+  # 9.14233e-05 -0.259833
+  # we do expect the first of these 2 numbers to be close to zero (the second is
+  # nonzero because the backoff weights make the states sum to >1).
+  # Because of the <s> fiasco for these particular LMs, the first number is not
+  # as close to zero as it could be.

  # Everything below is only for diagnostic.
  # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
@ -76,7 +62,7 @@ for lm_suffix in tgpr_5k; do
    < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
  fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
   fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
-  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
    echo "Language model has cycles with empty words" && exit 1
  rm -r $tmpdir/g
 done
--- a/egs/farsdat/s5/local/farsdat_format_data.sh
+++ b/egs/farsdat/s5/local/farsdat_format_data.sh
@ -25,13 +25,10 @@ for lm_suffix in bg; do
  test=data/lang_test_${lm_suffix}
  mkdir -p $test
  cp -r data/lang/* $test
-  
+
  gunzip -c $lmdir/lm_phone_${lm_suffix}.arpa.gz | \
-    egrep -v '<s> <s>|</s> <s>|</s> </s>' | \
-    arpa2fst - | fstprint | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-     --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
  fstisstochastic $test/G.fst
 # The output is like:
 # 9.14233e-05 -0.259833
@ -49,7 +46,7 @@ for lm_suffix in bg; do
    < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
  fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
   fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
-  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
    echo "Language model has cycles with empty words" && exit 1
  rm -r $tmpdir/g
 done
--- a/egs/farsdat/s5/local/farsdat_prepare_lm.sh
+++ b/egs/farsdat/s5/local/farsdat_prepare_lm.sh
@ -25,13 +25,10 @@ for lm_suffix in bg; do
  test=data/lang_test_${lm_suffix}
  mkdir -p $test
  cp -r data/lang/* $test
-  
+
  gunzip -c $lmdir/lm_phone_${lm_suffix}.arpa.gz | \
-    egrep -v '<s> <s>|</s> <s>|</s> </s>' | \
-    arpa2fst - | fstprint | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-     --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
  fstisstochastic $test/G.fst
 # The output is like:
 # 9.14233e-05 -0.259833
@ -49,7 +46,7 @@ for lm_suffix in bg; do
    < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
  fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
   fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
-  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
    echo "Language model has cycles with empty words" && exit 1
  rm -r $tmpdir/g
 done
--- a/egs/fisher_callhome_spanish/s5/local/fsp_create_test_lang.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_create_test_lang.sh
@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 # Copyright 2014  Gaurav Kumar.   Apache 2.0
 #

@ -12,26 +12,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 mkdir -p data/lang_test
 cp -r data/lang/* data/lang_test

-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst


 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst

 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@ -60,4 +47,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \


 echo "$0 succeeded"
-
--- a/egs/fisher_english/s5/local/fisher_create_test_lang.sh
+++ b/egs/fisher_english/s5/local/fisher_create_test_lang.sh
@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 #

 if [ -f path.sh ]; then . path.sh; fi
@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 mkdir -p data/lang_test
 cp -r data/lang/* data/lang_test

-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst


 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst

 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \


 echo "$0 succeeded"
-
--- a/egs/fisher_swbd/s5/local/fisher_create_test_lang.sh
+++ b/egs/fisher_swbd/s5/local/fisher_create_test_lang.sh
@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 #

 if [ -f path.sh ]; then . path.sh; fi
@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 mkdir -p data/lang_test
 cp -r data/lang/* data/lang_test

-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst


 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst

 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \


 echo "$0 succeeded"
-
--- a/egs/fisher_swbd/s5/local/fisher_create_test_lang_fsh.sh
+++ b/egs/fisher_swbd/s5/local/fisher_create_test_lang_fsh.sh
@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 #

 if [ -f path.sh ]; then . path.sh; fi
@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 mkdir -p data/lang_test_fsh
 cp -r data/lang/* data/lang_test_fsh

-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test_fsh/words.txt \
-     --osymbols=data/lang_test_fsh/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_fsh/G.fst
-  fstisstochastic data/lang_test_fsh/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst


 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test_fsh/G.fst 
+fstisstochastic data/lang_test_fsh/G.fst

 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test_fsh/G.fst | \


 echo "$0 succeeded"
-
--- a/egs/gale_arabic/s5/local/gale_format_data.sh
+++ b/egs/gale_arabic/s5/local/gale_format_data.sh
@ -6,9 +6,9 @@
 if [ -f path.sh ]; then
  . path.sh; else
   echo "missing path.sh"; exit 1;
-fi 
+fi

-for dir in test train; do 
+for dir in test train; do
   cp -pr data/local/$dir data/$dir
 done

@ -21,26 +21,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 rm -r data/lang_test
 cp -r data/lang data/lang_test

-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst


 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst

 ## Check lexicon.
 ## just have a look and make sure it seems sane.
--- a/egs/gale_mandarin/s5/local/gale_format_data.sh
+++ b/egs/gale_mandarin/s5/local/gale_format_data.sh
@ -6,9 +6,9 @@
 if [ -f path.sh ]; then
  . path.sh; else
   echo "missing path.sh"; exit 1;
-fi 
+fi

-for dir in dev train; do 
+for dir in dev train; do
   cp -pr data/local/$dir data/$dir
 done

@ -22,26 +22,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 rm -r data/lang_dev
 cp -r data/lang data/lang_dev

-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_dev/words.txt \
-     --osymbols=data/lang_dev/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_dev/G.fst
-  fstisstochastic data/lang_dev/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst


 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_dev/G.fst 
+fstisstochastic data/lang_dev/G.fst

 ## Check lexicon.
 ## just have a look and make sure it seems sane.
--- a/egs/gp/s1/local/gp_format_lms_edin.sh
+++ b/egs/gp/s1/local/gp_format_lms_edin.sh
@ -40,20 +40,10 @@ function format_lms () {
    cp $work_dir/lang_test/$f $test
  done

+  # kkm: I am removing fstdeterminizelog from the following pipe, no point.
  gunzip -c $work_dir/local/lm_${lm_suffix}.arpa.gz \
-    | find_arpa_oovs.pl $test/words.txt > $test/oovs_${lm_suffix}.txt
-
-  # Removing all "illegal" combinations of <s> and </s>, which are supposed to 
-  # occur only at being/end of utt.  These can cause determinization failures 
-  # of CLG [ends up being epsilon cycles].
-  gunzip -c $work_dir/local/lm_${lm_suffix}.arpa.gz \
-    | egrep -v '<s> <s>|</s> <s>|</s> </s>' \
-    | arpa2fst - | fstprint \
-    | remove_oovs.pl $test/oovs_${lm_suffix}.txt \
-    | eps2disambig.pl | s2eps.pl \
-    | fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt \
-      --keep_isymbols=false --keep_osymbols=false \
-    | fstrmepsilon | fstdeterminizelog > $test/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
  set +e
  fstisstochastic $test/G.fst
  set -e
@ -73,7 +63,7 @@ function format_lms () {
    < $work_dir/local/lexicon_??.txt  >tmpdir.g/select_empty.fst.txt
  fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt tmpdir.g/select_empty.fst.txt | \
   fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > tmpdir.g/empty_words.fst
-  fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' && 
+  fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' &&
    echo "Language model has cycles with empty words" && exit 1
  rm -r tmpdir.g

@ -99,7 +89,7 @@ echo "Preparing language models for test"
  format_lms GE17k_tg $WDIR/GE;
  format_lms GE17k_tg_pr $WDIR/GE; } >& $WDIR/GE/format_lms.log

-# German - 60K 
+# German - 60K
 { format_lms GE60k_bg $WDIR/GE;
  format_lms GE60k_tg $WDIR/GE;
  format_lms GE60k_tg_pr $WDIR/GE; } >> $WDIR/GE/format_lms.log 2>&1
@ -115,7 +105,7 @@ echo "Preparing language models for test"
  format_lms SP23k_tg_pr $WDIR/SP; } >& $WDIR/SP/format_lms.log

 # Swedish - 24K
-# TODO(arnab): Something going wrong with the Swedish trigram LM. 
+# TODO(arnab): Something going wrong with the Swedish trigram LM.
 { # format_lms SW24k_tg $WDIR/SW;
  # format_lms SW24k_tg_pr $WDIR/SW;
  format_lms SW24k_bg $WDIR/SW; } >& $WDIR/SW/format_lms.log
--- a/egs/hkust/s5/local/hkust_format_data.sh
+++ b/egs/hkust/s5/local/hkust_format_data.sh
@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 #

 if [ -f path.sh ]; then . path.sh; fi
@ -23,26 +23,13 @@ done
 rm -r data/lang_test
 cp -r data/lang data/lang_test

-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst


 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst

 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@ -71,4 +58,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \


 echo hkust_format_data succeeded.
-
--- a/egs/librispeech/s5/local/decode_example.sh
+++ b/egs/librispeech/s5/local/decode_example.sh
@ -34,22 +34,10 @@ mfccdir=mfcc
 # here.
 lang=data/lang
 lang_test=data/lang_test
-lang_test_tmp=data/local/lang_test_tmp/
-mkdir -p $lang_test_tmp
 mkdir -p $lang_test
 cp -r $lang/* $lang_test
-gunzip -c $lm | utils/find_arpa_oovs.pl $lang_test/words.txt \
-  > $lang_test_tmp/oovs.txt || exit 1
-gunzip -c $lm | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' | \
-  arpa2fst - | fstprint | \
-  utils/remove_oovs.pl $lang_test_tmp/oovs.txt | \
-  utils/eps2disambig.pl | utils/s2eps.pl | \
-  fstcompile --isymbols=$lang_test/words.txt --osymbols=$lang_test/words.txt  \
-  --keep_isymbols=false --keep_osymbols=false | \
-  fstrmepsilon | fstarcsort --sort_type=ilabel > $lang_test/G.fst
+gunzip -c $lm | arpa2fst --disambig-symbol=#0 \
+                 --read-symbol-table=$lang_test/words.txt - $lang_test/G.fst
 utils/validate_lang.pl --skip-determinization-check $lang_test || exit 1;

 # Compiles decoding graph.
--- a/egs/reverb/s5/local/wsjcam0_format_data.sh
+++ b/egs/reverb/s5/local/wsjcam0_format_data.sh
@ -50,22 +50,8 @@ for lm_suffix in bg_5k tg_5k; do
    cp -r data/lang/$f $test
  done
  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt
-
-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-  # which are supposed to occur only at being/end of utt.  These can cause 
-  # determinization failures of CLG [ends up being epsilon cycles].
-  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-    grep -v '<s> <s>' | \
-    grep -v '</s> <s>' | \
-    grep -v '</s> </s>' | \
-    arpa2fst - | fstprint | \
-    utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon  | fstarcsort --sort_type=ilabel > $test/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
  fstisstochastic $test/G.fst
 # The output is like:
 # 9.14233e-05 -0.259833
@ -83,7 +69,7 @@ for lm_suffix in bg_5k tg_5k; do
    < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
  fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
   fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
-  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
    echo "Language model has cycles with empty words" && exit 1
  rm -r $tmpdir/g
 done
--- a/egs/sprakbanken/s5/local/sprak_train_cmulm.sh
+++ b/egs/sprakbanken/s5/local/sprak_train_cmulm.sh
@ -52,25 +52,10 @@ mkdir -p $test
 cp -r data/lang/* $test

 cat $lmdir/sprak.arpa | \
-utils/find_arpa_oovs.pl $test/words.txt  > $lmdir/oovs_${lm_suffix}.txt
-
-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-  # which are supposed to occur only at being/end of utt.  These can cause 
-  # determinization failures of CLG [ends up being epsilon cycles].
-cat $lmdir/sprak.arpa | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' | \
-  arpa2fst - | fstprint | \
-  utils/remove_oovs.pl $lmdir/oovs_${lm_suffix}.txt | \
-  utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-    --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-   fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$test/words.txt - $test/G.fst


 utils/validate_lang.pl $test || exit 1;

 exit 0;
-
--- a/egs/sprakbanken/s5/local/sprak_train_irstlm.sh
+++ b/egs/sprakbanken/s5/local/sprak_train_irstlm.sh
@ -61,8 +61,8 @@ fi


 # Checks if espeak is available on the system. espeak is necessary to extend
-# the setup because the original transcriptions were created with espeak and 
-# filtered 
+# the setup because the original transcriptions were created with espeak and
+# filtered

 if ! which espeak >&/dev/null; then
  echo "espeak is not available on your system. You must install espeak before proceeding."
@ -95,7 +95,7 @@ if [ ! -f $extdict/lexicon.txt ];


  # Filter transcription
-  # Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove 
+  # Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove
  # initial and trailing spaces and collapse 2 or more spaces to one space

  cat $dir/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dir/plist2.txt
@ -128,7 +128,7 @@ if [ ! -f $lmdir/extra4.ngt ];

  grep -P -v '^[\s?|\.|\!]*$' $newtext | \
  awk '{if(NF>=4){ printf("%s\n",$0); }}' > $lmdir/text.filt
-    
+
  # Envelop LM training data in context cues
  add-start-end.sh < $lmdir/text.filt > $lmdir/lm_input

@ -151,22 +151,8 @@ mkdir -p $test
 cp -r $extlang $test

 cat $lmdir/extra${N}$lm_suffix | \
-utils/find_arpa_oovs.pl $test/words.txt  > $lmdir/oovs_${lm_suffix}.txt
-
-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-  # which are supposed to occur only at being/end of utt.  These can cause 
-  # determinization failures of CLG [ends up being epsilon cycles].
-cat $lmdir/extra${N}$lm_suffix | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' | \
-  arpa2fst - | fstprint | \
-  utils/remove_oovs.pl $lmdir/oovs_${lm_suffix}.txt | \
-  utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-    --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-   fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$test/words.txt - $test/G.fst

 utils/validate_lang.pl $test || exit 1;

--- a/egs/sprakbanken/s5/local/train_irstlm.sh
+++ b/egs/sprakbanken/s5/local/train_irstlm.sh
@ -66,22 +66,8 @@ mkdir -p $test
 cp -r $srcdir/* $test

 cat $lmdir/train${ngram}.arpa | \
-  utils/find_arpa_oovs.pl $test/words.txt  > $lmdir/oovs_${lm_suffix}.txt
-
-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-  # which are supposed to occur only at being/end of utt.  These can cause 
-  # determinization failures of CLG [ends up being epsilon cycles].
-cat $lmdir/train${ngram}.arpa | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' | \
-  arpa2fst - | fstprint | \
-  utils/remove_oovs.pl $lmdir/oovs_${lm_suffix}.txt | \
-  utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-    --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-   fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$test/words.txt - $test/G.fst

 utils/validate_lang.pl $test || exit 1;

--- a/egs/sre10/v1/local/dnn/fisher_create_test_lang.sh
+++ b/egs/sre10/v1/local/dnn/fisher_create_test_lang.sh
@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 #

 if [ -f path.sh ]; then . path.sh; fi
@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 mkdir -p data/lang_test
 cp -r data/lang/* data/lang_test

-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst


 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst

 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \


 echo "$0 succeeded"
-
--- a/egs/swahili/s5/local/prepare_lm.sh
+++ b/egs/swahili/s5/local/prepare_lm.sh
@ -4,13 +4,5 @@

 cd data
 #convert to FST format for Kaldi
-cat local/swahili.arpa | ../utils/find_arpa_oovs.pl lang/words.txt  > lang/oovs.txt
-cat local/swahili.arpa |    \
-    grep -v '<s> <s>' | \
-    grep -v '</s> <s>' | \
-    grep -v '</s> </s>' | \
-    arpa2fst - | fstprint | \
-    ../utils/remove_oovs.pl lang/oovs.txt | \
-    ../utils/eps2disambig.pl | ../utils/s2eps.pl | fstcompile --isymbols=lang/words.txt \
-      --osymbols=lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon > lang/G.fst
+arpa2fst --disambig-symbol=#0 --read-symbol-table=lang/words.txt \
+  local/swahili.arpa lang/G.fst
--- a/egs/swbd/s5/local/swbd_p1_format_data.sh
+++ b/egs/swbd/s5/local/swbd_p1_format_data.sh
@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 #

 if [ -f path.sh ]; then . path.sh; fi
@ -20,26 +20,13 @@ done
 rm -r data/lang_test
 cp -r data/lang data/lang_test

-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst


 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst

 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@ -68,4 +55,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \


 echo swbd_p1_format_data succeeded.
-
--- a/egs/tedlium/s5/local/prepare_lm.sh
+++ b/egs/tedlium/s5/local/prepare_lm.sh
@ -1,6 +1,6 @@
-#!/bin/bash 
+#!/bin/bash
 #
-# Copyright  2014 Nickolay V. Shmyrev 
+# Copyright  2014 Nickolay V. Shmyrev
 # Apache 2.0


@ -12,21 +12,8 @@ arpa_lm=db/cantab-TEDLIUM/cantab-TEDLIUM-pruned.lm3.gz
 rm -rf data/lang_nosp_test
 cp -r data/lang_nosp data/lang_nosp_test

-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
-gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_nosp_test/words.txt \
-     --osymbols=data/lang_nosp_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_nosp_test/G.fst
+gunzip -c "$arpa_lm" | arpa2fst --disambig-symbol=#0 \
+  --read-symbol-table=data/lang_nosp_test/words.txt - data/lang_nosp_test/G.fst


 echo  "Checking how stochastic G is (the first of these numbers should be small):"
--- a/egs/timit/s5/local/timit_format_data.sh
+++ b/egs/timit/s5/local/timit_format_data.sh
@ -16,7 +16,7 @@ tmpdir=data/local/lm_tmp
 lexicon=data/local/dict/lexicon.txt
 mkdir -p $tmpdir

-for x in train dev test; do 
+for x in train dev test; do
  mkdir -p data/$x
  cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
  cp $srcdir/$x.text data/$x/text || exit 1;
@ -37,13 +37,10 @@ for lm_suffix in bg; do
  test=data/lang_test_${lm_suffix}
  mkdir -p $test
  cp -r data/lang/* $test
-  
+
  gunzip -c $lmdir/lm_phone_${lm_suffix}.arpa.gz | \
-    egrep -v '<s> <s>|</s> <s>|</s> </s>' | \
-    arpa2fst - | fstprint | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-     --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
  fstisstochastic $test/G.fst
 # The output is like:
 # 9.14233e-05 -0.259833
@ -61,7 +58,7 @@ for lm_suffix in bg; do
    < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
  fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
   fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
-  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
    echo "Language model has cycles with empty words" && exit 1
  rm -r $tmpdir/g
 done
--- a/egs/voxforge/s5/local/voxforge_format_data.sh
+++ b/egs/voxforge/s5/local/voxforge_format_data.sh
@ -12,7 +12,7 @@ tmpdir=data/local/lm_tmp
 lexicon=data/local/dict/lexicon.txt
 mkdir -p $tmpdir

-for x in train test; do 
+for x in train test; do
  mkdir -p data/$x
  cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
  cp $srcdir/${x}_trans.txt data/$x/text || exit 1;
@ -33,22 +33,8 @@ for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones; do
    cp -r data/lang/$f $test
 done
 cat $lmdir/lm.arpa | \
-   utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs.txt
-
-# grep -v '<s> <s>' because the LM seems to have some strange and useless
-# stuff in it with multiple <s>'s in the history.  Encountered some other similar
-# things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-# which are supposed to occur only at being/end of utt.  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-cat $lmdir/lm.arpa | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' | \
-  arpa2fst - | fstprint | \
-  utils/remove_oovs.pl $tmpdir/oovs.txt | \
-  utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-    --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-  fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$test/words.txt - $test/G.fst
 fstisstochastic $test/G.fst
 # The output is like:
 # 9.14233e-05 -0.259833
@ -67,9 +53,8 @@ awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "
 fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt \
  $tmpdir/g/select_empty.fst.txt | \
 fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
-fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
  echo "Language model has cycles with empty words" && exit 1
 rm -rf $tmpdir

 echo "*** Succeeded in formatting data."
-
--- a/egs/vystadial_cz/s5/local/create_G.sh
+++ b/egs/vystadial_cz/s5/local/create_G.sh
@ -17,7 +17,7 @@ for lm in $LMs ; do
    lmp=$lmdir/`basename $lm`

    tmpdir=$tgt/tmp
-    mkdir -p $tgt 
+    mkdir -p $tgt
    mkdir -p $tmpdir

    echo "--- Preparing the grammar transducer (G.fst) from $lmp in $tgt ..."
@ -26,21 +26,9 @@ for lm in $LMs ; do
        ln -s $langdir/$f $tgt/$f 2> /dev/null
    done

-    cat $lmp | utils/find_arpa_oovs.pl $tgt/words.txt > $tmpdir/oovs.txt
-
-     # grep -v '<s> <s>' because the LM seems to have some strange and useless
-     # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-     # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-     # which are supposed to occur only at being/end of utt.  These can cause 
-     # determinization failures of CLG [ends up being epsilon cycles].
-
    cat $lmp | \
-      grep -v '<s> <s>\|</s> <s>\|</s> </s>' | \
-      arpa2fst - | fstprint | \
-      utils/remove_oovs.pl $tmpdir/oovs.txt | \
-      utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$tgt/words.txt \
-        --osymbols=$tgt/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-      fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt/G.fst
+      arpa2fst --disambig-symbol=#0 \
+               --read-symbol-table=$tgt/words.txt - $tgt/G.fst
    fstisstochastic $tgt/G.fst
    # The output is like:
    # 9.14233e-05 -0.259833
@ -48,7 +36,7 @@ for lm in $LMs ; do
    # nonzero because the backoff weights make the states sum to >1).
    # Because of the <s> fiasco for these particular LMs, the first number is not
    # as close to zero as it could be.
-    
+
    # Everything below is only for diagnostic.
    # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
    # this might cause determinization failure of CLG.
@ -59,7 +47,7 @@ for lm in $LMs ; do
    fstcompile --isymbols=$tgt/words.txt --osymbols=$tgt/words.txt \
      $tmpdir/g/select_empty.fst.txt | \
    fstarcsort --sort_type=olabel | fstcompose - $tgt/G.fst > $tmpdir/g/empty_words.fst
-    fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+    fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
      echo "Language model has cycles with empty words" && exit 1

    # rm -rf $tmpdir  # TODO debugging
--- a/egs/vystadial_en/s5/local/create_G.sh
+++ b/egs/vystadial_en/s5/local/create_G.sh
@ -17,7 +17,7 @@ for lm in $LMs ; do
    lmp=$lmdir/`basename $lm`

    tmpdir=$tgt/tmp
-    mkdir -p $tgt 
+    mkdir -p $tgt
    mkdir -p $tmpdir

    echo "--- Preparing the grammar transducer (G.fst) from $lmp in $tgt ..."
@ -26,21 +26,9 @@ for lm in $LMs ; do
        ln -s $langdir/$f $tgt/$f 2> /dev/null
    done

-    cat $lmp | utils/find_arpa_oovs.pl $tgt/words.txt > $tmpdir/oovs.txt
-
-     # grep -v '<s> <s>' because the LM seems to have some strange and useless
-     # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-     # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-     # which are supposed to occur only at being/end of utt.  These can cause 
-     # determinization failures of CLG [ends up being epsilon cycles].
-
    cat $lmp | \
-      grep -v '<s> <s>\|</s> <s>\|</s> </s>' | \
-      arpa2fst - | fstprint | \
-      utils/remove_oovs.pl $tmpdir/oovs.txt | \
-      utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$tgt/words.txt \
-        --osymbols=$tgt/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-      fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt/G.fst
+      arpa2fst --disambig-symbol=#0 \
+               --read-symbol-table=$tgt/words.txt - $tgt/G.fst
    fstisstochastic $tgt/G.fst
    # The output is like:
    # 9.14233e-05 -0.259833
@ -48,7 +36,7 @@ for lm in $LMs ; do
    # nonzero because the backoff weights make the states sum to >1).
    # Because of the <s> fiasco for these particular LMs, the first number is not
    # as close to zero as it could be.
-    
+
    # Everything below is only for diagnostic.
    # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
    # this might cause determinization failure of CLG.
@ -59,7 +47,7 @@ for lm in $LMs ; do
    fstcompile --isymbols=$tgt/words.txt --osymbols=$tgt/words.txt \
      $tmpdir/g/select_empty.fst.txt | \
    fstarcsort --sort_type=olabel | fstcompose - $tgt/G.fst > $tmpdir/g/empty_words.fst
-    fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+    fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
      echo "Language model has cycles with empty words" && exit 1

    # rm -rf $tmpdir  # TODO debugging
--- a/egs/yesno/s5/local/prepare_lm.sh
+++ b/egs/yesno/s5/local/prepare_lm.sh
@ -1,7 +1,7 @@
 #!/bin/bash

 . path.sh
- 
+
 echo Preparing language models for test

 for lm_suffix in tg; do
@ -10,10 +10,10 @@ for lm_suffix in tg; do
  rm -rf data/lang_test_${lm_suffix}
  cp -r data/lang data/lang_test_${lm_suffix}

-  cat input/task.arpabo | arpa2fst - | fstprint | utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
-  #cat input/G.txt | utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+  arpa2fst --disambig-symbol=#0 --read-symbol-table=$test/words.txt input/task.arpabo $test/G.fst
+
  fstisstochastic $test/G.fst
-      
+
 # The output is like:
 # 9.14233e-05 -0.259833
 # we do expect the first of these 2 numbers to be close to zero (the second is
@ -30,7 +30,7 @@ for lm_suffix in tg; do
    < data/local/dict/lexicon.txt  >tmpdir.g/select_empty.fst.txt
  fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt tmpdir.g/select_empty.fst.txt | \
   fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > tmpdir.g/empty_words.fst
-  fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' && 
+  fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' &&
    echo "Language model has cycles with empty words" && exit 1
  rm -r tmpdir.g
 done