Basic work towards the KW search -- create the keywords fsts and prepare the index using lattices and the KWfst

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@1940 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2013-01-24 23:39:35 +00:00 · 2013-01-24 23:39:35 +00:00 · cc620ded07
--- a/egs/babel/s5/conf/languages/105-turkish-80h.conf
+++ b/egs/babel/s5/conf/languages/105-turkish-80h.conf
@ -17,3 +17,6 @@ numGaussSGMM=50000
 # Change it to the LM with the best perplexity after run.sh finishes
 lmForDecoding=$SysDir/sriLM/32hLM.gz

+glmFile=`readlink -f ./conf/glm`
+
+
--- a/egs/babel/s5/conf/languages/106-tagalog-10h.conf
+++ b/egs/babel/s5/conf/languages/106-tagalog-10h.conf
@ -17,8 +17,16 @@ dev_data_dir=/export/babel/oguz/10Hsubsets/106B-delivery-v0.2f_10hSubset/convers
 lexicon_file=/export/a09/jtrmal/babel/egs/Tagalog-10hSystem2a/Lexicon/lexicon.txt

 # Scoring protocols (dummy GLM file to appease the scoring script)
-glmFile=`readlink -f ./conf/glm`
+glmFile=./conf/glm

 train_nj=12
 decode_nj=9

+#keyword search settings
+ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.ecf.xml
+kwlist_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.kwlist.xml
+kwlist_file=/export/babel/data/scoring/IndusDB.20121102/babel106b-v0.2g_conv-dev.kwlist.xml
+rttm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.rttm
+#ln -sf `readlink -f babel/subsets/ecf.xml`   $kwsdatadir/ecf.xml
+
+. /export/babel/data/software/env.sh
--- a/egs/babel/s5/conf/languages/106-tagalog-limitedLP.official.conf
+++ b/egs/babel/s5/conf/languages/106-tagalog-limitedLP.official.conf
@ -27,6 +27,14 @@ lexiconFlags="-oov <unk>"
 # Scoring protocols (dummy GLM file to appease the scoring script)
 glmFile=./conf/glm

-train_nj=32
+train_nj=16
 decode_nj=18

+#keyword search settings
+ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.ecf.xml
+kws_list= /export/babel/data/scoring/IndusDB.20121102/babel106b-v0.2g_conv-dev.kwlist.xml
+rttm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.rttm
+#ln -sf `readlink -f babel/subsets/ecf.xml`   $kwsdatadir/ecf.xml
+
+. /export/babel/data/software/env.sh
+
--- a/egs/babel/s5/local/CHECKPOINT.sh
+++ b/egs/babel/s5/local/CHECKPOINT.sh
@ -101,6 +101,33 @@ function CHECKPOINT {
  eval export $COUNTER_NAME=$COUNTER
 }

+function KILLBG_JOBS {
+    jobs \
+        | perl -ne 'print "$1\n" if m/^\[(\d+)\][+-]? +Running/;' \
+        | while read -r ; do kill %"$REPLY" ; done
+}
+
+function ONEXIT_HANDLER {
+  COLOR_GREEN='\e[00;32m'
+  COLOR_RED='\e[00;31m'
+  COLOR_BLUE='\e[00;34m'
+  COLOR_DEFAULT='\e[00m'
+  counters=`set | egrep "^CHECKPOINT_[_A-Z]+_COUNTER=" | sed 's/^CHECKPOINT\(_[_A-Z][_A-Z]*\)_COUNTER=/LAST_GOOD\1=/g' | sed "s/^LAST_GOOD_DEFAULT=/LAST_GOOD=/g"`
+  if [[ ! -z "$counters" ]]; then
+      echo -e ${COLOR_RED}"CHECKPOINT FAILURE: The last command returned non-zero status"${COLOR_DEFAULT} >&2
+      echo -e ${COLOR_RED}"look at the counters and try to rerun this script (after figuring the issue)"${COLOR_DEFAULT} >&2
+      echo -e ${COLOR_RED}"using the -c COUNTER_NAME=COUNTER_VALUE parameters;"${COLOR_DEFAULT} >&2
+      echo -e ${COLOR_RED}"You can use -c \"COUNTER_NAME1=COUNTER_VALUE1;COUNTER_NAME2=COUNTER_VALUE2\" as well"${COLOR_DEFAULT} >&2
+      echo -e ${COLOR_RED}"The counters: \n $counters"${COLOR_DEFAULT} >&2
+  else
+      echo -e ${COLOR_RED}"CHECKPOINT FAILURE: The last command returned non-zero status"${COLOR_DEFAULT} >&2
+      echo -e ${COLOR_RED}"No checkpoint was found. Try to figure out the problem and "${COLOR_DEFAULT} >&2
+      echo -e ${COLOR_RED}"run the script again"${COLOR_DEFAULT} >&2
+  fi
+}
+
+trap "ONEXIT_HANDLER; exit; " SIGINT SIGKILL SIGTERM ERR
+
 while getopts ":c:i" opt; do
  case $opt in
    c)
--- a/egs/babel/s5/local/kws_data_prep.sh
+++ b/egs/babel/s5/local/kws_data_prep.sh
@ -3,6 +3,13 @@
 # Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
 # Apache 2.0.

+# Begin configuration section.  
+case_insensitive=true
+# End configuration section.
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+

 if [ $# -ne 3 ]; then
   echo "Usage: local/kws_data_prep.sh <lang-dir> <data-dir> <kws-data-dir>"
@ -13,9 +20,8 @@ fi
 langdir=$1;
 datadir=$2;
 kwsdatadir=$3;
-keywords=$kwsdatadir/kws.xml
+keywords=$kwsdatadir/kwlist.xml

-case_insensitive=true

 mkdir -p $kwsdatadir;

@ -39,9 +45,10 @@ cat $keywords | perl -e '
 # are not in our $langdir/words.txt, as we won't find them anyway...
 #cat $kwsdatadir/keywords.txt | babel/filter_keywords.pl $langdir/words.txt - - | \
 #  sym2int.pl --map-oov 0 -f 2- $langdir/words.txt | \
-if $case_insensitive ; then
+if $case_insensitive  ; then
+  echo "Running case insensitive processing"
  cat $langdir/words.txt | tr '[:lower:]' '[:upper:]'  > $kwsdatadir/words.txt
-  [ `cut -f 1 -d ' ' $kwsdatadir/words.txt | wc -l` -ne `cat $kwsdatadir/words.txt | wc -l` ] && echo "Warning, multiple words in dictionary differ only in case..."
+  [ `cut -f 1 -d ' ' $kwsdatadir/words.txt | sort -u | wc -l` -ne `cat $kwsdatadir/words.txt | wc -l` ] && echo "Warning, multiple words in dictionary differ only in case..."

  cat $kwsdatadir/keywords.txt | tr '[:lower:]' '[:upper:]'  | \
    sym2int.pl --map-oov 0 -f 2- $kwsdatadir/words.txt > $kwsdatadir/keywords_all.int
@ -55,11 +62,11 @@ cat $kwsdatadir/keywords_all.int | \
  grep -v " 0 " | grep -v " 0$" > $kwsdatadir/keywords.int

 cut -f 1 -d ' ' $kwsdatadir/keywords.int | \
-  babel/subset_kwslist.pl $keywords > $kwsdatadir/keyword_invocab.xml
+  local/subset_kwslist.pl $keywords > $kwsdatadir/kwlist_invocab.xml

 cat $kwsdatadir/keywords_all.int | \
  egrep " 0 | 0$" | cut -f 1 -d ' ' | \
-  babel/subset_kwslist.pl $keywords > $kwsdatadir/keyword_outvocab.xml
+  local/subset_kwslist.pl $keywords > $kwsdatadir/kwlist_outvocab.xml



--- a/egs/babel/s5/local/kws_search.sh
+++ b/egs/babel/s5/local/kws_search.sh
@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
+# Apache 2.0.
+
+. ./path.sh
+. ./cmd.sh
+
+# Begin configuration section.  
+acwt=0.0909091
+duptime=0.6
+cmd=run.pl
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+datadir=$2
+langdir=$1
+decodedir=$3
+
+kwsdatadir=$datadir/kws
+kwsoutdir=$decodedir/kws
+
+mkdir -p $kwsdatadir
+mkdir -p $kwsoutdir
+
+
+local/make_index.sh --cmd "$cmd" --acwt $acwt \
+  $kwsdatadir $langdir $decodedir $kwsoutdir  || exit 1
+
+local/search_index.sh $kwsdatadir $kwsoutdir  || exit 1
+
+duration=`head -1 $kwsdatadir/ecf.xml |\
+  grep -o -E "duration=\"[0-9]*[    \.]*[0-9]*\"" |\
+  grep -o -E "[0-9]*[\.]*[0-9]*" |\
+  perl -e 'while(<>) {print $_/2;}'`
+
+cat $kwsoutdir/result.* | \
+    utils/write_kwslist.pl --flen=0.01 --duration=$duration \
+    --segments=$datadir/segments --normalize=true \
+    --map-utter=$kwsdatadir/utter_map \
+    - - | \
+    utils/filter_kwslist.pl $duptime > $kwsoutdir/kwslist.xml
+
--- a/egs/babel/s5/local/kws_setup.sh
+++ b/egs/babel/s5/local/kws_setup.sh
@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen, Yenda Trmal)
+# Apache 2.0.
+
+. ./path.sh
+. ./cmd.sh
+
+# Begin configuration section.  
+cmd=run.pl
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+ecf_file=$1
+kwlist_file=$2
+rttm_file=$3
+langdir=$4
+datadir=$5
+kwsdatadir=$datadir/kws
+
+mkdir -p $kwsdatadir
+
+cp `readlink -f $ecf_file` $kwsdatadir/ecf.xml || exit 1
+cp `readlink -f $kwlist_file` $kwsdatadir/kwlist.xml || exit 1
+cp `readlink -f $rttm_file` $kwsdatadir/rttm || exit 1
+
+local/kws_data_prep.sh --case-insensitive true $langdir $datadir $kwsdatadir || exit 1
+
--- a/egs/babel/s5/local/make_corpus_subset.sh
+++ b/egs/babel/s5/local/make_corpus_subset.sh
@ -0,0 +1,35 @@
+#!/bin/bash
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+input_data_dir=$1
+input_filelist=$2
+output_data_dir=$3
+
+mkdir -p $output_data_dir/transcription
+mkdir -p $output_data_dir/audio
+
+abs_src_dir=`readlink -f $input_data_dir` 
+abs_tgt_dir=`readlink -f $output_data_dir`
+
+for file_basename in `cat $input_data_list`; do
+    if [[ -e $abs_src_dir/audio/$file_basename.sph ]] ; then
+        ln -sf $abs_src_dir/audio/$file_basename.sph $abs_tgt_dir/audio || exit 1
+    else
+        echo "File $abs_src_dir/audio/$file_basename.sph does not exist!"
+        exit 1
+    fi
+
+    if [[ -e $abs_src_dir/transcription/$file_basename.txt ]] ; then
+        ln -sf $abs_src_dir/transcription/$file_basename.txt $abs_tgt_dir/transcription || exit 1
+    else
+        echo "File $abs_src_dir/audio/$file_basename.sph does not exist!"
+        exit 1
+    fi
+done
+
+
+
--- a/egs/babel/s5/local/make_lexicon_subset.sh
+++ b/egs/babel/s5/local/make_lexicon_subset.sh
@ -0,0 +1,30 @@
+#!/bin/bash
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+$transcriptions=$1
+$input_lexicon_file=$2
+$output_lexicon_file=$3
+
+(
+  #find $dev_data_dir/transcription/ -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]'  |cut -f 2- -d ':' | sed 's/ /\n/g' 
+  find $transcriptions -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]'  |cut -f 2- -d ':' | sed 's/ /\n/g'
+) | sort -u | awk ' 
+  BEGIN {
+      while(( getline line< ARGV[2] ) > 0 ) {
+          split(line, e, "\t")
+          LEXICON[ e[1] ]=line
+      }
+      FILENAME="-"
+      i=0
+    
+      while(( getline word< ARGV[1] ) > 0 ) {
+        if (word in LEXICON)
+          print LEXICON[word]
+      }
+  }
+' -  $input_lexicon_file | sort -u > $output_lexicon_file
+
--- a/egs/babel/s5/run-limited-chk.sh
+++ b/egs/babel/s5/run-limited-chk.sh
@ -0,0 +1,384 @@
+#!/bin/bash
+
+# System and data directories
+#SCRIPT=$(readlink $0)
+#SysDir=`dirname $SCRIPT`
+SysDir=`pwd`
+echo $SysDir
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="-oov <unk>"
+
+# Scoring protocols (dummy GLM file to appease the scoring script)
+glmFile=`readlink -f ./conf/glm`
+
+# Include the checkpointing facility
+. ./local/CHECKPOINT.sh
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+[ -f ./cmd.sh ] && . ./cmd.sh; # source train and decode cmds.
+. parse_options.sh || exit 1;
+
+configfile=$1
+[ -f $configfile ] && . $configfile 
+[ -f ./local.conf ] && . ./local.conf
+
+#Preparing dev and train directories
+if test -f $train_data_list ; then
+    echo ---------------------------------------------------------------------
+    echo "Subsetting the TRAIN set"
+    echo ---------------------------------------------------------------------
+
+    CHK local/make_corpus_subset.sh $train_data_dir $train_data_list ./data/raw_train_data || exit 1
+    train_data_dir=`readlink -f ./data/raw_train_data`
+
+    nj_max=`cat $train_data_list | wc -l`
+    if [[ "$nj_max" -lt "$train_nj" ]] ; then
+        echo "The maximum reasonable number of jobs is $nj_max (you have $train_nj)! (The training and decoding process has file-granularity)"
+        train_nj=$nj_max
+    fi
+fi
+
+if test -f $dev_data_list ; then
+    echo ---------------------------------------------------------------------
+    echo "Subsetting the DEV set"
+    echo ---------------------------------------------------------------------
+    
+    CHK local/make_corpus_subset.sh $dev_data_dir $dev_data_list ./data/raw_dev_data || exit 1
+    dev_data_dir=`readlink -f ./data/raw_dev_data`
+
+    nj_max=`cat $dev_data_list | wc -l`
+    if [[ "$nj_max" -lt "$decode_nj" ]] ; then
+        echo "The maximum reasonable number of jobs is $nj_max "
+        echo "you have $decode_nj! (The training and decoding process has file-granularity)"
+        decode_nj=$nj_max
+    fi
+fi
+
+if [[ $filter_lexicon ]]; then
+    echo ---------------------------------------------------------------------
+    echo "Subsetting the LEXICON"
+    echo ---------------------------------------------------------------------
+    
+    lexicon_dir=./data/raw_lex_data
+    mkdir -p $lexicon_dir
+    CHK local/make_corpus_subset.sh $train_data_dir/transcriptions \
+        $lexicon_file $lexicon_dir/lexicon.txt || exit 1
+    lexicon_file=$lexicon_dir/lexicon.txt
+fi
+
+
+echo ---------------------------------------------------------------------
+echo "Preparing lexicon in data/local on" `date`
+echo ---------------------------------------------------------------------
+mkdir -p data/local
+CHK local/prepare_lexicon.pl \
+    $lexiconFlags $lexicon_file data/local || exit 1
+
+echo ---------------------------------------------------------------------
+echo "Creating L.fst etc in data/lang on" `date`
+echo ---------------------------------------------------------------------
+mkdir -p data/lang
+CHK utils/prepare_lang.sh \
+    --share-silence-phones true \
+    data/local $oovSymbol data/local/tmp.lang data/lang || exit 1
+
+echo ---------------------------------------------------------------------
+echo "Preparing acoustic training lists in data/train on" `date`
+echo ---------------------------------------------------------------------
+mkdir -p data/train
+CHK local/prepare_acoustic_training_data.pl \
+    --vocab data/local/lexicon.txt --fragmentMarkers \-\*\~ \
+    $train_data_dir data/train > data/train/skipped_utts.log || exit 1
+
+echo ---------------------------------------------------------------------
+echo "Preparing dev data lists in data/dev on" `date`
+echo ---------------------------------------------------------------------
+mkdir -p data/dev
+CHK local/prepare_acoustic_training_data.pl \
+    --fragmentMarkers \-\*\~ \
+    $dev_data_dir data/dev > data/dev/skipped_utts.log || exit 1
+
+echo ---------------------------------------------------------------------
+echo "Preparing dev stm files in data/dev on" `date`
+echo ---------------------------------------------------------------------
+CHK local/prepare_stm.pl --fragmentMarkers \-\*\~ data/dev || exit 1
+
+
+test -f "$glmFile" || exit 1
+cp $glmFile data/dev/glm || exit 1
+
+echo ---------------------------------------------------------------------
+echo "Creating a basic G.fst in data/lang on" `date`
+echo ---------------------------------------------------------------------
+
+# We will simply override the default G.fst by the G.fst generated using SRILM
+CHK local/train_lms_srilm.sh data data/srilm 
+CHK local/arpa2G.sh data/srilm/lm.gz data/lang data/lang
+
+#CHK local/kws_setup.sh $ecf_file $kwlist_file $rttm_file data/lang data/dev || exit 1
+#CHK local/kws_search.sh data/lang data/dev exp/sgmm5/decode_fmllr
+
+
+cd $SysDir
+echo ---------------------------------------------------------------------
+echo "Starting plp feature extraction in plp on" `date`
+echo ---------------------------------------------------------------------
+CHK steps/make_plp.sh \
+    --cmd "$train_cmd" --nj $train_nj \
+    data/train exp/make_plp/train plp || exit 1
+CHK steps/compute_cmvn_stats.sh \
+    data/train exp/make_plp/train plp || exit 1
+# In case plp extraction failed on some utterance, delist them
+CHK utils/fix_data_dir.sh data/train
+
+CHK steps/make_plp.sh \
+    --cmd "$train_cmd" --nj $decode_nj \
+    data/dev exp/make_plp/dev plp || exit 1
+CHK steps/compute_cmvn_stats.sh \
+    data/dev exp/make_plp/dev plp || exit 1
+# In case plp extraction failed on some utterance, delist them
+CHK utils/fix_data_dir.sh data/dev
+mkdir -p exp
+
+echo ---------------------------------------------------------------------
+echo "Subsetting monophone training data in data/train_sub1 on" `date`
+echo ---------------------------------------------------------------------
+CHK utils/subset_data_dir.sh data/train  5000 data/train_sub1 || exit 1
+
+echo ---------------------------------------------------------------------
+echo "Starting (small) monophone training in exp/mono on" `date`
+echo ---------------------------------------------------------------------
+CHK steps/train_mono.sh \
+    --boost-silence 1.5 --nj 8  --cmd "$train_cmd" \
+    data/train_sub1 data/lang exp/mono || exit 1
+
+echo ---------------------------------------------------------------------
+echo "Starting (first) triphone training in exp/tri1 on" `date`
+echo ---------------------------------------------------------------------
+CHK steps/align_si.sh \
+    --boost-silence 1.5 --nj $(( $train_nj/2 )) --cmd "$train_cmd" \
+    data/train data/lang exp/mono exp/mono_ali || exit 1
+CHK steps/train_deltas.sh \
+    --boost-silence 1.5 --cmd "$train_cmd" \
+    $numLeavesTri1 $numGaussTri1 data/train data/lang exp/mono_ali exp/tri1 || exit 1
+
+echo ---------------------------------------------------------------------
+echo "Spawning decoding with first triphone models in exp/tri1 on" `date`
+echo ---------------------------------------------------------------------
+#(
+    mkdir -p exp/tri1/graph
+    CHK utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph &> exp/tri1/mkgraph.log
+    mkdir -p exp/tri1/decode
+    CHK steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+        exp/tri1/graph data/dev exp/tri1/decode &> exp/tri1/decode.log
+#) &
+#tri1decode=$!; # Grab the PID of the subshell
+#sleep 5; # Let any "start-up error" messages from the subshell get logged
+echo "See exp/tri1/mkgraph.log and exp/tri1/decode.log for decoding outcomes"
+
+echo -----------------------------------------------------------------------------
+echo "Starting second triphone training in exp/tri2 on" `date`
+echo -----------------------------------------------------------------------------
+CHK steps/align_si.sh \
+    --boost-silence 1.5 --nj $(( $train_nj/2 )) --cmd "$train_cmd" \
+    data/train data/lang exp/tri1 exp/tri1_ali || exit 1
+CHK steps/train_deltas.sh \
+    --boost-silence 1.5 --cmd "$train_cmd" \
+    $numLeavesTri2 $numGaussTri2 data/train data/lang exp/tri1_ali exp/tri2 || exit 1
+
+echo -----------------------------------------------------------------------------
+echo "Spawning decoding with triphone models in exp/tri2 on" `date`
+echo -----------------------------------------------------------------------------
+#(
+    mkdir -p exp/tri2/graph
+    CHK utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph &> exp/tri2/mkgraph.log
+    mkdir -p exp/tri2/decode
+    CHK steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+        exp/tri2/graph data/dev exp/tri2/decode &> exp/tri2/decode.log
+#) &
+#tri2decode=$!; # Grab the PID of the subshell
+#sleep 5; # Let any "start-up error" messages from the subshell get logged
+echo "See exp/tri2/mkgraph.log and exp/tri2/decode.log for decoding outcomes"
+
+
+echo ---------------------------------------------------------------------------------
+echo "Starting (lda_mllt) triphone training in exp/tri4 on" `date`
+echo ---------------------------------------------------------------------------------
+CHK steps/align_si.sh \
+    --boost-silence 1.5 --nj $train_nj --cmd "$train_cmd" \
+    data/train data/lang exp/tri2 exp/tri2_ali || exit 1
+CHK steps/train_lda_mllt.sh \
+    --boost-silence 1.5 --cmd "$train_cmd" \
+    $numLeavesMLLT $numGaussMLLT data/train data/lang exp/tri2_ali exp/tri3 || exit 1
+
+echo ----------------------------------------------------------------------------------
+echo "Spawning decoding with lda_mllt models in exp/tri3 on" `date`
+echo ----------------------------------------------------------------------------------
+#(
+    mkdir -p exp/tri3/graph
+    utils/mkgraph.sh \
+        data/lang exp/tri3 exp/tri3/graph &> exp/tri3/mkgraph.log
+    mkdir -p exp/tri3/decode
+    steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+        exp/tri3/graph data/dev exp/tri3/decode &> exp/tri3/decode.log
+#) &
+#tri3decode=$!; # Grab the PID of the subshell
+#sleep 5; # Let any "start-up error" messages from the subshell get logged
+echo "See exp/tri3/mkgraph.log and exp/tri3/decode.log for decoding outcomes"
+
+echo ----------------------------------------------------------------------------
+echo "Starting (SAT) triphone training in exp/tri4 on" `date`
+echo ----------------------------------------------------------------------------
+
+CHK steps/align_si.sh \
+    --boost-silence 1.5 --nj $train_nj --cmd "$train_cmd" \
+    data/train data/lang exp/tri3 exp/tri3_ali || exit 1
+CHK steps/train_sat.sh \
+    --boost-silence 1.5 --cmd "$train_cmd" \
+    $numLeavesSAT $numGaussSAT data/train data/lang exp/tri3_ali exp/tri4 || exit 1
+
+echo ------------------------------------------------------------------
+echo "Spawning decoding with SAT models  on" `date`
+echo ------------------------------------------------------------------
+#(
+    mkdir -p exp/tri4/graph
+    CHK utils/mkgraph.sh \
+        data/lang exp/tri4 exp/tri4/graph &> exp/tri4/mkgraph.log
+    mkdir -p exp/tri4/decode
+    CHK touch exp/tri4/decode.started # A signal to the SGMM2 decoding step
+    CHK steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" \
+        exp/tri4/graph data/dev exp/tri4/decode &> exp/tri4/decode.log
+
+    CHK touch exp/tri4/decode.finished # so SGMM2 decoding may proceed
+#) &
+#tri4decode=$!; # Grab the PID of the subshell; needed for SGMM2 decoding
+#sleep 5; # Let any "start-up error" messages from the subshell get logged
+echo "See exp/tri4/mkgraph.log and exp/tri4/decode.log for decoding outcomes"
+
+################################################################################
+# Ready to start SGMM training
+################################################################################
+
+echo -------------------------------------------------
+echo "Starting exp/ubm5 on" `date`
+echo -------------------------------------------------
+CHK steps/align_fmllr.sh --boost-silence 1.5 --nj $train_nj --cmd "$train_cmd" \
+    data/train data/lang exp/tri4 exp/tri4_ali || exit 1
+CHK steps/train_ubm.sh --cmd "$train_cmd" \
+    $numGaussUBM data/train data/lang exp/tri4_ali exp/ubm5 || exit 1
+
+echo --------------------------------------------------
+echo "Starting exp/sgmm5 on" `date`
+echo --------------------------------------------------
+CHK steps/train_sgmm2.sh --cmd "$train_cmd" \
+    $numLeavesSGMM $numGaussSGMM data/train data/lang exp/tri4_ali exp/ubm5/final.ubm exp/sgmm5 || exit 1
+
+################################################################################
+# Ready to decode with SGMM2 models
+################################################################################
+
+echo -----------------------------------------------------------------
+echo "Spawning exp/sgmm5/decode[_fmllr] on" `date`
+echo -----------------------------------------------------------------
+echo "exp/sgmm5/decode will wait on PID $tri4decode if necessary"
+#wait $tri4decode; # Need lattices from the corresponding SGMM decoding passes
+#(
+#    sleep 5; # Let the status message after the subshell get logged
+## The next (now commented) block should ensure we starting decoding of sgmm5 only after 
+## the tri4 decoding finishes. The same can be achieved by "wait"int for tri4decode pid
+#    while [ ! -f exp/tri4/decode.started -o ! -f exp/tri4/decode.finished ]; do
+#        echo "exp/sgmm5/decode is waiting on SAT decoding ..." `date`
+#        sleep 5
+#    done
+#    while [ exp/tri4/decode.finished -ot exp/tri4/decode.started ]; do
+#        echo "exp/tri4/decode.finished is older than exp/tri4/decode.started"; \
+#        ls -lt exp/tri4/decode.finished exp/tri4/decode.started; \
+#        echo "Perhaps SAT decoding was restarted and is still running?"; \
+#        echo "exp/sgmm5/decode is still waiting on SAT decoding ..." `date`
+#        sleep 5
+#    done
+#    rm exp/tri4/decode.started exp/tri4/decode.finished
+    mkdir -p exp/sgmm5/graph
+    CHK utils/mkgraph.sh \
+        data/lang exp/sgmm5 exp/sgmm5/graph &> exp/sgmm5/mkgraph.log
+    mkdir -p exp/sgmm5/decode
+    CHK steps/decode_sgmm2.sh \
+        --nj $decode_nj --cmd "$decode_cmd" --transform-dir exp/tri4/decode \
+        exp/sgmm5/graph data/dev/ exp/sgmm5/decode &> exp/sgmm5/decode.log
+    CHK steps/decode_sgmm2.sh --use-fmllr true --nj $decode_nj --cmd "$decode_cmd" \
+        --transform-dir exp/tri4/decode \
+        exp/sgmm5/graph data/dev/ exp/sgmm5/decode_fmllr &> exp/sgmm5/decode_fmllr.log
+#) &
+#sgmm5decode=$!; # Grab the PID of the subshell; needed for MMI rescoring
+#sleep 5; # Let any "start-up error" messages from the subshell get logged
+echo "See exp/sgmm5/mkgraph.log, exp/sgmm5/decode.log and exp/sgmm5/decode_fmllr.log for decoding outcomes"
+
+################################################################################
+# Ready to start discriminative SGMM training
+################################################################################
+
+echo ------------------------------------------------------
+echo "Starting exp/sgmm5_ali on" `date`
+echo ------------------------------------------------------
+CHK steps/align_sgmm2.sh \
+    --nj $train_nj --cmd "$train_cmd" --transform-dir exp/tri4_ali --use-graphs true --use-gselect true \
+    data/train data/lang exp/sgmm5 exp/sgmm5_ali || exit 1
+
+echo ----------------------------------------------------------
+echo "Starting exp/sgmm5_denlats on" `date`
+echo ----------------------------------------------------------
+CHK steps/make_denlats_sgmm2.sh \
+    --nj $train_nj --sub-split $train_nj \
+    --beam 10.0 --lattice-beam 6 --cmd "$decode_cmd" --transform-dir exp/tri4_ali \
+    data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats || exit 1
+
+echo -----------------------------------------------------------
+echo "Starting exp/sgmm5_mmi_b0.1 on" `date`
+echo -----------------------------------------------------------
+CHK steps/train_mmi_sgmm2.sh \
+    --cmd "$decode_cmd" --transform-dir exp/tri4_ali --boost 0.1 \
+    data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats \
+    exp/sgmm5_mmi_b0.1 || exit 1
+
+CHK steps/train_mmi_sgmm2.sh \
+    --cmd "$decode_cmd" --transform-dir exp/tri4_ali --boost 0.2 \
+    data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats \
+    exp/sgmm5_mmi_b0.2 || exit 1
+################################################################################
+# Ready to decode with discriminative SGMM2 models
+################################################################################
+
+echo "exp/sgmm5_mmi_b0.1/decode will wait on PID $sgmm5decode if necessary"
+wait $sgmm5decode; # Need lattices from the corresponding SGMM decoding passes
+echo --------------------------------------------------------------------------
+echo "Starting exp/sgmm5_mmi_b0.1/decode[_fmllr] on" `date`
+echo --------------------------------------------------------------------------
+for iter in 1 2 3 4; do
+    CHK steps/decode_sgmm2_rescore.sh \
+        --cmd "$decode_cmd" --iter $iter --transform-dir exp/tri4/decode \
+        data/lang data/dev exp/sgmm5/decode exp/sgmm5_mmi_b0.1/decode_it$iter
+    CHK steps/decode_sgmm2_rescore.sh \
+        --cmd "$decode_cmd" --iter $iter --transform-dir exp/tri4/decode \
+        data/lang data/dev exp/sgmm5/decode_fmllr exp/sgmm5_mmi_b0.1/decode_fmllr_it$iter
+    CHK steps/decode_sgmm2_rescore.sh \
+        --cmd "$decode_cmd" --iter $iter --transform-dir exp/tri4/decode \
+        data/lang data/dev exp/sgmm5/decode exp/sgmm5_mmi_b0.2/decode_it$iter
+    CHK steps/decode_sgmm2_rescore.sh \
+        --cmd "$decode_cmd" --iter $iter --transform-dir exp/tri4/decode \
+        data/lang data/dev exp/sgmm5/decode_fmllr exp/sgmm5_mmi_b0.2/decode_fmllr_it$iter
+done
+
+
+wait 
+
+# No need to wait on $tri4decode ---> $sgmm5decode ---> sgmm5_mmi_b0.1decode
+
+echo -----------------------------------------------------
+echo "Finished successfully on" `date`
+echo -----------------------------------------------------
+
+exit 0
--- a/egs/babel/s5/run-limited.sh
+++ b/egs/babel/s5/run-limited.sh
@ -99,6 +99,7 @@ if [[ $filter_lexicon ]]; then
    lexicon_file=$lexicon_dir/lexicon.txt
 fi

+
 echo ---------------------------------------------------------------------
 echo "Preparing lexicon in data/local on" `date`
 echo ---------------------------------------------------------------------
@ -144,6 +145,9 @@ echo ---------------------------------------------------------------------
 local/train_lms_srilm.sh data data/srilm 
 local/arpa2G.sh data/srilm/lm.gz data/lang data/lang

+#local/kws_setup.sh $ecf_file $kwlist_file $rttm_file data/lang data/dev || exit 1
+#local/kws_search.sh data/lang data/dev exp/sgmm5/decode_fmllr
+
 cd $SysDir
 echo ---------------------------------------------------------------------
 echo "Starting plp feature extraction in plp on" `date`