(trunk) Reintegrate the sandbox/pitch back to trunk

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@3252 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2013-12-03 17:49:53 +00:00 · 2013-12-03 17:49:53 +00:00 · f2aa400da5
--- a/egs/babel/s5b/README.txt
+++ b/egs/babel/s5b/README.txt
@ -0,0 +1,44 @@
+How to setup the BABEL database training environment
+====================================================
+a) Preparation: you need to make sure the BABEL data and the F4DE scoring software
+   is set up as it is in JHU, or change this setup accordingly.  This will probably 
+   be hard and will involve some trial and error.  Some relevant pathnames can be 
+   found in conf/lang/* and ./path.sh
+
+   Link one of the config files in conf/languages to ./lang.conf.  E.g.:
+    ln -s conf/languages/105-turkish-limitedLP.official.conf lang.conf
+   
+
+b) If you plan to work on one or more languages, the following approach is advised.
+    aa) create empty directory somewhere according to your choice
+    ab) symlink all the directories here to that directory
+    ac) copy cmd.sh and path.sh (you will probably need to do some changes in these)
+    ad) link the necessary scripts ( see below )
+    ae) link the appropriate language-specific config file to lang.conf in
+        each directory.
+
+Running the training scripts
+===================================================
+
+You run the scripts in order, i.e.
+ run-1-main.sh
+ run-2a-nnet.sh and run-2-bnf.sh may be run in parallel, but run-2-bnf.sh should be
+    run on a machine that has a GPU.
+ run-3-bnf-system.sh trains an SGMM system on top of bottleneck features from run-2-bnf.sh
+ run-4-test.sh is decoding with provided segmentation (we get this from CMU)
+ run-5-anydecode.sh seems to be decoding with the segmentation provided 
+
+
+
+Official NIST submission preparation
+==================================================
+The make_release.sh script might come handy.
+The scripts evaluates the performance of the sgmm2_mmi_b.0.1 system on 
+the eval.uem dataset and chooses the same set of parameters to 
+determine the path inside the test.uem dataset. 
+
+./make_release.sh --relname defaultJHU --lp FullLP --lr BaseLR --ar NTAR  \
+  conf/languages/106-tagalog-fullLP.official.conf /export/babel/data/releases
+
+
+
--- a/egs/babel/s5b/RESULTS
+++ b/egs/babel/s5b/RESULTS
--- a/egs/babel/s5b/RESULTS.txt
+++ b/egs/babel/s5b/RESULTS.txt
@ -0,0 +1,8 @@
+The results are by default to be found in <your-decode_directory>/decode_* where the individual <your-decode_directory>/decode_* directory correspond to the language model weight.
+
+An easthetically pleasing table with the results can be obtained for example like this (YMMV, as well as your aesthetic feeling):
+find exp/sgmm5_mmi_b0.1 -name "*.ctm.sys" -not -name "*char.ctm.sys" -ipath "*fmllr_eval.pem*" | xargs grep 'Sum/Avg' | sed 's/:* *| */ /g' | sed 's/  */ /g' |  sort  -n -k 9 | column -t
+
+similarly, for the kws outputs, the same table can be obtained as
+find exp/sgmm5_mmi_b0.1  -name "sum.txt"    -ipath "*fmllr_eval.pem*" | xargs grep "|   Occurrence" | cut -f 1,13 -d '|'| sed 's/:|//g' | column -t | sort -k 2 -n -r
+
--- a/egs/babel/s5b/cmd.sh
+++ b/egs/babel/s5b/cmd.sh
@ -0,0 +1,29 @@
+# "queue.pl" uses qsub.  The options to it are
+# options to qsub.  If you have GridEngine installed,
+# change this to a queue you have access to.
+# Otherwise, use "run.pl", which will run jobs locally
+# (make sure your --num-jobs options are no more than
+# the number of cpus on your machine.
+
+#a) JHU cluster options
+export train_cmd="queue.pl -l arch=*64"
+export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
+export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
+
+#export cuda_cmd="..."
+
+
+#b) BUT cluster options
+#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
+#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
+#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
+
+#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
+#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
+#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
+
+#c) run it locally...
+#export train_cmd=run.pl
+#export decode_cmd=run.pl
+#export cuda_cmd=run.pl
+#export mkgraph_cmd=run.pl
--- a/egs/babel/s5b/conf/bnf/config_full.py
+++ b/egs/babel/s5b/conf/bnf/config_full.py
@ -0,0 +1,60 @@
+#################################################
+## PTDNN - Python Toolkit for Deep Neural Network
+## Author: Yajie Miao
+#################################################
+
+import os
+import sys
+
+from utils.learn_rates import LearningRateExpDecay
+
+
+class BnfExpConfig(object):
+
+    def __init__(self):
+
+        # working directory; by default, the pfiles should be here
+        self.wdir = "WORK/"
+        self.pretrain_data = self.wdir + 'train.pfile.gz'  # pretraining data
+        self.pretrain_output = self.wdir + "rbm.ptr"       # pretraining output
+    
+        # finetuning data
+        self.finetune_train_data = self.wdir + 'train.pfile.gz'   # finetune training data
+        self.finetune_valid_data = self.wdir + 'valid.pfile.gz'   # finetune validation data
+        self.finetune_output = self.wdir + "final.nnet.raw"           # finetune output
+        self.nnet_kaldi_fmt = self.wdir + "final.nnet"
+
+        # global config for nnet topo
+        self.n_ins=250                                   # size of input data
+        self.n_outs=N_OUTS                               # number of output targets.. we'll replace this with 
+                                                         # the correct number when we move this to the right place.
+        self.hidden_layers_sizes=[1024, 1024, 1024, 1024, 1024, 42, 1024] # hidden layer sizes
+        self.bnf_layer_index = 6                         # the index of the Bottleneck layer
+        self.pretrain_layer_num = 5                      # number of hidden layers to be pretrained
+   
+        # global config for data
+        self.shuffle = True
+        self.chunk_size = '200m'
+
+        # pretraining batch size
+        self.pretrain_batch_size = 128              # batch-size in pretraining                             
+
+        # pretraining schedule
+        self.pretrain_gbrbm_lr = 0.005              # learning rate for Gaussian-Bernoulli RBM
+        self.pretrain_rbm_lr = 0.08                 # learning rate for Bernoulli-Bernoulli RBM
+        self.initial_momentum = 0.5                 # initial momentum 
+        self.final_momentum = 0.9                   # final momentum
+        self.initial_momentum_epoch = 2             # for how many epochs do we use initial_momentum
+        self.pretraining_epochs = 4                   # total epochs 
+
+        # finetuning batch size
+        self.finetune_batch_size = 256              # batch-size for finetuning
+
+        # finetuning schedule
+        self.finetune_momentum = 0.5                # momentum for finetuning
+        self.lrate = LearningRateExpDecay(start_rate=0.04,             # starting learning rate
+                                          scale_by = 0.5,               # decaying factor in ramping
+                                          max_epochs = 1000,            # 'dump' epoch limit, never can be reached
+                                          min_derror_ramp_start = 0.01, # min validation error difference to trigger ramping
+                                          min_derror_stop = 0.01,       # min validation error difference to stop finetuning, after ramping
+                                          init_error = 100)
--- a/egs/babel/s5b/conf/bnf/config_limited.py
+++ b/egs/babel/s5b/conf/bnf/config_limited.py
@ -0,0 +1,61 @@
+#################################################
+## PTDNN - Python Toolkit for Deep Neural Network
+## Author: Yajie Miao
+#################################################
+
+import os
+import sys
+
+from utils.learn_rates import LearningRateExpDecay
+
+
+class BnfExpConfig(object):
+
+    def __init__(self):
+
+        # working directory; by default, the pfiles should be here
+        self.wdir = "WORK/" # Note: we'll replace CWD with the current directory
+                                               # when we move this to the right place.
+        self.pretrain_data = self.wdir + 'train.pfile.gz'  # pretraining data
+        self.pretrain_output = self.wdir + "rbm.ptr"       # pretraining output
+    
+        # finetuning data
+        self.finetune_train_data = self.wdir + 'train.pfile.gz'   # finetune training data
+        self.finetune_valid_data = self.wdir + 'valid.pfile.gz'   # finetune validation data
+        self.finetune_output = self.wdir + "final.nnet.raw"           # finetune output
+        self.nnet_kaldi_fmt = self.wdir + "final.nnet"
+
+        # global config for nnet topo
+        self.n_ins=250                                   # size of input data
+        self.n_outs=N_OUTS                               # number of output targets.. we'll replace this with 
+                                                         # the correct number when we move this to the right place.
+        self.hidden_layers_sizes=[1024, 1024, 1024, 1024, 42, 1024] # hidden layer sizes
+        self.bnf_layer_index = 5                         # the index of the Bottleneck layer
+        self.pretrain_layer_num = 4                      # number of hidden layers to be pretrained
+   
+        # global config for data
+        self.shuffle = True
+        self.chunk_size = '200m'
+
+        # pretraining batch size
+        self.pretrain_batch_size = 128              # batch-size in pretraining                             
+
+        # pretraining schedule
+        self.pretrain_gbrbm_lr = 0.005              # learning rate for Gaussian-Bernoulli RBM
+        self.pretrain_rbm_lr = 0.08                 # learning rate for Bernoulli-Bernoulli RBM
+        self.initial_momentum = 0.5                 # initial momentum 
+        self.final_momentum = 0.9                   # final momentum
+        self.initial_momentum_epoch = 5             # for how many epochs do we use initial_momentum
+        self.pretraining_epochs=10                  # total epochs 
+
+        # finetuning batch size
+        self.finetune_batch_size = 256              # batch-size for finetuning
+
+        # finetuning schedule
+        self.finetune_momentum = 0.5                # momentum for finetuning
+        self.lrate = LearningRateExpDecay(start_rate=0.08,             # starting learning rate
+                                     scale_by = 0.5,               # decaying factor in ramping
+                                     max_epochs = 1000,            # 'dump' epoch limit, never can be reached
+                                     min_derror_ramp_start = 0.01, # min validation error difference to trigger ramping
+                                     min_derror_stop = 0.01,       # min validation error difference to stop finetuning, after ramping
+                                     init_error = 100)
--- a/egs/babel/s5b/conf/common.fullLP
+++ b/egs/babel/s5b/conf/common.fullLP
@ -0,0 +1,47 @@
+# DNN hybrid system training parameters
+dnn_mixup=12000
+dnn_num_jobs=16
+dnn_initial_learning_rate=0.01
+dnn_final_learning_rate=0.001
+dnn_num_parameters=7000000
+dnn_num_hidden_layers=4
+dnn_mem_reqs="mem_free=2.0G,ram_free=0.5G"
+dnn_extra_opts=
+
+bnf_every_nth_frame=2 # take every 2nd frame.
+babel_type=full
+
+use_pitch=false
+
+lmwt_plp_extra_opts=( --min-lmwt 8 --max-lmwt 12 )
+lmwt_bnf_extra_opts=( --min-lmwt 13 --max-lmwt 18 )
+lmwt_dnn_extra_opts=( --min-lmwt 8 --max-lmwt 12 )
+
+icu_opt=(--use-icu true --icu-transform Any-Lower)
+
+if [[ `hostname` == *.tacc.utexas.edu ]] ; then
+  dnn_train_extra_opts=(--num-threads 6 --parallel-opts "-pe smp 6" --num-jobs-nnet 16 --stage 0)
+  decode_extra_opts=( --num-threads 4 --parallel-opts "-pe smp 4" )
+  sgmm_train_extra_opts=( )
+  sgmm_group_extra_opts=( --num_iters 25 ) 
+  sgmm_denlats_extra_opts=( --num-threads 2 )
+else
+  dnn_train_extra_opts=(--num-threads 8 --parallel-opts "-pe smp 7" --cmd "queue.pl -l arch=*64,mem_free=4.0G,ram_free=0.75G" )
+  decode_extra_opts=(--num-threads 6 --parallel-opts "-pe smp 6 -l mem_free=4G,ram_free=0.7G")
+  sgmm_train_extra_opts=( --num_iters 25 )
+  sgmm_group_extra_opts=(--group 3 --parallel-opts "-pe smp 3 -l mem_free=7G,ram_free=2.75G" --cmd "queue.pl -l arch=*64 -l mem_free=3.0G,ram_free=3.0G") 
+  sgmm_denlats_extra_opts=(--num-threads 4 --parallel-opts "-pe smp 4" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=0.8G")
+  sgmm_mmi_extra_opts=(--cmd "queue.pl -l arch=*64 -l mem_free=3.2G,ram_free=3.2G")
+fi
+ 
+icu_transform="Any-Lower"
+case_insensitive=true
+
+
+max_states=150000
+wip=0.5
+
+use_pitch=false
+use_ffv=false
+
+phoneme_mapping=
--- a/egs/babel/s5b/conf/common.limitedLP
+++ b/egs/babel/s5b/conf/common.limitedLP
@ -0,0 +1,47 @@
+# DNN hybrid system training parameters
+dnn_mixup=5000
+dnn_num_jobs=8
+dnn_initial_learning_rate=0.015
+dnn_final_learning_rate=0.002
+dnn_num_parameters=1500000
+dnn_num_hidden_layers=2
+dnn_mem_reqs="mem_free=1.0G,ram_free=0.2G"
+dnn_extra_opts="--num_epochs 20 --num-epochs-extra 10 --add-layers-period 1 --shrink-interval 3"
+
+bnf_every_nth_frame=1 # take all frames.
+babel_type=limited
+
+use_pitch=false
+
+lmwt_plp_extra_opts=( --min-lmwt 8 --max-lmwt 12 )
+lmwt_bnf_extra_opts=( --min-lmwt 13 --max-lmwt 18 )
+lmwt_dnn_extra_opts=( --min-lmwt 8 --max-lmwt 12 )
+
+icu_opt=(--use-icu true --icu-transform Any-Lower)
+
+if [[ `hostname` == *.tacc.utexas.edu ]] ; then
+  dnn_train_extra_opts=(--num-threads 6 --parallel-opts "-pe smp 6" --num-jobs-nnet 16 --stage 0)
+  decode_extra_opts=( --num-threads 4 --parallel-opts "-pe smp 4" )
+  sgmm_train_extra_opts=( )
+  sgmm_group_extra_opts=( ) 
+  sgmm_denlats_extra_opts=( --num-threads 1 )
+else
+  dnn_train_extra_opts=(--num-threads 8 --parallel-opts "-pe smp 7" --cmd "queue.pl -l arch=*64,mem_free=4.0G,ram_free=0.75G" )
+  decode_extra_opts=(--num-threads 6 --parallel-opts "-pe smp 6 -l mem_free=4G,ram_free=0.7G")
+  sgmm_train_extra_opts=( --num-iters 25 )
+  sgmm_group_extra_opts=(--group 3 --parallel-opts "-pe smp 3 -l mem_free=7G,ram_free=2.3G" --cmd "queue.pl -l arch=*64 -l mem_free=2.0G,ram_free=2.0G") 
+  sgmm_denlats_extra_opts=(--num-threads 4 --parallel-opts "-pe smp 4" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=0.8G")
+  sgmm_mmi_extra_opts=(--cmd "queue.pl -l arch=*64 -l mem_free=1.5G,ram_free=1.5G")
+fi
+ 
+icu_transform="Any-Lower"
+case_insensitive=true
+
+
+max_states=150000
+wip=0.5
+
+use_pitch=false
+use_ffv=false
+
+phoneme_mapping=
--- a/egs/babel/s5b/conf/common_vars.sh
+++ b/egs/babel/s5b/conf/common_vars.sh
@ -0,0 +1,13 @@
+#keyword search default
+glmFile=conf/glm
+duptime=0.5
+case_insensitive=false
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="-oov <unk>"
+boost_sil=1.5 #  note from Dan: I expect 1.0 might be better (equivalent to not
+              # having the option)... should test.
+cer=0
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+[ -f ./cmd.sh ] && . ./cmd.sh; # source train and decode cmds.
--- a/egs/babel/s5b/conf/glm
+++ b/egs/babel/s5b/conf/glm
@ -0,0 +1,13 @@
+;;
+;;  File: ma970904.glm
+;;  Desc: This file contains the transcript filtering rules for the ARPA
+;;        Mandarin Hub5-NE Evaluation.
+;;
+;;  Date: 970904
+;;         - initial creation 
+;;
+;;  Hesitation mappings
+<hes>        => %HESITATION     / [ ] __ [ ]
+<v-noise>    => %HESITATION     / [ ] __ [ ]
+<noise>      => %HESITATION     / [ ] __ [ ]
+
--- a/egs/babel/s5b/conf/lang/101-cantonese-fullLP.official.conf
+++ b/egs/babel/s5b/conf/lang/101-cantonese-fullLP.official.conf
@ -0,0 +1,81 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/101-cantonese/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Cantonese_Babel101/train.FullLP.list
+train_nj=32
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/101-cantonese/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Cantonese_Babel101/dev.3hr.list
+dev2h_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.mitllfa2.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=20
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/101-cantonese/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Cantonese_Babel101/dev.list
+dev10h_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.mitllfa2.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/101-cantonese/release-current/conversational/eval
+eval_data_list=/export/babel/data/splits/Cantonese_Babel101/eval.babel101b-v0.4c.list
+eval_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+eval_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-eval.ecf.xml
+eval_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-eval.kwlist.xml
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=/export/babel/data/101-cantonese/release-current/conversational/eval
+evalpart1_data_list=/export/babel/data/splits/Cantonese_Babel101/evalpart1.babel101b-v0.4c.list
+evalpart1_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+evalpart1_stm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-evalpart1/babel101b-v0.4c_conv-evalpart1.stm
+evalpart1_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-evalpart1.ecf.xml
+evalpart1_rttm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-evalpart1/babel101b-v0.4c_conv-evalpart1.mitllfa3.rttm
+evalpart1_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-evalpart1/babel101b-v0.4c_conv-evalpart1.annot.kwlist.xml
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=80000
+
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+use_pitch=true
+
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=/export/babel/data/splits/Cantonese_Babel101/cantonese.glm
+lexicon_file=/export/babel/data/101-cantonese/release-current/conversational/reference_materials/lexicon.txt
+cer=1
+
+max_index_states=150000
+word_ins_penalty=0.5
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/101-cantonese-limitedLP.ffv+pitch.official.conf
+++ b/egs/babel/s5b/conf/lang/101-cantonese-limitedLP.ffv+pitch.official.conf
@ -0,0 +1,93 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/101-cantonese/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Cantonese_Babel101/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/101-cantonese/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Cantonese_Babel101/dev.3hr.list
+dev2h_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.mitllfa2.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=20
+
+#RADICAL DEV8H data files
+dev8h_data_dir=/export/babel/data/101-cantonese/release-current/conversational/dev
+dev8h_data_list=/export/babel/data/splits/Cantonese_Babel101/dev.7hr.list
+dev8h_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+dev8h_stm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.stm
+dev8h_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.ecf.xml
+dev8h_rttm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.mitllfa2.rttm
+dev8h_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.kwlist.xml
+dev8h_subset_ecf=true
+dev8h_nj=32
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/101-cantonese/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Cantonese_Babel101/dev.list
+dev10h_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.mitllfa2.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/101-cantonese/release-current/conversational/eval
+eval_data_list=/export/babel/data/splits/Cantonese_Babel101/eval.babel101b-v0.4c.list
+eval_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+eval_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-eval.ecf.xml
+eval_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-eval.kwlist.xml
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=/export/babel/data/101-cantonese/release-current/conversational/eval
+evalpart1_data_list=/export/babel/data/splits/Cantonese_Babel101/evalpart1.babel101b-v0.4c.list
+evalpart1_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+evalpart1_stm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-evalpart1/babel101b-v0.4c_conv-evalpart1.stm
+evalpart1_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-evalpart1.ecf.xml
+evalpart1_rttm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-evalpart1/babel101b-v0.4c_conv-evalpart1.mitllfa3.rttm
+evalpart1_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-evalpart1/babel101b-v0.4c_conv-evalpart1.annot.kwlist.xml
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+use_pitch=true
+use_ffv=true
+
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=/export/babel/data/splits/Cantonese_Babel101/cantonese.glm
+lexicon_file=/export/babel/data/101-cantonese/release-babel101b-v0.4c_sub-train1/conversational/reference_materials/lexicon.sub-train1.txt
+cer=1
+
+max_index_states=150000
+word_ins_penalty=0.5
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/101-cantonese-limitedLP.ffv.official.conf
+++ b/egs/babel/s5b/conf/lang/101-cantonese-limitedLP.ffv.official.conf
@ -0,0 +1,93 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/101-cantonese/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Cantonese_Babel101/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/101-cantonese/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Cantonese_Babel101/dev.3hr.list
+dev2h_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.mitllfa2.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=20
+
+#RADICAL DEV8H data files
+dev8h_data_dir=/export/babel/data/101-cantonese/release-current/conversational/dev
+dev8h_data_list=/export/babel/data/splits/Cantonese_Babel101/dev.7hr.list
+dev8h_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+dev8h_stm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.stm
+dev8h_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.ecf.xml
+dev8h_rttm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.mitllfa2.rttm
+dev8h_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.kwlist.xml
+dev8h_subset_ecf=true
+dev8h_nj=32
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/101-cantonese/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Cantonese_Babel101/dev.list
+dev10h_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.mitllfa2.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/101-cantonese/release-current/conversational/eval
+eval_data_list=/export/babel/data/splits/Cantonese_Babel101/eval.babel101b-v0.4c.list
+eval_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+eval_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-eval.ecf.xml
+eval_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-eval.kwlist.xml
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=/export/babel/data/101-cantonese/release-current/conversational/eval
+evalpart1_data_list=/export/babel/data/splits/Cantonese_Babel101/evalpart1.babel101b-v0.4c.list
+evalpart1_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+evalpart1_stm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-evalpart1/babel101b-v0.4c_conv-evalpart1.stm
+evalpart1_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-evalpart1.ecf.xml
+evalpart1_rttm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-evalpart1/babel101b-v0.4c_conv-evalpart1.mitllfa3.rttm
+evalpart1_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-evalpart1/babel101b-v0.4c_conv-evalpart1.annot.kwlist.xml
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+use_pitch=false
+use_ffv=true
+
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=/export/babel/data/splits/Cantonese_Babel101/cantonese.glm
+lexicon_file=/export/babel/data/101-cantonese/release-babel101b-v0.4c_sub-train1/conversational/reference_materials/lexicon.sub-train1.txt
+cer=1
+
+max_index_states=150000
+word_ins_penalty=0.5
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/101-cantonese-limitedLP.official.conf
+++ b/egs/babel/s5b/conf/lang/101-cantonese-limitedLP.official.conf
@ -0,0 +1,92 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/101-cantonese/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Cantonese_Babel101/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/101-cantonese/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Cantonese_Babel101/dev.3hr.list
+dev2h_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.mitllfa2.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=20
+
+#RADICAL DEV8H data files
+dev8h_data_dir=/export/babel/data/101-cantonese/release-current/conversational/dev
+dev8h_data_list=/export/babel/data/splits/Cantonese_Babel101/dev.7hr.list
+dev8h_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+dev8h_stm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.stm
+dev8h_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.ecf.xml
+dev8h_rttm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.mitllfa2.rttm
+dev8h_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.kwlist.xml
+dev8h_subset_ecf=true
+dev8h_nj=32
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/101-cantonese/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Cantonese_Babel101/dev.list
+dev10h_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev/babel101b-v0.4c_conv-dev.mitllfa2.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-dev.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/101-cantonese/release-current/conversational/eval
+eval_data_list=/export/babel/data/splits/Cantonese_Babel101/eval.babel101b-v0.4c.list
+eval_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+eval_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-eval.ecf.xml
+eval_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-eval.kwlist.xml
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=/export/babel/data/101-cantonese/release-current/conversational/eval
+evalpart1_data_list=/export/babel/data/splits/Cantonese_Babel101/evalpart1.babel101b-v0.4c.list
+evalpart1_data_cmudb=/export/babel/data/splits/Cantonese_Babel101/uem/db-v8-utt.dat
+evalpart1_stm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-evalpart1/babel101b-v0.4c_conv-evalpart1.stm
+evalpart1_ecf_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-evalpart1.ecf.xml
+evalpart1_rttm_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-evalpart1/babel101b-v0.4c_conv-evalpart1.mitllfa3.rttm
+evalpart1_kwlist_file=/export/babel/data/scoring/IndusDB/babel101b-v0.4c_conv-evalpart1/babel101b-v0.4c_conv-evalpart1.annot.kwlist.xml
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+use_pitch=true
+
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=/export/babel/data/splits/Cantonese_Babel101/cantonese.glm
+lexicon_file=/export/babel/data/101-cantonese/release-babel101b-v0.4c_sub-train1/conversational/reference_materials/lexicon.sub-train1.txt
+cer=1
+
+max_index_states=150000
+word_ins_penalty=0.5
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/102-assamese-fullLP.ffv+pitch.official.conf
+++ b/egs/babel/s5b/conf/lang/102-assamese-fullLP.ffv+pitch.official.conf
@ -0,0 +1,72 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/102-assamese/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Assamese_Babel102/train.FullLP.list
+train_nj=32
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/102-assamese/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Assamese_Babel102/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.scoring.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.annot.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=24
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/102-assamese/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Assamese_Babel102//dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.scoring.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.annot.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files (not released yet)
+#eval_data_dir=
+#eval_data_list=
+#eval_data_cmudb=
+#eval_ecf_file=
+#eval_kwlist_file=
+#eval_nj=64
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=80000
+
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+use_pitch=true
+use_ffv=true
+
+
+
+lexicon_file=/export/babel/data/102-assamese/release-current/conversational/reference_materials/lexicon.txt
+cer=0
+
+max_index_states=150000
+word_ins_penalty=0.5
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/102-assamese-fullLP.ffv.official.conf
+++ b/egs/babel/s5b/conf/lang/102-assamese-fullLP.ffv.official.conf
@ -0,0 +1,72 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/102-assamese/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Assamese_Babel102/train.FullLP.list
+train_nj=32
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/102-assamese/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Assamese_Babel102/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.scoring.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.annot.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=24
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/102-assamese/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Assamese_Babel102//dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.scoring.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.annot.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files (not released yet)
+#eval_data_dir=
+#eval_data_list=
+#eval_data_cmudb=
+#eval_ecf_file=
+#eval_kwlist_file=
+#eval_nj=64
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=80000
+
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+use_pitch=false
+use_ffv=true
+
+
+
+lexicon_file=/export/babel/data/102-assamese/release-current/conversational/reference_materials/lexicon.txt
+cer=0
+
+max_index_states=150000
+word_ins_penalty=0.5
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/102-assamese-fullLP.official.conf
+++ b/egs/babel/s5b/conf/lang/102-assamese-fullLP.official.conf
@ -0,0 +1,72 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/102-assamese/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Assamese_Babel102/train.FullLP.list
+train_nj=32
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/102-assamese/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Assamese_Babel102/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.scoring.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.annot.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=24
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/102-assamese/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Assamese_Babel102//dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.scoring.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.annot.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files (not released yet)
+#eval_data_dir=
+#eval_data_list=
+#eval_data_cmudb=
+#eval_ecf_file=
+#eval_kwlist_file=
+#eval_nj=64
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=80000
+
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+use_pitch=false
+use_ffv=false
+
+
+
+lexicon_file=/export/babel/data/102-assamese/release-current/conversational/reference_materials/lexicon.txt
+cer=0
+
+max_index_states=150000
+word_ins_penalty=0.5
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/102-assamese-fullLP.pitch.official.conf
+++ b/egs/babel/s5b/conf/lang/102-assamese-fullLP.pitch.official.conf
@ -0,0 +1,72 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/102-assamese/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Assamese_Babel102/train.FullLP.list
+train_nj=32
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/102-assamese/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Assamese_Babel102/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.scoring.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.annot.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=24
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/102-assamese/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Assamese_Babel102//dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.scoring.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.annot.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files (not released yet)
+#eval_data_dir=
+#eval_data_list=
+#eval_data_cmudb=
+#eval_ecf_file=
+#eval_kwlist_file=
+#eval_nj=64
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=80000
+
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+use_pitch=true
+use_ffv=false
+
+
+
+lexicon_file=/export/babel/data/102-assamese/release-current/conversational/reference_materials/lexicon.txt
+cer=0
+
+max_index_states=150000
+word_ins_penalty=0.5
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/102-assamese-limitedLP.ffv+pitch.official.conf
+++ b/egs/babel/s5b/conf/lang/102-assamese-limitedLP.ffv+pitch.official.conf
@ -0,0 +1,72 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/102-assamese/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Assamese_Babel102/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/102-assamese/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Assamese_Babel102/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.scoring.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.annot.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=24
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/102-assamese/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Assamese_Babel102//dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.scoring.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.annot.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files (not released yet)
+#eval_data_dir=
+#eval_data_list=
+#eval_data_cmudb=
+#eval_ecf_file=
+#eval_kwlist_file=
+#eval_nj=64
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+use_pitch=true
+use_ffv=true
+
+
+
+lexicon_file=/export/babel/data/102-assamese/release-current/conversational/reference_materials/lexicon.sub-train.txt
+cer=0
+
+max_index_states=150000
+word_ins_penalty=0.5
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/102-assamese-limitedLP.ffv.official.conf
+++ b/egs/babel/s5b/conf/lang/102-assamese-limitedLP.ffv.official.conf
@ -0,0 +1,72 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/102-assamese/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Assamese_Babel102/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/102-assamese/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Assamese_Babel102/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.scoring.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.annot.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=24
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/102-assamese/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Assamese_Babel102//dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.scoring.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.annot.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files (not released yet)
+#eval_data_dir=
+#eval_data_list=
+#eval_data_cmudb=
+#eval_ecf_file=
+#eval_kwlist_file=
+#eval_nj=64
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+use_pitch=false
+use_ffv=true
+
+
+
+lexicon_file=/export/babel/data/102-assamese/release-current/conversational/reference_materials/lexicon.sub-train.txt
+cer=0
+
+max_index_states=150000
+word_ins_penalty=0.5
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/102-assamese-limitedLP.official.conf
+++ b/egs/babel/s5b/conf/lang/102-assamese-limitedLP.official.conf
@ -0,0 +1,72 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/102-assamese/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Assamese_Babel102/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/102-assamese/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Assamese_Babel102/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.scoring.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.annot.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=24
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/102-assamese/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Assamese_Babel102//dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.scoring.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.annot.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files (not released yet)
+#eval_data_dir=
+#eval_data_list=
+#eval_data_cmudb=
+#eval_ecf_file=
+#eval_kwlist_file=
+#eval_nj=64
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+use_pitch=false
+use_ffv=false
+
+
+
+lexicon_file=/export/babel/data/102-assamese/release-current/conversational/reference_materials/lexicon.sub-train.txt
+cer=0
+
+max_index_states=150000
+word_ins_penalty=0.5
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/102-assamese-limitedLP.pitch.official.conf
+++ b/egs/babel/s5b/conf/lang/102-assamese-limitedLP.pitch.official.conf
@ -0,0 +1,72 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/102-assamese/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Assamese_Babel102/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/102-assamese/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Assamese_Babel102/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.scoring.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.annot.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=24
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/102-assamese/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Assamese_Babel102//dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.scoring.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel102b-v0.4_conv-dev/babel102b-v0.4_conv-dev.annot.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files (not released yet)
+#eval_data_dir=
+#eval_data_list=
+#eval_data_cmudb=
+#eval_ecf_file=
+#eval_kwlist_file=
+#eval_nj=64
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+use_pitch=true
+use_ffv=false
+
+
+
+lexicon_file=/export/babel/data/102-assamese/release-current/conversational/reference_materials/lexicon.sub-train.txt
+cer=0
+
+max_index_states=150000
+word_ins_penalty=0.5
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/103-bengali-fullLP.ffv+pitch.official.conf
+++ b/egs/babel/s5b/conf/lang/103-bengali-fullLP.ffv+pitch.official.conf
@ -0,0 +1,72 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/103-bengali/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Bengali_Babel103/train.FullLP.list
+train_nj=32
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/103-bengali/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Bengali_Babel103/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.scoring.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.annot.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=12
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/103-bengali/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Bengali_Babel103/dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.scoring.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.annot.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files (not released yet)
+#eval_data_dir=
+#eval_data_list=
+#eval_data_cmudb=
+#eval_ecf_file=
+#eval_kwlist_file=
+#eval_nj=64
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=80000
+
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+use_pitch=true
+use_ffv=true
+
+
+
+lexicon_file=/export/babel/data/103-bengali/release-current/conversational/reference_materials/lexicon.txt
+cer=0
+
+max_index_states=150000
+word_ins_penalty=0.5
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/103-bengali-fullLP.official.conf
+++ b/egs/babel/s5b/conf/lang/103-bengali-fullLP.official.conf
@ -0,0 +1,72 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/103-bengali/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Bengali_Babel103/train.FullLP.list
+train_nj=32
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/103-bengali/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Bengali_Babel103/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.scoring.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.annot.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=12
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/103-bengali/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Bengali_Babel103/dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.scoring.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.annot.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files (not released yet)
+#eval_data_dir=
+#eval_data_list=
+#eval_data_cmudb=
+#eval_ecf_file=
+#eval_kwlist_file=
+#eval_nj=64
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=80000
+
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+use_pitch=true
+use_ffv=false
+
+
+
+lexicon_file=/export/babel/data/103-bengali/release-current/conversational/reference_materials/lexicon.txt
+cer=0
+
+max_index_states=150000
+word_ins_penalty=0.5
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/103-bengali-limitedLP.ffv+pitch.official.conf
+++ b/egs/babel/s5b/conf/lang/103-bengali-limitedLP.ffv+pitch.official.conf
@ -0,0 +1,72 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/103-bengali//release-current/conversational/training
+train_data_list=/export/babel/data/splits/Bengali_Babel103/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/103-bengali/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Bengali_Babel103/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.scoring.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.annot.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=12
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/103-bengali/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Bengali_Babel103/dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.scoring.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.annot.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files (not released yet)
+#eval_data_dir=
+#eval_data_list=
+#eval_data_cmudb=
+#eval_ecf_file=
+#eval_kwlist_file=
+#eval_nj=64
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+use_pitch=true
+use_ffv=true
+
+
+
+lexicon_file=/export/babel/data/103-bengali/release-current/conversational/reference_materials/lexicon.sub-train.txt
+cer=0
+
+max_index_states=150000
+word_ins_penalty=0.5
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/103-bengali-limitedLP.official.conf
+++ b/egs/babel/s5b/conf/lang/103-bengali-limitedLP.official.conf
@ -0,0 +1,72 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/103-bengali//release-current/conversational/training
+train_data_list=/export/babel/data/splits/Bengali_Babel103/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/103-bengali/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Bengali_Babel103/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.scoring.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.annot.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=12
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/103-bengali/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Bengali_Babel103/dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.scoring.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel103b-v0.3_conv-dev/babel103b-v0.3_conv-dev.annot.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files (not released yet)
+#eval_data_dir=
+#eval_data_list=
+#eval_data_cmudb=
+#eval_ecf_file=
+#eval_kwlist_file=
+#eval_nj=64
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+use_pitch=true
+use_ffv=false
+
+
+
+lexicon_file=/export/babel/data/103-bengali/release-current/conversational/reference_materials/lexicon.sub-train.txt
+cer=0
+
+max_index_states=150000
+word_ins_penalty=0.5
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/104-pashto-fullLP.official.conf
+++ b/egs/babel/s5b/conf/lang/104-pashto-fullLP.official.conf
@ -0,0 +1,76 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/104-pashto/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Pashto_Babel104/train.FullLP.list
+train_nj=32
+
+#RADICAL DEV2H data files
+dev2h_data_dir=/export/babel/data/104-pashto/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Pashto_Babel104/dev2hr.list
+dev2h_data_cmudb=/export/babel/data/splits/Pashto_Babel104/uem/db-v7_dev+eval-utt.dat
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev/babel104b-v0.4bY_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-eval.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev/babel104b-v0.4bY_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev/babel104b-v0.4bY_conv-dev.kwlist2.xml
+dev2h_subset_ecf=true
+dev2h_nj=18
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/104-pashto/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Pashto_Babel104/dev.list
+dev10h_data_cmudb=/export/babel/data/splits/Pashto_Babel104/uem/db-v7_dev+eval-utt.dat
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev/babel104b-v0.4bY_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-eval.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev/babel104b-v0.4bY_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev/babel104b-v0.4bY_conv-dev.kwlist2.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/104-pashto/release-current/conversational/eval/
+eval_data_list=/export/babel/data/splits/Pashto_Babel104/eval.babel104b-v0.4bY.list
+eval_data_cmudb=/export/babel/data/splits/Pashto_Babel104/uem/db-v7_dev+eval-utt.dat
+eval_ecf_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-eval.ecf.xml
+eval_kwlist_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-eval.kwlist2.xml
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+#evalpart1_data_dir=/export/babel/data/104-pashto/release-current/conversational/eval
+#evalpart1_data_list=
+#evalpart1_data_cmudb=/export/babel/data/splits/Pashto_Babel104/uem/db-v7_dev+eval-utt.dat
+#evalpart1_stm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-evalpart1/babel104b-v0.4bY_conv-evalpart1.stm
+#evalpart1_ecf_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-evalpart1.ecf.xml
+#evalpart1_rttm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-evalpart1/babel104b-v0.4bY_conv-evalpart1.mitllfa3.rttm
+#evalpart1_kwlist_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-evalpart1/babel104b-v0.4bY_conv-evalpart1.annot.kwlist2.xml
+#evalpart1_nj=32
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=80000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/104-pashto/release-current/conversational/reference_materials/lexicon.txt
+
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/104-pashto-limitedLP.official.conf
+++ b/egs/babel/s5b/conf/lang/104-pashto-limitedLP.official.conf
@ -0,0 +1,76 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/104-pashto/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Pashto_Babel104/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV2H data files
+dev2h_data_dir=/export/babel/data/104-pashto/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Pashto_Babel104/dev2hr.list
+dev2h_data_cmudb=/export/babel/data/splits/Pashto_Babel104/uem/db-v7_dev+eval-utt.dat
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev/babel104b-v0.4bY_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-eval.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev/babel104b-v0.4bY_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev/babel104b-v0.4bY_conv-dev.kwlist2.xml
+dev2h_subset_ecf=true
+dev2h_nj=18
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/104-pashto/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Pashto_Babel104/dev.list
+dev10h_data_cmudb=/export/babel/data/splits/Pashto_Babel104/uem/db-v7_dev+eval-utt.dat
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev/babel104b-v0.4bY_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-eval.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev/babel104b-v0.4bY_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev/babel104b-v0.4bY_conv-dev.kwlist2.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/104-pashto/release-current/conversational/eval/
+eval_data_list=/export/babel/data/splits/Pashto_Babel104/eval.babel104b-v0.4bY.list
+eval_data_cmudb=/export/babel/data/splits/Pashto_Babel104/uem/db-v7_dev+eval-utt.dat
+eval_ecf_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-eval.ecf.xml
+eval_kwlist_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-eval.kwlist2.xml
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+#evalpart1_data_dir=/export/babel/data/104-pashto/release-current/conversational/eval
+#evalpart1_data_list=
+#evalpart1_data_cmudb=/export/babel/data/splits/Pashto_Babel104/uem/db-v7_dev+eval-utt.dat
+#evalpart1_stm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-evalpart1/babel104b-v0.4bY_conv-evalpart1.stm
+#evalpart1_ecf_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-evalpart1.ecf.xml
+#evalpart1_rttm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-evalpart1/babel104b-v0.4bY_conv-evalpart1.mitllfa3.rttm
+#evalpart1_kwlist_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-evalpart1/babel104b-v0.4bY_conv-evalpart1.annot.kwlist2.xml
+#evalpart1_nj=32
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/104-pashto/release-current-subtrain/conversational/reference_materials/lexicon.sub-train.txt
+
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/104-pashto-limitedLP.official.pitch.conf
+++ b/egs/babel/s5b/conf/lang/104-pashto-limitedLP.official.pitch.conf
@ -0,0 +1,77 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/104-pashto/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Pashto_Babel104/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV2H data files
+dev2h_data_dir=/export/babel/data/104-pashto/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Pashto_Babel104/dev2hr.list
+dev2h_data_cmudb=/export/babel/data/splits/Pashto_Babel104/uem/db-v7_dev+eval-utt.dat
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev/babel104b-v0.4bY_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev/babel104b-v0.4bY_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-eval.kwlist2.xml
+dev2h_subset_ecf=true
+dev2h_nj=18
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/104-pashto/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Pashto_Babel104/dev.list
+dev10h_data_cmudb=/export/babel/data/splits/Pashto_Babel104/uem/db-v7_dev+eval-utt.dat
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev/babel104b-v0.4bY_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-dev/babel104b-v0.4bY_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-eval.kwlist2.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/104-pashto/release-current/conversational/eval/
+eval_data_list=/export/babel/data/splits/Pashto_Babel104/eval.babel104b-v0.4bY.list
+eval_data_cmudb=/export/babel/data/splits/Pashto_Babel104/uem/db-v7_dev+eval-utt.dat
+eval_ecf_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-eval.ecf.xml
+eval_kwlist_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-eval.kwlist2.xml
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+#evalpart1_data_dir=/export/babel/data/104-pashto/release-current/conversational/eval
+#evalpart1_data_list=
+#evalpart1_data_cmudb=/export/babel/data/splits/Pashto_Babel104/uem/db-v7_dev+eval-utt.dat
+#evalpart1_stm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-evalpart1/babel104b-v0.4bY_conv-evalpart1.stm
+#evalpart1_ecf_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-evalpart1.ecf.xml
+#evalpart1_rttm_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-evalpart1/babel104b-v0.4bY_conv-evalpart1.mitllfa3.rttm
+#evalpart1_kwlist_file=/export/babel/data/scoring/IndusDB/babel104b-v0.4bY_conv-evalpart1/babel104b-v0.4bY_conv-evalpart1.annot.kwlist2.xml
+#evalpart1_nj=32
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+use_pitch=true
+
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/104-pashto/release-current-subtrain/conversational/reference_materials/lexicon.sub-train.txt
+
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/105-turkish-fullLP.official.conf
+++ b/egs/babel/s5b/conf/lang/105-turkish-fullLP.official.conf
@ -0,0 +1,76 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/105-turkish/release-current-b/conversational/training
+train_data_list=/export/babel/data/splits/Turkish_Babel105/train.FullLP.list
+train_nj=32
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/105-turkish/release-current-b/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Turkish_Babel105/dev2hr.list
+dev2h_data_cmudb=/export/babel/data/splits/Turkish_Babel105/uem/db-dev+eval-v7-utt.dat
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev/babel105b-v0.4_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev/babel105b-v0.4_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=18
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/105-turkish/release-current-b/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Turkish_Babel105/dev.list
+dev10h_data_cmudb=/export/babel/data/splits/Turkish_Babel105/uem/db-dev+eval-v7-utt.dat
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev/babel105b-v0.4_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev/babel105b-v0.4_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/105-turkish/release-current-b/conversational/eval
+eval_data_list=/export/babel/data/splits/Turkish_Babel105/eval.babel105b-v0.4.list
+eval_data_cmudb=/export/babel/data/splits/Turkish_Babel105/uem/db-dev+eval-v7-utt.dat
+eval_ecf_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-eval.ecf.xml
+eval_kwlist_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-eval.kwlist2.xml
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=80000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+
+# Scoring protocols (dummy GLM file to appease the scoring script)
+glmFile=./conf/glm
+lexicon_file=/export/babel/data/105-turkish/release-current-b/conversational/reference_materials/lexicon.txt
+
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/105-turkish-fullLP.releaseA.conf
+++ b/egs/babel/s5b/conf/lang/105-turkish-fullLP.releaseA.conf
@ -0,0 +1,55 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+# System and data directories
+train_data_dir=/export/babel/data/105-turkish/release-current-b/conversational/training
+#train_data_list=/export/babel/data/splits/Turkish_Babel105/train.FullLP.list
+dev_data_dir=/export/babel/data/105-turkish/release-current-b/conversational/dev
+dev_data_list=/export/babel/data/splits/Turkish_Babel105/dev2hr.list
+
+lexicon_file=/export/babel/data/105-turkish/release-current-b/conversational/reference_materials/lexicon.txt
+filter_lexicon=false
+
+eval_data_dir=/export/babel/data/105-turkish/release-current-b/conversational/dev
+#eval_data_list=/export/babel/data/splits/Turkish_Babel105/dev.list
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numLeavesSGMM=10000
+numGaussSGMM=80000
+numGaussUBM=800
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+
+# Scoring protocols (dummy GLM file to appease the scoring script)
+glmFile=./conf/glm
+
+train_nj=32
+decode_nj=32
+
+#keyword search settings
+duptime=0.5
+case_insensitive=false
+ecf_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev.ecf.xml
+#Generate a subset of the ecf file according to the {dev,eval}_data_list, if present
+subset_ecf=true
+
+kwlist_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev.kwlist.xml
+rttm_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev/babel105b-v0.4_conv-dev.mitllfa3.rttm
+
+#Include the links and settings of the BABEL-only software
+. /export/babel/data/software/env.sh
+
+
+
--- a/egs/babel/s5b/conf/lang/105-turkish-limitedLP.official.conf
+++ b/egs/babel/s5b/conf/lang/105-turkish-limitedLP.official.conf
@ -0,0 +1,77 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/105-turkish/release-current-b/conversational/training
+train_data_list=/export/babel/data/splits/Turkish_Babel105/train.LimitedLP.official.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/105-turkish/release-current-b/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Turkish_Babel105/dev2hr.list
+dev2h_data_cmudb=/export/babel/data/splits/Turkish_Babel105/uem/db-dev+eval-v7-utt.dat
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev/babel105b-v0.4_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev/babel105b-v0.4_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=18
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/105-turkish/release-current-b/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Turkish_Babel105/dev.list
+dev10h_data_cmudb=/export/babel/data/splits/Turkish_Babel105/uem/db-dev+eval-v7-utt.dat
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev/babel105b-v0.4_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev/babel105b-v0.4_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-dev.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/105-turkish/release-current-b/conversational/eval
+eval_data_list=/export/babel/data/splits/Turkish_Babel105/eval.babel105b-v0.4.list
+eval_data_cmudb=/export/babel/data/splits/Turkish_Babel105/uem/db-dev+eval-v7-utt.dat
+eval_ecf_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-eval.ecf.xml
+eval_kwlist_file=/export/babel/data/scoring/IndusDB/babel105b-v0.4_conv-eval.kwlist2.xml
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=600
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/105-turkish/release-babel105b-v0.4-rc1/conversational/reference_materials/lexicon.sub-train.txt
+#http://demo.icu-project.org/icu-bin/translit
+icu_opt=(--use-icu true --icu-transform 'İ > i;I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)̇ > i \\\\\\\\\\\\\\\$1 ;I > ı;::Any-Lower();' )
+#icu_opt=(--use-icu true --icu-transform "'\\\\\\\\İ > i;I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)̇ > i \\\\\\\\\\\\\\\$1 ;I > ı;::Any-Lower();'" )
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/106-tagalog-fullLP.official.conf
+++ b/egs/babel/s5b/conf/lang/106-tagalog-fullLP.official.conf
@ -0,0 +1,75 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/106-tagalog/release-current/conversational/training/
+train_data_list=/export/babel/data/splits/Tagalog_Babel106/train.FullLP.list
+train_nj=32
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/106-tagalog/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Tagalog_Babel106/dev2hr.list
+dev2h_data_cmudb=/export/babel/data/splits/Tagalog_Babel106/uem/v18/db-tag-utt.dat
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=23
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/106-tagalog/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Tagalog_Babel106/dev.list
+dev10h_data_cmudb=/export/babel/data/splits/Tagalog_Babel106/uem/v18/db-tag-utt.dat
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/106-tagalog/release-current/conversational/eval
+eval_data_list=/export/babel/data/splits/Tagalog_Babel106/eval.babel106b-v0.2g.list
+eval_ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-eval.ecf.xml
+eval_kwlist_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-eval.kwlist2.xml
+eval_data_cmudb=/export/babel/data/splits/Tagalog_Babel106/uem/v18/db-tag-utt.dat
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=80000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/106-tagalog/release-current/conversational/reference_materials/lexicon.txt
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/106-tagalog-limitedLP.official.conf
+++ b/egs/babel/s5b/conf/lang/106-tagalog-limitedLP.official.conf
@ -0,0 +1,75 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/106-tagalog/release-current/conversational/training/
+train_data_list=/export/babel/data/splits/Tagalog_Babel106/train.LimitedLP.official.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/106-tagalog/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Tagalog_Babel106/dev2hr.list
+dev2h_data_cmudb=/export/babel/data/splits/Tagalog_Babel106/uem/v18/db-tag-utt.dat
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=23
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/106-tagalog/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Tagalog_Babel106/dev.list
+dev10h_data_cmudb=/export/babel/data/splits/Tagalog_Babel106/uem/v18/db-tag-utt.dat
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/106-tagalog/release-current/conversational/eval
+eval_data_list=/export/babel/data/splits/Tagalog_Babel106/eval.babel106b-v0.2g.list
+eval_ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-eval.ecf.xml
+eval_kwlist_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-eval.kwlist2.xml
+eval_data_cmudb=/export/babel/data/splits/Tagalog_Babel106/uem/v18/db-tag-utt.dat
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/106-tagalog/release-babel106b-v0.2g-sub-train/conversational/reference_materials/lexicon.sub-train.txt
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/106-tagalog-limitedLP.official.ffv+pitch.conf
+++ b/egs/babel/s5b/conf/lang/106-tagalog-limitedLP.official.ffv+pitch.conf
@ -0,0 +1,77 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/106-tagalog/release-current/conversational/training/
+train_data_list=/export/babel/data/splits/Tagalog_Babel106/train.LimitedLP.official.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/106-tagalog/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Tagalog_Babel106/dev2hr.list
+dev2h_data_cmudb=/export/babel/data/splits/Tagalog_Babel106/uem/v18/db-tag-utt.dat
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=23
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/106-tagalog/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Tagalog_Babel106/dev.list
+dev10h_data_cmudb=/export/babel/data/splits/Tagalog_Babel106/uem/v18/db-tag-utt.dat
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.kwlist.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/106-tagalog/release-current/conversational/eval
+eval_data_list=/export/babel/data/splits/Tagalog_Babel106/eval.babel106b-v0.2g.list
+eval_ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-eval.ecf.xml
+eval_kwlist_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-eval.kwlist2.xml
+eval_data_cmudb=/export/babel/data/splits/Tagalog_Babel106/uem/v18/db-tag-utt.dat
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+
+use_ffv=true
+use_pitch=true
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/106-tagalog/release-babel106b-v0.2g-sub-train/conversational/reference_materials/lexicon.sub-train.txt
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/107-vietnamese-fullLP.official.conf
+++ b/egs/babel/s5b/conf/lang/107-vietnamese-fullLP.official.conf
@ -0,0 +1,83 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/training/
+train_data_list=/export/babel/data/splits/Vietnamese_Babel107/train.fullLP.list
+train_nj=32
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Vietnamese_Babel107/dev.2hr.list
+dev2h_data_cmudb=/export/babel/data/splits/Vietnamese_Babel107/uem/conv-eval/db-v8-utt.dat
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev/babel107b-v0.7_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev/babel107b-v0.7_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/splits/Vietnamese_Babel107/keywords.expanded.cmu.v2.xml
+dev2h_subset_ecf=true
+dev2h_nj=27
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/dev/
+dev10h_data_list=/export/babel/data/splits/Vietnamese_Babel107/dev.list
+dev10h_data_cmudb=/export/babel/data/splits/Vietnamese_Babel107/uem/conv-eval/db-v8-utt.dat
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev/babel107b-v0.7_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev/babel107b-v0.7_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/splits/Vietnamese_Babel107/keywords.expanded.cmu.v2.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/eval/
+eval_data_list=/export/babel/data/splits/Vietnamese_Babel107/eval.list
+eval_data_cmudb=/export/babel/data/splits/Vietnamese_Babel107/uem/conv-eval/db-v8-utt.dat
+eval_ecf_file=/export/babel/data/scoring/IndusDB.20130424/babel107b-v0.7_conv-eval.ecf.xml
+eval_kwlist_file=/export/babel/data/scoring/IndusDB.20130424/babel107b-v0.7_conv-eval.kwlist3.xml
+eval_nj=81
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=80000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+use_pitch=true
+
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/107-vietnamese/release-current/conversational/reference_materials/lexicon.txt
+
+phoneme_mapping="i@U=i @ U;oaI=o a I;oaI:=o a I:;u@I=u @ I;uI@= u I @;1@I=1 @ I;1@U=1 @ U;
+  a:I=a: I; a:U=a: U; aU=a U; @U=@ U; aI=a I; @I=@ I; EU=E U; eU=e U; i@=i @; iU=i U; Oa:=O a: ; Oa=O a; 
+  OE=O E; OI=O I; oI=o I; @:I=@: I; u@=u @; 1@=1 @; ue=u e; uI=u I; 1I=1 I; u@:=u @:; 1U=1 U; ui:=u i:"
+# 
+
+
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/107-vietnamese-limitedLP.ffv+pitch.official.conf
+++ b/egs/babel/s5b/conf/lang/107-vietnamese-limitedLP.ffv+pitch.official.conf
@ -0,0 +1,84 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/training/
+train_data_list=/export/babel/data/splits/Vietnamese_Babel107/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Vietnamese_Babel107/dev.2hr.list
+dev2h_data_cmudb=/export/babel/data/splits/Vietnamese_Babel107/uem/conv-eval/db-v8-utt.dat
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev/babel107b-v0.7_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev/babel107b-v0.7_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/splits/Vietnamese_Babel107/keywords.expanded.cmu.v2.xml
+dev2h_subset_ecf=true
+dev2h_nj=27
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/dev/
+dev10h_data_list=/export/babel/data/splits/Vietnamese_Babel107/dev.list
+dev10h_data_cmudb=/export/babel/data/splits/Vietnamese_Babel107/uem/conv-eval/db-v8-utt.dat
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev/babel107b-v0.7_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev/babel107b-v0.7_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/splits/Vietnamese_Babel107/keywords.expanded.cmu.v2.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/eval/
+eval_data_list=/export/babel/data/splits/Vietnamese_Babel107/eval.list
+eval_data_cmudb=/export/babel/data/splits/Vietnamese_Babel107/uem/conv-eval/db-v8-utt.dat
+eval_ecf_file=/export/babel/data/scoring/IndusDB.20130424/babel107b-v0.7_conv-eval.ecf.xml
+eval_kwlist_file=/export/babel/data/scoring/IndusDB.20130424/babel107b-v0.7_conv-eval.kwlist3.xml
+eval_nj=81
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+use_pitch=true
+use_ffv=true
+
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/107-vietnamese/release-current/conversational/reference_materials/lexicon.sub-train.txt
+
+phoneme_mapping="i@U=i @ U;oaI=o a I;oaI:=o a I:;u@I=u @ I;uI@= u I @;1@I=1 @ I;1@U=1 @ U;
+  a:I=a: I; a:U=a: U; aU=a U; @U=@ U; aI=a I; @I=@ I; EU=E U; eU=e U; i@=i @; iU=i U; Oa:=O a: ; Oa=O a; 
+  OE=O E; OI=O I; oI=o I; @:I=@: I; u@=u @; 1@=1 @; ue=u e; uI=u I; 1I=1 I; u@:=u @:; 1U=1 U; ui:=u i:"
+# 
+
+
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/107-vietnamese-limitedLP.ffv.official.conf
+++ b/egs/babel/s5b/conf/lang/107-vietnamese-limitedLP.ffv.official.conf
@ -0,0 +1,84 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/training/
+train_data_list=/export/babel/data/splits/Vietnamese_Babel107/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Vietnamese_Babel107/dev.2hr.list
+dev2h_data_cmudb=/export/babel/data/splits/Vietnamese_Babel107/uem/conv-eval/db-v8-utt.dat
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev/babel107b-v0.7_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev/babel107b-v0.7_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/splits/Vietnamese_Babel107/keywords.expanded.cmu.v2.xml
+dev2h_subset_ecf=true
+dev2h_nj=27
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/dev/
+dev10h_data_list=/export/babel/data/splits/Vietnamese_Babel107/dev.list
+dev10h_data_cmudb=/export/babel/data/splits/Vietnamese_Babel107/uem/conv-eval/db-v8-utt.dat
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev/babel107b-v0.7_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev/babel107b-v0.7_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/splits/Vietnamese_Babel107/keywords.expanded.cmu.v2.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/eval/
+eval_data_list=/export/babel/data/splits/Vietnamese_Babel107/eval.list
+eval_data_cmudb=/export/babel/data/splits/Vietnamese_Babel107/uem/conv-eval/db-v8-utt.dat
+eval_ecf_file=/export/babel/data/scoring/IndusDB.20130424/babel107b-v0.7_conv-eval.ecf.xml
+eval_kwlist_file=/export/babel/data/scoring/IndusDB.20130424/babel107b-v0.7_conv-eval.kwlist3.xml
+eval_nj=81
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+use_pitch=false
+use_ffv=true
+
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/107-vietnamese/release-current/conversational/reference_materials/lexicon.sub-train.txt
+
+phoneme_mapping="i@U=i @ U;oaI=o a I;oaI:=o a I:;u@I=u @ I;uI@= u I @;1@I=1 @ I;1@U=1 @ U;
+  a:I=a: I; a:U=a: U; aU=a U; @U=@ U; aI=a I; @I=@ I; EU=E U; eU=e U; i@=i @; iU=i U; Oa:=O a: ; Oa=O a; 
+  OE=O E; OI=O I; oI=o I; @:I=@: I; u@=u @; 1@=1 @; ue=u e; uI=u I; 1I=1 I; u@:=u @:; 1U=1 U; ui:=u i:"
+# 
+
+
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/107-vietnamese-limitedLP.official.conf
+++ b/egs/babel/s5b/conf/lang/107-vietnamese-limitedLP.official.conf
@ -0,0 +1,83 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/training/
+train_data_list=/export/babel/data/splits/Vietnamese_Babel107/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/dev
+dev2h_data_list=/export/babel/data/splits/Vietnamese_Babel107/dev.2hr.list
+dev2h_data_cmudb=/export/babel/data/splits/Vietnamese_Babel107/uem/conv-eval/db-v8-utt.dat
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev/babel107b-v0.7_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev/babel107b-v0.7_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/splits/Vietnamese_Babel107/keywords.expanded.cmu.v2.xml
+dev2h_subset_ecf=true
+dev2h_nj=27
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/dev/
+dev10h_data_list=/export/babel/data/splits/Vietnamese_Babel107/dev.list
+dev10h_data_cmudb=/export/babel/data/splits/Vietnamese_Babel107/uem/conv-eval/db-v8-utt.dat
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev/babel107b-v0.7_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel107b-v0.7_conv-dev/babel107b-v0.7_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/splits/Vietnamese_Babel107/keywords.expanded.cmu.v2.xml
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/eval/
+eval_data_list=/export/babel/data/splits/Vietnamese_Babel107/eval.list
+eval_data_cmudb=/export/babel/data/splits/Vietnamese_Babel107/uem/conv-eval/db-v8-utt.dat
+eval_ecf_file=/export/babel/data/scoring/IndusDB.20130424/babel107b-v0.7_conv-eval.ecf.xml
+eval_kwlist_file=/export/babel/data/scoring/IndusDB.20130424/babel107b-v0.7_conv-eval.kwlist3.xml
+eval_nj=81
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+use_pitch=true
+
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/107-vietnamese/release-current/conversational/reference_materials/lexicon.sub-train.txt
+
+phoneme_mapping="i@U=i @ U;oaI=o a I;oaI:=o a I:;u@I=u @ I;uI@= u I @;1@I=1 @ I;1@U=1 @ U;
+  a:I=a: I; a:U=a: U; aU=a U; @U=@ U; aI=a I; @I=@ I; EU=E U; eU=e U; i@=i @; iU=i U; Oa:=O a: ; Oa=O a; 
+  OE=O E; OI=O I; oI=o I; @:I=@: I; u@=u @; 1@=1 @; ue=u e; uI=u I; 1I=1 I; u@:=u @:; 1U=1 U; ui:=u i:"
+# 
+
+
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/201-haitian-fullLP.official.conf
+++ b/egs/babel/s5b/conf/lang/201-haitian-fullLP.official.conf
@ -0,0 +1,97 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/201-haitian/release-current/conversational/training/
+train_data_list=/export/babel/data/splits/Haitian_Babel201/train.FullLP.list
+train_nj=32
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev/
+dev2h_data_list=/export/babel/data/splits/Haitian_Babel201/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=
+dev2h_ecf_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.ecf.xml
+dev2h_rttm_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.rttm
+dev2h_kwlist_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=20
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Haitian_Babel201/dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=
+dev10h_ecf_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.ecf.xml
+dev10h_rttm_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.rttm
+dev10h_kwlist_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.kwlist.xml
+dev10h_nj=32
+
+#RADICAL DEV data files
+dev10h_sph_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev/
+dev10h_sph_data_list=/export/babel/data/splits/Haitian_Babel201/dev.sph.list
+dev10h_sph_data_cmudb=
+dev10h_sph_stm_file=
+dev10h_sph_ecf_file=
+dev10h_sph_rttm_file=
+dev10h_sph_kwlist_file=
+dev10h_sph_subset_ecf=true
+dev10h_sph_nj=32
+
+#RADICAL DEV data files
+dev10h_wav_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev/
+dev10h_wav_data_list=/export/babel/data/splits/Haitian_Babel201/dev.wav.list
+dev10h_wav_data_cmudb=
+dev10h_wav_stm_file=
+dev10h_wav_ecf_file=
+dev10h_wav_rttm_file=
+dev10h_wav_kwlist_file=
+dev10h_wav_subset_ecf=true
+dev10h_wav_nj=13
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/201-haitian/release-current/conversational/eval
+eval_data_list=
+eval_ecf_file=
+eval_kwlist_file=
+eval_data_cmudb=
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=80000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+use_pitch=false
+use_ffv=false
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/201-haitian/release-current/conversational/reference_materials/lexicon.txt
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/201-haitian-fullLP.official.ffv+pitch.conf
+++ b/egs/babel/s5b/conf/lang/201-haitian-fullLP.official.ffv+pitch.conf
@ -0,0 +1,97 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/201-haitian/release-current/conversational/training/
+train_data_list=/export/babel/data/splits/Haitian_Babel201/train.FullLP.list
+train_nj=32
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev/
+dev2h_data_list=/export/babel/data/splits/Haitian_Babel201/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=
+dev2h_ecf_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.ecf.xml
+dev2h_rttm_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.rttm
+dev2h_kwlist_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=20
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Haitian_Babel201/dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=
+dev10h_ecf_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.ecf.xml
+dev10h_rttm_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.rttm
+dev10h_kwlist_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.kwlist.xml
+dev10h_nj=32
+
+#RADICAL DEV data files
+dev10h_sph_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev/
+dev10h_sph_data_list=/export/babel/data/splits/Haitian_Babel201/dev.sph.list
+dev10h_sph_data_cmudb=
+dev10h_sph_stm_file=
+dev10h_sph_ecf_file=
+dev10h_sph_rttm_file=
+dev10h_sph_kwlist_file=
+dev10h_sph_subset_ecf=true
+dev10h_sph_nj=32
+
+#RADICAL DEV data files
+dev10h_wav_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev/
+dev10h_wav_data_list=/export/babel/data/splits/Haitian_Babel201/dev.wav.list
+dev10h_wav_data_cmudb=
+dev10h_wav_stm_file=
+dev10h_wav_ecf_file=
+dev10h_wav_rttm_file=
+dev10h_wav_kwlist_file=
+dev10h_wav_subset_ecf=true
+dev10h_wav_nj=13
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/201-haitian/release-current/conversational/eval
+eval_data_list=
+eval_ecf_file=
+eval_kwlist_file=
+eval_data_cmudb=
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=80000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+use_pitch=true
+use_ffv=true
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/201-haitian/release-current/conversational/reference_materials/lexicon.txt
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/201-haitian-limitedLP.official.conf
+++ b/egs/babel/s5b/conf/lang/201-haitian-limitedLP.official.conf
@ -0,0 +1,76 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/201-haitian/release-current/conversational/training/
+train_data_list=/export/babel/data/splits/Haitian_Babel201/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev/
+dev2h_data_list=/export/babel/data/splits/Haitian_Babel201/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=
+dev2h_ecf_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.ecf.xml
+dev2h_rttm_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.rttm
+dev2h_kwlist_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=20
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Haitian_Babel201/dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=
+dev10h_ecf_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.ecf.xml
+dev10h_rttm_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.rttm
+dev10h_kwlist_file=/export/babel/data/splits/Haitian_Babel201/babel201-v1.0_conv-jhu10hdev.kwlist.xml
+dev10h_nj=32
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/201-haitian//release-current/conversational/eval
+eval_data_list=
+eval_ecf_file=
+eval_kwlist_file=
+eval_data_cmudb=
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+
+use_pitch=true
+use_ffv=true
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/201-haitian/release-current/conversational/reference_materials/lexicon.sub-train.txt
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/203-lao-fullLP.official.conf
+++ b/egs/babel/s5b/conf/lang/203-lao-fullLP.official.conf
@ -0,0 +1,98 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/203-lao/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Lao_Babel203/train.FullLP.list
+train_nj=32
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/203-lao/release-current/conversational/dev/
+dev2h_data_list=/export/babel/data/splits/Lao_Babel203/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=
+dev2h_ecf_file=
+dev2h_rttm_file=
+dev2h_kwlist_file=
+dev2h_subset_ecf=true
+dev2h_nj=18
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/203-lao/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Lao_Babel203/dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=
+dev10h_ecf_file=
+dev10h_rttm_file=
+dev10h_kwlist_file=
+dev10h_nj=32
+
+#RADICAL DEV data files
+dev10h_sph_data_dir=/export/babel/data/203-lao/release-current/conversational/dev/
+dev10h_sph_data_list=/export/babel/data/splits/Lao_Babel203/dev.sph.list
+dev10h_sph_data_cmudb=
+dev10h_sph_stm_file=
+dev10h_sph_ecf_file=
+dev10h_sph_rttm_file=
+dev10h_sph_kwlist_file=
+dev10h_sph_subset_ecf=true
+dev10h_sph_nj=32
+
+#RADICAL DEV data files
+dev10h_wav_data_dir=/export/babel/data/203-lao/release-current/conversational/dev/
+dev10h_wav_data_list=/export/babel/data/splits/Lao_Babel203/dev.wav.list
+dev10h_wav_data_cmudb=
+dev10h_wav_stm_file=
+dev10h_wav_ecf_file=
+dev10h_wav_rttm_file=
+dev10h_wav_kwlist_file=
+dev10h_wav_subset_ecf=true
+dev10h_wav_nj=13
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/203-lao/release-current/conversational/eval
+eval_data_list=
+eval_ecf_file=
+eval_kwlist_file=
+eval_data_cmudb=
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=80000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+
+use_pitch=false
+use_ffv=false
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/203-lao/release-current/conversational/reference_materials/lexicon.fixed.txt
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/203-lao-fullLP.pitch.official.conf
+++ b/egs/babel/s5b/conf/lang/203-lao-fullLP.pitch.official.conf
@ -0,0 +1,98 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/203-lao/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Lao_Babel203/train.FullLP.list
+train_nj=32
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/203-lao/release-current/conversational/dev/
+dev2h_data_list=/export/babel/data/splits/Lao_Babel203/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=
+dev2h_ecf_file=
+dev2h_rttm_file=
+dev2h_kwlist_file=
+dev2h_subset_ecf=true
+dev2h_nj=18
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/203-lao/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Lao_Babel203/dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=
+dev10h_ecf_file=
+dev10h_rttm_file=
+dev10h_kwlist_file=
+dev10h_nj=32
+
+#RADICAL DEV data files
+dev10h_sph_data_dir=/export/babel/data/203-lao/release-current/conversational/dev/
+dev10h_sph_data_list=/export/babel/data/splits/Lao_Babel203/dev.sph.list
+dev10h_sph_data_cmudb=
+dev10h_sph_stm_file=
+dev10h_sph_ecf_file=
+dev10h_sph_rttm_file=
+dev10h_sph_kwlist_file=
+dev10h_sph_subset_ecf=true
+dev10h_sph_nj=32
+
+#RADICAL DEV data files
+dev10h_wav_data_dir=/export/babel/data/203-lao/release-current/conversational/dev/
+dev10h_wav_data_list=/export/babel/data/splits/Lao_Babel203/dev.wav.list
+dev10h_wav_data_cmudb=
+dev10h_wav_stm_file=
+dev10h_wav_ecf_file=
+dev10h_wav_rttm_file=
+dev10h_wav_kwlist_file=
+dev10h_wav_subset_ecf=true
+dev10h_wav_nj=13
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/203-lao/release-current/conversational/eval
+eval_data_list=
+eval_ecf_file=
+eval_kwlist_file=
+eval_data_cmudb=
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=80000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+
+use_pitch=true
+use_ffv=false
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/203-lao/release-current/conversational/reference_materials/lexicon.fixed.txt
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/203-lao-limitedLP.pitch.official.conf
+++ b/egs/babel/s5b/conf/lang/203-lao-limitedLP.pitch.official.conf
@ -0,0 +1,108 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/203-lao/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Lao_Babel203/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/203-lao/release-current/conversational/dev/
+dev2h_data_list=/export/babel/data/splits/Lao_Babel203/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=
+dev2h_ecf_file=
+dev2h_rttm_file=
+dev2h_kwlist_file=
+dev2h_subset_ecf=true
+dev2h_nj=18
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/203-lao/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Lao_Babel203/dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=
+dev10h_ecf_file=
+dev10h_rttm_file=
+dev10h_kwlist_file=
+dev10h_nj=32
+
+#RADICAL EVAL data files (difference between TRAIN-FULL  TRAIN-LIMITED)
+devtrain_data_dir=/export/babel/data/203-lao/release-current/conversational/training/
+devtrain_data_list=/export/babel/data/splits/Lao_Babel203/dev.train.list
+devtrain_data_cmudb=
+devtrain_stm_file=
+devtrain_ecf_file=
+devtrain_rttm_file=
+devtrain_kwlist_file=
+devtrain_nj=64
+
+#RADICAL DEV data files
+dev10h_sph_data_dir=/export/babel/data/203-lao/release-current/conversational/dev/
+dev10h_sph_data_list=/export/babel/data/splits/Lao_Babel203/dev.sph.list
+dev10h_sph_data_cmudb=
+dev10h_sph_stm_file=
+dev10h_sph_ecf_file=
+dev10h_sph_rttm_file=
+dev10h_sph_kwlist_file=
+dev10h_sph_subset_ecf=true
+dev10h_sph_nj=32
+
+#RADICAL DEV data files
+dev10h_wav_data_dir=/export/babel/data/203-lao/release-current/conversational/dev/
+dev10h_wav_data_list=/export/babel/data/splits/Lao_Babel203/dev.wav.list
+dev10h_wav_data_cmudb=
+dev10h_wav_stm_file=
+dev10h_wav_ecf_file=
+dev10h_wav_rttm_file=
+dev10h_wav_kwlist_file=
+dev10h_wav_subset_ecf=true
+dev10h_wav_nj=13
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/203-lao/release-current/conversational/eval
+eval_data_list=
+eval_ecf_file=
+eval_kwlist_file=
+eval_data_cmudb=
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--romanized --oov <unk>"
+
+use_pitch=true
+use_ffv=false
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/203-lao/release-current/conversational/reference_materials/lexicon.sub-train.txt
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/206-zulu-fullLP.official.conf
+++ b/egs/babel/s5b/conf/lang/206-zulu-fullLP.official.conf
@ -0,0 +1,103 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/206-zulu/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Zulu_Babel206/train.FullLP.list
+train_nj=32
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
+dev2h_data_list=/export/babel/data/splits/Zulu_Babel206/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.scoring.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.annot.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=18
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Zulu_Babel206/dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.scoring.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.annot.kwlist.xml
+dev10h_nj=32
+
+#RADICAL DEV data files
+dev10h_sph_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
+dev10h_sph_data_list=/export/babel/data/splits/Zulu_Babel206/dev.sph.list
+dev10h_sph_data_cmudb=
+dev10h_sph_stm_file=
+dev10h_sph_ecf_file=
+dev10h_sph_rttm_file=
+dev10h_sph_kwlist_file=
+dev10h_sph_subset_ecf=true
+dev10h_sph_nj=32
+
+#RADICAL DEV data files
+dev10h_wav_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
+dev10h_wav_data_list=/export/babel/data/splits/Zulu_Babel206/dev.wav.list
+dev10h_wav_data_cmudb=
+dev10h_wav_stm_file=
+dev10h_wav_ecf_file=
+dev10h_wav_rttm_file=
+dev10h_wav_kwlist_file=
+dev10h_wav_subset_ecf=true
+dev10h_wav_nj=13
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/206-zulu/release-current/conversational/eval
+eval_data_list=
+eval_ecf_file=
+eval_kwlist_file=
+eval_data_cmudb=
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=80000
+
+#Zulu seems to need much larger LM Weights
+lmwt_plp_extra_opts=( --min-lmwt 10 --max-lmwt 17 )
+lmwt_bnf_extra_opts=( --min-lmwt 13 --max-lmwt 18 )
+lmwt_dnn_extra_opts=( --min-lmwt 10 --max-lmwt 17 )
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+
+use_pitch=false
+use_ffv=false
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/206-zulu/release-current/conversational/reference_materials/lexicon.txt
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/206-zulu-fullLP.official.ffv+pitch.conf
+++ b/egs/babel/s5b/conf/lang/206-zulu-fullLP.official.ffv+pitch.conf
@ -0,0 +1,103 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/206-zulu/release-current/conversational/training
+train_data_list=/export/babel/data/splits/Zulu_Babel206/train.FullLP.list
+train_nj=32
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
+dev2h_data_list=/export/babel/data/splits/Zulu_Babel206/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.scoring.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.annot.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=18
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Zulu_Babel206/dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.scoring.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.annot.kwlist.xml
+dev10h_nj=32
+
+#RADICAL DEV data files
+dev10h_sph_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
+dev10h_sph_data_list=/export/babel/data/splits/Zulu_Babel206/dev.sph.list
+dev10h_sph_data_cmudb=
+dev10h_sph_stm_file=
+dev10h_sph_ecf_file=
+dev10h_sph_rttm_file=
+dev10h_sph_kwlist_file=
+dev10h_sph_subset_ecf=true
+dev10h_sph_nj=32
+
+#RADICAL DEV data files
+dev10h_wav_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
+dev10h_wav_data_list=/export/babel/data/splits/Zulu_Babel206/dev.wav.list
+dev10h_wav_data_cmudb=
+dev10h_wav_stm_file=
+dev10h_wav_ecf_file=
+dev10h_wav_rttm_file=
+dev10h_wav_kwlist_file=
+dev10h_wav_subset_ecf=true
+dev10h_wav_nj=13
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/206-zulu/release-current/conversational/eval
+eval_data_list=
+eval_ecf_file=
+eval_kwlist_file=
+eval_data_cmudb=
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=1000
+numGaussTri2=20000
+numLeavesTri3=6000
+numGaussTri3=75000
+numLeavesMLLT=6000
+numGaussMLLT=75000
+numLeavesSAT=6000
+numGaussSAT=75000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=80000
+
+#Zulu seems to need much larger LM Weights
+lmwt_plp_extra_opts=( --min-lmwt 10 --max-lmwt 17 )
+lmwt_bnf_extra_opts=( --min-lmwt 13 --max-lmwt 18 )
+lmwt_dnn_extra_opts=( --min-lmwt 10 --max-lmwt 17 )
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+
+use_pitch=true
+use_ffv=true
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/206-zulu/release-current/conversational/reference_materials/lexicon.txt
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/206-zulu-limitedLP.official.conf
+++ b/egs/babel/s5b/conf/lang/206-zulu-limitedLP.official.conf
@ -0,0 +1,113 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/206-zulu/release-current/conversational/training/
+train_data_list=/export/babel/data/splits/Zulu_Babel206/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
+dev2h_data_list=/export/babel/data/splits/Zulu_Babel206/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.scoring.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.annot.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=18
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Zulu_Babel206/dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.scoring.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.annot.kwlist.xml
+dev10h_nj=32
+
+#RADICAL EVAL data files (difference between TRAIN-FULL  TRAIN-LIMITED)
+devtrain_data_dir=/export/babel/data/206-zulu/release-current/conversational/training/
+devtrain_data_list=/export/babel/data/splits/Zulu_Babel206/dev.train.list
+devtrain_data_cmudb=
+devtrain_stm_file=
+devtrain_ecf_file=
+devtrain_rttm_file=
+devtrain_kwlist_file=
+devtrain_nj=64
+
+#RADICAL DEV data files
+dev10h_sph_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
+dev10h_sph_data_list=/export/babel/data/splits/Zulu_Babel206/dev.sph.list
+dev10h_sph_data_cmudb=
+dev10h_sph_stm_file=
+dev10h_sph_ecf_file=
+dev10h_sph_rttm_file=
+dev10h_sph_kwlist_file=
+dev10h_sph_subset_ecf=true
+dev10h_sph_nj=32
+
+#RADICAL DEV data files
+dev10h_wav_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
+dev10h_wav_data_list=/export/babel/data/splits/Zulu_Babel206/dev.wav.list
+dev10h_wav_data_cmudb=
+dev10h_wav_stm_file=
+dev10h_wav_ecf_file=
+dev10h_wav_rttm_file=
+dev10h_wav_kwlist_file=
+dev10h_wav_subset_ecf=true
+dev10h_wav_nj=13
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/206-zulu/release-current/conversational/eval
+eval_data_list=
+eval_ecf_file=
+eval_kwlist_file=
+eval_data_cmudb=
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+#Zulu seems to need much larger LM Weights
+lmwt_plp_extra_opts=( --min-lmwt 10 --max-lmwt 17 )
+lmwt_bnf_extra_opts=( --min-lmwt 13 --max-lmwt 18 )
+lmwt_dnn_extra_opts=( --min-lmwt 10 --max-lmwt 17 )
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+
+use_pitch=false
+use_ffv=false
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/206-zulu/release-current/conversational/reference_materials/lexicon.sub-train.txt
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/206-zulu-limitedLP.official.ffv+pitch.conf
+++ b/egs/babel/s5b/conf/lang/206-zulu-limitedLP.official.ffv+pitch.conf
@ -0,0 +1,113 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/206-zulu/release-current/conversational/training/
+train_data_list=/export/babel/data/splits/Zulu_Babel206/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
+dev2h_data_list=/export/babel/data/splits/Zulu_Babel206/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.scoring.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.annot.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=18
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Zulu_Babel206/dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.scoring.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.annot.kwlist.xml
+dev10h_nj=32
+
+#RADICAL EVAL data files (difference between TRAIN-FULL  TRAIN-LIMITED)
+devtrain_data_dir=/export/babel/data/206-zulu/release-current/conversational/training/
+devtrain_data_list=/export/babel/data/splits/Zulu_Babel206/dev.train.list
+devtrain_data_cmudb=
+devtrain_stm_file=
+devtrain_ecf_file=
+devtrain_rttm_file=
+devtrain_kwlist_file=
+devtrain_nj=64
+
+#RADICAL DEV data files
+dev10h_sph_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
+dev10h_sph_data_list=/export/babel/data/splits/Zulu_Babel206/dev.sph.list
+dev10h_sph_data_cmudb=
+dev10h_sph_stm_file=
+dev10h_sph_ecf_file=
+dev10h_sph_rttm_file=
+dev10h_sph_kwlist_file=
+dev10h_sph_subset_ecf=true
+dev10h_sph_nj=32
+
+#RADICAL DEV data files
+dev10h_wav_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
+dev10h_wav_data_list=/export/babel/data/splits/Zulu_Babel206/dev.wav.list
+dev10h_wav_data_cmudb=
+dev10h_wav_stm_file=
+dev10h_wav_ecf_file=
+dev10h_wav_rttm_file=
+dev10h_wav_kwlist_file=
+dev10h_wav_subset_ecf=true
+dev10h_wav_nj=13
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/206-zulu/release-current/conversational/eval
+eval_data_list=
+eval_ecf_file=
+eval_kwlist_file=
+eval_data_cmudb=
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+#Zulu seems to need much larger LM Weights
+lmwt_plp_extra_opts=( --min-lmwt 10 --max-lmwt 17 )
+lmwt_bnf_extra_opts=( --min-lmwt 13 --max-lmwt 18 )
+lmwt_dnn_extra_opts=( --min-lmwt 10 --max-lmwt 17 )
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+
+use_pitch=true
+use_ffv=true
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/206-zulu/release-current/conversational/reference_materials/lexicon.sub-train.txt
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/lang/206-zulu-limitedLP.pitch.official.conf
+++ b/egs/babel/s5b/conf/lang/206-zulu-limitedLP.pitch.official.conf
@ -0,0 +1,113 @@
+# include common settings for limitedLP systems.
+. conf/common.limitedLP || exit 1;
+
+#speech corpora files location
+train_data_dir=/export/babel/data/206-zulu/release-current/conversational/training/
+train_data_list=/export/babel/data/splits/Zulu_Babel206/train.LimitedLP.list
+train_nj=16
+
+#RADICAL DEV data files
+dev2h_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
+dev2h_data_list=/export/babel/data/splits/Zulu_Babel206/dev.2hr.list
+dev2h_data_cmudb=
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.stm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.scoring.ecf.xml
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.mitllfa3.rttm
+dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.annot.kwlist.xml
+dev2h_subset_ecf=true
+dev2h_nj=18
+
+#Official DEV data files
+dev10h_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev
+dev10h_data_list=/export/babel/data/splits/Zulu_Babel206/dev.list
+dev10h_data_cmudb=
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.stm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.scoring.ecf.xml
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.mitllfa3.rttm
+dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel206b-v0.1d_conv-dev/babel206b-v0.1d_conv-dev.annot.kwlist.xml
+dev10h_nj=32
+
+#RADICAL EVAL data files (difference between TRAIN-FULL  TRAIN-LIMITED)
+devtrain_data_dir=/export/babel/data/206-zulu/release-current/conversational/training/
+devtrain_data_list=/export/babel/data/splits/Zulu_Babel206/dev.train.list
+devtrain_data_cmudb=
+devtrain_stm_file=
+devtrain_ecf_file=
+devtrain_rttm_file=
+devtrain_kwlist_file=
+devtrain_nj=64
+
+#RADICAL DEV data files
+dev10h_sph_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
+dev10h_sph_data_list=/export/babel/data/splits/Zulu_Babel206/dev.sph.list
+dev10h_sph_data_cmudb=
+dev10h_sph_stm_file=
+dev10h_sph_ecf_file=
+dev10h_sph_rttm_file=
+dev10h_sph_kwlist_file=
+dev10h_sph_subset_ecf=true
+dev10h_sph_nj=32
+
+#RADICAL DEV data files
+dev10h_wav_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
+dev10h_wav_data_list=/export/babel/data/splits/Zulu_Babel206/dev.wav.list
+dev10h_wav_data_cmudb=
+dev10h_wav_stm_file=
+dev10h_wav_ecf_file=
+dev10h_wav_rttm_file=
+dev10h_wav_kwlist_file=
+dev10h_wav_subset_ecf=true
+dev10h_wav_nj=13
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data/206-zulu/release-current/conversational/eval
+eval_data_list=
+eval_ecf_file=
+eval_kwlist_file=
+eval_data_cmudb=
+eval_nj=64
+
+#Official (POST-)EVAL evaluation data portion
+evalpart1_data_dir=
+evalpart1_data_list=
+evalpart1_data_cmudb=
+evalpart1_stm_file=
+evalpart1_ecf_file=
+evalpart1_rttm_file=
+evalpart1_kwlist_file=
+evalpart1_nj=21
+
+# Acoustic model parameters
+numLeavesTri1=1000
+numGaussTri1=10000
+numLeavesTri2=2500
+numGaussTri2=36000
+numLeavesTri3=2500
+numGaussTri3=36000
+numLeavesMLLT=2500
+numGaussMLLT=36000
+numLeavesSAT=2500
+numGaussSAT=36000
+numGaussUBM=750
+numLeavesSGMM=5000
+numGaussSGMM=18000
+
+#Zulu seems to need much larger LM Weights
+lmwt_plp_extra_opts=( --min-lmwt 10 --max-lmwt 17 )
+lmwt_bnf_extra_opts=( --min-lmwt 13 --max-lmwt 18 )
+lmwt_dnn_extra_opts=( --min-lmwt 10 --max-lmwt 17 )
+
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
+
+use_pitch=true
+use_ffv=false
+# Scoring protocols (dummy GLM file to appease the scoring script)
+#glmFile=./conf/glm
+lexicon_file=/export/babel/data/206-zulu/release-current/conversational/reference_materials/lexicon.sub-train.txt
+
+#keyword search settings
+duptime=0.5
+case_insensitive=true
+
--- a/egs/babel/s5b/conf/pitch.conf
+++ b/egs/babel/s5b/conf/pitch.conf
@ -0,0 +1 @@
+--sample-frequency=8000
--- a/egs/babel/s5b/conf/plp.conf
+++ b/egs/babel/s5b/conf/plp.conf
@ -0,0 +1 @@
+--sample-frequency=8000
--- a/egs/babel/s5b/conf/topo.proto
+++ b/egs/babel/s5b/conf/topo.proto
@ -0,0 +1,22 @@
+<Topology> 
+<TopologyEntry> 
+<ForPhones>
+NONSILENCEPHONES
+</ForPhones> 
+<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State> 
+<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State> 
+<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State> 
+<State> 3 </State>
+</TopologyEntry> 
+<TopologyEntry> 
+<ForPhones>
+SILENCEPHONES
+</ForPhones> 
+<State> 0 <PdfClass> 0 <Transition> 0 0.25 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 </State> 
+<State> 1 <PdfClass> 1 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State> 
+<State> 2 <PdfClass> 2 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State> 
+<State> 3 <PdfClass> 3 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State> 
+<State> 4 <PdfClass> 4 <Transition> 4 0.25 <Transition> 5 0.75 </State> 
+<State> 5 </State>
+</TopologyEntry> 
+</Topology> 
--- a/egs/babel/s5b/local/CHECKPOINT.sh
+++ b/egs/babel/s5b/local/CHECKPOINT.sh
@ -0,0 +1,141 @@
+#!/bin/bash
+
+function GETAPPROVAL {
+  until false ; do 
+    echo "Do you want to run the command (y/n)?"
+    read -n 1 WISH
+    
+    if [ "$WISH" == "y" ]; then 
+      return true;
+    elif [ "$WISH" == "n" ]; then
+      return false;
+    fi
+
+  done
+}
+
+function ESCAPE_PARAMS {
+  local out=""
+
+  for v in "$@"; do
+
+      if [[ "$v" == *"<"* ]]; then
+         out="$out \"$v\""
+      elif [[ "$v" == *">"* ]] ; then 
+         out="$out \"$v\""
+      elif [[ "$v" == *"|"* ]] ; then 
+         out="$out \'$v\'"
+      elif [[ "$v" == *" "* ]] ; then 
+         out="$out \"$v\""
+      else
+         out="$out $v"
+      fi
+  done
+
+  echo $out
+}
+
+function CHK {
+  local ID=DEFAULT
+  CHECKPOINT $ID "$@"
+}
+
+function CHECKPOINT {
+  COLOR_GREEN='\e[00;32m'
+  COLOR_RED='\e[00;31m'
+  COLOR_BLUE='\e[00;34m'
+  COLOR_DEFAULT='\e[00m'
+
+  local ID=$1; shift
+  #We retrieve the counter variable we use for checkpointing
+  #Because the name of the variable is govern by the checkpoint ID
+  #we must use indirect approach
+  local COUNTER_NAME="CHECKPOINT_${ID}_COUNTER"
+  local COUNTER
+  eval COUNTER=\$$COUNTER_NAME
+  if [ -z $COUNTER ]; then
+    COUNTER=0
+  fi
+  echo -e  ${COLOR_GREEN}CHECKPOINT:$ID, COUNTER=$COUNTER $COLOR_DEFAULT >&2
+
+  #Now the same for "LAST GOOD STATE"
+  if [ "$ID" == "DEFAULT" ]; then
+    local LAST_GOOD_NAME="LAST_GOOD"
+  else
+    local LAST_GOOD_NAME="LAST_GOOD_$ID"
+  fi
+  local LAST_GOOD_VALUE
+  eval LAST_GOOD_VALUE=\$$LAST_GOOD_NAME
+
+  echo -e ${COLOR_GREEN}"CHECKPOINT: $LAST_GOOD_NAME=$LAST_GOOD_VALUE"${COLOR_DEFAULT} >&2
+
+  #The command has to be run, if no-checkpoint tracking is in progress
+  #or we are already gone through the last problematic part
+  if [ -z $LAST_GOOD_VALUE  ] || [ $COUNTER -ge $LAST_GOOD_VALUE ]; then
+    #bash print_args.sh `ESCAPE_PARAMS $CMD`
+
+    if [ !$INTERACTIVE_CHECKPOINT ] ; then
+      eval `ESCAPE_PARAMS "$@"`
+    else 
+      APPROVAL=GETAPPROVAL
+      if $APPROVAL ; then
+        eval `ESCAPE_PARAMS $@`
+      fi
+    fi
+
+    if [ $? -ne 0 ] ; then
+      echo -e ${COLOR_RED}"CHECKPOINT FAILURE: The command returned non-zero status" >&2
+      echo -e "                    rerun the script with the parameter -c $LAST_GOOD_NAME=$COUNTER" >&2
+      echo -e "COMMAND">&2
+      echo -e "  " "$@" ${COLOR_RED} >&2 
+
+      exit 1
+    fi
+  else
+    #Else, we just skip the command....
+    echo -e ${COLOR_GREEN}"CHECKPOINT: SKIPPING, $LAST_GOOD_NAME=$COUNTER" >&2
+    echo -e "$@"${COLOR_DEFAULT} >&2
+  fi
+
+  COUNTER=$(( $COUNTER + 1 )) 
+  eval export $COUNTER_NAME=$COUNTER
+}
+
+function KILLBG_JOBS {
+    jobs \
+        | perl -ne 'print "$1\n" if m/^\[(\d+)\][+-]? +Running/;' \
+        | while read -r ; do kill %"$REPLY" ; done
+}
+
+function ONEXIT_HANDLER {
+  COLOR_GREEN='\e[00;32m'
+  COLOR_RED='\e[00;31m'
+  COLOR_BLUE='\e[00;34m'
+  COLOR_DEFAULT='\e[00m'
+  counters=`set | egrep "^CHECKPOINT_[_A-Z]+_COUNTER=" | sed 's/^CHECKPOINT\(_[_A-Z][_A-Z]*\)_COUNTER=/LAST_GOOD\1=/g' | sed "s/^LAST_GOOD_DEFAULT=/LAST_GOOD=/g"`
+  if [[ ! -z "$counters" ]]; then
+      echo -e ${COLOR_RED}"CHECKPOINT FAILURE: The last command returned non-zero status"${COLOR_DEFAULT} >&2
+      echo -e ${COLOR_RED}"look at the counters and try to rerun this script (after figuring the issue)"${COLOR_DEFAULT} >&2
+      echo -e ${COLOR_RED}"using the -c COUNTER_NAME=COUNTER_VALUE parameters;"${COLOR_DEFAULT} >&2
+      echo -e ${COLOR_RED}"You can use -c \"COUNTER_NAME1=COUNTER_VALUE1;COUNTER_NAME2=COUNTER_VALUE2\" as well"${COLOR_DEFAULT} >&2
+      echo -e ${COLOR_RED}"The counters: \n $counters"${COLOR_DEFAULT} >&2
+  else
+      echo -e ${COLOR_RED}"CHECKPOINT FAILURE: The last command returned non-zero status"${COLOR_DEFAULT} >&2
+      echo -e ${COLOR_RED}"No checkpoint was found. Try to figure out the problem and "${COLOR_DEFAULT} >&2
+      echo -e ${COLOR_RED}"run the script again"${COLOR_DEFAULT} >&2
+  fi
+}
+
+trap "ONEXIT_HANDLER; exit; " SIGINT SIGKILL SIGTERM ERR
+
+while getopts ":c:i" opt; do
+  case $opt in
+    c)
+      eval $OPTARG
+      ;;
+    i)
+      INTERACTIVE_CHECKPOINT=true
+  esac
+done
+
+
--- a/egs/babel/s5b/local/annotated_kwlist_to_KWs.pl
+++ b/egs/babel/s5b/local/annotated_kwlist_to_KWs.pl
@ -0,0 +1,124 @@
+#!/usr/bin/perl
+
+# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
+# Apache 2.0.
+#
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+my $Usage = <<EOU;
+Usage: annotated_kwlist_to_KWs.pl [options] <kwlist.annot.xml|-> <keywords|-> [category]
+ e.g.: annotated_kwlist_to_KWs.pl kwlist.annot.list keywords.list "NGram Order:2,3,4"
+
+This script reads an annotated kwlist xml file and writes a list of keywords, according
+to the given categories. The "category" is a "key:value" pair in the annotated kwlist xml
+file. For example
+1. "NGram Order:2,3,4"
+2. "NGram Order:2"
+3. "NGram Order:-"
+where "NGram Order" is the category name. The first line means print keywords that are
+bigram, trigram and 4gram; The second line means print keywords only for bigram; The last
+line means print all possible ngram keywords.
+If no "category" is specified, the script will print out the possible categories.
+
+Allowed options:
+EOU
+
+GetOptions(); 
+
+@ARGV >= 2 || die $Usage;
+
+# Workout the input/output source
+my $kwlist_filename = shift @ARGV;
+my $kws_filename = shift @ARGV;
+
+my $source = "STDIN";
+if ($kwlist_filename ne "-") {
+  open(KWLIST, "<$kwlist_filename") || die "Fail to open kwlist file: $kwlist_filename\n";
+  $source = "KWLIST";
+}
+
+# Process kwlist.annot.xml
+my %attr;
+my %attr_kws;
+my $kwid="";
+my $name="";
+my $value="";
+while (<$source>) {
+  chomp;
+  if (m/<kw kwid=/) {($kwid) = /kwid="(\S+)"/; next;}
+  if (m/<name>/) {($name) = /<name>(.*)<\/name>/; next;}
+  if (m/<value>/) {
+    ($value) = /<value>(.*)<\/value>/;
+    if (defined($attr{$name})) {
+      $attr{"$name"}->{"$value"} = 1;
+    } else {
+      $attr{"$name"} = {"$value", 1};
+    }
+    if (defined($attr_kws{"${name}_$value"})) {
+      $attr_kws{"${name}_$value"}->{"$kwid"} = 1;
+    } else {
+      $attr_kws{"${name}_$value"} = {"$kwid", 1};
+    }
+  }
+}
+
+my $output = "";
+if (@ARGV == 0) {
+  # If no category provided, print out the possible categories
+  $output .= "Possible categories are:\n\n";
+  foreach my $name (keys %attr) {
+    $output .= "$name:";
+    my $count = 0;
+    foreach my $value (keys %{$attr{$name}}) {
+      if ($value eq "") {$value = "\"\"";}
+      if ($count == 0) {
+        $output .= "$value";
+        $count ++; next;
+      } 
+      if ($count == 6) {
+        $output .= ", ...";
+        last;
+      }
+      $output .= ",$value"; $count ++;
+    }
+    $output .= "\n";
+  }
+  print STDERR $output;
+  $output = "";
+} else {
+  my %keywords;
+  while (@ARGV > 0) {
+    my $category = shift @ARGV;
+    my @col = split(/:/, $category);
+    @col == 2 || die "Bad category \"$category\"\n";
+    $name = $col[0];
+    if ($col[1] eq "-") {
+      foreach my $value (keys %{$attr{$name}}) {
+        foreach my $kw (keys %{$attr_kws{"${name}_$value"}}) {
+          $keywords{$kw} = 1;
+        }
+      }
+    } else {
+      my @col1 = split(/,/, $col[1]);
+      foreach my $value (@col1) {
+        foreach my $kw (keys %{$attr_kws{"${name}_$value"}}) {
+          $keywords{$kw} = 1;
+        }
+      }
+    }
+  }
+  foreach my $kw (keys %keywords) {
+    $output .= "$kw\n";
+  }
+}
+
+if ($kwlist_filename ne "-") {close(KWLIST);}
+if ($kws_filename eq "-") { print $output;}
+else {
+  open(O, ">$kws_filename") || die "Fail to open file $kws_filename\n";
+  print O $output;
+  close(O);
+}
--- a/egs/babel/s5b/local/arpa2G.sh
+++ b/egs/babel/s5b/local/arpa2G.sh
@ -0,0 +1,52 @@
+#!/bin/bash
+# Copyright 2013  Johns Hopkins University (authors: Yenda Trmal)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+#Simple utility script to convert the gzipped ARPA lm into a G.fst file
+
+
+
+#no configuration here
+#end configuration section.
+
+echo $0 $@
+
+[ -f ./path.sh ] && . ./path.sh
+[ -f ./cmd.sh ]  && . ./cmd.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0  <arpa-lm-file> <lang-dir> <dest-dir>"
+  exit 1;
+fi
+
+lmfile=$1
+langdir=$2
+destdir=$3
+
+mkdir $destdir 2>/dev/null || true
+
+gunzip -c $lmfile | \
+    grep -v '<s> <s>' | grep -v '</s> <s>' |  grep -v '</s> </s>' | \
+    arpa2fst - | \
+    fstprint | \
+    utils/eps2disambig.pl | \
+    utils/s2eps.pl | \
+    fstcompile --isymbols=$langdir/words.txt \
+    --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+    fstrmepsilon > $destdir/G.fst || exit 1
+fstisstochastic $destdir/G.fst || true
+
+exit 0
--- a/egs/babel/s5b/local/arpa2G_syllables.sh
+++ b/egs/babel/s5b/local/arpa2G_syllables.sh
@ -0,0 +1,53 @@
+# This is as arpa2G.sh but specialized for the per-syllable setup.  This is
+# specific to the BABEL setup.
+# The difference from arpa2G.sh is that (1) we have to change <unk> to <oov>, because
+# <oov> is the name of the phone that was chosen to represent the unknown word [note:
+# <unk> is special to SRILM, which is why it appears in the vocab]; and (2) we have
+# a special step with fstrhocompose which we use to ensure that silence cannot appear
+# twice in succession.  [Silence appears in the language model, which would naturally
+# allow it to appear twice in succession.]
+
+# input side, because <oov> is the name of the
+
+lmfile=$1
+langdir=$2
+destdir=$3
+
+mkdir -p $destdir;
+
+# Make FST that we compose with to disallow >1 silence in a row.
+last_id=`tail -n 1 $langdir/words.txt | awk '{print $2}'` || exit 1;
+[ -z $last_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1;
+silence_id=`grep -w SIL $langdir/words.txt | awk '{print $2}'` || exit 1;
+[ -z $silence_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1;
+rho=$[$last_id+1]
+
+# state 0 is start-state.  state 1 is state after we saw silence.  state 2 is 
+# "dead state/failure state" that is not coaccessible.
+cat <<EOF | fstcompile > $destdir/rho.fst
+0 1 $silence_id $silence_id
+0 0 $rho $rho
+1 2 $silence_id $silence_id
+1 0 $rho $rho
+0
+1
+EOF
+
+
+gunzip -c $lmfile | \
+    grep -v '<s> <s>' | grep -v '</s> <s>' |  grep -v '</s> </s>' | \
+    sed 's/<unk>/<oov>/g' | \
+    arpa2fst - | \
+    fstprint | \
+    utils/eps2disambig.pl | \
+    utils/s2eps.pl | \
+    fstcompile --isymbols=$langdir/words.txt \
+    --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+    fstrhocompose "$rho" - $destdir/rho.fst | \
+    fstrmepsilon > $destdir/G.fst || exit 1
+
+fstisstochastic $destdir/G.fst || true
+
+rm $destdir/rho.fst
+
+exit 0
--- a/egs/babel/s5b/local/augment_original_stm.pl
+++ b/egs/babel/s5b/local/augment_original_stm.pl
@ -0,0 +1,109 @@
+#!/usr/bin/perl -w
+# Copyright 2012  Johns Hopkins University (Author: Jan Trmal)
+# Apache 2.0.
+
+#This script takes the original BABEL STM file (part of the IndusDB)
+#and replaces the "Aggregated" field with a correct speaker ID.
+#As a result, the scoring will be done on per-speaker basis as well
+#As the segment from segment mapping generally do not correspond to
+#the segmentation of the original STM file, it combines the files
+#segments and utt2spk to work out the correct speaker ID for 
+#the reference segment
+#In case of overlay, it will either use the previous speaker or
+#prints out an error message
+
+use strict;
+use warnings;
+
+use Data::Dumper;
+
+@ARGV == 2 || die "$0 <stm-file> <data-dir>\n";
+
+my $warn_count = 0;
+my $warn_max = 10;
+my $stm_file = shift @ARGV;
+my $data_dir = shift @ARGV;
+my %utt2spk;
+my %segments;
+
+open(F_u, "<$data_dir/utt2spk") || die "Could not open the file $data_dir/utt2spk\n";
+while(<F_u>) {
+  chop;
+  (my $utt, my $spk) = split;
+  $utt2spk{$utt} = $spk;
+}
+close(F_u);
+
+open(F_s, "<$data_dir/segments") || die "Could not open the file $data_dir/segments\n";
+while(<F_s>) {
+  chop;
+  (my $utt, my $file, my $seg_start, my $seg_end) = split;
+  push @{$segments{$file}}, [ $seg_start, $seg_end, $utt2spk{$utt}];
+}
+close(F_s);
+
+open(STM, "<$stm_file") || die "Could not opent the STM file $stm_file";
+open(STMOUT, ">$data_dir/stm") || die "Could not open the output STM file $data_dir/stm";
+open(RECO, ">$data_dir/reco2file_and_channel") or die "Could not create the output file $data_dir/reco2file_and_channel";
+
+my $prev_filename = "";
+my @timestamps;
+my $i = 0;
+while(<STM>) {
+  chop;
+  (my $filename, my $line, my $aggregated, my $seg_start, my $seg_end, my $text) = split(/\s+/, $_, 6);
+  #print "$filename, $seg_start, $seg_end, $text\n";
+
+  if (( $prev_filename ne  $filename ) && ( ";;$prev_filename" ne  $filename)){
+    my $_filename = $filename;
+    $_filename =~ s/^;;//g;
+    next if  not exists $segments{$_filename};
+    #print $filename, "\n";
+    $prev_filename = $_filename;
+    @timestamps = @{$segments{$_filename}};
+    #print Dumper(\@timestamps);
+    $i=0;
+    print RECO "$_filename $_filename 1\n";
+  }
+
+  my $max_i=@timestamps;
+  while ( ($i < $max_i ) && ($seg_start > @{$timestamps[$i]}[0] ) ) {
+    $i+= 1;
+  }
+
+  if (($i >= $max_i ) && ($timestamps[$i-1][1]) <= $seg_start ){
+    #We are over the start of the last segment -> we assing the last speaker ID
+    if ($warn_count < $warn_max) {
+      print STDERR "Warning: $prev_filename: the segment from the STM file starts after the last segment from the segments file ends\n";
+      print STDERR "Warning: Additional info: STM: ($seg_start, $seg_end), segments file: ($timestamps[$i-1][0] $timestamps[$i-1][1])\n";
+      $warn_count += 1;
+
+      if ($warn_count >= $warn_max) {
+        print STDERR "Warning: Maximum number of warning reached, not warning anymore...\n"
+      }
+    }
+    #print "$i, $filename, $timestamps[$max_i - 1][2]\n";
+    print STMOUT "$filename $line $timestamps[$max_i - 1][2] $seg_start $seg_end $text\n";
+  } elsif ( $i == 0  ) {
+    if ($warn_count < $warn_max) {
+      print STDERR "Warning: $prev_filename: The segment from the STM file start before the first segment from the segments file\n";
+      print STDERR "Warning: Additional info: STM: ($seg_start, $seg_end), segments file: ($timestamps[$i][0] $timestamps[$i][1])\n";
+      $warn_count += 1;
+
+      if ($warn_count >= $warn_max) {
+        print STDERR "Warning: Maximum number of warning reached, not warning anymore...\n"
+      }
+    }
+    #Even the first segment's start time was higher then the stm segment start time
+    #That means we do not really know which speaker the stm segment belongs
+    print STMOUT "$filename $line $timestamps[$i][2] $seg_start $seg_end $text\n";
+    #print "$i, $filename, $timestamps[$i][2]\n";
+  } else {
+    print STMOUT "$filename $line $timestamps[$i-1][2] $seg_start $seg_end $text\n";
+    #print "$i, $filename, $timestamps[$i-1][2]\n";
+  }
+}
+
+close(STMOUT);
+close(STM);
+close(RECO);
--- a/egs/babel/s5b/local/buildSRILM.sh
+++ b/egs/babel/s5b/local/buildSRILM.sh
@ -0,0 +1,57 @@
+#!/bin/bash
+
+targetDir=$1
+
+echo "------------------------------------"
+echo "Building an SRILM in \"$targetDir\""
+echo "------------------------------------"
+
+for f in $targetDir/vocab $targetDir/text.train $targetDir/text.dev; do
+  [ ! -f $f ] && echo "$0: requires $f" && exit 1;
+done
+
+echo "-------------------"
+echo "Good-Turing 3grams"
+echo "-------------------"
+ngram-count -lm $targetDir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+
+echo "-------------------"
+echo "Kneser-Ney $targetDir/3grams"
+echo "-------------------"
+ngram-count -lm $targetDir/3gram.kn011.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/3gram.kn012.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/3gram.kn022.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/3gram.kn023.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+
+
+echo "-------------------"
+echo "Good-Turing 4grams"
+echo "-------------------"
+ngram-count -lm $targetDir/4gram.gt0111.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/4gram.gt0112.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/4gram.gt0122.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/4gram.gt0123.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/4gram.gt0113.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/4gram.gt0222.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/4gram.gt0223.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+
+echo "-------------------"
+echo "Kneser-Ney 4grams"
+echo "-------------------"
+ngram-count -lm $targetDir/4gram.kn0111.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/4gram.kn0112.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/4gram.kn0113.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/4gram.kn0122.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/4gram.kn0123.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/4gram.kn0222.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+ngram-count -lm $targetDir/4gram.kn0223.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $targetDir/text.train -vocab $targetDir/vocab -unk -sort
+
+echo "-------------------"
+echo "Computing perplexity"
+echo "-------------------"
+for f in $targetDir/3gram* ; do echo $f; ngram -order 3 -lm $f -unk -ppl $targetDir/text.dev; done | tee $targetDir/perplexities.3gram
+for f in $targetDir/4gram* ; do echo $f; ngram -order 4 -lm $f -unk -ppl $targetDir/text.dev; done | tee $targetDir/perplexities.4gram
+
--- a/egs/babel/s5b/local/build_edit_distance_fst.pl
+++ b/egs/babel/s5b/local/build_edit_distance_fst.pl
@ -0,0 +1,127 @@
+#!/usr/bin/perl
+
+# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
+# Apache 2.0.
+#
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+my $Usage = <<EOU;
+Usage:    build_edit_distance_fst.pl <phones.txt|-> <fst_out|->
+          Buld a edit distance FST at the phone level.
+
+Allowed options:
+  --confusion-matrix    : Matrix for insertion, deletion and substitution. (string, default="")
+  --ins-cost            : Insertion cost                                   (double, default=1 )
+  --del-cost            : Deletion cost                                    (double, default=1 )
+  --subs-cost           : substitution cost                                (double, default=1 )
+  --boundary-ins-cost   : Cost for insertions at work boundary             (double, default=0.1)
+  --boundary-off        : No insertions at word boundary                   (boolean, default=true)
+EOU
+
+my $confusion_matrix = "";
+my $insertion_cost = 1;
+my $deletion_cost = 1;
+my $substitution_cost = 1;
+my $boundary_ins_cost = 0.1;
+my $boundary_off="true";
+GetOptions('confusion-matrix=s' => \$confusion_matrix,
+  'ins-cost=f'          => \$insertion_cost,
+  'del-cost=f'          => \$deletion_cost,
+  'subs-cost=f'         => \$substitution_cost,
+  'boundary-ins-cost=f' => \$boundary_ins_cost,
+  'boundary-off=s'      => \$boundary_off);
+
+@ARGV == 2 || die $Usage;
+
+$boundary_off eq "true" || $boundary_off eq "false" || die "$0: Bad value for option --boundary-off\n";
+
+# Workout the input and output parameters
+my $phone_in = shift @ARGV;
+my $fst_out = shift @ARGV;
+
+open(I, "<$phone_in") || die "$0: Fail to open lexicon $phone_in\n";
+open(O, ">$fst_out") || die "$0: Fail to write FST $fst_out\n";
+
+# Read confusion matrix
+my %confusion;
+if ($confusion_matrix ne "") {
+  open(M, "<$confusion_matrix") || die "$0: Fail to open confusion matrix $confusion_matrix\n";
+  while (<M>) {
+    chomp;
+    my @col = split();
+    @col == 3 || die "$0: Bad line in confusion matrix \"$_\"\n";
+    $confusion{"$col[0]_$col[1]"} = $col[2];
+  }
+  close(M);
+}
+
+# Start processing
+my @phones;
+while (<I>) {
+  chomp;
+  my @col = split();
+  @col == 1 || die "$0: Bad number of columns in phone list \"$_\"\n";
+  if ($col[0] eq "<eps>") {next;}
+  push(@phones, $col[0]);
+}
+
+# Add insertions, deletions
+my $fst = "";
+foreach my $p (@phones) {
+  if ($confusion_matrix eq "") {
+    $fst .= "1 1 $p <eps> $deletion_cost\n";        # Deletions
+    $fst .= "1 1 <eps> $p $insertion_cost\n";       # Insertions
+    if ($boundary_off eq "false") {
+      $fst .= "0 0 <eps> $p $boundary_ins_cost\n";
+      $fst .= "0 1 <eps> $p $boundary_ins_cost\n";
+      $fst .= "2 2 <eps> $p $boundary_ins_cost\n";
+      $fst .= "1 2 <eps> $p $boundary_ins_cost\n";
+    }
+  } else {
+    my $key = "${p}_<eps>";
+    if (defined($confusion{$key})) {
+      $fst .= "1 1 $p <eps> $confusion{$key}\n";
+    }
+    $key = "<eps>_${p}";
+    if (defined($confusion{$key})) {
+      $fst .= "1 1 <eps> $p $confusion{$key}\n";
+      if ($boundary_off eq "false") {
+        $fst .= "0 0 <eps> $p $confusion{$key}\n";
+        $fst .= "0 1 <eps> $p $confusion{$key}\n";
+        $fst .= "2 2 <eps> $p $confusion{$key}\n";
+        $fst .= "1 2 <eps> $p $confusion{$key}\n";
+      }
+    }
+  }
+}
+foreach my $p1 (@phones) {
+  foreach my $p2 (@phones) {
+    if ($p1 eq $p2) {
+      $fst .= "1 1 $p1 $p2 0\n";
+    } else {
+      if ($confusion_matrix eq "") {
+        $fst .= "1 1 $p1 $p2 $substitution_cost\n";
+      } else {
+        my $key = "${p1}_${p2}";
+        if (defined($confusion{$key})) {
+          $fst .= "1 1 $p1 $p2 $confusion{$key}\n";
+        }
+      }
+    }
+  }
+}
+if ($boundary_off eq "false") {
+  $fst .= "0 1 <eps> <eps> 0\n";
+  $fst .= "1 2 <eps> <eps> 0\n";
+  $fst .= "2\n";
+} else {
+  $fst .= "1\n";
+}
+
+print O $fst;
+
+close(I);
+close(O);
--- a/egs/babel/s5b/local/check_models.sh
+++ b/egs/babel/s5b/local/check_models.sh
@ -0,0 +1,34 @@
+#!/bin/bash
+
+
+check_model () {
+  model=$1
+  if [ -s $model ]; then echo $model
+  else 
+    dir=`dirname $model`
+    latest_model=`ls -lt $dir/{?,??}.mdl 2>/dev/null | head -1 | awk '{print $9}'`
+    echo "*$model is not there, latest is: $latest_model"
+  fi
+}
+
+for model in exp/mono/final.mdl exp/tri{1,2,3}/final.mdl; do
+  check_model $model
+done
+
+if [ ! -f exp/tri4/final.mdl ]; then
+  echo "*exp/tri4/final.mdl is not there*"
+  exit 1
+fi
+
+if [ -f exp/tri4/trans.1 ]; then # This is LimitedLP.
+  models="exp/tri4/final.alimdl exp/sgmm5/final.alimdl exp/sgmm5_mmi_b0.1/final.mdl exp/tri5_nnet/final.mdl"
+else
+  models="exp/tri4/final.mdl exp/tri5/final.alimdl exp/sgmm5/final.alimdl exp/sgmm5_mmi_b0.1/final.mdl exp/tri6_nnet/final.mdl"
+fi
+models="$models exp_BNF/tri5/final.mdl exp_BNF/tri6/final.alimdl exp_BNF/sgmm7/final.alimdl"
+
+for model in $models; do
+  check_model $model
+done
+
+
--- a/egs/babel/s5b/local/check_wers.sh
+++ b/egs/babel/s5b/local/check_wers.sh
@ -0,0 +1,50 @@
+#!/bin/bash
+
+
+
+check_wer () {
+  dir=$1
+  if [ -d $dir ]; then 
+    seen_dir=false
+    for ddir in $dir/decode*; do
+      if [ -d $ddir ]; then
+        seen_dir=true
+        printf " % -40s " $ddir
+        line=`grep Sum $ddir/score_*/*.sys 2>/dev/null | $char_command | utils/best_wer.sh`
+        if [ -z "$line" ]; then echo "------"
+        else echo $line | cut -c 1-65; fi
+      fi
+    done
+    ! $seen_dir && echo "$dir ********** no decode dirs"
+  fi
+
+}
+
+final=false
+char_command="grep -v char"
+
+for n in `seq 10`; do
+  if [ "$1" == "--final" ]; then
+    final=true
+    shift
+  fi
+  if [ "$1" == "--char" ]; then
+    char_command="grep char"
+    shift
+  fi
+done
+
+if [ $# != 0 ]; then 
+  echo "Usage: local/check_wers.sh [--final] [--char]"
+  exit 1;
+fi
+
+if $final; then
+  for dir in exp/sgmm5_mmi_b0.1 exp/tri5_nnet exp/tri6_nnet exp_BNF/sgmm7 exp_BNF/sgmm7_mmi_b0.1 exp/combine*; do
+    check_wer $dir
+  done
+else
+  for dir in exp/tri{2,3,4,5} exp/sgmm5 exp/sgmm5_mmi_b0.1 exp/tri5_nnet exp/tri6_nnet exp_BNF/* exp/combine_*; do
+    check_wer $dir
+  done
+fi
--- a/egs/babel/s5b/local/cmu_uem2kaldi_dir.sh
+++ b/egs/babel/s5b/local/cmu_uem2kaldi_dir.sh
@ -0,0 +1,119 @@
+#!/bin/bash -e
+
+# Creating a UEM decoding setup with CMU segmentation from Florian (Feb 15, 2013).
+dummy_text=true
+text=
+filelist=
+#end of configuration
+
+[ -f ./path.sh ] && . ./path.sh
+[ -f ./cmd.sh ]  && . ./cmd.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ] ; then
+  echo "$0: Converts the CMU segmentation database file into a kaldi data directory for UEM decoding"
+  echo ""
+  echo "cmu_ume2kaldi_dir.sh <cmu-utt-database> <path-to-sph-files> <output-data-dir>"
+  echo "example: cmu_ume2kaldi_dir.sh db-tag-eval-utt.dat /export/babel/data/106-tagalog/audio data/eval.uem"
+  echo "Was called with: $*"
+  exit 1;
+fi
+
+database=$1
+audiopath=$2
+datadir=$3
+
+echo $0 $@
+mkdir -p $datadir
+# 1. Create the segments file:
+[ ! -f $database ] && echo "Database file $1 does not exist!"  && exit 1;
+
+echo "Converting `basename $database` to kaldi directory $datadir "
+cat $database | perl -pe 's:.+(BABEL):BABEL:; s:\}\s+\{FROM\s+: :; s:\}\s+\{TO\s+: :; s:\}.+::;' | \
+  perl -ne '@K = split; 
+            $utteranceID = @K[0]; 
+            $utteranceID =~ s:[^_]+_[^_]+_[^_]+_::; 
+            $utteranceID =~ s:([^_]+)_(.+)_(inLine|scripted):${1}_A_${2}:; 
+            $utteranceID =~ s:([^_]+)_(.+)_outLine:${1}_B_${2}:; 
+            $utteranceID .= sprintf ("_%06i", (100*@K[2])); 
+            printf("%s %s %.2f %.2f\n", $utteranceID, @K[0], @K[1], @K[2]);' | sort > $datadir/segments
+
+if [ ! -z $filelist ] ; then
+  mv $datadir/segments $datadir/segments.full
+  grep -F -f $filelist $datadir/segments.full > $datadir/segments
+
+  l=`grep -v -F -f $filelist $datadir/segments.full | cut -f 2 -d ' ' | sort -u | wc -l`
+  echo "Because of using filelist, $l files omitted"
+fi
+
+
+ # 2. Create the utt2spk file:
+
+echo "Creating the $datadir/utt2spk file"
+cut -f1 -d' ' $datadir/segments | \
+  perl -ne 'chomp; m:([^_]+_[AB]).*:; print "$_ $1\n";' | \
+  sort > $datadir/utt2spk
+
+ # 3. Create the spk2utt file:
+
+echo "Creating the $datadir/spk2utt file"
+perl -ne '{chomp; @K=split; $utt{@K[1]}.=" @K[0]";}
+           END{foreach $spk (sort keys %utt) {
+              printf("%s%s\n", $spk, $utt{$spk});
+              }
+           }' < $datadir/utt2spk | sort > $datadir/spk2utt
+
+# 4. Create the wav.scp file:
+sph2pipe=`which sph2pipe || which $KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe`
+if [ $? -ne 0 ] ; then
+  echo "Could not find sph2pipe binary. Add it to PATH"  
+  exit 1;
+fi
+
+echo "Creating the $datadir/wav.scp file"
+(
+  set -o pipefail
+  for file in `cut -f 2 -d ' ' $datadir/segments` ; do
+    if [ -f $audiopath/audio/$file.sph ] ; then
+      echo "$file $sph2pipe -f wav -p -c 1 $audiopath/audio/$file.sph |"
+    else
+      echo "Audio file $audiopath/audio/$file.sph does not exist!" >&2 
+      exit 1
+    fi
+  done | sort -u > $datadir/wav.scp 
+  if [ $? -ne 0 ] ; then 
+    echo "Error producing the wav.scp file" 
+    exit 1
+  fi
+) || exit 1 
+
+l1=`wc -l $datadir/wav.scp | cut -f 1 -d ' ' `
+echo "wav.scp contains $l1 files"
+if [ ! -z $filelist ] ; then
+  l2=`wc -l $filelist | cut -f 1 -d ' '`
+  echo "filelist `basename $filelist` contains $l2 files"
+
+  if [ "$l1" -ne "$l2" ] ; then
+    echo "Not all files from the specified fileset made their way into wav.scp"
+    exit 1
+  fi
+fi
+
+# 5. Create the text file:
+echo "Creating the $datadir/text file"
+if [ ! -z $text ] ; then
+  cp $text $datadir/text || echo "Could not copy the source text file \"$text\" " && exit 1
+elif $dummy_text ; then
+  cut -f1 -d' ' $datadir/segments | \
+  sed -e 's/$/ IGNORE_TIME_SEGMENT_IN_SCORING/'  | \
+  sort > $datadir/text
+fi
+
+# 6. reco2file_and_channel
+echo "Creating the $datadir/reco2file_and_channel file"
+(for f in $( cut -f 8 -d ' '  $datadir/wav.scp ) ; do p=`basename $f .sph`; echo $p $p 1; done) > $datadir/reco2file_and_channel
+
+echo "Everything done"
+
+
+
--- a/egs/babel/s5b/local/count_to_logprob.pl
+++ b/egs/babel/s5b/local/count_to_logprob.pl
@ -0,0 +1,94 @@
+#!/usr/bin/perl
+
+# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
+# Apache 2.0.
+#
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+my $Usage = <<EOU;
+Usage:    count_to_logprob.pl <confusing_in|-> <counfusing_out|->
+          This script takes in the confusion phone pair counts and converts
+          the counts into negated log probabilities. The counts should be in
+          the following format:
+          p1 p2 count1        // For substitution
+          p3 <eps> count2     // For deletion
+          <eps> p4 count3     // For insertion
+
+Allowed options:
+  --cutoff              : Minimal count to be considered                (int   , default=1)
+EOU
+
+my $cutoff = 1;
+GetOptions('cutoff=i' => \$cutoff);
+
+@ARGV == 2 || die $Usage;
+
+# Workout the input and output parameters
+my $cm_in = shift @ARGV;
+my $cm_out = shift @ARGV;
+
+open(I, "<$cm_in") || die "$0: Fail to open keywords file $cm_in\n";
+open(O, ">$cm_out") || die "$0: Fail to write confusion matrix $cm_out\n";
+
+# Collect counts
+my %ins;
+my %del;
+my %subs;
+my %phone_count;
+my $ins_count = 0;
+my $del_count = 0;
+while (<I>) {
+  chomp;
+  my @col = split();
+  @col == 3 || die "$0: Bad line in confusion matrix file: $_\n";
+  my ($p1, $p2, $count) = ($col[0], $col[1], $col[2]);
+  $count >= $cutoff || next;
+  if ($p1 eq "<eps>" && $p2 ne "<eps>") {
+    $ins{$p2} = $count;
+    $ins_count += $count;
+  } elsif ($p1 ne "<eps>" && $p2 eq "<eps>") {
+    $del{$p1} = $count;
+    $del_count += $count;
+  } elsif ($p1 ne "<eps>" && $p2 ne "<eps>") {
+    $p1 ne $p2 || next;           # Skip same phone convert
+    $subs{"${p1}_$p2"} = $count;
+    if (defined($phone_count{$p1})) {
+      $phone_count{$p1} += $count;
+    } else {
+      $phone_count{$p1} = $count;
+    }
+  }
+}
+
+# Compute negated log probability
+foreach my $key (keys %ins) {
+  $ins{$key} = -log($ins{$key}/$ins_count);
+}
+foreach my $key (keys %del) {
+  $del{$key} = -log($del{$key}/$del_count);
+}
+foreach my $key (keys %subs) {
+  my @col = split(/_/, $key);
+  $subs{$key} = -log($subs{$key}/$phone_count{$col[0]});
+}
+
+# print results
+my $output = "";
+foreach my $key (keys %ins) {
+  $output .= "<eps> $key $ins{$key}\n";
+}
+foreach my $key (keys %del) {
+  $output .= "$key <eps> $del{$key}\n";
+}
+foreach my $key (keys %subs) {
+  my @col = split(/_/, $key);
+  $output .= "$col[0] $col[1] $subs{$key}\n";
+}
+
+print O $output;
+
+close(I);
+close(O);
--- a/egs/babel/s5b/local/create_shadow_dataset.sh
+++ b/egs/babel/s5b/local/create_shadow_dataset.sh
@ -0,0 +1,180 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University   
+# Apache 2.0.
+
+stage=0
+
+[ -f ./path.sh ] && . ./path.sh
+[ -f ./cmd.sh ] && . ./cmd.sh
+[ -f /export/babel/data/software/env.sh ] && . /export/babel/data/software/env.sh
+
+. utils/parse_options.sh
+
+if [ $# -ne 3 ]; then
+  echo "Usage: create_shadow_dataset.sh <dest-data-dir> <src-data-dir1> <src-data-dir2> "
+  exit 1
+fi
+
+dest=$1
+src1=$2
+src2=$3
+
+mkdir -p $dest/kws
+
+if [ $stage -le 0 ] ; then
+  utils/combine_data.sh $dest $src1 $src2 || exit 1
+fi
+
+if [ $stage -le 1 ] ; then
+  #zkombinovat ecf
+  echo "Combining ECF files..."
+  perl -e '
+    #binmode STDIN, ":utf8"; 
+    binmode STDOUT, ":utf8"; 
+
+    use XML::Simple;
+    use Data::Dumper;
+
+    use strict;
+    use warnings;
+
+
+    my $src1 = XMLin($ARGV[0]);
+    my $src2 = XMLin($ARGV[1]);
+    my $tgt={};
+    my %filename_hash;
+
+    my $expected_duration=0.0;
+    my $duration=0.0;
+
+    if ( $src1->{language} ne $src2->{language} ) {
+        die "ECF languages differ in the source ecf.xml files"
+    }
+    $expected_duration=$src1->{source_signal_duration} + $src2->{source_signal_duration};
+
+    $tgt->{source_signal_duration} = $expected_duration;
+    $tgt->{language}=$src1->{language};
+    $tgt->{version}="Generated automatically by the shadow_set.sh script";
+    $tgt->{excerpt}= [];
+
+    #print Dumper(\$src1);
+    foreach my $excerpt ( @{$src1->{excerpt}} ) {
+       push @{$tgt->{excerpt}}, $excerpt;
+       if ( exists $filename_hash{$excerpt->{audio_filename}} ) {
+          print STDERR "[WARN]: Duplicate filename $excerpt->{audio_filename} \n"
+       } else {
+         $duration += $excerpt->{dur} ;
+         $filename_hash{$excerpt->{audio_filename}} = $excerpt;
+      }
+    }
+    foreach my $excerpt ( @{$src2->{excerpt}} ) {
+       push @{$tgt->{excerpt}}, $excerpt;
+       if ( exists $filename_hash{$excerpt->{audio_filename}} ) {
+          print STDERR "[WARN]: Duplicate filename $excerpt->{audio_filename} \n"
+       } else {
+         $duration += $excerpt->{dur} ;
+         $filename_hash{$excerpt->{audio_filename}} = $excerpt;
+      }
+    }
+    $tgt->{source_signal_duration} = $duration;
+
+    my $tgtxml = XMLout($tgt, RootName=>"ecf");
+    print $tgtxml;
+  ' $src1/kws/ecf.xml $src2/kws/ecf.xml > $dest/kws/ecf.xml
+fi
+
+if [ $stage -le 2 ] ; then
+  #zkombinovat kwlist
+  echo "Combining the KWLIST files"
+  perl -e '
+    #binmode STDIN, ":utf8"; 
+    binmode STDOUT, ":utf8"; 
+
+    use XML::Simple;
+    use Data::Dumper;
+
+    use strict;
+    use warnings;
+
+    my $src1 = XMLin($ARGV[0],  ForceArray => 1);
+    my $src2 = XMLin($ARGV[1],  ForceArray => 1);
+    my $tgt={};
+    my %kwid_hash;
+
+    if ( $src1->{compareNormalize} ne $src2->{compareNormalize} ) {
+        die "KWLIST compareNormalize attributes differ in the source kwlist.xml files";
+    }
+    if ( $src1->{language} ne $src2->{language} ) {
+        die "KWLIST languages differ in the source kwlist.xml files";
+    }
+    
+    $tgt->{ecf_filename} = "";
+    $tgt->{language}=$src1->{language};
+    $tgt->{compareNormalize}=$src1->{compareNormalize};
+    $tgt->{encoding}=$src1->{encoding};
+    $tgt->{version}="1";
+    $tgt->{kw}= [];
+
+
+    foreach my $kw ( @{$src1->{kw}} ) {
+       $kw->{kwid} = $kw->{kwid} . "-A";
+       if ( exists $kwid_hash{$kw->{kwid}} ) {
+          print STDERR "[WARN]: Duplicate kwid $kw->{kwid}\n";
+       } else {
+         $kwid_hash{$kw->{kwid}} = $kw;
+       }
+       push @{$tgt->{kw}}, $kw;
+    }
+    foreach my $kw ( @{$src2->{kw}} ) {
+       $kw->{kwid} = $kw->{kwid} . "-B";
+       if ( exists $kwid_hash{$kw->{kwid}} ) {
+          print STDERR "[WARN]: Duplicate kwid $kw->{kwid}\n";
+       } else {
+         $kwid_hash{$kw->{kwid}} = $kw;
+       }
+       push @{$tgt->{kw}}, $kw;
+    }
+
+    my $tgtxml = XMLout($tgt, RootName=>"kwlist", KeyAttr=>"");
+    print $tgtxml;
+  ' $src1/kws/kwlist.xml $src2/kws/kwlist.xml > $dest/kws/kwlist.xml || exit 1
+fi
+
+if [ $stage -le 3 ] ; then
+  echo "Making KWLIST maps"
+  perl -e '
+    #binmode STDIN, ":utf8"; 
+    binmode STDOUT, ":utf8"; 
+
+    use XML::Simple;
+    use Data::Dumper;
+
+    use strict;
+    use warnings;
+
+    my $src1 = XMLin($ARGV[0],  ForceArray => 1);
+    open TGT_DEV, ">", $ARGV[1] or die $!;
+    open TGT_TST, ">", $ARGV[2] or die $!;
+
+    foreach my $kw ( @{$src1->{kw}} ) {
+        if ( $kw->{kwid} =~ "KW.+-A\$" ) {
+            my $new_kw = $kw->{kwid};
+            my $old_kw = substr $new_kw, 0, -2;
+            print TGT_DEV "$old_kw\t$new_kw\n";
+        } elsif ( $kw->{kwid} =~ "KW.+-B\$" ) {
+            my $new_kw = $kw->{kwid};
+            my $old_kw = substr $new_kw, 0, -2;
+            print TGT_TST  "$old_kw\t$new_kw\n";
+        } else {
+            die "Unsupported or unknown KW ID: $kw->{kwid}\n";
+        }
+    }
+  ' $dest/kws/kwlist.xml $dest/kws/kws_map.dev.txt $dest/kws/kws_map.test.txt || exit 1
+fi
+
+#RTTM file is not necessary
+
+utils/fix_data_dir.sh data/shadow.uem
+
+exit 0
+
--- a/egs/babel/s5b/local/cstr_ndx2flist.pl
+++ b/egs/babel/s5b/local/cstr_ndx2flist.pl
@ -0,0 +1,54 @@
+#!/usr/bin/perl
+
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems. 
+# - Arnab Ghoshal, 12/1/12
+
+# This program takes as its standard input an .ndx file from the WSJ corpus that looks
+# like this:
+#;; File: tr_s_wv1.ndx, updated 04/26/94
+#;;
+#;; Index for WSJ0 SI-short Sennheiser training data
+#;; Data is read WSJ sentences, Sennheiser mic.
+#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts 
+#;; per speaker TI) = 7236 utts
+#;;
+#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
+#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
+#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1
+
+# and as command-line argument it takes the names of the WSJ disk locations, e.g.:
+# /group/corpora/public/wsjcam0/data on DICE machines.
+# It outputs a list of absolute pathnames.
+
+$wsj_dir = $ARGV[0];
+
+while(<STDIN>){
+  if(m/^;/){ next; } # Comment.  Ignore it.
+  else {
+    m/^([0-9_]+):\s*(\S+)$/  || die "Could not parse line $_";
+    $filename = $2; # as a subdirectory of the distributed disk.
+    if ($filename !~ m/\.wv1$/) { $filename .= ".wv1"; }
+    $filename = "$wsj_dir/$filename";
+    if (-e $filename) {
+      print "$filename\n";
+    } else {
+      print STDERR "File $filename found in the index but not on disk\n";
+    }
+  }
+}
--- a/egs/babel/s5b/local/cstr_wsj_data_prep.sh
+++ b/egs/babel/s5b/local/cstr_wsj_data_prep.sh
@ -0,0 +1,187 @@
+#!/bin/bash
+set -e
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems. 
+# - Arnab Ghoshal, 29/05/12
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <corpus-directory>\n\n" `basename $0`
+  echo "The argument should be a the top-level WSJ corpus directory."
+  echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory"
+  echo "within the top-level corpus directory."
+  exit 1;
+fi
+
+CORPUS=$1
+
+dir=`pwd`/data/local/data
+lmdir=`pwd`/data/local/nist_lm
+mkdir -p $dir $lmdir
+local=`pwd`/local
+utils=`pwd`/utils
+
+. ./path.sh # Needed for KALDI_ROOT
+export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+if [ ! -x $sph2pipe ]; then
+  echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
+  exit 1;
+fi
+
+cd $dir
+
+# This version for SI-84
+cat $CORPUS/wsj0/doc/indices/train/tr_s_wv1.ndx \
+  | $local/cstr_ndx2flist.pl $CORPUS | sort \
+  | grep -v wsj0/si_tr_s/401 > train_si84.flist
+
+# This version for SI-284
+cat $CORPUS/wsj1/doc/indices/si_tr_s.ndx \
+  $CORPUS/wsj0/doc/indices/train/tr_s_wv1.ndx \
+  | $local/cstr_ndx2flist.pl  $CORPUS | sort \
+  | grep -v wsj0/si_tr_s/401 > train_si284.flist
+
+# Now for the test sets.
+# $CORPUS/wsj1/doc/indices/readme.doc 
+# describes all the different test sets.
+# Note: each test-set seems to come in multiple versions depending
+# on different vocabulary sizes, verbalized vs. non-verbalized
+# pronunciations, etc.  We use the largest vocab and non-verbalized
+# pronunciations.
+# The most normal one seems to be the "baseline 60k test set", which
+# is h1_p0. 
+
+# Nov'92 (333 utts)
+# These index files have a slightly different format; 
+# have to add .wv1, which is done in cstr_ndx2flist.pl 
+cat $CORPUS/wsj0/doc/indices/test/nvp/si_et_20.ndx | \
+  $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval92.flist
+
+# Nov'92 (330 utts, 5k vocab)
+cat $CORPUS/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
+  $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval92_5k.flist
+
+# Nov'93: (213 utts)
+# Have to replace a wrong disk-id.
+cat $CORPUS/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \
+  $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval93.flist
+
+# Nov'93: (215 utts, 5k)
+cat $CORPUS/wsj1/doc/indices/wsj1/eval/h2_p0.ndx | \
+  $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval93_5k.flist
+
+# Dev-set for Nov'93 (503 utts)
+cat $CORPUS/wsj1/doc/indices/h1_p0.ndx | \
+  $local/cstr_ndx2flist.pl $CORPUS | sort > test_dev93.flist
+
+# Dev-set for Nov'93 (513 utts, 5k vocab)
+cat $CORPUS/wsj1/doc/indices/h2_p0.ndx | \
+  $local/cstr_ndx2flist.pl $CORPUS | sort > test_dev93_5k.flist
+
+
+# Dev-set Hub 1,2 (503, 913 utterances)
+
+# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.  
+# Sometimes this gets copied from the CD's with upcasing, don't know 
+# why (could be older versions of the disks).
+find $CORPUS/???1/??_??_20 -print | grep -i ".wv1" | sort > dev_dt_20.flist
+find $CORPUS/???1/??_??_05 -print | grep -i ".wv1" | sort > dev_dt_05.flist
+
+
+# Finding the transcript files:
+find -L $CORPUS -iname '*.dot' > dot_files.flist
+
+# Convert the transcripts into our format (no normalization yet)
+for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
+  $local/flist2scp.pl $x.flist | sort > ${x}_sph.scp
+  cat ${x}_sph.scp | awk '{print $1}' \
+    | $local/find_transcripts.pl dot_files.flist > $x.trans1
+done
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
+  cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
+    | sort > $x.txt || exit 1;
+done
+ 
+# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
+for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
+  awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp \
+    > ${x}_wav.scp
+done
+
+# Make the utt2spk and spk2utt files.
+for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
+  cat ${x}_sph.scp | awk '{print $1}' \
+    | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
+  cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
+done
+
+#in case we want to limit lm's on most frequent words, copy lm training word frequency list
+cp $CORPUS/wsj1/doc/lng_modl/vocab/wfl_64.lst $lmdir
+chmod u+w $lmdir/*.lst # had weird permissions on source.
+
+# The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without
+# verbalized pronunciations.   This is the most common test setup, I understand.
+
+cp $CORPUS/wsj1/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1;
+chmod u+w $lmdir/lm_bg.arpa.gz
+
+# trigram would be:
+cat $CORPUS/wsj1/doc/lng_modl/base_lm/tcb20onp.z | \
+  perl -e 'while(<>){ if(m/^\\data\\/){ print; last;  } } while(<>){ print; }' \
+  | gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1;
+
+prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1;
+gzip -f $lmdir/lm_tgpr.arpa || exit 1;
+
+# repeat for 5k language models
+cp $CORPUS/wsj1/doc/lng_modl/base_lm/bcb05onp.z  $lmdir/lm_bg_5k.arpa.gz || exit 1;
+chmod u+w $lmdir/lm_bg_5k.arpa.gz
+
+# trigram would be: !only closed vocabulary here!
+cp $CORPUS/wsj1/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1;
+chmod u+w $lmdir/lm_tg_5k.arpa.gz
+gunzip $lmdir/lm_tg_5k.arpa.gz
+tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz
+rm $lmdir/lm_tg_5k.arpa
+
+prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1;
+gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1;
+
+
+if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
+  rm -f wsj0-train-spkrinfo.txt
+  wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt \
+    || ( echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
+         wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt );
+fi
+
+if [ ! -f wsj0-train-spkrinfo.txt ]; then
+  echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
+  echo "This is possibly omitted from the training disks; couldn't find it." 
+  echo "Everything else may have worked; we just may be missing gender info"
+  echo "which is only needed for VTLN-related diagnostics anyway."
+  exit 1
+fi
+# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
+# LDC put it on the web.  Perhaps it was accidentally omitted from the
+# disks.  
+
+cat $CORPUS/wsj0/doc/spkrinfo.txt \
+    $CORPUS/wsj1/doc/evl_spok/spkrinfo.txt \
+    $CORPUS/wsj1/doc/dev_spok/spkrinfo.txt \
+    $CORPUS/wsj1/doc/train/spkrinfo.txt \
+    ./wsj0-train-spkrinfo.txt  | \
+    perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
+    awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
+
+
+echo "Data preparation succeeded"
--- a/egs/babel/s5b/local/cstr_wsj_extend_dict.sh
+++ b/egs/babel/s5b/local/cstr_wsj_extend_dict.sh
@ -0,0 +1,172 @@
+#!/bin/bash
+
+# This script builds a larger word-list and dictionary 
+# than used for the LMs supplied with the WSJ corpus.
+# It uses a couple of strategies to fill-in words in
+# the LM training data but not in CMUdict.  One is
+# to generate special prons for possible acronyms, that
+# just consist of the constituent letters.  The other
+# is designed to handle derivatives of known words
+# (e.g. deriving the pron of a plural from the pron of
+# the base-word), but in a more general, learned-from-data
+# way.
+# It makes use of scripts in local/dict/
+
+if [ $# -ne 1 ]; then
+  echo "Usage: local/cstr_wsj_train_lms.sh WSJ1_doc_dir"
+  exit 1
+fi
+
+export PATH=$PATH:`pwd`/local/dict/
+srcdir=$1
+
+if [ ! -d $srcdir/lng_modl ]; then
+  echo "Expecting 'lng_modl' under WSJ doc directory '$srcdir'"
+  exit 1
+fi
+
+mkdir -p data/local/dict_larger
+dir=data/local/dict_larger
+cp data/local/dict/* data/local/dict_larger # Various files describing phones etc.
+  # are there; we just want to copy them as the phoneset is the same.
+rm data/local/dict_larger/lexicon.txt # we don't want this.
+mincount=2 # Minimum count of an OOV we will try to generate a pron for.
+
+[ ! -f data/local/dict/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1;
+
+# Remove comments from cmudict; print first field; remove
+# words like FOO(1) which are alternate prons: our dict format won't
+# include these markers.
+grep -v ';;;' data/local/dict/cmudict/cmudict.0.7a | 
+ perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu
+
+cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu
+
+echo "Getting training data [this should take at least a few seconds; if not, there's a problem]"
+
+# Convert to uppercase, remove XML-like markings.
+# For words ending in "." that are not in CMUdict, we assume that these
+# are periods that somehow remained in the data during data preparation,
+# and we we replace the "." with "\n".  Note: we found this by looking at
+# oov.counts below (before adding this rule).
+
+touch $dir/cleaned.gz
+if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then
+  echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]";
+else
+ gunzip -c $srcdir/lng_modl/lm_train/np_data/{87,88,89}/*.z \
+  | awk '/^</{next}{print toupper($0)}' | perl -e '
+   open(F, "<$ARGV[0]")||die;
+   while(<F>){ chop; $isword{$_} = 1; }
+   while(<STDIN>) { 
+    @A = split(" ", $_); 
+    for ($n = 0; $n < @A; $n++) {
+      $a = $A[$n];
+      if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "."
+         # and have no other "." in them: treat as period.
+         print "$a";
+         if ($n+1 < @A) { print "\n"; }
+      } else { print "$a "; }
+    }
+    print "\n";
+  }
+ ' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz
+fi
+  
+# get unigram counts
+echo "Getting unigram counts"
+gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \
+  awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams
+
+cat $dir/unigrams | awk -v dict=$dir/dict.cmu \
+  'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \
+   > $dir/oov.counts
+
+echo "Most frequent unseen unigrams are: "
+head $dir/oov.counts
+
+# Prune away singleton counts, and remove things with numbers in
+# (which should have been normalized) and with no letters at all.
+
+
+cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \
+  | awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist
+
+# Automatic rule-finding...
+
+# First make some prons for possible acronyms.
+# Note: we don't do this for things like U.K or U.N,
+# or A.B. (which doesn't exist anyway), 
+# as we consider this normalization/spelling errors.
+
+cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu  > $dir/dict.acronyms
+
+mkdir $dir/f $dir/b # forward, backward directions of rules...
+  # forward is normal suffix
+  # rules, backward is reversed (prefix rules).  These
+  # dirs contain stuff we create while making the rule-based
+  # extensions to the dictionary.
+
+# Remove ; and , from words, if they are present; these
+# might crash our scripts, as they are used as separators there.
+filter_dict.pl $dir/dict.cmu > $dir/f/dict 
+cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
+reverse_dict.pl $dir/f/dict > $dir/b/dict
+reverse_dict.pl $dir/f/oovs > $dir/b/oovs
+
+# The next stage takes a few minutes.
+# Note: the forward stage takes longer, as English is
+# mostly a suffix-based language, and there are more rules
+# that it finds.
+for d in $dir/f $dir/b; do
+ (
+   cd $d
+   cat dict | get_rules.pl 2>get_rules.log >rules
+   get_rule_hierarchy.pl rules >hierarchy
+   awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
+     limit_candidate_prons.pl hierarchy | \
+     score_prons.pl dict | \
+     count_rules.pl >rule.counts
+   # the sort command below is just for convenience of reading.
+   score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
+   get_candidate_prons.pl rules.with_scores dict oovs | \
+     limit_candidate_prons.pl hierarchy > oovs.candidates
+ )  &   
+done 
+wait
+
+# Merge the candidates.
+reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
+select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s  %s\n", $1, $2);}' \
+  > $dir/dict.oovs
+
+cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
+
+awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
+sort $dir/oovlist | diff - $dir/oovlist.handled  | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
+
+
+# add_counts.pl attaches to original counts to the list of handled/not-handled OOVs
+add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts
+add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts
+
+echo "**Top OOVs we handled are:**"; 
+head $dir/oovlist.handled.counts
+echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**"; 
+head $dir/oovlist.not_handled.counts
+
+
+echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`"
+echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`"
+echo "Count of OOVs we didn't handle due to low count is" \
+    `awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts`
+# The two files created above are for humans to look at, as diagnostics.
+
+cat <<EOF | cat - $dir/dict.cmu $dir/dict.oovs_merged | sort | uniq > $dir/lexicon.txt
+!SIL SIL
+<SPOKEN_NOISE> SPN
+<UNK> SPN
+<NOISE> NSN
+EOF
+
+echo "Created $dir/lexicon.txt"
--- a/egs/babel/s5b/local/decode_helper.sh
+++ b/egs/babel/s5b/local/decode_helper.sh
@ -0,0 +1,32 @@
+#!/bin/bash
+
+. ./cmd.sh
+
+TYPE=$1
+LANGDIR=$2
+MODELDIR=$3
+DEVDIR=$4
+TRANSFORMDIR=$5
+
+echo "$@"
+
+if [ "$1" == "SI" ]; then
+  utils/mkgraph.sh $LANGDIR $MODELDIR $MODELDIR/graph		|| exit 1
+  steps/decode.sh --nj 20 --cmd "$decode_cmd" \
+	  $MODELDIR/graph $DEVDIR $MODELDIR/decode || exit 1
+elif [ "$1" == "FMLLR" ]; then
+  utils/mkgraph.sh $LANGDIR $MODELDIR $MODELDIR/graph		|| exit 1
+  steps/decode_fmllr.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
+	  $MODELDIR/graph $DEVDIR $MODELDIR/decode || exit 1
+elif [ "$1" == "SGMM" ]; then
+  utils/mkgraph.sh $LANGDIR $MODELDIR $MODELDIR/graph		|| exit 1
+  
+  steps/decode_sgmm.sh --nj 20 --cmd "$decode_cmd" --transform-dir $TRANSFORMDIR \
+  	$MODELDIR/graph $DEVDIR $MODELDIR/decode		|| exit 1;
+
+  steps/decode_sgmm.sh --use-fmllr true --nj 20 --cmd "$decode_cmd" --transform-dir $TRANSFORMDIR\
+  	$MODELDIR/graph $DEVDIR $MODELDIR/decode_fmllr  || exit 1;    
+
+fi
+
+
--- a/egs/babel/s5b/local/dict/add_counts.pl
+++ b/egs/babel/s5b/local/dict/add_counts.pl
@ -0,0 +1,31 @@
+#!/usr/bin/perl
+
+
+# Add counts to an oovlist.
+# Reads in counts as output by uniq -c, and
+# an oovlist, and prints out the counts of the oovlist.
+
+(@ARGV == 1 || @ARGV == 2) || die "Usage: add_counts.pl count_file [oovlist]\n";
+
+$counts = shift @ARGV;
+
+open(C, "<$counts") || die "Opening counts file $counts";
+
+while(<C>) {
+  @A = split(" ", $_);
+  @A == 2 || die "Bad line in counts file: $_";
+  ($count, $word) = @A;
+  $count =~ m:^\d+$: || die "Bad count $A[0]\n";
+  $counts{$word} = $count;
+}
+
+while(<>) {
+  chop;
+  $w = $_;
+  $w =~ m:\S+: || die "Bad word $w";
+  defined $counts{$w} || die "Word $w not present in counts file";
+  print "\t$counts{$w}\t$w\n";
+}
+    
+  
+
--- a/egs/babel/s5b/local/dict/count_rules.pl
+++ b/egs/babel/s5b/local/dict/count_rules.pl
@ -0,0 +1,44 @@
+#!/usr/bin/perl
+
+# This program takes the output of score_prons.pl and collates
+# it for each (rule, destress) pair so that we get the
+# counts of right/partial/wrong for each pair.
+
+# The input is a 7-tuple on each line, like:
+# word;pron;base-word;base-pron;rule-name;de-stress;right|partial|wrong
+#
+# The output format is a 5-tuple like:
+#
+# rule;destress;right-count;partial-count;wrong-count
+#
+
+if (@ARGV != 0 && @ARGV != 1) {
+  die "Usage: count_rules.pl < scored_candidate_prons > rule_counts";
+}
+
+
+while(<>) {
+  chop;
+  $line = $_;
+  my ($word, $pron, $baseword, $basepron, $rulename, $destress, $score) = split(";", $line);
+  
+  my $key = $rulename . ";" . $destress;
+
+  if (!defined $counts{$key}) {
+    $counts{$key} = [ 0, 0, 0 ]; # new anonymous array.
+  }
+  $ref = $counts{$key};
+  if ($score eq "right") {
+    $$ref[0]++;
+  } elsif ($score eq "partial") {
+    $$ref[1]++;
+  } elsif ($score eq "wrong") {
+    $$ref[2]++;
+  } else {
+    die "Bad score $score\n";
+  }
+}
+
+while ( my ($key, $value) = each(%counts)) {
+  print $key . ";" . join(";", @$value) . "\n";
+}
--- a/egs/babel/s5b/local/dict/filter_dict.pl
+++ b/egs/babel/s5b/local/dict/filter_dict.pl
@ -0,0 +1,19 @@
+#!/usr/bin/perl
+
+
+# This program reads and writes either a dictionary or just a list
+# of words, and it removes any words containing ";" or "," as these
+# are used in these programs.  It will warn about these.
+# It will die if the pronunciations have these symbols in.
+while(<>) {
+  chop;
+  @A = split(" ", $_);
+  $word = shift @A;
+  
+  if ($word =~ m:[;,]:) {
+    print STDERR "Omitting line $_ since it has one of the banned characters ; or ,\n" ;
+  } else {
+    $_ =~ m:[;,]: && die "Phones cannot have ; or , in them.";
+    print $_ . "\n";
+  }
+}
--- a/egs/babel/s5b/local/dict/find_acronyms.pl
+++ b/egs/babel/s5b/local/dict/find_acronyms.pl
@ -0,0 +1,95 @@
+#!/usr/bin/perl
+
+# Reads a dictionary, and prints out a list of words that seem to be pronounced
+# as acronyms (not including plurals of acronyms, just acronyms).  Uses
+# the prons of the individual letters (A., B. and so on) to judge this.
+# Note: this is somewhat dependent on the convention used in CMUduct, that
+# the individual letters are spelled this way (e.g. "A.").
+
+$max_length = 6; # Max length of words that might be
+ # acronyms.
+
+while(<>) { # Read the dict.
+  chop;
+  @A = split(" ", $_);
+  $word = shift @A;
+  $pron = join(" ", @A);
+  if ($word =~ m/^([A-Z])\.$/ ) {
+    chop $word; # Remove trailing "." to get just the letter
+    $letter = $1;
+    if (!defined $letter_prons{$letter} ) { 
+      $letter_prons{$letter} = [ ]; # new anonymous array
+    }
+    $arrayref = $letter_prons{$letter};
+    push @$arrayref, $pron;
+  } elsif( length($word) <= $max_length ) {
+    $pronof{$word . "," . $pron} = 1;
+    $isword{$word} = 1;
+    #if (!defined $prons{$word} ) {
+    #  $prons{$word} = [ ];
+    #}
+    #  push @{$prons{$word}}, $pron;
+  }
+}
+
+sub get_letter_prons;
+
+foreach $word (keys %isword) {
+  my @letter_prons = get_letter_prons($word);
+  foreach $pron (@letter_prons) {
+    if (defined $pronof{$word.",".$pron}) {
+      print "$word  $pron\n";
+    }
+  }
+}
+
+
+sub get_letter_prons {
+  @acronym = split("", shift); # The letters in the word.
+  my @prons = ( "" );
+  
+  while (@acronym > 0) {
+    $l = shift @acronym;
+    $n = 1; # num-repeats of letter $l.
+    while (@acronym > 0 && $acronym[0] eq $l) {
+      $n++;
+      shift @acronym;
+    }
+    my $arrayref = $letter_prons{$l};
+    my @prons_of_block = ();
+    if ($n == 1) { # Just one repeat.
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, $lpron; # typically (always?) just one pron of a letter.
+      }
+    } elsif ($n == 2) { # Two repeats.  Can be "double a" or "a a"
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, "D AH1 B AH0 L " . $lpron;
+        push @prons_of_block, $lpron . $lpron;
+      }
+    } elsif ($n == 3) { # can be "triple a" or "a a a"
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, "T R IH1 P AH0 L " . $lpron;
+        push @prons_of_block, $lpron . $lpron . $lpron;
+      }
+    } elsif ($n >= 4) { # let's say it can only be that letter repeated $n times..
+      # not sure really.
+      foreach $lpron ( @$arrayref ) {
+        $nlpron = "";
+        for ($m = 0; $m < $n; $m++) { $nlpron = $nlpron . $lpron; }
+        push @prons_of_block, $nlpron;
+      }
+    }
+    my @new_prons = ();
+    foreach $pron (@prons) {
+      foreach $pron_of_block(@prons_of_block) {
+        if ($pron eq "") {
+          push @new_prons, $pron_of_block;
+        } else {
+          push @new_prons, $pron . " " . $pron_of_block;
+        }
+      }
+    }
+    @prons = @new_prons;
+  }
+  return @prons;
+}
--- a/egs/babel/s5b/local/dict/get_acronym_prons.pl
+++ b/egs/babel/s5b/local/dict/get_acronym_prons.pl
@ -0,0 +1,123 @@
+#!/usr/bin/perl
+
+# Reads a dictionary (for prons of letters), and an OOV list,
+# and puts out candidate pronunciations of words in that list
+# that could plausibly be acronyms.
+# We judge that a word can plausibly be an acronym if it is
+# a sequence of just letters (no non-letter characters such
+# as "'"),  or something like U.K.,
+# and the number of letters is four or less.
+#
+# If the text were not already pre-normalized, there would
+# be other hints such as capitalization.
+
+# This program appends
+# the prons of the individual letters (A., B. and so on) to work out
+# the pron of the acronym.
+# Note: this is somewhat dependent on the convention used in CMUduct, that
+# the individual letters are spelled this way (e.g. "A."). [it seems
+# to also have the separated versions.
+
+if (!(@ARGV == 1 || @ARGV == 2)) { 
+  print "Usage: get_acronym_prons.pl dict [oovlist]";
+}
+
+$max_length = 4; # Max #letters in an acronym. (Longer 
+ # acronyms tend to have "pseudo-pronunciations", e.g. think about UNICEF.
+
+$dict = shift @ARGV;
+open(D, "<$dict") || die "Opening dictionary $dict";
+
+while(<D>) { # Read the dict, to get the prons of the letters.
+  chop;
+  @A = split(" ", $_);
+  $word = shift @A;
+  $pron = join(" ", @A);
+  if ($word =~ m/^([A-Z])\.$/ ) {
+    chop $word; # Remove trailing "." to get just the letter
+    $letter = $1;
+    if (!defined $letter_prons{$letter} ) { 
+      $letter_prons{$letter} = [ ]; # new anonymous array
+    }
+    $arrayref = $letter_prons{$letter};
+    push @$arrayref, $pron;
+  } elsif( length($word) <= $max_length ) {
+    $pronof{$word . "," . $pron} = 1;
+    $isword{$word} = 1;
+    #if (!defined $prons{$word} ) {
+    #  $prons{$word} = [ ];
+    #}
+    #  push @{$prons{$word}}, $pron;
+  }
+}
+
+sub get_letter_prons;
+
+while(<>) { # Read OOVs.
+  # For now, just do the simple cases without "." in 
+  # between... things with "." in the OOV list seem to
+  # be mostly errors.
+  chop;
+  $word = $_;
+  if ($word =~ m/^[A-Z]{1,5}$/) {
+    foreach $pron ( get_letter_prons($word) ) { # E.g. UNPO
+      print "$word  $pron\n";
+    }
+  } elsif ($word =~ m:^(\w\.){1,4}\w\.?$:) { # E.g. U.K.  Make the final "." optional.
+    $letters = $word;
+    $letters =~ s:\.::g;
+    foreach $pron ( get_letter_prons($letters) ) { 
+      print "$word  $pron\n";
+    }
+  }
+}
+
+sub get_letter_prons {
+  @acronym = split("", shift); # The letters in the word.
+  my @prons = ( "" );
+  
+  while (@acronym > 0) {
+    $l = shift @acronym;
+    $n = 1; # num-repeats of letter $l.
+    while (@acronym > 0 && $acronym[0] eq $l) {
+      $n++;
+      shift @acronym;
+    }
+    my $arrayref = $letter_prons{$l};
+    my @prons_of_block = ();
+    if ($n == 1) { # Just one repeat.
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, $lpron; # typically (always?) just one pron of a letter.
+      }
+    } elsif ($n == 2) { # Two repeats.  Can be "double a" or "a a"
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, "D AH1 B AH0 L " . $lpron;
+        push @prons_of_block, $lpron . " " . $lpron;
+      }
+    } elsif ($n == 3) { # can be "triple a" or "a a a"
+      foreach $lpron ( @$arrayref ) {
+        push @prons_of_block, "T R IH1 P AH0 L " . $lpron;
+        push @prons_of_block, "$lpron $lpron $lpron";
+      }
+    } elsif ($n >= 4) { # let's say it can only be that letter repeated $n times..
+      # not sure really.
+      foreach $lpron ( @$arrayref ) {
+        $nlpron = $lpron;
+        for ($m = 1; $m < $n; $m++) { $nlpron = $nlpron . " " . $lpron; }
+        push @prons_of_block, $nlpron;
+      }
+    }
+    my @new_prons = ();
+    foreach $pron (@prons) {
+      foreach $pron_of_block(@prons_of_block) {
+        if ($pron eq "") {
+          push @new_prons, $pron_of_block;
+        } else {
+          push @new_prons, $pron . " " . $pron_of_block;
+        }
+      }
+    }
+    @prons = @new_prons;
+  }
+  return @prons;
+}
--- a/egs/babel/s5b/local/dict/get_candidate_prons.pl
+++ b/egs/babel/s5b/local/dict/get_candidate_prons.pl
@ -0,0 +1,187 @@
+#!/usr/bin/perl
+
+# This script takes three command-line arguments (typically files, or "-"):
+# the suffix rules (as output by get_rules.pl), the rule-hierarchy 
+# (from get_rule_hierarchy.pl), and the words that we want prons to be 
+# generated for (one per line).
+
+# The output consists of candidate generated pronunciations for those words,
+# together with information about how we generated those pronunciations.
+# This does not do pruning of the candidates using the restriction
+# "you can't use a more general rule when a more specific one is applicable".
+# That is done by limit_candidate_prons.pl.
+
+# Each line of the output consists of a 4-tuple, separated by ";", of the
+# form:
+# word;pron;base-word;base-pron;rule-name;destress[;rule-score]
+# [the last field is only present if you supplied rules with score information].
+# where:
+# - "word" is the input word that we queried for, e.g. WASTED
+# - "pron" is the generated pronunciation, e.g. "W EY1 S T AH0 D"
+# - rule-name is a 4-tuple separated by commas that describes the rule, e.g.
+#   "STED,STING,D,NG",
+# - "base-word" is the base-word we're getting the pron from,
+#   e.g. WASTING
+# - "base-pron" is the pron of the base-word, e.g. "W EY1 S T IH0 NG"
+# - "destress" is either "yes" or "no" and corresponds to whether we destressed the
+#   base-word or not [de-stressing just corresponds to just taking any 2's down to 1's,
+#   although we may extend this in future]... 
+# - "rule-score" is a numeric score of the rule (this field is only present
+#   if there was score information in your rules.
+
+
+(@ARGV == 2  || @ARGV == 3) || die "Usage: get_candidate_prons.pl rules base-dict [ words ]";
+
+$min_prefix_len = 3;  # this should probably match with get_rules.pl
+
+$rules = shift @ARGV; # Note: rules may be with destress "yes/no" indicators or without...
+                      # if without, it's treated as if both "yes" and "no" are present.
+$dict = shift @ARGV;
+
+open(R, "<$rules") || die "Opening rules file: $rules";
+
+sub process_word;
+
+while(<R>) {
+  chop $_;
+  my ($rule, $destress, $rule_score) = split(";", $_); # We may have "destress" markings (yes|no),
+  # and scores, or we may have just rule, in which case
+  # $destress and $rule_score will be undefined.
+
+  my @R = split(",", $rule, 4); # "my" means new instance of @R each
+  # time we do this loop -> important because we'll be creating
+  # a reference to @R below.
+  # Note: the last arg to SPLIT tells it how many fields max to get.
+  # This stops it from omitting empty trailing fields.
+  @R == 4 || die "Bad rule $_";
+  $suffix = $R[0]; # Suffix of word we want pron for.
+  if (!defined $isrule{$rule}) {
+    $isrule{$rule} = 1; # make sure we do this only once for each rule 
+    # (don't repeate for different stresses).
+    if (!defined $suffix2rule{$suffix}) {
+      # The syntax [ $x, $y, ... ] means a reference to a newly created array
+      # containing $x, $y, etc.   \@R creates an array reference to R.
+      # so suffix2rule is a hash from suffix to ref to array of refs to 
+      # 4-dimensional arrays.
+      $suffix2rule{$suffix} = [ \@R ];
+    } else {
+      # Below, the syntax @{$suffix2rule{$suffix}} dereferences the array
+      # reference inside the hash; \@R pushes onto that array a new array
+      # reference pointing to @R.
+      push @{$suffix2rule{$suffix}}, \@R;
+    }
+  }
+  if (!defined $rule_score) { $rule_score = -1; } # -1 means we don't have the score info.
+  
+  # Now store information on which destress markings (yes|no) this rule
+  # is valid for, and the associated scores (if supplied)
+  # If just the rule is given (i.e. no destress marking specified),
+  # assume valid for both.
+  if (!defined $destress) { # treat as if both "yes" and "no" are valid.
+    $rule_and_destress_to_rule_score{$rule.";yes"} = $rule_score;
+    $rule_and_destress_to_rule_score{$rule.";no"} = $rule_score;
+  } else {
+    $rule_and_destress_to_rule_score{$rule.";".$destress} = $rule_score;
+  }
+
+}
+
+open(D, "<$dict") || die "Opening base dictionary: $dict";
+while(<D>) {
+  @A = split(" ", $_);
+  $word = shift @A;
+  $pron = join(" ", @A);
+  if (!defined $word2prons{$word}) {
+    $word2prons{$word} = [ $pron ]; # Ref to new anonymous array containing just "pron".
+  } else {
+    push @{$word2prons{$word}}, $pron; # Push $pron onto array referred to (@$ref derefs array).
+  }
+}
+foreach $word (%word2prons) {
+  # Set up the hash "prefixcount", which says how many times a char-sequence
+  # is a prefix (not necessarily a strict prefix) of a word in the dict.
+  $len = length($word);
+  for ($l = 0; $l <= $len; $l++) {
+    $prefixcount{substr($word, 0, $l)}++;
+  }
+}
+
+open(R, "<$rules") || die "Opening rules file: $rules";
+
+
+while(<>) {
+  chop;
+  m/^\S+$/ || die;
+  process_word($_);
+}
+
+sub process_word {
+  my $word = shift @_;
+  $len = length($word);
+  # $owncount is used in evaluating whether a particular prefix is a prefix
+  # of some other word in the dict... if a word itself may be in the dict
+  # (usually because we're running this on the dict itself), we need to
+  # correct for this.
+  if (defined $word2prons{$word}) { $owncount = 1; } else { $owncount = 0; }
+  
+  for ($prefix_len = $min_prefix_len; $prefix_len <= $len; $prefix_len++) {
+    my $prefix = substr($word, 0, $prefix_len);
+    my $suffix = substr($word, $prefix_len);
+    if ($prefixcount{$prefix} - $owncount == 0) {
+      # This prefix is not a prefix of any word in the dict, so no point
+      # checking the rules below-- none of them can match.
+      next;
+    }
+    $rules_array_ref = $suffix2rule{$suffix};
+    if (defined $rules_array_ref) {
+      foreach $R (@$rules_array_ref) { # @$rules_array_ref dereferences the array.
+        # $R is a refernce to a 4-dimensional array, whose elements we access with
+        # $$R[0], etc.
+        my $base_suffix = $$R[1];
+        my $base_word = $prefix . $base_suffix;
+        my $base_prons_ref = $word2prons{$base_word};
+        if (defined $base_prons_ref) {
+          my $psuffix = $$R[2];
+          my $base_psuffix = $$R[3];
+          if ($base_psuffix ne "") { 
+            $base_psuffix = " " . $base_psuffix; 
+            # Include " ", the space between phones, to prevent
+            # matching partial phones below.
+          }
+          my $base_psuffix_len = length($base_psuffix);
+          foreach $base_pron (@$base_prons_ref) { # @$base_prons_ref derefs 
+            # that reference to an array.
+            my $base_pron_prefix_len = length($base_pron) - $base_psuffix_len;
+            # Note: these lengths are in characters, not phones.
+            if ($base_pron_prefix_len >= 0 && 
+                substr($base_pron, $base_pron_prefix_len) eq $base_psuffix) {
+              # The suffix of the base_pron is what it should be.
+              my $pron_prefix = substr($base_pron, 0, $base_pron_prefix_len);
+              my $rule = join(",", @$R); # we'll output this..
+              my $len = @R;
+              for ($destress = 0; $destress <= 1; $destress++) { # Two versions 
+                # of each rule: with destressing and without.
+                # pron is the generated pron.
+                if ($destress) {  $pron_prefix =~ s/2/1/g; }
+                my $pron;
+                if ($psuffix ne "") { $pron = $pron_prefix . " " . $psuffix; }
+                else { $pron = $pron_prefix; }
+                # Now print out the info about the generated pron.
+                my $destress_mark = ($destress ? "yes" : "no");
+                my $rule_score = $rule_and_destress_to_rule_score{$rule.";".$destress_mark};
+                if (defined $rule_score) { # Means that the (rule,destress) combination was
+                  # seen [note: this if-statement may be pointless, as currently we don't
+                  # do any pruning of rules].
+                  my @output = ($word, $pron, $base_word, $base_pron, $rule, $destress_mark);
+                  if ($rule_score != -1) { push @output, $rule_score; } # If scores were supplied,
+                  # we also output the score info.
+                  print join(";", @output) . "\n";
+                }
+              }
+            }  
+          }
+        }
+      }
+    }
+  }
+}  
--- a/egs/babel/s5b/local/dict/get_rule_hierarchy.pl
+++ b/egs/babel/s5b/local/dict/get_rule_hierarchy.pl
@ -0,0 +1,73 @@
+#!/usr/bin/perl
+
+#This reads in rules, of the form put out by get_rules.pl, e.g.:
+# ERT,,ER0 T,
+# MENT,ING,M AH0 N T,IH0 NG
+# S,TON,Z,T AH0 N
+# ,ER,IH0 NG,IH0 NG ER0
+# ,'S,M AH0 N,M AH0 N Z
+#TIONS,TIVE,SH AH0 N Z,T IH0 V
+
+# and it works out a hierarchy that says which rules are sub-cases
+# of which rules: it outputs on each line a pair separated by ";", where
+# each member of the pair is a rule, first one is the specialization, the
+# second one being more general.
+# E.g.:
+# RED,RE,D,/ED,E,D,
+# RED,RE,D,/D,,D,
+# GING,GE,IH0 NG,/ING,I,IH0 NG,
+# TOR,TING,T ER0,T IH0 NG/OR,OR,T ER0,T ER0 
+# ERED,ER,D,/RED,R,D,
+# ERED,ER,D,/ED,,D,
+
+
+
+
+while(<>) {
+  chop;
+  $rule = $_;
+  $isrule{$rule} = 1;
+  push @rules, $rule;
+}
+
+foreach my $rule (@rules) {
+  # Truncate the letters and phones in the rule, while we
+  # can, to get more general rules; if the more general rule
+  # exists, put out the pair.
+  @A = split(",", $rule);
+  @suffixa = split("", $A[0]);
+  @suffixb = split("", $A[1]);
+  @psuffixa = split(" ", $A[2]);
+  @psuffixb = split(" ", $A[3]);
+  for ($common_suffix_len = 0; $common_suffix_len < @suffixa && $common_suffix_len < @suffixb;) {
+    if ($suffixa[$common_suffix_len] eq $suffixb[$common_suffix_len]) {
+      $common_suffix_len++;
+    } else {
+      last;
+    }
+  }
+  for ($common_psuffix_len = 0; $common_psuffix_len < @psuffixa && $common_psuffix_len < @psuffixb;) {
+    if ($psuffixa[$common_psuffix_len] eq $psuffixb[$common_psuffix_len]) {
+      $common_psuffix_len++;
+    } else {
+      last;
+    }
+  }
+  # Get all combinations of pairs of integers <= (common_suffix_len, common_psuffix_len),
+  # except (0,0), and print out this rule together with the corresponding rule (if it exists).
+  for ($m = 0; $m <= $common_suffix_len; $m++) {
+    $sa = join("", @suffixa[$m...$#suffixa]); # @x[a..b] is array slice notation.
+    $sb = join("", @suffixb[$m...$#suffixb]);
+    for ($n = 0; $n <= $common_psuffix_len; $n++) {
+      if (!($m == 0 && $n == 0)) {
+        $psa = join(" ", @psuffixa[$n...$#psuffixa]);
+        $psb = join(" ", @psuffixb[$n...$#psuffixb]);
+        $more_general_rule = join(",", ($sa, $sb, $psa, $psb));
+        if (defined $isrule{$more_general_rule}) {
+          print $rule . ";" . $more_general_rule . "\n";
+        }
+      }
+    }
+  }
+}
+
--- a/egs/babel/s5b/local/dict/get_rules.pl
+++ b/egs/babel/s5b/local/dict/get_rules.pl
@ -0,0 +1,204 @@
+#!/usr/bin/perl
+
+# This program creates suggested suffix rules from a dictionary.
+# It outputs quadruples of the form:
+# suffix,base-suffix,psuffix,base-psuffix
+# where "suffix" is the suffix of the letters of a word, "base-suffix" is
+# the suffix of the letters of the base-word, "psuffix" is the suffix of the
+# pronunciation of the word (a space-separated list of phonemes), and
+# "base-psuffix" is the suffix of the pronunciation of the baseword.
+# As far as this program is concerned, there is no distinction between
+# "word" and "base-word".  To simplify things slightly, what it does
+# is return all tuples (a,b,c,d) [with a != b] such that there are
+# at least $min_suffix_count instances in the dictionary of
+# a (word-prefix, pron-prefix) pair where there exists (word,pron)
+# pairs of the form
+# ( word-prefix . a,  pron-prefix . c)
+# and 
+# ( word-prefix . b, pron-prefix . d)
+# For example if (a,b,c,d) equals (USLY,US,S L IY0,S)
+# then this quadruple will be output as long as there at least
+# e.g. 30 instances of prefixes like (FAM, F EY1 M AH0)
+# where there exist (word, pron) pairs like:
+# FAMOUS, F EY1 M AH0 S
+# FAMOUSLY  F EY1 M AH0 S L IY0
+#
+# There are some modifications to the picture above, for efficiency.
+# If $disallow_empty_suffix != 0, this program will not output 4-tuples where
+# the first element (the own-word suffix) is empty, as this would cause
+# efficiency problems in get_candidate_prons.pl.  If 
+# $ignore_prefix_stress != 0, this program will ignore stress markings
+# while evaluating whether prefixes are the same.
+# The minimum count for a quadruple to be output is $min_suffix_count
+# (e.g. 30).
+#
+# The function of this program is not to evaluate the accuracy of these rules;
+# it is mostly a pruning step, where we suggest rules that have large enough
+# counts to be suitable for our later procedure where we evaluate their
+# accuracy in predicting prons.
+
+$disallow_empty_suffix = 1; # Disallow rules where the suffix of the "own-word" is
+   # empty.  This is for efficiency in later stages (e.g. get_candidate_prons.pl).
+$min_prefix_len = 3;  # this must match with get_candidate_prons.pl
+$ignore_prefix_stress = 1; # or 0 to take account of stress in prefix.
+$min_suffix_count = 20;
+
+# Takes in dictionary.
+
+print STDERR "Reading dict\n";
+while(<>) {
+  @A = split(" ", $_);
+  my $word = shift @A;
+  my $pron = join(" ", @A);
+  if (!defined $prons{$word}) {
+    $prons{$word} = $pron;
+    push @words, $word;
+  } else {
+    $prons{$word} = $prons{$word} . ";" . $pron;
+  }
+}
+
+# Get common suffixes (e.g., count >100).  Include empty suffix.
+
+print STDERR "Getting common suffix counts.\n";
+{
+  foreach $word (@words) {
+    $len = length($word);
+    for ($x = $min_prefix_len; $x <= $len; $x++) {
+      $suffix_count{substr($word, $x)}++;
+    }
+  }
+
+  foreach $suffix (keys %suffix_count) {
+    if ($suffix_count{$suffix} >= $min_suffix_count) {
+      $newsuffix_count{$suffix} = $suffix_count{$suffix};
+    }
+  }
+  %suffix_count = %newsuffix_count;
+  undef %newsuffix_count;
+
+  foreach $suffix ( sort { $suffix_count{$b} <=> $suffix_count{$a} } keys %suffix_count ) {
+    print STDERR "$suffix_count{$suffix} $suffix\n";
+  }
+}
+
+print STDERR "Getting common suffix pairs.\n";
+
+{
+  print STDERR " Getting map from prefix -> suffix-set.\n";
+
+  # Create map from prefix -> suffix-set.
+  foreach $word (@words) {
+    $len = length($word);
+    for ($x = $min_prefix_len; $x <= $len; $x++) {
+      $prefix = substr($word, 0, $x);
+      $suffix = substr($word, $x);
+      if (defined $suffix_count{$suffix}) { # Suffix is common...
+        if (!defined $suffixes_of{$prefix}) {
+          $suffixes_of{$prefix} = [ $suffix ]; # Create a reference to a new array with
+          # one element.
+        } else {
+          push @{$suffixes_of{$prefix}}, $suffix; # Push $suffix onto array that the
+          # hash member is a reference .
+        }
+      }
+    }
+  }
+  my %suffix_set_count;
+  print STDERR " Getting map from suffix-set -> count.\n";
+  while ( my ($key, $value) = each(%suffixes_of) ) { 
+    my @suffixes = sort ( @$value );
+    $suffix_set_count{join(";", @suffixes)}++;
+  }
+  print STDERR " Getting counts for suffix pairs.\n";
+  while ( my ($suffix_set, $count) = each (%suffix_set_count) ) {
+    my @suffixes = split(";", $suffix_set);
+    # Consider pairs to be ordered.  This is more convenient
+    # later on.
+    foreach $suffix_a (@suffixes) {
+      foreach $suffix_b (@suffixes) {
+        if ($suffix_a ne $suffix_b) {
+          $suffix_pair = $suffix_a . "," . $suffix_b;
+          $suffix_pair_count{$suffix_pair} += $count;
+        }
+      }
+    }
+  }
+
+  # To save memory, only keep pairs above threshold in the hash.
+  while ( my ($suffix_pair, $count) = each (%suffix_pair_count) ) {
+    if ($count >= $min_suffix_count) {
+      $new_hash{$suffix_pair} = $count;
+    }
+  }
+  %suffix_pair_count = %new_hash;
+  undef %new_hash;
+
+  # Print out the suffix pairs so the user can see.
+  foreach $suffix_pair ( 
+      sort { $suffix_pair_count{$b} <=> $suffix_pair_count{$a} } keys %suffix_pair_count ) {
+    print STDERR "$suffix_pair_count{$suffix_pair} $suffix_pair\n";
+  }
+}
+
+print STDERR "Getting common suffix/suffix/psuffix/psuffix quadruples\n";
+
+{
+  while ( my ($prefix, $suffixes_ref) = each(%suffixes_of) ) {
+    # Note: suffixes_ref is a reference to an array.  We dereference with
+    # @$suffixes_ref.
+    # Consider each pair of suffixes (in each order).
+    foreach my $suffix_a ( @$suffixes_ref ) {
+      foreach my $suffix_b ( @$suffixes_ref ) {
+        # could just used "defined" in next line, but this is for clarity.
+        $suffix_pair = $suffix_a.",".$suffix_b;
+        if ( $suffix_pair_count{$suffix_pair} >= $min_suffix_count ) {
+          foreach $pron_a_str (split(";", $prons{$prefix.$suffix_a})) {
+            @pron_a = split(" ", $pron_a_str);
+            foreach $pron_b_str (split(";", $prons{$prefix.$suffix_b})) {
+              @pron_b = split(" ", $pron_b_str);
+              $len_a = @pron_a; # evaluating array as scalar automatically gives length.
+              $len_b = @pron_b;
+              for (my $pos = 0; $pos <= $len_a && $pos <= $len_b; $pos++) {
+                # $pos is starting-pos of psuffix-pair. 
+                $psuffix_a = join(" ", @pron_a[$pos...$#pron_a]);
+                $psuffix_b = join(" ", @pron_b[$pos...$#pron_b]);
+                $quadruple = $suffix_pair . "," . $psuffix_a . "," . $psuffix_b;
+                $quadruple_count{$quadruple}++;
+                
+                my $pron_a_pos = $pron_a[$pos], $pron_b_pos = $pron_b[$pos];
+                if ($ignore_prefix_stress) {
+                  $pron_a_pos =~ s/\d//; # e.g convert IH0 to IH.  Only affects
+                  $pron_b_pos =~ s/\d//; # whether we exit the loop below.
+                }
+                if ($pron_a_pos ne $pron_b_pos) {
+                  # This is important: we don't consider a pron suffix-pair to be
+                  # valid unless the pron prefix is the same.
+                  last;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  # To save memory, only keep pairs above threshold in the hash.
+  while ( my ($quadruple, $count) = each (%quadruple_count) ) {
+    if ($count >= $min_suffix_count) {
+      $new_hash{$quadruple} = $count;
+    }
+  }
+  %quadruple_count = %new_hash;
+  undef %new_hash;
+  
+  # Print out the quadruples for diagnostics.
+  foreach $quadruple ( 
+    sort { $quadruple_count{$b} <=> $quadruple_count{$a} } keys %quadruple_count ) {
+    print STDERR "$quadruple_count{$quadruple} $quadruple\n";
+  }
+}
+# Now print out the quadruples; these are the output of this program.
+foreach $quadruple (keys %quadruple_count) {
+  print $quadruple."\n";
+}
--- a/egs/babel/s5b/local/dict/limit_candidate_prons.pl
+++ b/egs/babel/s5b/local/dict/limit_candidate_prons.pl
@ -0,0 +1,103 @@
+#!/usr/bin/perl
+
+# This program enforces the rule that
+# if a "more specific" rule applies, we cannot use the more general rule.
+# It takes in tuples generated by get_candidate_prons (one per line, separated
+# by ";"), of the form:
+# word;pron;base-word;base-pron;rule-name;de-stress[;rule-score]
+# [note: we mean that the last element, the numeric score of the rule, is optional]
+# and it outputs a (generally shorter) list
+# of the same form.
+
+
+# For each word:
+  # For each (base-word,base-pron):
+  #  Eliminate "more-general" rules as follows:
+  #    For each pair of rules applying to this (base-word, base-pron):
+  #      If pair is in more-general hash, disallow more general one.
+  # Let the output be: for each (base-word, base-pron, rule):
+  # for (destress-prefix) in [yes, no], do:
+  #  print out the word input, the rule-name, [destressed:yes|no], and the new pron.
+
+
+if (@ARGV != 1 && @ARGV != 2) {
+  die "Usage: limit_candidate_prons.pl rule_hierarchy [candidate_prons] > limited_candidate_prons";
+}
+
+$hierarchy = shift @ARGV;
+open(H, "<$hierarchy") || die "Opening rule hierarchy $hierarchy";
+
+while(<H>) {
+  chop;
+  m:.+;.+: || die "Bad rule-hierarchy line $_";
+  $hierarchy{$_} = 1; # Format is: if $rule1 is the string form of the more specific rule
+  # and $rule21 is that string form of the more general rule, then $hierarchy{$rule1.";".$rule2}
+  # is defined, else undefined.
+}
+
+
+sub process_word;
+
+undef $cur_word;
+@cur_lines = ();
+
+while(<>) {
+  # input, output is:
+  # word;pron;base-word;base-pron;rule-name;destress;score
+  chop;
+  m:^([^;]+);: || die "Unexpected input: $_";
+  $word = $1;
+  if (!defined $cur_word || $word eq $cur_word) {
+    if (!defined $cur_word) { $cur_word = $word; }
+    push @cur_lines, $_;
+  } else {
+    process_word(@cur_lines); # Process a series of suggested prons
+    # for a particular word.
+    $cur_word = $word;
+    @cur_lines = ( $_ ); 
+  }
+}
+process_word(@cur_lines);
+  
+sub process_word {
+  my %pair2rule_list; # hash from $baseword.";".$baseword to ref
+  # to array of [ line1, line2, ... ].
+  my @cur_lines = @_;
+  foreach my $line (@cur_lines) {
+    my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line);
+    my $key = $baseword.";".$basepron;
+    if (defined $pair2rule_list{$key}) {
+      push @{$pair2rule_list{$key}}, $line; # @{...} derefs the array pointed to 
+      # by the array ref inside {}. 
+    } else {
+      $pair2rule_list{$key} = [ $line ]; # [ $x ] is new anonymous array with 1 elem ($x)
+    }
+  }
+  while ( my ($key, $value) = each(%pair2rule_list) ) {
+    my @lines = @$value; # array of lines that are for this (baseword,basepron).
+    my @stress, @rules; # Arrays of stress markers and rule names, indexed by
+    # same index that indexes @lines.
+    for (my $n = 0; $n < @lines; $n++) {
+      my $line = $lines[$n];
+      my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line);
+      $stress[$n] = $destress;
+      $rules[$n] = $rulename;
+    }
+    for (my $m = 0; $m < @lines; $m++) {
+      my $ok = 1; # if stays 1, this line is OK.
+      for (my $n = 0; $n < @lines; $n++) {
+        if ($m != $n && $stress[$m] eq $stress[$n]) {
+          if (defined $hierarchy{$rules[$n].";".$rules[$m]}) {
+            # Note: this "hierarchy" variable is defined if $rules[$n] is a more
+            # specific instances of $rules[$m], thus invalidating $rules[$m].
+            $ok = 0;
+            last; # no point iterating further.
+          }
+        }
+      }
+      if ($ok != 0) {
+        print $lines[$m] . "\n";
+      }
+    }
+  }
+}
--- a/egs/babel/s5b/local/dict/reverse_candidates.pl
+++ b/egs/babel/s5b/local/dict/reverse_candidates.pl
@ -0,0 +1,50 @@
+#!/usr/bin/perl
+
+# This takes the output of e.g. get_candidate_prons.pl or limit_candidate_prons.pl,
+# which is 7-tuples, one per line, of the form:
+
+# word;pron;base-word;base-pron;rule-name;de-stress;rule-score
+# (where rule-score is somtimes listed as optional, but this
+# program does expect it, since we don't anticipate it being used
+# without it).
+# This program assumes that all the words and prons and rules have
+# come from a reversed dictionary (reverse_dict.pl) where the order
+# of the characters in the words, and the phones in the prons, have
+# been reversed, and it un-reverses them.  That it, the characters
+# in "word" and "base-word", and the phones in "pron" and "base-pron"
+# are reversed; and the rule ("rule-name") is parsed as a 4-tuple,
+# like:
+# suffix,base-suffix,psuffix,base-psuffix
+# so this program reverses the characters in "suffix" and "base-suffix"
+# and the phones (separated by spaces) in "psuffix" and "base-psuffix".
+
+sub reverse_str {
+  $str = shift;
+  return join("", reverse(split("", $str)));
+}
+sub reverse_pron {
+  $str = shift;
+  return join(" ", reverse(split(" ", $str)));
+}
+
+while(<>){ 
+  chop;
+  @A = split(";", $_);
+  @A == 7 || die "Bad input line $_: found $len fields, expected 7.";
+
+  ($word,$pron,$baseword,$basepron,$rule,$destress,$score) = @A;
+  $word = reverse_str($word);
+  $pron = reverse_pron($pron);
+  $baseword = reverse_str($baseword);
+  $basepron = reverse_pron($basepron);
+  @R = split(",", $rule, 4);
+  @R == 4 || die "Bad rule $rule";
+
+  $R[0] = reverse_str($R[0]); # suffix.
+  $R[1] = reverse_str($R[1]); # base-suffix.
+  $R[2] = reverse_pron($R[2]); # pron.
+  $R[3] = reverse_pron($R[3]); # base-pron.
+  $rule = join(",", @R);
+  @A = ($word,$pron,$baseword,$basepron,$rule,$destress,$score);
+  print join(";", @A) . "\n";
+}
--- a/egs/babel/s5b/local/dict/reverse_dict.pl
+++ b/egs/babel/s5b/local/dict/reverse_dict.pl
@ -0,0 +1,14 @@
+#!/usr/bin/perl
+
+# Used in conjunction with get_rules.pl
+# example input line: XANTHE  Z AE1 N DH
+# example output line: EHTNAX DH N AE1 Z
+
+while(<>){ 
+  @A = split(" ", $_);
+  $word = shift @A;
+  $word = join("", reverse(split("", $word))); # Reverse letters of word.
+  @A = reverse(@A); # Reverse phones in pron.
+  unshift @A, $word;
+  print join(" ", @A) . "\n";
+}
--- a/egs/babel/s5b/local/dict/score_prons.pl
+++ b/egs/babel/s5b/local/dict/score_prons.pl
@ -0,0 +1,50 @@
+#!/usr/bin/perl
+
+# This program takes candidate prons from "get_candidate_prons.pl" or
+# "limit_candidate_prons.pl", and a reference dictionary covering those words,
+# and outputs the same format but with scoring information added (so we go from
+# 6 to 7 fields).  The scoring information says, for each generated pron,
+# whether we have a match, a partial match, or no match, to some word in the
+# dictionary.  A partial match means it's correct except for stress.
+
+# The input is a 6-tuple on each line, like:
+# word;pron;base-word;base-pron;rule-name;de-stress
+#
+# The output is the same except with one more field, the score,
+# which may be "right", "wrong", "partial".
+
+if (@ARGV != 1 && @ARGV != 2) {
+  die "Usage: score_prons.pl reference_dict [candidate_prons] > scored_candidate_prons";
+}
+
+$dict = shift @ARGV;
+open(D, "<$dict") || die "Opening dictionary $dict";
+
+while(<D>) { # Set up some hashes that tell us when
+  # a (word,pron) pair is correct (and the same for
+  # prons with stress information removed).
+  chop;
+  @A = split(" ", $_);
+  $word = shift @A;
+  $pron = join(" ", @A);
+  $pron_nostress = $pron;
+  $pron_nostress =~ s:\d::g;
+  $word_and_pron{$word.";".$pron} = 1;
+  $word_and_pron_nostress{$word.";".$pron_nostress} = 1;
+}
+
+while(<>) {
+  chop;
+  $line = $_;
+  my ($word, $pron, $baseword, $basepron, $rulename, $destress) = split(";", $line);
+  $pron_nostress = $pron;
+  $pron_nostress =~ s:\d::g;
+  if (defined $word_and_pron{$word.";".$pron}) {
+    $score = "right";
+  } elsif (defined $word_and_pron_nostress{$word.";".$pron_nostress}) {
+    $score = "partial";
+  } else {
+    $score = "wrong";
+  }
+  print $line.";".$score."\n";
+}
--- a/egs/babel/s5b/local/dict/score_rules.pl
+++ b/egs/babel/s5b/local/dict/score_rules.pl
@ -0,0 +1,52 @@
+#!/usr/bin/perl
+
+# This program takes the output of count_rules.pl, which is tuples
+# of the form
+#
+# rule;destress;right-count;partial-count;wrong-count
+#
+# and outputs lines of the form
+#
+# rule;de-stress;score
+#
+# where the score, between 0 and 1 (1 better), is 
+# equal to:
+#
+# It forms a score between 0 and 1, of the form:
+# ((#correct) +  $partial_score * (#partial)) / (#correct + #partial + #wrong + $ballast)
+#
+# where $partial_score (e.g. 0.8) is the score we assign to a "partial" match,
+# and $ballast is a small number, e.g. 1, that is treated like "extra" wrong scores, to penalize
+# rules with few observations.
+#
+# It outputs all rules that at are at least the
+
+$ballast = 1;
+$partial_score = 0.8;
+$destress_penalty = 1.0e-05; # Give destressed rules a small
+# penalty vs. their no-destress counterparts, so if we
+# have to choose arbitrarily we won't destress (seems safer)>
+
+for ($n = 1; $n <= 4; $n++) {
+  if ($ARGV[0] eq "--ballast") {
+    shift @ARGV;
+    $ballast = shift @ARGV;
+  }
+  if ($ARGV[0] eq "--partial-score") {
+    shift @ARGV;
+    $partial_score = shift @ARGV;
+    ($partial_score >= 0.0 && $partial_score <= 1.0) || die "Invalid partial_score: $partial_score";
+  }
+}
+
+(@ARGV == 0 || @ARGV == 1) || die "Usage: score_rules.pl [--ballast ballast-count] [--partial-score partial-score] [input from count_rules.pl]";
+
+while(<>) {
+  @A = split(";", $_);
+  @A == 5 || die "Bad input line; $_";
+  ($rule,$destress,$right_count,$partial_count,$wrong_count) = @A;
+  $rule_score = ($right_count + $partial_score*$partial_count) / 
+    ($right_count+$partial_count+$wrong_count+$ballast);
+  if ($destress eq "yes") { $rule_score -= $destress_penalty; }
+  print join(";", $rule, $destress, sprintf("%.5f", $rule_score)) . "\n";
+}
--- a/egs/babel/s5b/local/dict/select_candidate_prons.pl
+++ b/egs/babel/s5b/local/dict/select_candidate_prons.pl
@ -0,0 +1,84 @@
+#!/usr/bin/perl
+
+# This takes the output of e.g. get_candidate_prons.pl or limit_candidate_prons.pl
+# or reverse_candidates.pl, which is 7-tuples, one per line, of the form:
+#
+# word;pron;base-word;base-pron;rule-name;de-stress;rule-score
+#
+# and selects the most likely prons for the words based on rule
+# score.  It outputs in the same format as the input (thus, it is
+# similar to limit_candidates.pl in its input and output format,
+# except it has a different way of selecting the prons to put out).
+#
+# This script will select the $max_prons best pronunciations for
+# each candidate word, subject to the constraint that no pron should
+# have a rule score worse than $min_rule_score.
+# It first merges the candidates by, if there are multiple candidates
+# generating the same pron, selecting the candidate that had the
+# best associated score.  It then sorts the prons on score and
+# selects the n best prons (but doesn't print out candidates with
+# score beneath the threshold).
+
+
+$max_prons = 4;
+$min_rule_score = 0.35;
+
+
+for ($n = 1; $n <= 3; $n++) {
+  if ($ARGV[0] eq "--max-prons") {
+    shift @ARGV;
+    $max_prons = shift @ARGV;
+  }
+  if ($ARGV[0] eq "--min-rule-score") {
+    shift @ARGV;
+    $min_rule_score = shift @ARGV;
+  }
+}
+
+if (@ARGV != 0 && @ARGV != 1) {
+  die "Usage: select_candidates_prons.pl [candidate_prons] > selected_candidate_prons";
+}
+
+sub process_word;
+
+undef $cur_word;
+@cur_lines = ();
+
+while(<>) {
+  # input, output is:
+  # word;pron;base-word;base-pron;rule-name;destress;score
+  chop;
+  m:^([^;]+);: || die "Unexpected input: $_";
+  $word = $1;
+  if (!defined $cur_word || $word eq $cur_word) {
+    if (!defined $cur_word) { $cur_word = $word; }
+    push @cur_lines, $_;
+  } else {
+    process_word(@cur_lines); # Process a series of suggested prons
+    # for a particular word.
+    $cur_word = $word;
+    @cur_lines = ( $_ ); 
+  }
+}
+process_word(@cur_lines);
+
+
+sub process_word {
+  my %pron2rule_score; # hash from generated pron to rule score for that pron.
+  my %pron2line; # hash from generated pron to best line for that pron.
+  my @cur_lines = @_;
+  foreach my $line (@cur_lines) {
+    my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line);
+    if (!defined $pron2rule_score{$pron} ||
+        $rule_score > $pron2rule_score{$pron}) {
+      $pron2rule_score{$pron} = $rule_score;
+      $pron2line{$pron} = $line;
+    }
+  }
+  my @prons = sort { $pron2rule_score{$b} <=> $pron2rule_score{$a} } keys %pron2rule_score;
+  for (my $n = 0; $n < @prons && $n < $max_prons &&
+       $pron2rule_score{$prons[$n]} >= $min_rule_score; $n++) {
+    print $pron2line{$prons[$n]} . "\n";
+  }
+}
+
--- a/egs/babel/s5b/local/eval_kw_subsets.sh
+++ b/egs/babel/s5b/local/eval_kw_subsets.sh
@ -0,0 +1,4 @@
+KWSEval -e ecf.xml -r rttm -t keyword_outvocab.xml  -s kwslist.xml -c -o -b -d -f ./kws/outvocab
+KWSEval -e ecf.xml -r rttm -t keyword_invocab.xml  -s kwslist.xml -c -o -b -d -f ./kws/invocab
+KWSEval -e ecf.xml -r rttm -t kws.xml  -s kwslist.xml -c -o -b -d -f ./kws/fullvocab
+
--- a/egs/babel/s5b/local/extract_oov_words.pl
+++ b/egs/babel/s5b/local/extract_oov_words.pl
@ -0,0 +1,70 @@
+#!/usr/bin/perl
+# Copyright 2012  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0.
+
+use Data::Dumper;
+$Data::Dumper::Indent = 1;
+
+binmode STDOUT, ":utf8"; 
+binmode STDIN, ":utf8"; 
+
+$ignore_oov = 0;
+$ignore_first_field = 0;
+for($x = 0; $x < 2; $x++) {
+  if ($ARGV[0] eq "-f") {
+    shift @ARGV; 
+    $field_spec = shift @ARGV; 
+    if ($field_spec =~ m/^\d+$/) {
+      $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
+    }
+    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
+      if ($1 ne "") {
+        $field_begin = $1 - 1;  # Change to zero-based indexing.
+      }
+      if ($2 ne "") {
+        $field_end = $2 - 1;    # Change to zero-based indexing.
+      }
+    }
+    if (!defined $field_begin && !defined $field_end) {
+      die "Bad argument to -f option: $field_spec"; 
+    }
+  }
+}
+
+$symtab = shift @ARGV;
+if (!defined $symtab) {
+  print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" .
+    "options: [--map-oov <oov-symbol> ]  [-f <field-range> ]\n" .
+      "note: <field-range> can look like 4-5, or 4-, or 5-, or 1.\n";
+}
+
+
+open(F, "<:encoding(UTF-8)", $symtab) || die "Error opening symbol table file $symtab";
+while(<F>) {
+    @A = split(" ", $_);
+    @A == 2 || die "bad line in symbol table file: $_";
+    
+    if ( not defined( $sym2int{$A[0]} ) ) {
+      $sym2int{$A[0]} = [];
+    }
+    push @{ $sym2int{$A[0]} }, $A[1] + 0;
+}
+
+
+$lines=0;
+while (<>) {
+  @A = split(" ", $_);
+  @B = ();
+  for ($n = 0; $n < @A; $n++) {
+    if ( (!defined $field_begin || $n >= $field_begin)
+      && (!defined $field_end || $n <= $field_end)) {
+      $a = $A[$n];
+      $i = $sym2int{$a};
+      if (!defined ($i)) {
+        print $a . "\n";
+      } 
+    }
+  }
+}
+
+
--- a/egs/babel/s5b/local/filter_keywords.pl
+++ b/egs/babel/s5b/local/filter_keywords.pl
@ -0,0 +1,68 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+use Encode;
+
+my $Usage = <<EOU;
+Usage:    filter_keywords.pl <dictin> <queryin|-> <queryout|->
+
+EOU
+
+if(@ARGV != 3) {
+  die $Usage;
+}
+
+# Get parameters
+my $dictin = shift @ARGV;
+my $filein = shift @ARGV;
+my $fileout = shift @ARGV;
+
+# Open dictionary
+if (!open(D, "<$dictin")) {print "Fail to open dictionary: $dictin\n"; exit 1;}
+
+# Get input source
+my $source = "";
+if ($filein eq "-") {
+  $source = "STDIN";
+} else {
+  if (!open(I, "<$filein")) {print "Fail to open input file: $filein\n"; exit 1;}
+  $source = "I";
+}
+
+# Open output fst list
+my $sourceout = "";
+if ($fileout ne "-") {
+  if (!open(O, ">$fileout")) {print "Fail to open output file: $fileout\n"; exit 1;}
+  $sourceout = "O";
+}
+
+# Read in the dictionary
+my %dict = ();
+while (<D>) {
+  chomp;
+  my @col = split(" ", $_);
+  my $word = shift @col;
+  my $original_w = $word;
+  $word =~ tr/a-z/A-Z/;
+  $dict{$word} = $original_w;
+}
+
+# Process the queries
+my $word;
+while (<$source>) {
+  chomp;
+  my @col = split(" ", $_);
+  foreach $word (@col) {
+    if (defined($dict{$word})) {
+      eval "print $sourceout \"$dict{$word} \"";
+    } else {
+      eval "print $sourceout \"$word \"";
+    }
+  }
+  eval "print $sourceout \"\n\"";
+}
+
+close(D);
+if ($filein  ne "-") {close(I);}
+if ($fileout ne "-") {close(O);}
--- a/egs/babel/s5b/local/filter_kwslist.pl
+++ b/egs/babel/s5b/local/filter_kwslist.pl
@ -0,0 +1,55 @@
+#!/usr/bin/env perl
+
+# Copyright 2012  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0.
+#
+use strict;
+use warnings;
+use Getopt::Long;
+use XML::Simple;
+
+my $data = XMLin(\*STDIN);
+my $duptime= $ARGV[0];
+
+#print Dumper($data);
+
+# Filters duplicate keywords that have the same keyword and about the same time.
+# Relies on the fact that its input is sorted from largest to smallest score.
+
+foreach my $kwentry (@{$data->{detected_kwlist}}) {
+  #print "$kwentry->{kwid}\n";
+  my $prev_time;
+  my $prev_file;
+
+  if(ref($kwentry->{kw}) eq 'ARRAY'){
+    my @arr = @{$kwentry->{kw}};
+    my @newarray = ();
+    
+    push @newarray, $arr[0];
+    #$arr[0]->{tbeg} . "\n";
+    for (my $i = 1; $i < scalar(@arr); $i +=1) {
+      
+      my $found = 0;
+      foreach my $kw (@newarray) {
+        if (( abs($arr[$i]->{tbeg} -  $kw->{tbeg}) < $duptime )  && 
+            ( $arr[$i]->{channel} ==  $kw->{channel}) &&
+            ( $arr[$i]->{file} eq  $kw->{file}) ) {
+
+          $found = 1;
+          
+        #print $arr[$i]->{tbeg} . "\n";
+        }
+      }
+      if ( $found == 0 ) {
+        push @newarray, $arr[$i];
+      }
+    }
+
+    $kwentry->{kw} = \@newarray;
+  }else{
+      #print $kwentry->{kw}->{tbeg} . "\n";
+  }
+#  print "$kwentry->{kwid}\t$kwentry->{kwtext}\n";
+}
+my $xml = XMLout($data, RootName => "kwslist", NoSort=>1);
+print $xml;
--- a/egs/babel/s5b/local/find_transcripts.pl
+++ b/egs/babel/s5b/local/find_transcripts.pl
@ -0,0 +1,64 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+# This program takes on its standard input a list of utterance
+# id's, one for each line. (e.g. 4k0c030a is a an utterance id).
+# It takes as
+# Extracts from the dot files the transcripts for a given
+# dataset (represented by a file list).
+# 
+
+@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts";
+$dot_flist = shift @ARGV;
+
+open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n";
+while(<L>){
+    chop;
+    m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_";
+    $spk = $1;
+    $spk2dot{$spk} = $_;
+}
+
+
+
+while(<STDIN>){ 
+    chop;
+    $uttid = $_;
+    $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_";
+    $spk = $1;
+    if($spk ne $curspk) {
+        %utt2trans = { }; # Don't keep all the transcripts in memory...
+        $curspk = $spk;
+        $dotfile = $spk2dot{$spk};
+        defined $dotfile || die "No dot file for speaker $spk\n";
+        open(F, "<$dotfile") || die "Error opening dot file $dotfile\n";
+        while(<F>) {
+            $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n";
+            $trans = $1;
+            $utt = $2;
+            $utt2trans{$utt} = $trans;
+        }
+    }
+    if(!defined $utt2trans{$uttid}) {
+        print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n";
+    } else {
+        print "$uttid $utt2trans{$uttid}\n";
+    }
+}
+
+
--- a/egs/babel/s5b/local/fix_kwslist.pl
+++ b/egs/babel/s5b/local/fix_kwslist.pl
@ -0,0 +1,89 @@
+#!/usr/bin/perl
+
+# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen, Jan Trmal)
+# Apache 2.0.
+#
+
+use strict;
+use warnings;
+use Getopt::Long;
+use XML::Simple;
+use Data::Dumper;
+use File::Basename;
+
+sub mysort {
+  if ($a->{kwid} =~ m/[0-9]+$/ and $b->{kwid} =~ m/[0-9]+$/) {
+    ($a->{kwid} =~ /([0-9]*)$/)[0] <=> ($b->{kwid} =~ /([0-9]*)$/)[0]
+  } else {
+    $a->{kwid} cmp $b->{kwid};
+  }
+}
+
+my $Usage = <<EOU;
+Usage: fix_kwslist.pl [options] <kwlist_in> <kwslist_in|-> <fixed_kwslist_out|->
+ e.g.: fix_kwslist.pl --kwlist-filename=kwlist.xml kwlist.xml kwslist.xml fixed_kwslist.xml
+
+Allowed options:
+  --kwlist-filename       : Kwlist filename with version info     (string, default = "")
+
+EOU
+
+my $kwlist_filename="";
+GetOptions('kwlist-filename=s'    => \$kwlist_filename);
+
+if (@ARGV != 3) {
+  die $Usage;
+}
+
+# Workout the input/output source
+my $kwlist_in = shift @ARGV;
+my $kwslist_in = shift @ARGV;
+my $fixed_kwslist_out = shift @ARGV;
+
+my $KW = XMLin($kwlist_in);
+my $KWS = XMLin($kwslist_in);
+
+# Extract keywords from kwlist.xml
+my %kwlist;
+my $language = $KW->{language};
+foreach my $kwentry (@{$KW->{kw}}) {
+  $kwlist{$kwentry->{kwid}} = 1;
+}
+
+# Now work on the kwslist
+$KWS->{language} = $language;
+if ($kwlist_filename ne "") {
+  $KWS->{kwlist_filename} = basename($kwlist_filename);
+} elsif ($KWS->{kwlist_filename} eq "") {
+  $KWS->{kwlist_filename} = basename($kwlist_in);
+}
+foreach my $kwentry (@{$KWS->{detected_kwlist}}) {
+  if (defined($kwlist{$kwentry->{kwid}})) {
+    delete $kwlist{$kwentry->{kwid}};
+  }
+}
+
+# Empty entries...
+foreach my $kw (keys %kwlist) {
+  my %empty;
+  my @tmp = [];
+  $empty{search_time} = 1;
+  $empty{kwid} = $kw;
+  $empty{oov_count} = 0;
+  push(@{$KWS->{detected_kwlist}}, \%empty);
+}
+
+my @sorted = sort mysort @{$KWS->{detected_kwlist}};
+$KWS->{detected_kwlist} = \@sorted;
+
+my $xml = XMLout($KWS, RootName => "kwslist", NoSort=>0);
+if ($fixed_kwslist_out eq "-") {
+  print $xml;
+} else {
+  if (!open(O, ">$fixed_kwslist_out")) {
+    print "Fail to open output file: $fixed_kwslist_out\n"; 
+    exit 1;
+  }
+  print O $xml;
+  close(O);
+}
--- a/egs/babel/s5b/local/flist2scp.pl
+++ b/egs/babel/s5b/local/flist2scp.pl
@ -0,0 +1,31 @@
+#!/usr/bin/perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# takes in a file list with lines like
+# /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
+# and outputs an scp in kaldi format with lines like
+# 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
+# (the first thing is the utterance-id, which is the same as the basename of the file.
+
+
+while(<>){
+    m:^\S+/(\w+)\.[wW][vV]1$: || die "Bad line $_";
+    $id = $1;
+    $id =~ tr/A-Z/a-z/;  # Necessary because of weirdness on disk 13-16.1 (uppercase filenames)
+    print "$id $_";
+}
+
--- a/egs/babel/s5b/local/generate_example_kws.sh
+++ b/egs/babel/s5b/local/generate_example_kws.sh
@ -0,0 +1,110 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
+# Apache 2.0.
+
+
+if [ $# -ne 2 ]; then
+   echo "Usage: local/generate_example_kws.sh <data-dir> <kws-data-dir>"
+   echo " e.g.: local/generate_example_kws.sh data/test_eval92/ <data/kws>"
+   exit 1;
+fi
+
+datadir=$1;
+kwsdatadir=$2;
+text=$datadir/text;
+
+mkdir -p $kwsdatadir;
+
+# Generate keywords; we generate 20 unigram keywords with at least 20 counts,
+# 20 bigram keywords with at least 10 counts and 10 trigram keywords with at
+# least 5 counts.
+cat $text | perl -e '
+  %unigram = ();
+  %bigram = ();
+  %trigram = ();
+  while(<>) {
+    chomp;
+    @col=split(" ", $_);
+    shift @col;
+    for($i = 0; $i < @col; $i++) {
+      # unigram case
+      if (!defined($unigram{$col[$i]})) {
+        $unigram{$col[$i]} = 0;
+      }
+      $unigram{$col[$i]}++;
+
+      # bigram case
+      if ($i < @col-1) {
+        $word = $col[$i] . " " . $col[$i+1];
+        if (!defined($bigram{$word})) {
+          $bigram{$word} = 0;
+        }
+        $bigram{$word}++;
+      }
+
+      # trigram case
+      if ($i < @col-2) {
+        $word = $col[$i] . " " . $col[$i+1] . " " . $col[$i+2];
+        if (!defined($trigram{$word})) {
+          $trigram{$word} = 0;
+        }
+        $trigram{$word}++;
+      }
+    }
+  }
+
+  $max_count = 100;
+  $total = 20;
+  $current = 0;
+  $min_count = 20;
+  while ($current < $total && $min_count <= $max_count) {
+    foreach $x (keys %unigram) {
+      if ($unigram{$x} == $min_count) {
+        print "$x\n";
+        $unigram{$x} = 0;
+        $current++;
+      }
+      if ($current == $total) {
+        last;
+      }
+    }
+    $min_count++;
+  }
+  
+  $total = 20;
+  $current = 0;
+  $min_count = 4;
+  while ($current < $total && $min_count <= $max_count) {
+    foreach $x (keys %bigram) {
+      if ($bigram{$x} == $min_count) {
+        print "$x\n";
+        $bigram{$x} = 0;
+        $current++;
+      }
+      if ($current == $total) {
+        last;
+      }
+    }
+    $min_count++;
+  }
+  
+  $total = 10;
+  $current = 0;
+  $min_count = 3;
+  while ($current < $total && $min_count <= $max_count) {
+    foreach $x (keys %trigram) {
+      if ($trigram{$x} == $min_count) {
+        print "$x\n";
+        $trigram{$x} = 0;
+        $current++;
+      }
+      if ($current == $total) {
+        last;
+      }
+    }
+    $min_count++;
+  }
+  ' > $kwsdatadir/raw_keywords.txt
+
+echo "Keywords generation succeeded"
--- a/egs/babel/s5b/local/generate_proxy_keywords.sh
+++ b/egs/babel/s5b/local/generate_proxy_keywords.sh
@ -0,0 +1,150 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
+# Apache 2.0.
+
+nj=32
+cmd=run.pl
+beam=5                    # Beam for proxy FST; usually used together with the nbest option
+nbest=100                 # First n best proxy keywords
+phone_cutoff=5            # We don't generate proxy keywords for OOV keywords that have less phones
+                          # than the specified cutoff; they may introduce more false alarms
+count_cutoff=1            # Cutoff for the phone confusion pair counts 
+confusion_matrix=
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# -ne 4 ]; then
+  echo "Generate proxy keywords for OOV keywords. You may apply the confusion matrix. If you"
+  echo "are going to use the confusion matrix, please use the following format for the file"
+  echo "\$confusion_matrix:"
+  echo "          p1 p2 count1        // For substitution"
+  echo "          p3 <eps> count2     // For deletion"
+  echo "          <eps> p4 count3     // For insertion"
+  echo ""
+  echo "Usage: local/generate_example_kws.sh <kws-data-dir> <oov-lexicon>"
+  echo "                                     <lexicon> <symbol-table>"
+  echo " e.g.: local/generate_example_kws.sh data/kws oov_lexicon.txt"
+  echo "                                     data/local/lexicon.txt data/lang/words.txt"
+  exit 1;
+fi
+
+# Parameters
+kwsdatadir=$1
+oov_lexicon=$2
+original_lexicon=$3
+original_symtab=$4
+
+mkdir -p $kwsdatadir/tmp
+
+# You may modify the lexicon here; For example, I removed the stress marks for the 
+# Tagalog lexicon
+cat $oov_lexicon |\
+  sed 's/_[%|"]//g' | awk '{if(NF>=2) {print $0}}' > $kwsdatadir/tmp/oov.lex
+cat $original_lexicon |\
+  sed 's/_[%|"]//g' | awk '{if(NF>=2) {print $0}}' > $kwsdatadir/tmp/original.lex
+
+# Get OOV keywords, and remove the short OOV keywords. Generate proxy keywords based
+# on the phone confusion for the short OOV keywords may introduce a lot of false alarms,
+# therefore we provide the cutoff option.
+cat $kwsdatadir/kwlist_outvocab.xml | \
+  grep -o -P "(?<=kwid=\").*(?=\")" |\
+  paste - <(cat $kwsdatadir/kwlist_outvocab.xml | grep -o -P "(?<=<kwtext>).*(?=</kwtext>)") \
+  > $kwsdatadir/tmp/oov_all.txt
+cat $kwsdatadir/tmp/oov_all.txt | perl -e '
+  open(W, "<'$kwsdatadir/tmp/oov.lex'") || die "Fail to open OOV lexicon: '$kwsdatadir/tmp/oov.lex'\n";
+  my %lexicon;
+  while (<W>) {
+    chomp;
+    my @col = split();
+    @col >= 2 || die "Bad line in lexicon: $_\n";
+    $lexicon{$col[0]} = scalar(@col)-1;
+  }
+  while (<>) {
+    chomp;
+    my $line = $_;
+    my @col = split();
+    @col >= 2 || die "Bad line in keywords file: $_\n";
+    my $len = 0;
+    for (my $i = 1; $i < scalar(@col); $i ++) {
+      if (defined($lexicon{$col[$i]})) {
+        $len += $lexicon{$col[$i]};
+      } else {
+        print STEDRR "No pronunciation found for word: $col[$i]\n";
+      }
+    }
+    if ($len >= '$phone_cutoff') {
+      print "$line\n";
+    }
+  }' > $kwsdatadir/tmp/oov.txt
+
+# Get phone symbols
+cat $kwsdatadir/tmp/oov.lex $kwsdatadir/tmp/original.lex |\
+  awk '{for(i=2; i <= NF; i++) {print $i;}}' | sort -u |\
+  sed '1i\<eps>' | awk 'BEGIN{x=0} {print $0"\t"x; x++;}' > $kwsdatadir/tmp/phones.txt
+
+# Get word symbols; We append new words to the original word symbol table
+max_id=`cat $original_symtab | awk '{print $2}' | sort -n | tail -1`;
+cat $kwsdatadir/tmp/oov.txt |\
+  awk '{for(i=2; i <= NF; i++) {print $i;}}' |\
+  cat - <(cat $kwsdatadir/tmp/oov.lex | awk '{print $1;}') |\
+  cat - <(cat $kwsdatadir/tmp/original.lex | awk '{print $1}') | sort -u |\
+  grep -F -v -x -f <(cat $original_symtab | awk '{print $1;}') |\
+  awk 'BEGIN{x='$max_id'+1}{print $0"\t"x; x++;}' |\
+  cat $original_symtab - > $kwsdatadir/tmp/words.txt
+
+# Compile lexicon into FST
+cat $kwsdatadir/tmp/oov.lex | utils/make_lexicon_fst.pl - |\
+  fstcompile --isymbols=$kwsdatadir/tmp/phones.txt --osymbols=$kwsdatadir/tmp/words.txt - |\
+  fstinvert | fstarcsort --sort_type=olabel > $kwsdatadir/tmp/oov_lexicon.fst
+cat $kwsdatadir/tmp/original.lex | utils/make_lexicon_fst.pl - |\
+  fstcompile --isymbols=$kwsdatadir/tmp/phones.txt --osymbols=$kwsdatadir/tmp/words.txt - |\
+  fstarcsort --sort_type=ilabel > $kwsdatadir/tmp/original_lexicon.fst
+
+# Compile E.fst
+if [ -z $confusion_matrix ]; then
+  cat $kwsdatadir/tmp/phones.txt |\
+    grep -v -E "<.*>" | grep -v "SIL" | awk '{print $1;}' |\
+    local/build_edit_distance_fst.pl --boundary-off=false - - |\
+    fstcompile --isymbols=$kwsdatadir/tmp/phones.txt --osymbols=$kwsdatadir/tmp/phones.txt - $kwsdatadir/tmp/Edit.fst
+else
+  echo "$0: Using confusion matrix."
+  local/count_to_logprob.pl --cutoff $count_cutoff $confusion_matrix $kwsdatadir/tmp/confusion.txt
+  cat $kwsdatadir/tmp/phones.txt |\
+    grep -v -E "<.*>" | grep -v "SIL" | awk '{print $1;}' |\
+    local/build_edit_distance_fst.pl --boundary-off=false \
+    --confusion-matrix=$kwsdatadir/tmp/confusion.txt - - |\
+    fstcompile --isymbols=$kwsdatadir/tmp/phones.txt --osymbols=$kwsdatadir/tmp/phones.txt - $kwsdatadir/tmp/Edit.fst
+fi
+
+# Pre-compose L2 and E, for the sake of efficiency
+fstcompose $kwsdatadir/tmp/oov_lexicon.fst $kwsdatadir/tmp/Edit.fst |\
+  fstarcsort --sort_type=olabel > $kwsdatadir/tmp/L2xE.fst
+
+# Prepare for parallelization
+mkdir -p $kwsdatadir/tmp/split/
+cat $kwsdatadir/tmp/oov.txt | utils/sym2int.pl -f 2- $kwsdatadir/tmp/words.txt > $kwsdatadir/tmp/oov.int
+if [ $nj -gt `cat $kwsdatadir/tmp/oov.int | wc -l` ]; then
+  nj=`cat $kwsdatadir/tmp/oov.int | wc -l`
+  echo "$0: Too many number of jobs, using $nj instead"
+fi
+for j in `seq 1 $nj`; do
+  let "id=$j-1";
+  utils/split_scp.pl -j $nj $id $kwsdatadir/tmp/oov.int $kwsdatadir/tmp/split/$j.int
+done
+
+# Generate the proxy keywords
+$cmd JOB=1:$nj $kwsdatadir/tmp/split/JOB.log \
+  generate-proxy-keywords --verbose=1 \
+  --cost-threshold=$beam --nBest=$nbest \
+  $kwsdatadir/tmp/L2xE.fst $kwsdatadir/tmp/original_lexicon.fst \
+  ark:$kwsdatadir/tmp/split/JOB.int ark:$kwsdatadir/tmp/split/JOB.fsts
+
+# Post process
+if [ ! -f $kwsdatadir/keywords_invocab.fsts ]; then
+  cp -f $kwsdatadir/keywords.fsts $kwsdatadir/keywords_invocab.fsts
+fi
+cat $kwsdatadir/tmp/split/*.fsts > $kwsdatadir/keywords_outvocab.fsts
+cat $kwsdatadir/keywords_invocab.fsts $kwsdatadir/keywords_outvocab.fsts \
+  > $kwsdatadir/keywords.fsts
--- a/egs/babel/s5b/local/get_syllable_text.sh
+++ b/egs/babel/s5b/local/get_syllable_text.sh
@ -0,0 +1,77 @@
+#!/bin/bash
+
+# Copyright Johns Hopkins University 2013 (author: Daniel Povey)
+# Apache 2.0.
+
+if [ $# -ne 7 ]; then
+  echo "Usage: get_syllable_text.sh <data> <lang> <syllable-lang-nopos> <word2syllable-fst> <ali-dir> <tempdir> <tgt-data>"
+  echo "e.g.: get_syllable_text.sh data/train data/lang ../s5-vietnamese-limited-syllables/data/lang_nopos \\"
+  echo "      ../s5-vietnamese-limited-syllables/data/local/syllables/word2syllable_lexicon_unweighted.fst"
+  echo "       exp/tri5h_ali exp/tri5_align_syllables ../s5-vietnamese-limited-syllables/data/train"
+  echo "This script copies the data-directory <data> to <tgt-data> but converts the text into syllable-level text."
+  echo "The inputs are as follows (those that are not self-explanatory):"
+  echo "  <syllable-lang-nopos> is the syllable-level lang/ directory that has been built without"
+  echo "   word-position dependency (we'll strip the suffixes from phones and expect them to be compatible with this)"
+  echo "  <word2syllable-fst> is a kind of lexicon FST that describes words as syllable sequences."
+  echo "  <ali-dir> contains a word-level alignment of the data in <data>"
+  echo "  <tempdir> will be used to put temporary files and logs (make it somewhere in exp/)"
+  echo "  <tgt-data> is a data directory to put the syllable-level data; transcripts go to <tgt-data>/text"
+  exit 1;
+fi
+
+[ -f path.sh ] && . ./path.sh
+
+data=$1
+lang=$2
+lang_nopos=$3
+word2syllable_fst=$4
+alidir=$5
+dir=$6
+tgtdata=$7
+
+for f in $data/text $lang/L.fst $lang_nopos/L.fst $word2syllable_fst $alidir/ali.1.gz \
+  $alidir/final.mdl $alidir/num_jobs; do
+  if [ ! -f $f ]; then
+    echo "Expected file $f to exist" 
+    exit 1;
+  fi
+done
+
+mkdir -p $dir/log
+nj=`cat $alidir/num_jobs` || exit 1;
+sil=`cat data/lang/phones/optional_silence.txt` || exit 1
+
+! ( ( for n in `seq $nj`; do gunzip -c $alidir/ali.$n.gz; done ) | \
+  ali-to-phones $alidir/final.mdl ark:- ark,t:- | \
+  utils/int2sym.pl -f 2- $lang/phones.txt - | \
+  sed -E 's/_I( |$)/ /g' |  sed -E 's/_E( |$)/ /g' | sed -E 's/_B( |$)/ /g' | sed -E 's/_S( |$)/ /g' | \
+  utils/sym2int.pl -f 2- $lang_nopos/phones.txt | \
+  gzip -c > $dir/phones.ark.gz ) 2>&1 | tee $dir/log/align.log \
+  && echo "Error getting phone-level (non-word-position-dependent) alignments" && exit 1;
+
+# Get an archive of syllable-level acceptors corresponding to the training data.
+# transcripts.  We don't have an fstproject program for archives so we use a line of awk.
+
+! ( cat $data/text | utils/sym2int.pl --map-oov `cat $lang/oov.int` -f 2- $lang/words.txt | \
+  transcripts-to-fsts ark:- ark:- | \
+  fsttablecompose $word2syllable_fst ark:- ark,t:- | \
+  awk '{if (NF < 4) { print; } else { print $1, $2, $3, $3, $5; }}' | \
+  gzip -c > $dir/syllables.ark.gz ) 2>&1 | tee $dir/log/get_syllable_fsts.log && \
+ echo "Error getting syllable FSTs" && exit 1;
+
+cp -rT $data $tgtdata || exit 1;
+rm -rf $tgtdata/split*
+
+# From the phone-level transcripts and the syllable-level acceptors, work out
+# the syllable sequence for each .  Remove consecutive silences.
+! ( fsttablecompose $lang_nopos/L.fst "ark:gunzip -c $dir/syllables.ark.gz|" ark:- | \
+  fsttablecompose "ark:gunzip -c $dir/phones.ark.gz | transcripts-to-fsts ark:- ark:- |" \
+  ark,s,cs:- ark,t:- | fsts-to-transcripts ark:- ark,t:- | int2sym.pl -f 2- $lang_nopos/words.txt | \
+  sed "s/$sil $sil/$sil/g" > $tgtdata/text ) && echo "Error getting text data" && exit 1;
+
+! utils/fix_data_dir.sh $tgtdata/ && echo "Error fixing data dir" && exit 1;
+
+exit 0;
+
+
+
--- a/egs/babel/s5b/local/gridsearch.pl
+++ b/egs/babel/s5b/local/gridsearch.pl
@ -0,0 +1,179 @@
+#! /usr/bin/perl
+
+use warnings;
+use strict;
+use List::Util qw(reduce);
+use Data::Dumper;
+
+sub cartesian_product {
+  reduce {
+    [ map {
+      my $item = $_;
+      map [ @$_, $item ], @$a
+    } @$b ]
+  } [[]], @_
+}
+
+sub gen_range {
+  (my $begin,  my $step, my $end) = split(':', $_[0]);
+
+  my @range = ();
+  for (my $i=$begin; $i <=$end; $i += $step) {
+    push @range, $i;
+  }
+
+  return @range;
+}
+
+sub gen_sequence {
+  my $name=$_[0];
+
+  my @steps = split(',', $_[1]);
+  my @seq=();
+
+  foreach my $step (@steps) {
+    if ($step =~/\d+:\d+:\d/) {
+      push @seq, gen_range($step);
+    } elsif ($step =~ /\d+/ ) {
+      push @seq, $step;
+    } else {
+      die "Unsupported range atom $step in range spec $name";
+    }
+  }
+
+  return ($name, @seq);
+}
+
+sub gen_combinations {
+
+  my @combinations = ();
+
+  foreach my $i  ( @{$_[0]} )  {
+    foreach my $j ( @{$_[1]} ) {
+      push @combinations, [$i, $j];
+    }
+  }
+  return @combinations;
+}
+
+sub substitute {
+  my @cmd_proto = @{$_[0]};
+  my %valhash = %{$_[1]};
+
+
+  my @cmd_out;
+
+  foreach my $elem (@cmd_proto) {
+    foreach my $key (keys %valhash) {
+      #print $elem . "($key, " . $valhash{$key}. ")->";
+      $elem =~ s/$key/$valhash{$key}/g;
+      #print $elem . "\n";
+    }
+    push @cmd_out, $elem;
+  }
+
+  return @cmd_out
+}
+
+sub escape {
+  my @cmd_in = @{$_[0]};
+  my @cmd = ();
+  foreach my $x (@cmd_in) { 
+    if ($x =~ m/^\S+$/) { push @cmd, $x } # If string contains no spaces, take
+                                          # as-is.
+
+    elsif ($x =~ m:\":) { push @cmd,  "'\''$x'\'' "; } # else if no dbl-quotes, use single
+    else { push @cmd,  "\"$x\" "; }  # else use double.
+  }
+  return @cmd;
+}
+
+my %VARIABLES=();
+my @cmd = ();
+my $cmdid = undef;
+my @traincmd = ();
+my @evalcmd = ();
+my @scorecmd = ();
+
+my @known_switches = ("-train", "-eval", "-score");
+my %found_switches = ();
+
+for (my $i=0; $i < scalar(@ARGV); $i++) {
+  if ($ARGV[$i] eq "-var") {
+     
+    $i++;
+    (my $name, my @range) = gen_sequence(split('=', $ARGV[$i]));
+    $VARIABLES{$name}=\@range
+  
+  } elsif ($ARGV[$i] eq "-train") {
+    if ( $cmdid ) {
+      if ( $cmdid eq "-eval" ) {
+        @evalcmd = @cmd;
+      } elsif ( $cmdid eq "-train" ) {
+        @traincmd = @cmd;
+      }
+    }
+    
+    $cmdid = $ARGV[$i];
+    @cmd = ();
+
+  } elsif ($ARGV[$i] eq "-eval") {
+    if ( $cmdid ) {
+      if ( $cmdid eq "-eval" ) {
+        @evalcmd = @cmd;
+      } elsif ( $cmdid eq "-train" ) {
+        @traincmd = @cmd;
+      }
+    }
+
+    $cmdid = "$ARGV[$i]";
+    @cmd = ();
+
+  } else {
+    if ( $cmdid ) {
+      push @cmd, $ARGV[$i];
+    } else {
+      die "Unknown option or switch '$ARGV[$i]' \n";
+    }
+  }
+}
+
+if ( $cmdid ) {
+  if ( $cmdid eq "-eval" ) {
+    @evalcmd = @cmd;
+  } elsif ( $cmdid eq "-train" ) {
+    @traincmd = @cmd;
+  }
+}
+
+
+my @combs;
+@combs = cartesian_product( values %VARIABLES );
+@combs =@{$combs[0]};
+#print Dumper(@{$combs[0]});
+
+
+#@combs = gen_combinations(values %VARIABLES);
+#print Dumper(\@combs);
+#@traincmd = escape(\@traincmd);
+#@evalcmd = escape(\@evalcmd);
+
+
+foreach my $comb (@combs) {
+  my %params;
+  @params{keys %VARIABLES} = @{$comb};
+
+  my @out;
+  @out = substitute(\@traincmd, \%params);
+  print "Running train:\n" . join(" ", @out) . "\n";
+  system(@out) == 0 or die "system @out failed: exit code $?";
+  
+
+  @out = substitute(\@evalcmd, \%params);
+  print "Running eval:\n" . join(" ", @out) . "\n";
+  system(@out) == 0 or die "system @out failed: exit code $?";
+  
+}
+
+
+
--- a/egs/babel/s5b/local/gridsearch2.pl
+++ b/egs/babel/s5b/local/gridsearch2.pl
@ -0,0 +1,147 @@
+#! /usr/bin/perl
+
+use warnings;
+use strict;
+
+
+use Data::Dump qw(pp dumpf);
+use List::Util qw(reduce);
+
+sub cartesian_product {
+  reduce {
+    [ map {
+      my $item = $_;
+      map [ @$_, $item ], @$a
+    } @$b ]
+  } [[]], @_
+}
+
+
+sub gen_range {
+  (my $begin,  my $step, my $end) = split(':', $_[0]);
+
+  my @range = ();
+  for (my $i=$begin; $i <=$end; $i += $step) {
+    push @range, $i;
+  }
+
+  return @range;
+}
+
+sub gen_sequence {
+  my $name=$_[0];
+
+  my @steps = split(',', $_[1]);
+  my @seq=();
+
+  foreach my $step (@steps) {
+    if ($step =~/\d+:\d+:\d/) {
+      push @seq, gen_range($step);
+    } elsif ($step =~ /\d+/ ) {
+      push @seq, $step;
+    } else {
+      die "Unsupported range atom $step in range spec $name";
+    }
+  }
+
+  return ($name, @seq);
+}
+
+sub gen_combinations {
+
+  my @combinations = ();
+
+  foreach my $i  ( @{$_[0]} )  {
+    foreach my $j ( @{$_[1]} ) {
+      push @combinations, [$i, $j];
+    }
+  }
+  return @combinations;
+}
+
+sub substitute {
+  my @cmd_proto = @{$_[0]};
+  my %valhash = %{$_[1]};
+
+
+  my @cmd_out;
+
+  foreach my $elem (@cmd_proto) {
+    foreach my $key (keys %valhash) {
+      #print $elem . "($key, " . $valhash{$key}. ")->";
+      $elem =~ s/$key/$valhash{$key}/g;
+      #print $elem . "\n";
+    }
+    push @cmd_out, $elem;
+  }
+
+  return @cmd_out
+}
+
+
+my %VARIABLES=();
+my @cmd = ();
+my $cmdid = undef;
+my @traincmd = ();
+my @evalcmd = ();
+my @scorecmd = ();
+
+my @known_switches = ("-train", "-eval", "-score");
+my %found_switches = ();
+
+for (my $i=0; $i < scalar(@ARGV); $i++) {
+  if ($ARGV[$i] eq "-var") {
+     
+    $i++;
+    (my $name, my @range) = gen_sequence(split('=', $ARGV[$i]));
+    $VARIABLES{$name}=\@range
+  
+  } elsif (grep {$_ eq $ARGV[$i]} @known_switches) {
+
+    if ($cmdid) {
+      print "CMD: $cmdid\n";
+      my @tmp = @cmd;
+      $found_switches{$cmdid} = \@tmp;      
+      pp(%found_switches);
+    }
+
+    $cmdid = "$ARGV[$i]";
+    @cmd = ();
+
+  } else {
+    if ( $cmdid ) {
+      push @cmd, $ARGV[$i];
+    } else {
+      die "Unknown option or switch '$ARGV[$i]' \n";
+    }
+  }
+}
+
+if ($cmdid) {
+  print "CMD: $cmdid\n";
+  my @tmp = @cmd;
+  $found_switches{$cmdid} = \@tmp;      
+}
+
+pp(%VARIABLES);
+pp(%found_switches);
+
+my @combs = gen_combinations(values %VARIABLES);
+
+
+foreach my $comb (@combs) {
+  my %params;
+  @params{keys %VARIABLES} = @{$comb};
+
+  my @out;
+  @out = substitute(\@traincmd, \%params);
+  system(@out) == 0 or die "system @out failed: exit code $?";
+  
+
+  @out = substitute(\@evalcmd, \%params);
+  system(@out) == 0 or die "system @out failed: exit code $?";
+  
+}
+
+
+
--- a/egs/babel/s5b/local/kwords2indices.pl
+++ b/egs/babel/s5b/local/kwords2indices.pl
@ -0,0 +1,123 @@
+#!/usr/bin/perl
+# Copyright 2012  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0.
+
+use Data::Dumper;
+$Data::Dumper::Indent = 1;
+
+binmode STDOUT, ":utf8"; 
+binmode STDIN, ":utf8"; 
+
+sub permute {
+
+    my $last = pop @_;
+
+    unless(@_) {
+           return map([$_], @$last);
+    }
+
+    return map { 
+                 my $left = $_; 
+                 map([@$left, $_], @$last)
+               } 
+               permute(@_);
+}
+
+$oov_count=0;
+
+$ignore_oov = 0;
+$ignore_first_field = 0;
+for($x = 0; $x < 2; $x++) {
+  if ($ARGV[0] eq "--map-oov") {
+    shift @ARGV; $map_oov = shift @ARGV;
+  }
+  if ($ARGV[0] eq "-f") {
+    shift @ARGV; 
+    $field_spec = shift @ARGV; 
+    if ($field_spec =~ m/^\d+$/) {
+      $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
+    }
+    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
+      if ($1 ne "") {
+        $field_begin = $1 - 1;  # Change to zero-based indexing.
+      }
+      if ($2 ne "") {
+        $field_end = $2 - 1;    # Change to zero-based indexing.
+      }
+    }
+    if (!defined $field_begin && !defined $field_end) {
+      die "Bad argument to -f option: $field_spec"; 
+    }
+  }
+}
+
+$symtab = shift @ARGV;
+if (!defined $symtab) {
+  print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" .
+    "options: [--map-oov <oov-symbol> ]  [-f <field-range> ]\n" .
+      "note: <field-range> can look like 4-5, or 4-, or 5-, or 1.\n";
+}
+open(F, "<:encoding(UTF-8)", $symtab) || die "Error opening symbol table file $symtab";
+while(<F>) {
+    @A = split(" ", $_);
+    @A == 2 || die "bad line in symbol table file: $_";
+    
+    if ( not defined( $sym2int{$A[0]} ) ) {
+      $sym2int{$A[0]} = [];
+    }
+    push @{ $sym2int{$A[0]} }, $A[1] + 0;
+}
+#print Dumper(\%sym2int);
+
+if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up
+  if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; }
+  $map_oov = $sym2int{$map_oov};
+}
+
+$lines=0;
+while (<>) {
+  @A = split(" ", $_);
+  @B = ();
+  $lines = $lines + 1;
+  $undefined_words = 0;
+  for ($n = 1; $n < @A; $n++) {
+    $a = $A[$n];
+    $i = $sym2int{$a};
+    if (!defined ($i)) {
+      if (defined $map_oov) {
+        if ($num_warning++ < $max_warning) {
+          print STDERR "sym2int.pl: replacing $a with $map_oov\n";
+          if ($num_warning == $max_warning) {
+            print STDERR "sym2int.pl: not warning for OOVs any more times\n";
+          }
+        }
+        $i = [ $map_oov ];
+      } else {
+        $pos = $n+1;
+        die "sym2int.pl: undefined symbol $a (in position $pos)\n";
+      }
+      $undefined_words = $undefined_words + 1;
+    }
+    $a = $i;
+    push @B, $a;
+  }
+    #if ( defined $sym2int{$A[$n]} ) {
+    #  push @B, $sym2int{$A[$n]};
+    #} else {
+    #  push @B, [0];
+    #}
+  if ($undefined_words > 0) {
+    $oov_count = $oov_count + 1;
+  }
+  @C = permute @B;
+  #print Dumper(\@B);
+  #print Dumper(\@C);
+  foreach $phrase ( @C ) {
+    print "$A[0] ";
+    print join(" ", @{$phrase});
+    print "\n";
+  }
+}
+
+print STDERR "Remaped/ignored $oov_count phrases...\n";
+
--- a/egs/babel/s5b/local/kws_data_prep.sh
+++ b/egs/babel/s5b/local/kws_data_prep.sh
@ -0,0 +1,136 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
+# Apache 2.0.
+
+# Begin configuration section.  
+case_insensitive=true
+use_icu=true
+icu_transform="Any-Lower"
+silence_word=  # Optional silence word to insert (once) between words of the transcript.
+# End configuration section.
+
+echo $0 "$@"
+
+help_message="
+   Usage: local/kws_data_prep.sh <lang-dir> <data-dir> <kws-data-dir>
+    e.g.: local/kws_data_prep.sh data/lang/ data/eval/ data/kws/
+   Input is in <kws-data-dir>: kwlist.xml, ecf.xml (rttm file not needed).
+   Output is in <kws-data/dir>: keywords.txt, keywords_all.int, kwlist_invocab.xml,
+       kwlist_outvocab.xml, keywords.fsts
+   Note: most important output is keywords.fsts
+   allowed switches:
+      --case-sensitive <true|false>      # Shall we be case-sensitive or not?
+                                         # Please not the case-sensitivness depends 
+                                         # on the shell locale!
+      --use-uconv <true|false>           # Use the ICU uconv binary to normalize casing
+      --icu-transform <string>           # When using ICU, use this transliteration
+              
+"
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+
+if [ $# -ne 3 ]; then
+  printf "FATAL: invalid number of arguments.\n\n"
+  printf "$help_message\n"
+  exit 1;
+fi
+
+set -u
+set -e 
+set -o pipefail
+
+langdir=$1;
+datadir=$2;
+kwsdatadir=$3;
+keywords=$kwsdatadir/kwlist.xml
+
+
+mkdir -p $kwsdatadir;
+
+cat $keywords | perl -e '
+  #binmode STDIN, ":utf8"; 
+  binmode STDOUT, ":utf8"; 
+
+  use XML::Simple;
+  use Data::Dumper;
+
+  my $data = XMLin(\*STDIN);
+
+  #print Dumper($data->{kw});
+  foreach $kwentry (@{$data->{kw}}) {
+    #print Dumper($kwentry);
+    print "$kwentry->{kwid}\t$kwentry->{kwtext}\n";
+  }
+' > $kwsdatadir/keywords.txt
+
+
+# Map the keywords to integers; note that we remove the keywords that
+# are not in our $langdir/words.txt, as we won't find them anyway...
+#cat $kwsdatadir/keywords.txt | babel/filter_keywords.pl $langdir/words.txt - - | \
+#  sym2int.pl --map-oov 0 -f 2- $langdir/words.txt | \
+if  $case_insensitive && ! $use_icu  ; then
+  echo "$0: Running case insensitive processing"
+  cat $langdir/words.txt | tr '[:lower:]' '[:upper:]'  > $kwsdatadir/words.txt
+  [ `cut -f 1 -d ' ' $kwsdatadir/words.txt | sort -u | wc -l` -ne `cat $kwsdatadir/words.txt | wc -l` ] && \
+    echo "$0: Warning, multiple words in dictionary differ only in case: " 
+    
+
+  cat $kwsdatadir/keywords.txt | tr '[:lower:]' '[:upper:]'  | \
+    sym2int.pl --map-oov 0 -f 2- $kwsdatadir/words.txt > $kwsdatadir/keywords_all.int
+elif  $case_insensitive && $use_icu ; then
+  echo "$0: Running case insensitive processing (using ICU with transform \"$icu_transform\")"
+  cat $langdir/words.txt | uconv -f utf8 -t utf8 -x "${icu_transform}"  > $kwsdatadir/words.txt
+  [ `cut -f 1 -d ' ' $kwsdatadir/words.txt | sort -u | wc -l` -ne `cat $kwsdatadir/words.txt | wc -l` ] && \
+    echo "$0: Warning, multiple words in dictionary differ only in case: " 
+
+  paste <(cut -f 1  $kwsdatadir/keywords.txt  ) \
+        <(cut -f 2  $kwsdatadir/keywords.txt | uconv -f utf8 -t utf8 -x "${icu_transform}" ) |\
+    local/kwords2indices.pl --map-oov 0  $kwsdatadir/words.txt > $kwsdatadir/keywords_all.int
+else
+  cp $langdir/words.txt  $kwsdatadir/words.txt
+  cat $kwsdatadir/keywords.txt | \
+    sym2int.pl --map-oov 0 -f 2- $kwsdatadir/words.txt > $kwsdatadir/keywords_all.int
+fi
+
+(cat $kwsdatadir/keywords_all.int | \
+  grep -v " 0 " | grep -v " 0$" > $kwsdatadir/keywords.int ) || true
+
+(cut -f 1 -d ' ' $kwsdatadir/keywords.int | \
+  local/subset_kwslist.pl $keywords > $kwsdatadir/kwlist_invocab.xml) || true
+
+(cat $kwsdatadir/keywords_all.int | \
+  egrep " 0 | 0$" | cut -f 1 -d ' ' | \
+  local/subset_kwslist.pl $keywords > $kwsdatadir/kwlist_outvocab.xml) || true
+
+
+# Compile keywords into FSTs
+if [ -z $silence_word ]; then
+  transcripts-to-fsts ark:$kwsdatadir/keywords.int ark,t:$kwsdatadir/keywords.fsts
+else
+  silence_int=`grep -w $silence_word $langdir/words.txt | awk '{print $2}'`
+  [ -z $silence_int ] && \
+     echo "$0: Error: could not find integer representation of silence word $silence_word" && exit 1;
+  transcripts-to-fsts ark:$kwsdatadir/keywords.int ark,t:- | \
+    awk -v 'OFS=\t' -v silint=$silence_int '{if (NF == 4 && $1 != 0) { print $1, $1, silint, silint; } print; }' \
+     > $kwsdatadir/keywords.fsts
+fi
+
+# Create utterance id for each utterance
+cat $datadir/segments | \
+  awk '{print $1}' | \
+  sort | uniq | perl -e '
+  $idx=1;
+  while(<>) {
+    chomp;
+    print "$_ $idx\n";
+    $idx++;
+  }' > $kwsdatadir/utter_id
+
+# Map utterance to the names that will appear in the rttm file. You have 
+# to modify the commands below accoring to your rttm file
+cat $datadir/segments | awk '{print $1" "$2}' | sort | uniq > $kwsdatadir/utter_map;
+
+echo "$0: Kws data preparation succeeded"
--- a/egs/babel/s5b/local/kws_data_prep_syllables.sh
+++ b/egs/babel/s5b/local/kws_data_prep_syllables.sh
@ -0,0 +1,144 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
+# Apache 2.0.
+
+# Begin configuration section.  
+silence_word=  # Optional silence word to insert (once) between words of the transcript.
+# End configuration section.
+
+echo $0 "$@"
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+
+if [ $# -ne 4 ]; then
+   echo "Usage: local/kws_data_prep_syllables.sh [options] <lang-dir> <data-dir> <syllable-lexicon> <kws-data-dir>"
+   echo " e.g.: local/kws_data_prep_syllables.sh data/lang/ data/dev10h/  SIL data/kws/"
+   echo "Input is in <kws-data-dir>: kwlist.xml, ecf.xml (rttm file not needed)."
+   echo "The lang directory is expected to be syllable-level.  The syllable-lexicon "
+   echo "is a text file with lines of the form:"
+   echo "word  syllable1 syllable2"
+   echo "This script is as kws_data_prep.sh, except that the output keywords.fsts"
+   echo "contains the various alternative syllable-level pronunciations of the input"
+   echo "words."
+   echo "Output is in <kws-data/dir>: keywords.txt, kwlist_invocab.xml,"
+   echo "    kwlist_outvocab.xml, keywords.fsts; note that the only syllable-level"
+   echo "  output (and the only one that really matters) is keywords.fsts"
+   echo "Note: most important output is keywords.fsts"
+   echo " Options:"
+   echo "  --silence-word   <silence-word>        # Note, this is required.  It is a word, e.g. SIL,"
+   echo "                                         # in the syllable lexicon, that's optional."
+   exit 1;
+fi
+
+langdir=$1;
+datadir=$2;
+syllable_lexicon=$3
+kwsdatadir=$4
+keywords=$kwsdatadir/kwlist.xml
+
+[ -z $silence_word ] && echo "--silence-word option is required" && exit 1;
+
+mkdir -p $kwsdatadir;
+
+cat $keywords | perl -e '
+  #binmode STDIN, ":utf8"; 
+  binmode STDOUT, ":utf8"; 
+
+  use XML::Simple;
+  use Data::Dumper;
+
+  my $data = XMLin(\*STDIN);
+
+  #print Dumper($data->{kw});
+  foreach $kwentry (@{$data->{kw}}) {
+    #print Dumper($kwentry);
+    print "$kwentry->{kwid}\t$kwentry->{kwtext}\n";
+  }
+' > $kwsdatadir/keywords.txt
+
+[ ! -s "$syllable_lexicon" ] && echo "No such file '$syllable_lexicon' (syllable lexicon), or empty file." && exit 1;
+
+# The word symbols on the first entry of $syllable_lexicon will be given a symbol-table
+# file.  We just use this symbol table in this script; the values will never appear
+# elsewhere.  
+
+mkdir -p $kwsdatadir/temp
+
+# Remove any lines with symbols we don't have in our symbol vocabulary.
+temp_syllable_lexicon=$kwsdatadir/temp/syllable_lexicon.in
+cat $syllable_lexicon | sym2int.pl --map-oov 123456789 -f 2- $langdir/words.txt | grep -v -w 123456789 | \
+  int2sym.pl -f 2- $langdir/words.txt > $temp_syllable_lexicon
+
+n1=`cat $syllable_lexicon | wc -l`
+n2=`cat $temp_syllable_lexicon | wc -l`
+echo "After removing OOV symbols from word-to-syllable lexicon, #lines changed from $n1 to $n2"
+
+
+if $case_insensitive; then
+  echo "Running case insensitive processing"
+  # we turn the first element of each line of $temp_syllable_lexicon into upper case.
+  tr '[:lower:]' '[:upper:]' < $temp_syllable_lexicon | awk '{print $1}' | \
+     paste - <(awk '{for(n=2;n<=NF;n++) { printf("%s ", $n); } print ""; }'  <$temp_syllable_lexicon) \
+    > $kwsdatadir/temp/syllable_lexicon.txt || exit 1;
+
+  # We turn all but the first element of each line in $kwsdatadir/keywords.txt
+  # into upper case.
+  tr '[:lower:]' '[:upper:]' < $kwsdatadir/keywords.txt | \
+     awk '{for(n=2;n<=NF;n++) { printf("%s ", $n); } print ""; }' | \
+     paste <(awk '{print $1}'  <$kwsdatadir/keywords.txt) - \
+    > $kwsdatadir/temp/keywords.txt || exit 1;
+else
+  cp $temp_syllable_lexicon $kwsdatadir/temp/syllable_lexicon.txt || exit 1;
+  cp $kwsdatadir/keywords.txt $kwsdatadir/temp/ || exit 1;
+fi
+
+cat $kwsdatadir/temp/syllable_lexicon.txt | awk '{print $1}' | sort | uniq | \
+ awk 'BEGIN{print "<eps> 0";} {print $1, NR;}' > $kwsdatadir/temp/words.txt
+
+sym2int.pl --map-oov 0 -f 2- $kwsdatadir/temp/words.txt < $kwsdatadir/temp/keywords.txt \
+  > $kwsdatadir/temp/keywords_all.int
+
+cat $kwsdatadir/temp/keywords_all.int | \
+  grep -v " 0 " | grep -v " 0$" > $kwsdatadir/keywords.int
+
+cut -f 1 -d ' ' $kwsdatadir/keywords.int | \
+  local/subset_kwslist.pl $keywords > $kwsdatadir/kwlist_invocab.xml
+
+cat $kwsdatadir/temp/keywords_all.int | \
+  egrep " 0 | 0$" | cut -f 1 -d ' ' | \
+  local/subset_kwslist.pl $keywords > $kwsdatadir/kwlist_outvocab.xml
+
+local/make_lexicon_fst_special.pl $kwsdatadir/temp/syllable_lexicon.txt $silence_word | \
+  sym2int.pl -f 4 $kwsdatadir/temp/words.txt | \
+  sym2int.pl -f 3 $langdir/words.txt | \
+  fstcompile | \
+  fstarcsort --sort_type=olabel > $kwsdatadir/temp/L.fst || exit 1;
+
+# Compile keywords into FSTs, compose with lexicon to get syllables
+#  and project on the input (keeping only syllable labels), 
+# before writing to keywords.fsts
+
+transcripts-to-fsts ark:$kwsdatadir/keywords.int ark:- | \
+  fsttablecompose $kwsdatadir/temp/L.fst ark:- ark,t:- | \
+  awk '{if (NF < 4) { print; } else { print $1, $2, $3, $3, $5; }}' > \
+  $kwsdatadir/keywords.fsts
+
+# Create utterance id for each utterance
+cat $datadir/segments | \
+  awk '{print $1}' | \
+  sort | uniq | perl -e '
+  $idx=1;
+  while(<>) {
+    chomp;
+    print "$_ $idx\n";
+    $idx++;
+  }' > $kwsdatadir/utter_id
+
+# Map utterance to the names that will appear in the rttm file. You have 
+# to modify the commands below accoring to your rttm file
+cat $datadir/segments | awk '{print $1" "$2}' | sort | uniq > $kwsdatadir/utter_map;
+
+echo "Kws data preparation succeeded"
--- a/egs/babel/s5b/local/kws_gen_oracle_lattices.sh
+++ b/egs/babel/s5b/local/kws_gen_oracle_lattices.sh
@ -0,0 +1,56 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
+# Apache 2.0.
+
+# Begin configuration section.  
+cmd=run.pl
+duptime=0.5
+model=final.mdl
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage $0 [options] <lang-dir> <data-dir> <decode-dir>"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo ""
+  exit 1;
+fi
+
+lang=$1;
+data=$2;
+decodedir=$3;
+
+
+kwsdatadir=$data/kws
+oracledir=$decodedir/kws_oracle
+mkdir -p $oracledir/log
+
+for filename in $lang/words.txt $decodedir/num_jobs \
+                $data/text $decodedir/lat.1.gz \
+                $decodedir/../$model ; do
+    if [[ ! -f $filename ]] ; then 
+        echo "FATAL: File $filename does not exist!" 
+        exit 1;
+    fi
+done
+
+nj=`cat $decodedir/num_jobs`
+
+(cd $decodedir; ln -s ../$model final.mdl )
+(cd $oracledir; echo "$nj" > num_jobs ) 
+
+$cmd LAT=1:$nj $oracledir/log/lat.LAT.log \
+  cat $data/text \| \
+  sed 's/- / /g' \| \
+  sym2int.pl --map-oov '"<unk>"' -f 2- $lang/words.txt \| \
+  lattice-oracle --word-symbol-table=$lang/words.txt \
+    --write-lattices="ark:|gzip -c > $oracledir/lat.LAT.gz" \
+    "ark:gzip -cdf $decodedir/lat.LAT.gz|" ark:- ark,t:$oracledir/lat.LAT.tra;
+
--- a/egs/babel/s5b/local/kws_oracle.sh
+++ b/egs/babel/s5b/local/kws_oracle.sh
@ -0,0 +1,136 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen, Jan Trmal)
+#           2013  Johns Hopkins University 
+# Apache 2.0.
+
+. ./path.sh
+. ./cmd.sh
+
+# Begin configuration section.  
+cmd=run.pl
+acwt=0.09091 #Acoustic weight -- should not be necessary for oracle lattices 
+duptime=0.6  #Max time difference in which the occurences of the same KW will be seen as duplicates
+text=     # an alternative reference text to use. when not specified, the <data-dir>/text will be used
+model=    # acoustic model to use 
+extraid=  # kws setup extra ID (kws task was setup using kws_setup.sh --extraid <id>
+stage=0   # to resume the computation from different stage
+# End configuration section.
+
+set -e 
+set -o pipefail
+
+echo "$0 $@"  # Print the command line for logging
+
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage $0 [options] <lang-dir> <data-dir> <decode-dir>"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --text <text-file> #The alternative text file in the format SEGMENT W1 W2 W3..., "
+  echo "                     #The default text file is taken from <data-dir>/text"
+  echo ""
+  exit 1;
+fi
+
+lang=$1;
+data=$2;
+decodedir=$3;
+
+if [ -z $text ] ; then
+  text=$data/text
+fi
+
+if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
+  srcdir=`dirname $decodedir`; # The model directory is one level up from decoding directory.
+  model=$srcdir/final.mdl; 
+fi
+
+if [ -z $extraid ] ; then # the same logic as with kws_setup.sh
+  kwsdatadir=$data/kws
+else
+  kwsdatadir=$data/${extraid}_kws
+fi
+
+nj=`cat $decodedir/num_jobs`;
+
+oracledir=$decodedir/kws_oracle
+mkdir -p $oracledir 
+mkdir -p $oracledir/log
+
+if [ $stage -le 0 ] ; then
+  echo "$nj" > $oracledir/num_jobs
+  $cmd LAT=1:$nj $oracledir/log/oracle_lat.LAT.log \
+    cat $text \| \
+    sed 's/- / /g' \| \
+    sym2int.pl --map-oov '"<unk>"' -f 2- $lang/words.txt \| \
+    lattice-oracle --word-symbol-table=$lang/words.txt \
+      --write-lattices="ark:|gzip -c > $oracledir/lat.LAT.gz" \
+      "ark:gzip -cdf $decodedir/lat.LAT.gz|" ark:- ark,t:$oracledir/lat.LAT.tra;
+fi
+
+if [ $stage -le 1 ] ; then
+  steps/make_index.sh --cmd "$cmd" --acwt $acwt --model $model  \
+    $kwsdatadir $lang $oracledir $oracledir
+fi
+
+if [ $stage -le 2 ] ; then
+  steps/search_index.sh --cmd "$cmd" $kwsdatadir $oracledir
+fi
+
+if [ $stage -le 3 ]; then
+
+  #TODO: this stage should be probably moved in a single script file
+  # and used accross all the kw search scripts
+  duration=`head -1 $kwsdatadir/ecf.xml |\
+    grep -o -E "duration=\"[0-9]*[    \.]*[0-9]*\"" |\
+    grep -o -E "[0-9]*[\.]*[0-9]*" |\
+    perl -e 'while(<>) {print $_/2;}'`
+
+
+  cat $oracledir/result.* | \
+    utils/write_kwslist.pl --flen=0.01 --duration=$duration \
+    --segments=$data/segments --normalize=true --duptime=$duptime\
+    --map-utter=$kwsdatadir/utter_map --remove-dup=true \
+    -  $oracledir/kwslist_orig.xml
+
+  #This does not do much -- just adds empty entries for keywords for which
+  #not even one occurence has not been found
+  local/fix_kwslist.pl $kwsdatadir/kwlist.xml $oracledir/kwslist_orig.xml $oracledir/kwslist.xml
+fi
+
+
+if [ $stage -le 4 ]; then
+  #As there is a missing functionality in the F4DE for scoring
+  #subsets of the original set, lets  keep this commented out.
+  #Alternatively:TODO: write a filter_kwslist.pl script
+  #That will produce kwslist on a basis of given kwlist.xml subset
+
+  local/kws_score_f4de.sh `dirname $kwsdatadir` $oracledir
+  #-local/kws_score_f4de.sh --kwlist $kwsdatadir/kwlist_outvocab.xml \
+  #-  --f4de-prefix outvocab `dirname $kwsdatadir` $oracledir || exit 1
+  #-local/kws_score_f4de.sh --kwlist $kwsdatadir/kwlist_invocab.xml \
+  #-  --f4de-prefix invocab `dirname $kwsdatadir` $oracledir || exit 1
+
+  echo "======================================================="
+  (
+    echo -n "ATWV-full     "
+    grep Occurrence $oracledir/sum.txt | cut -d '|' -f 13  
+  )
+
+  #-(
+  #-echo -n "ATWV-invocab  "
+  #-grep Occurrence $oracledir/invocab.sum.txt | cut -d '|' -f 13  
+  #-) || echo "Error occured getting the invocab results"
+
+  #-(
+  #-echo -n "ATWV-outvocab "
+  #-grep Occurrence $oracledir/outvocab.sum.txt | cut -d '|' -f 13  
+  #-) || echo "Error occured getting the outvocab results"
+
+  echo "======================================================="
+fi
--- a/Показать больше
+++ b/Показать больше