sandbox/akirkedal: Refactored data preparation scripts and the lexicon is now downloaded from openslr.org

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/akirkedal@4269 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2014-08-06 23:40:01 +00:00 · 2014-08-06 23:40:01 +00:00 · 7e6af54b2d
--- a/egs/sprakbanken/s5/local/copy_dict.sh
+++ b/egs/sprakbanken/s5/local/copy_dict.sh
@ -22,48 +22,16 @@ exproot=$(pwd)
 dir=data/local/dict
 mkdir -p $dir
 # Dictionary preparation:
-
+# This lexicon was created using eSpeak. 
-# Normalise transcripts and create a transcript file
+# To extend the setup, see local/dict_prep.sh
 # Removes '.,:;?' and removes '\' before '\Komma' (dictated ',') 
 # outputs a normalised transcript without utterance ids and a list of utterance ids 
 echo "Normalising"
 trainsrc=data/local/trainsrc
 rm -rf $trainsrc
 mkdir $trainsrc
 mv data/train/text1 $trainsrc/text1
 python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $trainsrc/text1 $trainsrc/onlyids $dir/transcripts.tmp
 # Additional normalisation, uppercasing, writing numbers etc.
 # and recombine with 
 local/norm_dk/format_text.sh am $dir/transcripts.tmp > $dir/transcripts.am
 cp $dir/transcripts.am $trainsrc/onlytext
 paste -d ' ' $trainsrc/onlyids $trainsrc/onlytext > data/train/text 
 utils/validate_data_dir.sh --no-feat data/train || exit 1;
 # lmsents is output by sprak_data_prep.sh and contains
 # sentences that are disjoint from the test and dev set 
 python3 local/normalize_transcript.py local/norm_dk/numbersUp.tbl data/local/data/lmsents $dir/lmsents.norm
 wait
 # Create wordlist from the AM transcripts
 cat $dir/transcripts.am | tr [:blank:] '\n' | sort -u > $dir/wlist.txt &
 # Because training data is read aloud, there are many occurences of the same
 # sentence and bias towards the domain. Make a version where  
 # the sentences are unique to reduce bias.
 local/norm_dk/format_text.sh lm $dir/lmsents.norm > $dir/transcripts.txt
 sort -u $dir/transcripts.txt > $dir/transcripts.uniq
 # Copy pre-made phone table 
 cp local/dictsrc/complexphones.txt $dir/nonsilence_phones.txt
 # Copy pre-made lexicon
-cp local/dictsrc/lexicon.txt $dir/lexicon.txt
+wget http://www.openslr.org/resources/8/lexicon-da.tar.gz --directory-prefix=data/local/data/download
 tar -xzf data/local/data/download/lexicon-da.tar.gz -C $dir
 # silence phones, one per line.
@ -72,30 +40,7 @@ echo SIL > $dir/optional_silence.txt
 touch $dir/extra_questions.txt
-# Repeat text preparation on test set, but do not add to dictionary
+wait
 testsrc=data/local/testsrc
 rm -rf $testsrc
 mkdir $testsrc
 mv data/test/text1 $testsrc/text1
 python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $testsrc/text1 $testsrc/onlyids $testsrc/transcripts.am 
 local/norm_dk/format_text.sh am $testsrc/transcripts.am > $testsrc/onlytext
 paste -d ' ' $testsrc/onlyids $testsrc/onlytext > data/test/text
 utils/validate_data_dir.sh --no-feat data/test || exit 1;
 # Repeat text preparation on dev set, but do not add to dictionary
 devsrc=data/local/devsrc
 rm -rf $devsrc
 mkdir $devsrc
 mv data/dev/text1 $devsrc/text1
 python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $devsrc/text1 $devsrc/onlyids $devsrc/transcripts.tmp
 local/norm_dk/format_text.sh am $devsrc/transcripts.tmp > $devsrc/onlytext
 paste -d ' ' $devsrc/onlyids $devsrc/onlytext > data/dev/text
 # Also create a file that can be used for reranking using LMs
 local/norm_dk/format_text.sh lm $devsrc/transcripts.tmp > data/dev/transcripts.txt
 sort -u data/dev/transcripts.txt > data/dev/transcripts.uniq
 utils/validate_data_dir.sh --no-feat data/dev || exit 1;
 ## TODO: add cleanup commands
--- a/egs/sprakbanken/s5/local/create_datasets.sh
+++ b/egs/sprakbanken/s5/local/create_datasets.sh
@ -0,0 +1,34 @@
 #!/bin/bash
 # Copyright 2014 Mirsk Digital ApS  (Author: Andreas Kirkedal)
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #  http://www.apache.org/licenses/LICENSE-2.0
 #
 # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 # MERCHANTABLITY OR NON-INFRINGEMENT.
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 if [ $# != 2 ]; then
  echo "Usage: create_dataset.sh <src-data-dir> <dest-dir> "
  exit 1
 fi
 src=$1
 dest=$2
 mkdir $dest
 python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $src/text.unnormalised $src/onlyids $src/transcripts.am 
 local/norm_dk/format_text.sh am $src/transcripts.am > $src/onlytext
 paste -d ' ' $src/onlyids $src/onlytext > $dest/text
 for f in wav.scp utt2spk; do
    cp $src/$f $dest/$f
 done
 utils/utt2spk_to_spk2utt.pl $dest/utt2spk > $dest/spk2utt
 utils/validate_data_dir.sh --no-feats $dest || exit 1;
--- a/egs/sprakbanken/s5/local/data_prep.py
+++ b/egs/sprakbanken/s5/local/data_prep.py
@ -131,7 +131,7 @@ if __name__ == '__main__':
    else:
        traindata = create_parallel_kaldi(flist, "")
-    textout = codecs.open(os.path.join(outpath, "text1"), "w", "utf8")
+    textout = codecs.open(os.path.join(outpath, "text.unnormalised"), "w", "utf8")
    wavout = codecs.open(os.path.join(outpath, "wav.scp"), "w")
    utt2spkout = codecs.open(os.path.join(outpath, "utt2spk"), "w")
    textout.writelines(traindata[0])
--- a/egs/sprakbanken/s5/local/dictsrc/lexicon.txt
+++ b/egs/sprakbanken/s5/local/dictsrc/lexicon.txt
--- a/egs/sprakbanken/s5/local/norm_dk/format_text.sh
+++ b/egs/sprakbanken/s5/local/norm_dk/format_text.sh
@ -21,14 +21,16 @@
 mode=$1
 tmp="$(mktemp -d)"
 dir=$(pwd)/local/norm_dk
-src=$dir/src.tmp
+src=$tmp/src.tmp
-abbr=$dir/anot.tmp
+abbr=$tmp/anot.tmp
-rem=$dir/rem.tmp
+rem=$tmp/rem.tmp
-line=$dir/line.tmp
+line=$tmp/line.tmp
-num=$dir/num.tmp
+num=$tmp/num.tmp
-nonum=$dir/nonum.tmp
+nonum=$tmp/nonum.tmp
 cat $2 | tr -d '\r' > $src
@ -50,4 +52,4 @@ PERLIO=:utf8 perl -pe '$_=uc'
 # Comment this line for debugging
 wait
-rm -f $abbr $rem $line 
+rm -rf $tmp
--- a/egs/sprakbanken/s5/local/sprak_data_prep.sh
+++ b/egs/sprakbanken/s5/local/sprak_data_prep.sh
@ -6,10 +6,11 @@
 dir=`pwd`/data/local/data
-lmdir=`pwd`/data/local/arpa_lm
+lmdir=`pwd`/data/local/transcript_lm
-traindir=`pwd`/data/train
+traindir=`pwd`/data/local/trainsrc
-testdir=`pwd`/data/test
+testdir=`pwd`/data/local/testsrc
-devdir=`pwd`/data/dev
+devdir=`pwd`/data/local/devsrc
 rm -rf $lmdir $traindir $testdir $devdir
 mkdir -p $dir $lmdir $traindir $testdir $devdir
 local=`pwd`/local
 utils=`pwd`/utils
@ -18,7 +19,7 @@ utils=`pwd`/utils
 # Checks if python3 is available on the system and install python3 in userspace if not
 # This recipe currently relies on version 3 because python3 uses utf8 as internal 
-# representation string representation
+# string representation
 if ! which python3 >&/dev/null; then
  echo "Installing python3 since not on your path."
@ -60,7 +61,7 @@ if [ ! -d $dir/download/0611 ]; then
    echo "Corpus unpacked succesfully."
 fi
-. ./path.sh # Needed for KALDI_ROOT
+
 sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
 if [ ! -x $sph2pipe ]; then
   echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
@ -76,62 +77,73 @@ mkdir -p $dir/corpus_processed/training/0565-1 $dir/corpus_processed/training/05
 # Create parallel file lists and text files, but keep sound files in the same location to save disk space
 # Writes the lists to data/local/data (~ 310h)
 echo "Creating parallel data for training data."
 python3 $local/sprak2kaldi.py $dir/download/0565-1 $dir/corpus_processed/training/0565-1 &  # ~130h
 python3 $local/sprak2kaldi.py $dir/download/0565-2 $dir/corpus_processed/training/0565-2 &  # ~115h
 python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon05 $dir/corpus_processed/training/0611_Stasjon05 & # ~51h 
 (
 # Ditto dev set (~ 16h)
-rm -rf $dir/corpus_processed/dev03 
+    echo "Creating parallel data for test data."
-mkdir -p $dir/corpus_processed/dev03 
+    rm -rf $dir/corpus_processed/dev03 
-python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon03 $dir/corpus_processed/dev03 &
+    mkdir -p $dir/corpus_processed/dev03 
    python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon03 $dir/corpus_processed/dev03 &
 ) &
 (
 # Ditto test set (about 9 hours)
-rm -rf $dir/corpus_processed/test06 
+    echo "Creating parallel data for development data."
-mkdir -p $dir/corpus_processed/test06 
+    rm -rf $dir/corpus_processed/test06 
-python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon06 $dir/corpus_processed/test06 || exit 1;
+    mkdir -p $dir/corpus_processed/test06 
    python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon06 $dir/corpus_processed/test06 || exit 1;
 ) &
 wait
 # Create the LM training data 
 # Test and dev data is disjoint from training data, so we use those transcripts)
 # Because training data is read aloud, there are many occurences of the same
 # sentence and bias towards the domain. Make a version where  
 # the sentences are unique to reduce bias.
 (
    echo "Writing the LM text to file and normalising."
    cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist | while read l; do cat $l; done > $lmdir/lmsents
    python3 local/normalize_transcript.py local/norm_dk/numbersUp.tbl $lmdir/lmsents $lmdir/lmsents.norm
    local/norm_dk/format_text.sh lm $lmdir/lmsents.norm > $lmdir/transcripts.txt
    sort -u $lmdir/transcripts.txt > $lmdir/transcripts.uniq
 ) &
 # Combine training file lists
 echo "Combine file lists."
 cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist $dir/corpus_processed/training/0611_Stasjon05/txtlist > $dir/traintxtfiles
 cat $dir/corpus_processed/training/0565-1/sndlist $dir/corpus_processed/training/0565-2/sndlist $dir/corpus_processed/training/0611_Stasjon05/sndlist > $dir/trainsndfiles
-# LM training files (test data is disjoint from training data)
+# Move test file lists to the right location
-echo "Write file list with LM text files. (This will take a while)"
+cp $dir/corpus_processed/dev03/txtlist $dir/devtxtfiles
-cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist > $dir/lmtxtfiles
+cp $dir/corpus_processed/dev03/sndlist $dir/devsndfiles
 cat $dir/lmtxtfiles | while read l; do cat $l; done > $dir/lmsents &
 # Move test file lists to the right location
-mv $dir/corpus_processed/dev03/txtlist $dir/devtxtfiles
+cp $dir/corpus_processed/test06/txtlist $dir/testtxtfiles
-mv $dir/corpus_processed/dev03/sndlist $dir/devsndfiles
+cp $dir/corpus_processed/test06/sndlist $dir/testsndfiles
-
+# Write wav.scp, utt2spk and text.unnormalised for train, test and dev sets with
 # Move test file lists to the right location
 mv $dir/corpus_processed/test06/txtlist $dir/testtxtfiles
 mv $dir/corpus_processed/test06/sndlist $dir/testsndfiles
 # Write wav.scp, utt2spk and text1 for train, test and dev sets with
 # Use sph2pipe because the wav files are actually sph files
-echo "Creating wav.scp, utt2spk and text1 for train, test and dev dirs." 
+echo "Creating wav.scp, utt2spk and text.unnormalised for train, test and dev" 
 python3 $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe &
 python3 $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe &
 python3 $local/data_prep.py $dir/devtxtfiles $devdir $dir/devsndfiles $sph2pipe &
 wait
-# Create spk2utt file
+# Create the main data sets
-utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt &
+local/create_datasets.sh $testdir data/test &
-utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt &
+local/create_datasets.sh $devdir data/dev &
-utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
+local/create_datasets.sh $traindir data/train &
 wait
 for d in train test dev; do
    utils/validate_data_dir.sh --no-feats --no-text  data/$d || exit 1;
 done
 ## TODO
 # Extract gender from spl files 
--- a/egs/sprakbanken/s5/local/sprak_run_sgmm2.sh
+++ b/egs/sprakbanken/s5/local/sprak_run_sgmm2.sh
@ -8,15 +8,13 @@
 # Note: you might want to try to give the option --spk-dep-weights=false to train_sgmm2.sh;
 # this takes out the "symmetric SGMM" part which is not always helpful.
 # SGMM system on si84 data [sgmm5a].  Note: the system we aligned from used the si284 data for
 # training, but this shouldn't have much effect.
 test=$1
-
+if [ ! -d xxp/tri4b_ali ]; then
-  steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
    data/train data/lang exp/tri4b exp/tri4b_ali || exit 1;
-
+fi
  steps/train_ubm.sh --cmd "$train_cmd" \
    400 data/train data/lang exp/tri4b_ali exp/ubm5a || exit 1;
@ -30,9 +28,9 @@ test=$1
      exp/sgmm2_5a/graph_3g data/${test} exp/sgmm2_5a/decode_3g_${test}
  ) &
-  steps/align_sgmm2.sh --nj 50 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
+  steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
    --use-graphs true --use-gselect true data/train data/lang exp/sgmm2_5a exp/sgmm2_5a_ali || exit 1;
-  steps/make_denlats_sgmm2.sh --nj 50 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
+  steps/make_denlats_sgmm2.sh --nj 30 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
    data/train data/lang exp/sgmm2_5a_ali exp/sgmm2_5a_denlats
  wait
@ -104,10 +102,10 @@ test=$1
 wait
-  steps/align_sgmm2.sh --nj 50 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
+  steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
    --use-graphs true --use-gselect true data/train data/lang exp/sgmm2_5b exp/sgmm2_5b_ali 
-  steps/make_denlats_sgmm2.sh --nj 50 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
+  steps/make_denlats_sgmm2.sh --nj 30 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
    data/train data/lang exp/sgmm2_5b_ali exp/sgmm2_5b_denlats
  wait
@ -136,8 +134,6 @@ wait
     done
  done
 wait
 # Examples of combining some of the best decodings: SGMM+MMI with
@ -149,14 +145,3 @@ local/score_combine.sh data/${test} \
   exp/sgmm2_5b_mmi_b0.1/decode_4g_${test}_it3 \
   exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_4g_${test}_it8_3
 # %WER 4.43 [ 250 / 5643, 41 ins, 12 del, 197 sub ] exp/tri4b_fmmi_a/decode_tgpr_eval92_it8/wer_11
 # %WER 3.85 [ 217 / 5643, 35 ins, 11 del, 171 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3/wer_10
 # combined to:
 # %WER 3.76 [ 212 / 5643, 32 ins, 12 del, 168 sub ] exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it8_3/wer_12
 # Checking MBR decode of baseline:
 cp -r -T exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3{,.mbr}
 local/score_mbr.sh data/test_eval92 data/lang_test_bd_tgpr exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr
 # MBR decoding did not seem to help (baseline was 3.85).  I think this is normal at such low WERs.
 %WER 3.86 [ 218 / 5643, 35 ins, 11 del, 172 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr/wer_10
--- a/egs/sprakbanken/s5/run.sh
+++ b/egs/sprakbanken/s5/run.sh
@ -13,6 +13,7 @@
 local/sprak_data_prep.sh  || exit 1;
 # Perform text normalisation, prepare dict folder and LM data transcriptions
 # This setup uses previsously prepared data. eSpeak must be installed and in PATH to use dict_prep.sh
 #local/dict_prep.sh || exit 1;
 local/copy_dict.sh || exit 1;
@ -29,33 +30,40 @@ mfccdir=mfcc
 # p was added to the rspecifier (scp,p:$logdir/wav.JOB.scp) in make_mfcc.sh because some 
 # wave files are corrupt 
 # Will return a warning message because of the corrupt audio files, but compute them anyway
-steps/make_mfcc.sh --nj 30 --cmd $train_cmd data/train exp/make_mfcc/train mfcc 
+# If this step fails and prints a partial diff, rerun from sprak_data_prep.sh
 steps/make_mfcc.sh --nj 30 --cmd $train_cmd data/test exp/make_mfcc/test mfcc 
 steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/test exp/make_mfcc/test mfcc &
 steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/dev exp/make_mfcc/dev mfcc &
 steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/train exp/make_mfcc/train mfcc || exit 1;
 wait
 # Compute cepstral mean and variance normalisation
-steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train mfcc && \
+steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test mfcc &
-steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test mfcc
+steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev mfcc &
 steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train mfcc 
 wait
 # Repair data set (remove corrupt data points with corrupt audio)
-utils/fix_data_dir.sh data/train && utils/fix_data_dir.sh data/test
+
-utils/fix_data_dir.sh data/dev
+utils/fix_data_dir.sh data/test &
 utils/fix_data_dir.sh data/dev &
 utils/fix_data_dir.sh data/train 
 wait
 # Train LM with CMUCLMTK
 # This setup uses IRSTLM
 #local/sprak_train_lm.sh &> data/local/cmuclmtk/lm.log
 # Train LM with irstlm
-local/train_irstlm.sh data/local/dict/transcripts.txt 3 "b3g" data/lang data/local/trainb3_lm &> data/local/b3g.log &
+local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 3 "3g" data/lang data/local/train3_lm &> data/local/3g.log &
-local/train_irstlm.sh data/local/dict/transcripts.uniq 3 "3g" data/lang data/local/train3_lm &> data/local/3g.log &
+local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 4 "4g" data/lang data/local/train4_lm &> data/local/4g.log 
 #local/train_irstlm.sh data/local/dict/transcripts.txt b4 "b4g" data/lang data/local/trainb4_lm &> data/local/b4g.log &
 #local/train_irstlm.sh data/local/dict/transcripts.uniq 4 "4g" data/lang data/local/train4_lm &> data/local/4g.log &
 # Make subset with 1k utterances for rapid testing
 # Randomly selects 980 utterances from 7 speakers
 utils/subset_data_dir.sh --per-spk data/test 140 data/test1k &
-# Now make subset with the shortest 120k utterances. 
+# Now make subset of the training data with the shortest 120k utterances. 
 utils/subset_data_dir.sh --shortest data/train 120000 data/train_120kshort || exit 1;
 # Train monophone model on short utterances
@ -66,24 +74,14 @@ steps/train_mono.sh --nj 30 --cmd "$train_cmd" \
 wait
 utils/mkgraph.sh --mono data/lang_test_3g exp/mono0a exp/mono0a/graph_3g &
-#utils/mkgraph.sh --mono data/lang_test_b3g exp/mono0a exp/mono0a/graph_b3g &
+utils/mkgraph.sh --mono data/lang_test_4g exp/mono0a exp/mono0a/graph_4g &
 #utils/mkgraph.sh --mono data/lang_test_4g exp/mono0a exp/mono0a/graph_4g &
 #utils/mkgraph.sh --mono data/lang_test_b4g exp/mono0a exp/mono0a/graph_b4g 
 # Ensure that all graphs are constructed
 wait 
 #(
 #steps/decode.sh --nj 7 --cmd "$decode_cmd" \
 #      exp/mono0a/graph_b3g data/test1k exp/mono0a/decode_b3g_test1k
 #) &
 steps/decode.sh --nj 7 --cmd "$decode_cmd" \
      exp/mono0a/graph_3g data/test1k exp/mono0a/decode_3g_test1k
 exit 0;
 # steps/align_si.sh --boost-silence 1.25 --nj 42 --cmd "$train_cmd" \
 steps/align_si.sh --nj 30 --cmd "$train_cmd" \
   data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;
@ -96,19 +94,19 @@ wait
 utils/mkgraph.sh data/lang_test_3g exp/tri1 exp/tri1/graph_3g &
-utils/mkgraph.sh data/lang_test_b3g exp/tri1 exp/tri1/graph_b3g || exit 1;#
+utils/mkgraph.sh data/lang_test_4g exp/tri1 exp/tri1/graph_4g || exit 1;
-#(
+(
-#steps/decode.sh --nj 7 --cmd "$decode_cmd" \
+steps/decode.sh --nj 7 --cmd "$decode_cmd" \
-#  exp/tri1/graph_4g data/test1k exp/tri1/decode_4g_test1k || exit 1;
+  exp/tri1/graph_4g data/test1k exp/tri1/decode_4g_test1k || exit 1;
-#) &
+) &
 (
 steps/decode.sh --nj 7 --cmd "$decode_cmd" \
  exp/tri1/graph_3g data/test1k exp/tri1/decode_3g_test1k || exit 1;
 ) &
-steps/decode.sh --nj 7 --cmd "$decode_cmd" \
+
-  exp/tri1/graph_b3g data/test1k exp/tri1/decode_b3g_test1k || exit 1;
+wait
 steps/align_si.sh --nj 30 --cmd "$train_cmd" \
  data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
@ -120,14 +118,12 @@ steps/train_deltas.sh --cmd "$train_cmd" \
 utils/mkgraph.sh data/lang_test_3g exp/tri2a exp/tri2a/graph_3g || exit 1;
 #steps/decode.sh --nj 7 --cmd "$decode_cmd" \
 #  exp/tri2a/graph_b3g data/test1k exp/tri2a/decode_b3g_test1k || exit 1;
 steps/decode.sh --nj 7 --cmd "$decode_cmd" \
  exp/tri2a/graph_3g data/test1k exp/tri2a/decode_3g_test1k || exit 1;
 steps/train_lda_mllt.sh --cmd "$train_cmd" \
-   --splice-opts "--left-context=3 --right-context=3" \
+   --splice-opts "--left-context=5 --right-context=5" \
   2500 15000 data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
 utils/mkgraph.sh data/lang_test_3g exp/tri2b exp/tri2b/graph_3g || exit 1;
@ -135,7 +131,6 @@ steps/decode.sh --nj 7 --cmd "$decode_cmd" \
  exp/tri2b/graph_3g data/test1k exp/tri2b/decode_3g_test1k || exit 1;
 # Align tri2b system with si84 data.
 steps/align_si.sh  --nj 30 --cmd "$train_cmd" \
  --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali  || exit 1;
@ -151,18 +146,17 @@ steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
 # Trying 4-gram language model
 local/train_irstlm.sh data/local/dict/transcripts.uniq 4 "4g" data/lang data/local/train4_lm &> data/local/4g.log
 utils/mkgraph.sh data/lang_test_4g exp/tri3b exp/tri3b/graph_4g || exit 1;
 steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 7 \
  exp/tri3b/graph_4g data/test1k exp/tri3b/decode_4g_test1k || exit 1;
 # Train RNN for reranking
 local/sprak_train_rnnlms.sh data/local/dict data/dev/transcripts.uniq data/local/rnnlms/g_c380_d1k_h100_v130k
 # Consumes a lot of memory! Do not run in parallel
 local/sprak_run_rnnlms_tri3b.sh data/lang_test_3g data/local/rnnlms/g_c380_d1k_h100_v130k data/test1k exp/tri3b/decode_3g_test1k
 # From 3b system
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
  data/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
@ -175,9 +169,6 @@ steps/train_sat.sh  --cmd "$train_cmd" \
 utils/mkgraph.sh data/lang_test_3g exp/tri4a exp/tri4a/graph_3g || exit 1;
 steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
   exp/tri4a/graph_3g data/test1k exp/tri4a/decode_3g_test1k || exit 1;
 # steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
 #   exp/tri4a/graph_tgpr data/test_eval92 exp/tri4a/decode_tgpr_eval92 || exit 1;
 steps/train_quick.sh --cmd "$train_cmd" \
@ -195,9 +186,7 @@ steps/train_quick.sh --cmd "$train_cmd" \
 wait
-
+# alignment used to train nnets and sgmms
 # Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on
 # all the data).  Use 30 jobs.
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
  data/train data/lang exp/tri4b exp/tri4b_ali || exit 1;
@ -207,9 +196,6 @@ local/sprak_run_nnet_cpu.sh 3g test1k
 ## Works
 local/sprak_run_sgmm2.sh test1k
 # You probably want to run the hybrid recipe as it is complementary:
 #local/run_hybrid.sh
 # Getting results [see RESULTS file]
 for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done