sandbox/akirkedal: Refactored data preparation scripts and the lexicon is now downloaded from openslr.org

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/akirkedal@4269 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2014-08-06 23:40:01 +00:00 · 2014-08-06 23:40:01 +00:00 · 7e6af54b2d
--- a/egs/sprakbanken/s5/local/copy_dict.sh
+++ b/egs/sprakbanken/s5/local/copy_dict.sh
@ -22,48 +22,16 @@ exproot=$(pwd)
 dir=data/local/dict
 mkdir -p $dir

-
 # Dictionary preparation:
-
-# Normalise transcripts and create a transcript file
-# Removes '.,:;?' and removes '\' before '\Komma' (dictated ',') 
-# outputs a normalised transcript without utterance ids and a list of utterance ids 
-echo "Normalising"
-trainsrc=data/local/trainsrc
-rm -rf $trainsrc
-mkdir $trainsrc
-mv data/train/text1 $trainsrc/text1
-python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $trainsrc/text1 $trainsrc/onlyids $dir/transcripts.tmp
-
-# Additional normalisation, uppercasing, writing numbers etc.
-# and recombine with 
-local/norm_dk/format_text.sh am $dir/transcripts.tmp > $dir/transcripts.am
-cp $dir/transcripts.am $trainsrc/onlytext
-paste -d ' ' $trainsrc/onlyids $trainsrc/onlytext > data/train/text 
-utils/validate_data_dir.sh --no-feat data/train || exit 1;
-
-# lmsents is output by sprak_data_prep.sh and contains
-# sentences that are disjoint from the test and dev set 
-python3 local/normalize_transcript.py local/norm_dk/numbersUp.tbl data/local/data/lmsents $dir/lmsents.norm
-wait
-
-# Create wordlist from the AM transcripts
-cat $dir/transcripts.am | tr [:blank:] '\n' | sort -u > $dir/wlist.txt &
-
-
-
-
-# Because training data is read aloud, there are many occurences of the same
-# sentence and bias towards the domain. Make a version where  
-# the sentences are unique to reduce bias.
-local/norm_dk/format_text.sh lm $dir/lmsents.norm > $dir/transcripts.txt
-sort -u $dir/transcripts.txt > $dir/transcripts.uniq
+# This lexicon was created using eSpeak. 
+# To extend the setup, see local/dict_prep.sh

 # Copy pre-made phone table 
 cp local/dictsrc/complexphones.txt $dir/nonsilence_phones.txt

 # Copy pre-made lexicon
-cp local/dictsrc/lexicon.txt $dir/lexicon.txt
+wget http://www.openslr.org/resources/8/lexicon-da.tar.gz --directory-prefix=data/local/data/download
+tar -xzf data/local/data/download/lexicon-da.tar.gz -C $dir


 # silence phones, one per line.
@ -72,30 +40,7 @@ echo SIL > $dir/optional_silence.txt

 touch $dir/extra_questions.txt

-# Repeat text preparation on test set, but do not add to dictionary
-testsrc=data/local/testsrc
-rm -rf $testsrc
-mkdir $testsrc
-mv data/test/text1 $testsrc/text1
-python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $testsrc/text1 $testsrc/onlyids $testsrc/transcripts.am 
-local/norm_dk/format_text.sh am $testsrc/transcripts.am > $testsrc/onlytext
-paste -d ' ' $testsrc/onlyids $testsrc/onlytext > data/test/text
-utils/validate_data_dir.sh --no-feat data/test || exit 1;
-
-# Repeat text preparation on dev set, but do not add to dictionary
-devsrc=data/local/devsrc
-rm -rf $devsrc
-mkdir $devsrc
-mv data/dev/text1 $devsrc/text1
-python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $devsrc/text1 $devsrc/onlyids $devsrc/transcripts.tmp
-local/norm_dk/format_text.sh am $devsrc/transcripts.tmp > $devsrc/onlytext
-paste -d ' ' $devsrc/onlyids $devsrc/onlytext > data/dev/text
-
-# Also create a file that can be used for reranking using LMs
-local/norm_dk/format_text.sh lm $devsrc/transcripts.tmp > data/dev/transcripts.txt
-sort -u data/dev/transcripts.txt > data/dev/transcripts.uniq
-
-utils/validate_data_dir.sh --no-feat data/dev || exit 1;
+wait


 ## TODO: add cleanup commands
--- a/egs/sprakbanken/s5/local/create_datasets.sh
+++ b/egs/sprakbanken/s5/local/create_datasets.sh
@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Copyright 2014 Mirsk Digital ApS  (Author: Andreas Kirkedal)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+if [ $# != 2 ]; then
+  echo "Usage: create_dataset.sh <src-data-dir> <dest-dir> "
+  exit 1
+fi
+
+
+src=$1
+dest=$2
+mkdir $dest
+python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $src/text.unnormalised $src/onlyids $src/transcripts.am 
+local/norm_dk/format_text.sh am $src/transcripts.am > $src/onlytext
+paste -d ' ' $src/onlyids $src/onlytext > $dest/text
+for f in wav.scp utt2spk; do
+    cp $src/$f $dest/$f
+done
+utils/utt2spk_to_spk2utt.pl $dest/utt2spk > $dest/spk2utt
+utils/validate_data_dir.sh --no-feats $dest || exit 1;
--- a/egs/sprakbanken/s5/local/data_prep.py
+++ b/egs/sprakbanken/s5/local/data_prep.py
@ -131,7 +131,7 @@ if __name__ == '__main__':
    else:
        traindata = create_parallel_kaldi(flist, "")

-    textout = codecs.open(os.path.join(outpath, "text1"), "w", "utf8")
+    textout = codecs.open(os.path.join(outpath, "text.unnormalised"), "w", "utf8")
    wavout = codecs.open(os.path.join(outpath, "wav.scp"), "w")
    utt2spkout = codecs.open(os.path.join(outpath, "utt2spk"), "w")
    textout.writelines(traindata[0])
--- a/egs/sprakbanken/s5/local/dictsrc/lexicon.txt
+++ b/egs/sprakbanken/s5/local/dictsrc/lexicon.txt
--- a/egs/sprakbanken/s5/local/norm_dk/format_text.sh
+++ b/egs/sprakbanken/s5/local/norm_dk/format_text.sh
@ -21,14 +21,16 @@

 mode=$1

+tmp="$(mktemp -d)"
+
 dir=$(pwd)/local/norm_dk

-src=$dir/src.tmp
-abbr=$dir/anot.tmp
-rem=$dir/rem.tmp
-line=$dir/line.tmp
-num=$dir/num.tmp
-nonum=$dir/nonum.tmp
+src=$tmp/src.tmp
+abbr=$tmp/anot.tmp
+rem=$tmp/rem.tmp
+line=$tmp/line.tmp
+num=$tmp/num.tmp
+nonum=$tmp/nonum.tmp

 cat $2 | tr -d '\r' > $src

@ -50,4 +52,4 @@ PERLIO=:utf8 perl -pe '$_=uc'

 # Comment this line for debugging
 wait
-rm -f $abbr $rem $line 
+rm -rf $tmp
--- a/egs/sprakbanken/s5/local/sprak_data_prep.sh
+++ b/egs/sprakbanken/s5/local/sprak_data_prep.sh
@ -6,10 +6,11 @@


 dir=`pwd`/data/local/data
-lmdir=`pwd`/data/local/arpa_lm
-traindir=`pwd`/data/train
-testdir=`pwd`/data/test
-devdir=`pwd`/data/dev
+lmdir=`pwd`/data/local/transcript_lm
+traindir=`pwd`/data/local/trainsrc
+testdir=`pwd`/data/local/testsrc
+devdir=`pwd`/data/local/devsrc
+rm -rf $lmdir $traindir $testdir $devdir
 mkdir -p $dir $lmdir $traindir $testdir $devdir
 local=`pwd`/local
 utils=`pwd`/utils
@ -18,7 +19,7 @@ utils=`pwd`/utils

 # Checks if python3 is available on the system and install python3 in userspace if not
 # This recipe currently relies on version 3 because python3 uses utf8 as internal 
-# representation string representation
+# string representation

 if ! which python3 >&/dev/null; then
  echo "Installing python3 since not on your path."
@ -60,7 +61,7 @@ if [ ! -d $dir/download/0611 ]; then
    echo "Corpus unpacked succesfully."
 fi

-. ./path.sh # Needed for KALDI_ROOT
+
 sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
 if [ ! -x $sph2pipe ]; then
   echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
@ -76,62 +77,73 @@ mkdir -p $dir/corpus_processed/training/0565-1 $dir/corpus_processed/training/05

 # Create parallel file lists and text files, but keep sound files in the same location to save disk space
 # Writes the lists to data/local/data (~ 310h)
+echo "Creating parallel data for training data."
 python3 $local/sprak2kaldi.py $dir/download/0565-1 $dir/corpus_processed/training/0565-1 &  # ~130h
 python3 $local/sprak2kaldi.py $dir/download/0565-2 $dir/corpus_processed/training/0565-2 &  # ~115h
 python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon05 $dir/corpus_processed/training/0611_Stasjon05 & # ~51h 

+(
 # Ditto dev set (~ 16h)
-rm -rf $dir/corpus_processed/dev03 
-mkdir -p $dir/corpus_processed/dev03 
-python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon03 $dir/corpus_processed/dev03 &
+    echo "Creating parallel data for test data."
+    rm -rf $dir/corpus_processed/dev03 
+    mkdir -p $dir/corpus_processed/dev03 
+    python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon03 $dir/corpus_processed/dev03 &
+) &

+(
 # Ditto test set (about 9 hours)
-rm -rf $dir/corpus_processed/test06 
-mkdir -p $dir/corpus_processed/test06 
-python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon06 $dir/corpus_processed/test06 || exit 1;
+    echo "Creating parallel data for development data."
+    rm -rf $dir/corpus_processed/test06 
+    mkdir -p $dir/corpus_processed/test06 
+    python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon06 $dir/corpus_processed/test06 || exit 1;
+) &

 wait

+# Create the LM training data 
+# Test and dev data is disjoint from training data, so we use those transcripts)
+
+# Because training data is read aloud, there are many occurences of the same
+# sentence and bias towards the domain. Make a version where  
+# the sentences are unique to reduce bias.
+
+(
+    echo "Writing the LM text to file and normalising."
+    cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist | while read l; do cat $l; done > $lmdir/lmsents
+    python3 local/normalize_transcript.py local/norm_dk/numbersUp.tbl $lmdir/lmsents $lmdir/lmsents.norm
+    local/norm_dk/format_text.sh lm $lmdir/lmsents.norm > $lmdir/transcripts.txt
+    sort -u $lmdir/transcripts.txt > $lmdir/transcripts.uniq
+) &
+
 # Combine training file lists
 echo "Combine file lists."
 cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist $dir/corpus_processed/training/0611_Stasjon05/txtlist > $dir/traintxtfiles
 cat $dir/corpus_processed/training/0565-1/sndlist $dir/corpus_processed/training/0565-2/sndlist $dir/corpus_processed/training/0611_Stasjon05/sndlist > $dir/trainsndfiles

-# LM training files (test data is disjoint from training data)
-echo "Write file list with LM text files. (This will take a while)"
-cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist > $dir/lmtxtfiles
-cat $dir/lmtxtfiles | while read l; do cat $l; done > $dir/lmsents &
+# Move test file lists to the right location
+cp $dir/corpus_processed/dev03/txtlist $dir/devtxtfiles
+cp $dir/corpus_processed/dev03/sndlist $dir/devsndfiles

 # Move test file lists to the right location
-mv $dir/corpus_processed/dev03/txtlist $dir/devtxtfiles
-mv $dir/corpus_processed/dev03/sndlist $dir/devsndfiles
+cp $dir/corpus_processed/test06/txtlist $dir/testtxtfiles
+cp $dir/corpus_processed/test06/sndlist $dir/testsndfiles

-
-# Move test file lists to the right location
-mv $dir/corpus_processed/test06/txtlist $dir/testtxtfiles
-mv $dir/corpus_processed/test06/sndlist $dir/testsndfiles
-
-# Write wav.scp, utt2spk and text1 for train, test and dev sets with
+# Write wav.scp, utt2spk and text.unnormalised for train, test and dev sets with
 # Use sph2pipe because the wav files are actually sph files
-echo "Creating wav.scp, utt2spk and text1 for train, test and dev dirs." 
+echo "Creating wav.scp, utt2spk and text.unnormalised for train, test and dev" 
 python3 $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe &
 python3 $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe &
 python3 $local/data_prep.py $dir/devtxtfiles $devdir $dir/devsndfiles $sph2pipe &

 wait

-# Create spk2utt file
-utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt &
-utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt &
-utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
+# Create the main data sets
+local/create_datasets.sh $testdir data/test &
+local/create_datasets.sh $devdir data/dev &
+local/create_datasets.sh $traindir data/train &

 wait

-for d in train test dev; do
-    utils/validate_data_dir.sh --no-feats --no-text  data/$d || exit 1;
-done
-
-
 ## TODO

 # Extract gender from spl files 
--- a/egs/sprakbanken/s5/local/sprak_run_sgmm2.sh
+++ b/egs/sprakbanken/s5/local/sprak_run_sgmm2.sh
@ -8,15 +8,13 @@
 # Note: you might want to try to give the option --spk-dep-weights=false to train_sgmm2.sh;
 # this takes out the "symmetric SGMM" part which is not always helpful.

-# SGMM system on si84 data [sgmm5a].  Note: the system we aligned from used the si284 data for
-# training, but this shouldn't have much effect.

 test=$1

-
-  steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \
+if [ ! -d xxp/tri4b_ali ]; then
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
    data/train data/lang exp/tri4b exp/tri4b_ali || exit 1;
-
+fi
  steps/train_ubm.sh --cmd "$train_cmd" \
    400 data/train data/lang exp/tri4b_ali exp/ubm5a || exit 1;

@ -30,9 +28,9 @@ test=$1
      exp/sgmm2_5a/graph_3g data/${test} exp/sgmm2_5a/decode_3g_${test}
  ) &

-  steps/align_sgmm2.sh --nj 50 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
+  steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
    --use-graphs true --use-gselect true data/train data/lang exp/sgmm2_5a exp/sgmm2_5a_ali || exit 1;
-  steps/make_denlats_sgmm2.sh --nj 50 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
+  steps/make_denlats_sgmm2.sh --nj 30 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
    data/train data/lang exp/sgmm2_5a_ali exp/sgmm2_5a_denlats

  wait
@ -104,10 +102,10 @@ test=$1

 wait

-  steps/align_sgmm2.sh --nj 50 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
+  steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
    --use-graphs true --use-gselect true data/train data/lang exp/sgmm2_5b exp/sgmm2_5b_ali 

-  steps/make_denlats_sgmm2.sh --nj 50 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
+  steps/make_denlats_sgmm2.sh --nj 30 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
    data/train data/lang exp/sgmm2_5b_ali exp/sgmm2_5b_denlats

  wait
@ -136,8 +134,6 @@ wait
     done
  done

-
-
 wait

 # Examples of combining some of the best decodings: SGMM+MMI with
@ -149,14 +145,3 @@ local/score_combine.sh data/${test} \
   exp/sgmm2_5b_mmi_b0.1/decode_4g_${test}_it3 \
   exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_4g_${test}_it8_3

-
-# %WER 4.43 [ 250 / 5643, 41 ins, 12 del, 197 sub ] exp/tri4b_fmmi_a/decode_tgpr_eval92_it8/wer_11
-# %WER 3.85 [ 217 / 5643, 35 ins, 11 del, 171 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3/wer_10
-# combined to:
-# %WER 3.76 [ 212 / 5643, 32 ins, 12 del, 168 sub ] exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it8_3/wer_12
-
-# Checking MBR decode of baseline:
-cp -r -T exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3{,.mbr}
-local/score_mbr.sh data/test_eval92 data/lang_test_bd_tgpr exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr
-# MBR decoding did not seem to help (baseline was 3.85).  I think this is normal at such low WERs.
-%WER 3.86 [ 218 / 5643, 35 ins, 11 del, 172 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr/wer_10
--- a/egs/sprakbanken/s5/run.sh
+++ b/egs/sprakbanken/s5/run.sh
@ -13,6 +13,7 @@
 local/sprak_data_prep.sh  || exit 1;

 # Perform text normalisation, prepare dict folder and LM data transcriptions
+# This setup uses previsously prepared data. eSpeak must be installed and in PATH to use dict_prep.sh
 #local/dict_prep.sh || exit 1;
 local/copy_dict.sh || exit 1;

@ -29,33 +30,40 @@ mfccdir=mfcc
 # p was added to the rspecifier (scp,p:$logdir/wav.JOB.scp) in make_mfcc.sh because some 
 # wave files are corrupt 
 # Will return a warning message because of the corrupt audio files, but compute them anyway
-steps/make_mfcc.sh --nj 30 --cmd $train_cmd data/train exp/make_mfcc/train mfcc 
-steps/make_mfcc.sh --nj 30 --cmd $train_cmd data/test exp/make_mfcc/test mfcc 
+# If this step fails and prints a partial diff, rerun from sprak_data_prep.sh

+steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/test exp/make_mfcc/test mfcc &
+steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/dev exp/make_mfcc/dev mfcc &
+steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/train exp/make_mfcc/train mfcc || exit 1;
+wait

 # Compute cepstral mean and variance normalisation
-steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train mfcc && \
-steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test mfcc
+steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test mfcc &
+steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev mfcc &
+steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train mfcc 

+wait

 # Repair data set (remove corrupt data points with corrupt audio)
-utils/fix_data_dir.sh data/train && utils/fix_data_dir.sh data/test
-utils/fix_data_dir.sh data/dev
+
+utils/fix_data_dir.sh data/test &
+utils/fix_data_dir.sh data/dev &
+utils/fix_data_dir.sh data/train 
+wait

 # Train LM with CMUCLMTK
+# This setup uses IRSTLM
 #local/sprak_train_lm.sh &> data/local/cmuclmtk/lm.log

 # Train LM with irstlm
-local/train_irstlm.sh data/local/dict/transcripts.txt 3 "b3g" data/lang data/local/trainb3_lm &> data/local/b3g.log &
-local/train_irstlm.sh data/local/dict/transcripts.uniq 3 "3g" data/lang data/local/train3_lm &> data/local/3g.log &
-#local/train_irstlm.sh data/local/dict/transcripts.txt b4 "b4g" data/lang data/local/trainb4_lm &> data/local/b4g.log &
-#local/train_irstlm.sh data/local/dict/transcripts.uniq 4 "4g" data/lang data/local/train4_lm &> data/local/4g.log &
+local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 3 "3g" data/lang data/local/train3_lm &> data/local/3g.log &
+local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 4 "4g" data/lang data/local/train4_lm &> data/local/4g.log 

 # Make subset with 1k utterances for rapid testing
 # Randomly selects 980 utterances from 7 speakers
 utils/subset_data_dir.sh --per-spk data/test 140 data/test1k &

-# Now make subset with the shortest 120k utterances. 
+# Now make subset of the training data with the shortest 120k utterances. 
 utils/subset_data_dir.sh --shortest data/train 120000 data/train_120kshort || exit 1;

 # Train monophone model on short utterances
@ -66,24 +74,14 @@ steps/train_mono.sh --nj 30 --cmd "$train_cmd" \
 wait

 utils/mkgraph.sh --mono data/lang_test_3g exp/mono0a exp/mono0a/graph_3g &
-#utils/mkgraph.sh --mono data/lang_test_b3g exp/mono0a exp/mono0a/graph_b3g &
-#utils/mkgraph.sh --mono data/lang_test_4g exp/mono0a exp/mono0a/graph_4g &
-#utils/mkgraph.sh --mono data/lang_test_b4g exp/mono0a exp/mono0a/graph_b4g 
+utils/mkgraph.sh --mono data/lang_test_4g exp/mono0a exp/mono0a/graph_4g &

 # Ensure that all graphs are constructed
 wait 

-
-
-#(
-#steps/decode.sh --nj 7 --cmd "$decode_cmd" \
-#      exp/mono0a/graph_b3g data/test1k exp/mono0a/decode_b3g_test1k
-#) &
 steps/decode.sh --nj 7 --cmd "$decode_cmd" \
      exp/mono0a/graph_3g data/test1k exp/mono0a/decode_3g_test1k

-exit 0;
-
 # steps/align_si.sh --boost-silence 1.25 --nj 42 --cmd "$train_cmd" \
 steps/align_si.sh --nj 30 --cmd "$train_cmd" \
   data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;
@ -96,19 +94,19 @@ wait


 utils/mkgraph.sh data/lang_test_3g exp/tri1 exp/tri1/graph_3g &
-utils/mkgraph.sh data/lang_test_b3g exp/tri1 exp/tri1/graph_b3g || exit 1;#
+utils/mkgraph.sh data/lang_test_4g exp/tri1 exp/tri1/graph_4g || exit 1;
 
-#(
-#steps/decode.sh --nj 7 --cmd "$decode_cmd" \
-#  exp/tri1/graph_4g data/test1k exp/tri1/decode_4g_test1k || exit 1;
-#) &
+(
+steps/decode.sh --nj 7 --cmd "$decode_cmd" \
+  exp/tri1/graph_4g data/test1k exp/tri1/decode_4g_test1k || exit 1;
+) &

 (
 steps/decode.sh --nj 7 --cmd "$decode_cmd" \
  exp/tri1/graph_3g data/test1k exp/tri1/decode_3g_test1k || exit 1;
 ) &
-steps/decode.sh --nj 7 --cmd "$decode_cmd" \
-  exp/tri1/graph_b3g data/test1k exp/tri1/decode_b3g_test1k || exit 1;
+
+wait

 steps/align_si.sh --nj 30 --cmd "$train_cmd" \
  data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
@ -120,14 +118,12 @@ steps/train_deltas.sh --cmd "$train_cmd" \

 utils/mkgraph.sh data/lang_test_3g exp/tri2a exp/tri2a/graph_3g || exit 1;

-#steps/decode.sh --nj 7 --cmd "$decode_cmd" \
-#  exp/tri2a/graph_b3g data/test1k exp/tri2a/decode_b3g_test1k || exit 1;
 steps/decode.sh --nj 7 --cmd "$decode_cmd" \
  exp/tri2a/graph_3g data/test1k exp/tri2a/decode_3g_test1k || exit 1;


 steps/train_lda_mllt.sh --cmd "$train_cmd" \
-   --splice-opts "--left-context=3 --right-context=3" \
+   --splice-opts "--left-context=5 --right-context=5" \
   2500 15000 data/train data/lang exp/tri1_ali exp/tri2b || exit 1;

 utils/mkgraph.sh data/lang_test_3g exp/tri2b exp/tri2b/graph_3g || exit 1;
@ -135,7 +131,6 @@ steps/decode.sh --nj 7 --cmd "$decode_cmd" \
  exp/tri2b/graph_3g data/test1k exp/tri2b/decode_3g_test1k || exit 1;


-# Align tri2b system with si84 data.
 steps/align_si.sh  --nj 30 --cmd "$train_cmd" \
  --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali  || exit 1;

@ -151,18 +146,17 @@ steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \


 # Trying 4-gram language model
-local/train_irstlm.sh data/local/dict/transcripts.uniq 4 "4g" data/lang data/local/train4_lm &> data/local/4g.log
 utils/mkgraph.sh data/lang_test_4g exp/tri3b exp/tri3b/graph_4g || exit 1;

 steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 7 \
  exp/tri3b/graph_4g data/test1k exp/tri3b/decode_4g_test1k || exit 1;

-
 # Train RNN for reranking
 local/sprak_train_rnnlms.sh data/local/dict data/dev/transcripts.uniq data/local/rnnlms/g_c380_d1k_h100_v130k
 # Consumes a lot of memory! Do not run in parallel
 local/sprak_run_rnnlms_tri3b.sh data/lang_test_3g data/local/rnnlms/g_c380_d1k_h100_v130k data/test1k exp/tri3b/decode_3g_test1k

+
 # From 3b system
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
  data/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
@ -175,9 +169,6 @@ steps/train_sat.sh  --cmd "$train_cmd" \
 utils/mkgraph.sh data/lang_test_3g exp/tri4a exp/tri4a/graph_3g || exit 1;
 steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
   exp/tri4a/graph_3g data/test1k exp/tri4a/decode_3g_test1k || exit 1;
-# steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
-#   exp/tri4a/graph_tgpr data/test_eval92 exp/tri4a/decode_tgpr_eval92 || exit 1;
-


 steps/train_quick.sh --cmd "$train_cmd" \
@ -195,9 +186,7 @@ steps/train_quick.sh --cmd "$train_cmd" \

 wait

-
-# Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on
-# all the data).  Use 30 jobs.
+# alignment used to train nnets and sgmms
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
  data/train data/lang exp/tri4b exp/tri4b_ali || exit 1;

@ -207,9 +196,6 @@ local/sprak_run_nnet_cpu.sh 3g test1k
 ## Works
 local/sprak_run_sgmm2.sh test1k

-# You probably want to run the hybrid recipe as it is complementary:
-#local/run_hybrid.sh
-

 # Getting results [see RESULTS file]
 for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done