sandbox/akirkedal: Refactored data preparation scripts and the lexicon is now downloaded from openslr.org

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/akirkedal@4269 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Andreas Soeborg Kirkedal 2014-08-06 23:40:01 +00:00
Родитель cdfaba14c6
Коммит 7e6af54b2d
8 изменённых файлов: 132 добавлений и 65837 удалений

Просмотреть файл

@ -22,48 +22,16 @@ exproot=$(pwd)
dir=data/local/dict
mkdir -p $dir
# Dictionary preparation:
# Normalise transcripts and create a transcript file
# Removes '.,:;?' and removes '\' before '\Komma' (dictated ',')
# outputs a normalised transcript without utterance ids and a list of utterance ids
echo "Normalising"
trainsrc=data/local/trainsrc
rm -rf $trainsrc
mkdir $trainsrc
mv data/train/text1 $trainsrc/text1
python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $trainsrc/text1 $trainsrc/onlyids $dir/transcripts.tmp
# Additional normalisation, uppercasing, writing numbers etc.
# and recombine with
local/norm_dk/format_text.sh am $dir/transcripts.tmp > $dir/transcripts.am
cp $dir/transcripts.am $trainsrc/onlytext
paste -d ' ' $trainsrc/onlyids $trainsrc/onlytext > data/train/text
utils/validate_data_dir.sh --no-feat data/train || exit 1;
# lmsents is output by sprak_data_prep.sh and contains
# sentences that are disjoint from the test and dev set
python3 local/normalize_transcript.py local/norm_dk/numbersUp.tbl data/local/data/lmsents $dir/lmsents.norm
wait
# Create wordlist from the AM transcripts
cat $dir/transcripts.am | tr [:blank:] '\n' | sort -u > $dir/wlist.txt &
# Because training data is read aloud, there are many occurences of the same
# sentence and bias towards the domain. Make a version where
# the sentences are unique to reduce bias.
local/norm_dk/format_text.sh lm $dir/lmsents.norm > $dir/transcripts.txt
sort -u $dir/transcripts.txt > $dir/transcripts.uniq
# This lexicon was created using eSpeak.
# To extend the setup, see local/dict_prep.sh
# Copy pre-made phone table
cp local/dictsrc/complexphones.txt $dir/nonsilence_phones.txt
# Copy pre-made lexicon
cp local/dictsrc/lexicon.txt $dir/lexicon.txt
wget http://www.openslr.org/resources/8/lexicon-da.tar.gz --directory-prefix=data/local/data/download
tar -xzf data/local/data/download/lexicon-da.tar.gz -C $dir
# silence phones, one per line.
@ -72,30 +40,7 @@ echo SIL > $dir/optional_silence.txt
touch $dir/extra_questions.txt
# Repeat text preparation on test set, but do not add to dictionary
testsrc=data/local/testsrc
rm -rf $testsrc
mkdir $testsrc
mv data/test/text1 $testsrc/text1
python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $testsrc/text1 $testsrc/onlyids $testsrc/transcripts.am
local/norm_dk/format_text.sh am $testsrc/transcripts.am > $testsrc/onlytext
paste -d ' ' $testsrc/onlyids $testsrc/onlytext > data/test/text
utils/validate_data_dir.sh --no-feat data/test || exit 1;
# Repeat text preparation on dev set, but do not add to dictionary
devsrc=data/local/devsrc
rm -rf $devsrc
mkdir $devsrc
mv data/dev/text1 $devsrc/text1
python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $devsrc/text1 $devsrc/onlyids $devsrc/transcripts.tmp
local/norm_dk/format_text.sh am $devsrc/transcripts.tmp > $devsrc/onlytext
paste -d ' ' $devsrc/onlyids $devsrc/onlytext > data/dev/text
# Also create a file that can be used for reranking using LMs
local/norm_dk/format_text.sh lm $devsrc/transcripts.tmp > data/dev/transcripts.txt
sort -u data/dev/transcripts.txt > data/dev/transcripts.uniq
utils/validate_data_dir.sh --no-feat data/dev || exit 1;
wait
## TODO: add cleanup commands

Просмотреть файл

@ -0,0 +1,34 @@
#!/bin/bash
# Copyright 2014 Mirsk Digital ApS (Author: Andreas Kirkedal)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
if [ $# != 2 ]; then
echo "Usage: create_dataset.sh <src-data-dir> <dest-dir> "
exit 1
fi
src=$1
dest=$2
mkdir $dest
python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $src/text.unnormalised $src/onlyids $src/transcripts.am
local/norm_dk/format_text.sh am $src/transcripts.am > $src/onlytext
paste -d ' ' $src/onlyids $src/onlytext > $dest/text
for f in wav.scp utt2spk; do
cp $src/$f $dest/$f
done
utils/utt2spk_to_spk2utt.pl $dest/utt2spk > $dest/spk2utt
utils/validate_data_dir.sh --no-feats $dest || exit 1;

Просмотреть файл

@ -131,7 +131,7 @@ if __name__ == '__main__':
else:
traindata = create_parallel_kaldi(flist, "")
textout = codecs.open(os.path.join(outpath, "text1"), "w", "utf8")
textout = codecs.open(os.path.join(outpath, "text.unnormalised"), "w", "utf8")
wavout = codecs.open(os.path.join(outpath, "wav.scp"), "w")
utt2spkout = codecs.open(os.path.join(outpath, "utt2spk"), "w")
textout.writelines(traindata[0])

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -21,14 +21,16 @@
mode=$1
tmp="$(mktemp -d)"
dir=$(pwd)/local/norm_dk
src=$dir/src.tmp
abbr=$dir/anot.tmp
rem=$dir/rem.tmp
line=$dir/line.tmp
num=$dir/num.tmp
nonum=$dir/nonum.tmp
src=$tmp/src.tmp
abbr=$tmp/anot.tmp
rem=$tmp/rem.tmp
line=$tmp/line.tmp
num=$tmp/num.tmp
nonum=$tmp/nonum.tmp
cat $2 | tr -d '\r' > $src
@ -50,4 +52,4 @@ PERLIO=:utf8 perl -pe '$_=uc'
# Comment this line for debugging
wait
rm -f $abbr $rem $line
rm -rf $tmp

Просмотреть файл

@ -6,10 +6,11 @@
dir=`pwd`/data/local/data
lmdir=`pwd`/data/local/arpa_lm
traindir=`pwd`/data/train
testdir=`pwd`/data/test
devdir=`pwd`/data/dev
lmdir=`pwd`/data/local/transcript_lm
traindir=`pwd`/data/local/trainsrc
testdir=`pwd`/data/local/testsrc
devdir=`pwd`/data/local/devsrc
rm -rf $lmdir $traindir $testdir $devdir
mkdir -p $dir $lmdir $traindir $testdir $devdir
local=`pwd`/local
utils=`pwd`/utils
@ -18,7 +19,7 @@ utils=`pwd`/utils
# Checks if python3 is available on the system and install python3 in userspace if not
# This recipe currently relies on version 3 because python3 uses utf8 as internal
# representation string representation
# string representation
if ! which python3 >&/dev/null; then
echo "Installing python3 since not on your path."
@ -60,7 +61,7 @@ if [ ! -d $dir/download/0611 ]; then
echo "Corpus unpacked succesfully."
fi
. ./path.sh # Needed for KALDI_ROOT
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
@ -76,62 +77,73 @@ mkdir -p $dir/corpus_processed/training/0565-1 $dir/corpus_processed/training/05
# Create parallel file lists and text files, but keep sound files in the same location to save disk space
# Writes the lists to data/local/data (~ 310h)
echo "Creating parallel data for training data."
python3 $local/sprak2kaldi.py $dir/download/0565-1 $dir/corpus_processed/training/0565-1 & # ~130h
python3 $local/sprak2kaldi.py $dir/download/0565-2 $dir/corpus_processed/training/0565-2 & # ~115h
python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon05 $dir/corpus_processed/training/0611_Stasjon05 & # ~51h
(
# Ditto dev set (~ 16h)
rm -rf $dir/corpus_processed/dev03
mkdir -p $dir/corpus_processed/dev03
python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon03 $dir/corpus_processed/dev03 &
echo "Creating parallel data for test data."
rm -rf $dir/corpus_processed/dev03
mkdir -p $dir/corpus_processed/dev03
python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon03 $dir/corpus_processed/dev03 &
) &
(
# Ditto test set (about 9 hours)
rm -rf $dir/corpus_processed/test06
mkdir -p $dir/corpus_processed/test06
python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon06 $dir/corpus_processed/test06 || exit 1;
echo "Creating parallel data for development data."
rm -rf $dir/corpus_processed/test06
mkdir -p $dir/corpus_processed/test06
python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon06 $dir/corpus_processed/test06 || exit 1;
) &
wait
# Create the LM training data
# Test and dev data is disjoint from training data, so we use those transcripts)
# Because training data is read aloud, there are many occurences of the same
# sentence and bias towards the domain. Make a version where
# the sentences are unique to reduce bias.
(
echo "Writing the LM text to file and normalising."
cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist | while read l; do cat $l; done > $lmdir/lmsents
python3 local/normalize_transcript.py local/norm_dk/numbersUp.tbl $lmdir/lmsents $lmdir/lmsents.norm
local/norm_dk/format_text.sh lm $lmdir/lmsents.norm > $lmdir/transcripts.txt
sort -u $lmdir/transcripts.txt > $lmdir/transcripts.uniq
) &
# Combine training file lists
echo "Combine file lists."
cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist $dir/corpus_processed/training/0611_Stasjon05/txtlist > $dir/traintxtfiles
cat $dir/corpus_processed/training/0565-1/sndlist $dir/corpus_processed/training/0565-2/sndlist $dir/corpus_processed/training/0611_Stasjon05/sndlist > $dir/trainsndfiles
# LM training files (test data is disjoint from training data)
echo "Write file list with LM text files. (This will take a while)"
cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist > $dir/lmtxtfiles
cat $dir/lmtxtfiles | while read l; do cat $l; done > $dir/lmsents &
# Move test file lists to the right location
cp $dir/corpus_processed/dev03/txtlist $dir/devtxtfiles
cp $dir/corpus_processed/dev03/sndlist $dir/devsndfiles
# Move test file lists to the right location
mv $dir/corpus_processed/dev03/txtlist $dir/devtxtfiles
mv $dir/corpus_processed/dev03/sndlist $dir/devsndfiles
cp $dir/corpus_processed/test06/txtlist $dir/testtxtfiles
cp $dir/corpus_processed/test06/sndlist $dir/testsndfiles
# Move test file lists to the right location
mv $dir/corpus_processed/test06/txtlist $dir/testtxtfiles
mv $dir/corpus_processed/test06/sndlist $dir/testsndfiles
# Write wav.scp, utt2spk and text1 for train, test and dev sets with
# Write wav.scp, utt2spk and text.unnormalised for train, test and dev sets with
# Use sph2pipe because the wav files are actually sph files
echo "Creating wav.scp, utt2spk and text1 for train, test and dev dirs."
echo "Creating wav.scp, utt2spk and text.unnormalised for train, test and dev"
python3 $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe &
python3 $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe &
python3 $local/data_prep.py $dir/devtxtfiles $devdir $dir/devsndfiles $sph2pipe &
wait
# Create spk2utt file
utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt &
utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt &
utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
# Create the main data sets
local/create_datasets.sh $testdir data/test &
local/create_datasets.sh $devdir data/dev &
local/create_datasets.sh $traindir data/train &
wait
for d in train test dev; do
utils/validate_data_dir.sh --no-feats --no-text data/$d || exit 1;
done
## TODO
# Extract gender from spl files

Просмотреть файл

@ -8,15 +8,13 @@
# Note: you might want to try to give the option --spk-dep-weights=false to train_sgmm2.sh;
# this takes out the "symmetric SGMM" part which is not always helpful.
# SGMM system on si84 data [sgmm5a]. Note: the system we aligned from used the si284 data for
# training, but this shouldn't have much effect.
test=$1
steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \
if [ ! -d xxp/tri4b_ali ]; then
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/tri4b exp/tri4b_ali || exit 1;
fi
steps/train_ubm.sh --cmd "$train_cmd" \
400 data/train data/lang exp/tri4b_ali exp/ubm5a || exit 1;
@ -30,9 +28,9 @@ test=$1
exp/sgmm2_5a/graph_3g data/${test} exp/sgmm2_5a/decode_3g_${test}
) &
steps/align_sgmm2.sh --nj 50 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
--use-graphs true --use-gselect true data/train data/lang exp/sgmm2_5a exp/sgmm2_5a_ali || exit 1;
steps/make_denlats_sgmm2.sh --nj 50 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
steps/make_denlats_sgmm2.sh --nj 30 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
data/train data/lang exp/sgmm2_5a_ali exp/sgmm2_5a_denlats
wait
@ -104,10 +102,10 @@ test=$1
wait
steps/align_sgmm2.sh --nj 50 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
--use-graphs true --use-gselect true data/train data/lang exp/sgmm2_5b exp/sgmm2_5b_ali
steps/make_denlats_sgmm2.sh --nj 50 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
steps/make_denlats_sgmm2.sh --nj 30 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
data/train data/lang exp/sgmm2_5b_ali exp/sgmm2_5b_denlats
wait
@ -136,8 +134,6 @@ wait
done
done
wait
# Examples of combining some of the best decodings: SGMM+MMI with
@ -149,14 +145,3 @@ local/score_combine.sh data/${test} \
exp/sgmm2_5b_mmi_b0.1/decode_4g_${test}_it3 \
exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_4g_${test}_it8_3
# %WER 4.43 [ 250 / 5643, 41 ins, 12 del, 197 sub ] exp/tri4b_fmmi_a/decode_tgpr_eval92_it8/wer_11
# %WER 3.85 [ 217 / 5643, 35 ins, 11 del, 171 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3/wer_10
# combined to:
# %WER 3.76 [ 212 / 5643, 32 ins, 12 del, 168 sub ] exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it8_3/wer_12
# Checking MBR decode of baseline:
cp -r -T exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3{,.mbr}
local/score_mbr.sh data/test_eval92 data/lang_test_bd_tgpr exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr
# MBR decoding did not seem to help (baseline was 3.85). I think this is normal at such low WERs.
%WER 3.86 [ 218 / 5643, 35 ins, 11 del, 172 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr/wer_10

Просмотреть файл

@ -13,6 +13,7 @@
local/sprak_data_prep.sh || exit 1;
# Perform text normalisation, prepare dict folder and LM data transcriptions
# This setup uses previsously prepared data. eSpeak must be installed and in PATH to use dict_prep.sh
#local/dict_prep.sh || exit 1;
local/copy_dict.sh || exit 1;
@ -29,33 +30,40 @@ mfccdir=mfcc
# p was added to the rspecifier (scp,p:$logdir/wav.JOB.scp) in make_mfcc.sh because some
# wave files are corrupt
# Will return a warning message because of the corrupt audio files, but compute them anyway
steps/make_mfcc.sh --nj 30 --cmd $train_cmd data/train exp/make_mfcc/train mfcc
steps/make_mfcc.sh --nj 30 --cmd $train_cmd data/test exp/make_mfcc/test mfcc
# If this step fails and prints a partial diff, rerun from sprak_data_prep.sh
steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/test exp/make_mfcc/test mfcc &
steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/dev exp/make_mfcc/dev mfcc &
steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/train exp/make_mfcc/train mfcc || exit 1;
wait
# Compute cepstral mean and variance normalisation
steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train mfcc && \
steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test mfcc
steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test mfcc &
steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev mfcc &
steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train mfcc
wait
# Repair data set (remove corrupt data points with corrupt audio)
utils/fix_data_dir.sh data/train && utils/fix_data_dir.sh data/test
utils/fix_data_dir.sh data/dev
utils/fix_data_dir.sh data/test &
utils/fix_data_dir.sh data/dev &
utils/fix_data_dir.sh data/train
wait
# Train LM with CMUCLMTK
# This setup uses IRSTLM
#local/sprak_train_lm.sh &> data/local/cmuclmtk/lm.log
# Train LM with irstlm
local/train_irstlm.sh data/local/dict/transcripts.txt 3 "b3g" data/lang data/local/trainb3_lm &> data/local/b3g.log &
local/train_irstlm.sh data/local/dict/transcripts.uniq 3 "3g" data/lang data/local/train3_lm &> data/local/3g.log &
#local/train_irstlm.sh data/local/dict/transcripts.txt b4 "b4g" data/lang data/local/trainb4_lm &> data/local/b4g.log &
#local/train_irstlm.sh data/local/dict/transcripts.uniq 4 "4g" data/lang data/local/train4_lm &> data/local/4g.log &
local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 3 "3g" data/lang data/local/train3_lm &> data/local/3g.log &
local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 4 "4g" data/lang data/local/train4_lm &> data/local/4g.log
# Make subset with 1k utterances for rapid testing
# Randomly selects 980 utterances from 7 speakers
utils/subset_data_dir.sh --per-spk data/test 140 data/test1k &
# Now make subset with the shortest 120k utterances.
# Now make subset of the training data with the shortest 120k utterances.
utils/subset_data_dir.sh --shortest data/train 120000 data/train_120kshort || exit 1;
# Train monophone model on short utterances
@ -66,24 +74,14 @@ steps/train_mono.sh --nj 30 --cmd "$train_cmd" \
wait
utils/mkgraph.sh --mono data/lang_test_3g exp/mono0a exp/mono0a/graph_3g &
#utils/mkgraph.sh --mono data/lang_test_b3g exp/mono0a exp/mono0a/graph_b3g &
#utils/mkgraph.sh --mono data/lang_test_4g exp/mono0a exp/mono0a/graph_4g &
#utils/mkgraph.sh --mono data/lang_test_b4g exp/mono0a exp/mono0a/graph_b4g
utils/mkgraph.sh --mono data/lang_test_4g exp/mono0a exp/mono0a/graph_4g &
# Ensure that all graphs are constructed
wait
#(
#steps/decode.sh --nj 7 --cmd "$decode_cmd" \
# exp/mono0a/graph_b3g data/test1k exp/mono0a/decode_b3g_test1k
#) &
steps/decode.sh --nj 7 --cmd "$decode_cmd" \
exp/mono0a/graph_3g data/test1k exp/mono0a/decode_3g_test1k
exit 0;
# steps/align_si.sh --boost-silence 1.25 --nj 42 --cmd "$train_cmd" \
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;
@ -96,19 +94,19 @@ wait
utils/mkgraph.sh data/lang_test_3g exp/tri1 exp/tri1/graph_3g &
utils/mkgraph.sh data/lang_test_b3g exp/tri1 exp/tri1/graph_b3g || exit 1;#
utils/mkgraph.sh data/lang_test_4g exp/tri1 exp/tri1/graph_4g || exit 1;
#(
#steps/decode.sh --nj 7 --cmd "$decode_cmd" \
# exp/tri1/graph_4g data/test1k exp/tri1/decode_4g_test1k || exit 1;
#) &
(
steps/decode.sh --nj 7 --cmd "$decode_cmd" \
exp/tri1/graph_4g data/test1k exp/tri1/decode_4g_test1k || exit 1;
) &
(
steps/decode.sh --nj 7 --cmd "$decode_cmd" \
exp/tri1/graph_3g data/test1k exp/tri1/decode_3g_test1k || exit 1;
) &
steps/decode.sh --nj 7 --cmd "$decode_cmd" \
exp/tri1/graph_b3g data/test1k exp/tri1/decode_b3g_test1k || exit 1;
wait
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
@ -120,14 +118,12 @@ steps/train_deltas.sh --cmd "$train_cmd" \
utils/mkgraph.sh data/lang_test_3g exp/tri2a exp/tri2a/graph_3g || exit 1;
#steps/decode.sh --nj 7 --cmd "$decode_cmd" \
# exp/tri2a/graph_b3g data/test1k exp/tri2a/decode_b3g_test1k || exit 1;
steps/decode.sh --nj 7 --cmd "$decode_cmd" \
exp/tri2a/graph_3g data/test1k exp/tri2a/decode_3g_test1k || exit 1;
steps/train_lda_mllt.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" \
--splice-opts "--left-context=5 --right-context=5" \
2500 15000 data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
utils/mkgraph.sh data/lang_test_3g exp/tri2b exp/tri2b/graph_3g || exit 1;
@ -135,7 +131,6 @@ steps/decode.sh --nj 7 --cmd "$decode_cmd" \
exp/tri2b/graph_3g data/test1k exp/tri2b/decode_3g_test1k || exit 1;
# Align tri2b system with si84 data.
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
--use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali || exit 1;
@ -151,18 +146,17 @@ steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
# Trying 4-gram language model
local/train_irstlm.sh data/local/dict/transcripts.uniq 4 "4g" data/lang data/local/train4_lm &> data/local/4g.log
utils/mkgraph.sh data/lang_test_4g exp/tri3b exp/tri3b/graph_4g || exit 1;
steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 7 \
exp/tri3b/graph_4g data/test1k exp/tri3b/decode_4g_test1k || exit 1;
# Train RNN for reranking
local/sprak_train_rnnlms.sh data/local/dict data/dev/transcripts.uniq data/local/rnnlms/g_c380_d1k_h100_v130k
# Consumes a lot of memory! Do not run in parallel
local/sprak_run_rnnlms_tri3b.sh data/lang_test_3g data/local/rnnlms/g_c380_d1k_h100_v130k data/test1k exp/tri3b/decode_3g_test1k
# From 3b system
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
@ -175,9 +169,6 @@ steps/train_sat.sh --cmd "$train_cmd" \
utils/mkgraph.sh data/lang_test_3g exp/tri4a exp/tri4a/graph_3g || exit 1;
steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
exp/tri4a/graph_3g data/test1k exp/tri4a/decode_3g_test1k || exit 1;
# steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
# exp/tri4a/graph_tgpr data/test_eval92 exp/tri4a/decode_tgpr_eval92 || exit 1;
steps/train_quick.sh --cmd "$train_cmd" \
@ -195,9 +186,7 @@ steps/train_quick.sh --cmd "$train_cmd" \
wait
# Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on
# all the data). Use 30 jobs.
# alignment used to train nnets and sgmms
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/tri4b exp/tri4b_ali || exit 1;
@ -207,9 +196,6 @@ local/sprak_run_nnet_cpu.sh 3g test1k
## Works
local/sprak_run_sgmm2.sh test1k
# You probably want to run the hybrid recipe as it is complementary:
#local/run_hybrid.sh
# Getting results [see RESULTS file]
for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done