sandbox/akirkedal: Refactored data preparation scripts and the lexicon is now downloaded from openslr.org

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/akirkedal@4269 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Andreas Soeborg Kirkedal 2014-08-06 23:40:01 +00:00
Родитель cdfaba14c6
Коммит 7e6af54b2d
8 изменённых файлов: 132 добавлений и 65837 удалений

Просмотреть файл

@ -22,48 +22,16 @@ exproot=$(pwd)
dir=data/local/dict dir=data/local/dict
mkdir -p $dir mkdir -p $dir
# Dictionary preparation: # Dictionary preparation:
# This lexicon was created using eSpeak.
# Normalise transcripts and create a transcript file # To extend the setup, see local/dict_prep.sh
# Removes '.,:;?' and removes '\' before '\Komma' (dictated ',')
# outputs a normalised transcript without utterance ids and a list of utterance ids
echo "Normalising"
trainsrc=data/local/trainsrc
rm -rf $trainsrc
mkdir $trainsrc
mv data/train/text1 $trainsrc/text1
python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $trainsrc/text1 $trainsrc/onlyids $dir/transcripts.tmp
# Additional normalisation, uppercasing, writing numbers etc.
# and recombine with
local/norm_dk/format_text.sh am $dir/transcripts.tmp > $dir/transcripts.am
cp $dir/transcripts.am $trainsrc/onlytext
paste -d ' ' $trainsrc/onlyids $trainsrc/onlytext > data/train/text
utils/validate_data_dir.sh --no-feat data/train || exit 1;
# lmsents is output by sprak_data_prep.sh and contains
# sentences that are disjoint from the test and dev set
python3 local/normalize_transcript.py local/norm_dk/numbersUp.tbl data/local/data/lmsents $dir/lmsents.norm
wait
# Create wordlist from the AM transcripts
cat $dir/transcripts.am | tr [:blank:] '\n' | sort -u > $dir/wlist.txt &
# Because training data is read aloud, there are many occurences of the same
# sentence and bias towards the domain. Make a version where
# the sentences are unique to reduce bias.
local/norm_dk/format_text.sh lm $dir/lmsents.norm > $dir/transcripts.txt
sort -u $dir/transcripts.txt > $dir/transcripts.uniq
# Copy pre-made phone table # Copy pre-made phone table
cp local/dictsrc/complexphones.txt $dir/nonsilence_phones.txt cp local/dictsrc/complexphones.txt $dir/nonsilence_phones.txt
# Copy pre-made lexicon # Copy pre-made lexicon
cp local/dictsrc/lexicon.txt $dir/lexicon.txt wget http://www.openslr.org/resources/8/lexicon-da.tar.gz --directory-prefix=data/local/data/download
tar -xzf data/local/data/download/lexicon-da.tar.gz -C $dir
# silence phones, one per line. # silence phones, one per line.
@ -72,30 +40,7 @@ echo SIL > $dir/optional_silence.txt
touch $dir/extra_questions.txt touch $dir/extra_questions.txt
# Repeat text preparation on test set, but do not add to dictionary wait
testsrc=data/local/testsrc
rm -rf $testsrc
mkdir $testsrc
mv data/test/text1 $testsrc/text1
python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $testsrc/text1 $testsrc/onlyids $testsrc/transcripts.am
local/norm_dk/format_text.sh am $testsrc/transcripts.am > $testsrc/onlytext
paste -d ' ' $testsrc/onlyids $testsrc/onlytext > data/test/text
utils/validate_data_dir.sh --no-feat data/test || exit 1;
# Repeat text preparation on dev set, but do not add to dictionary
devsrc=data/local/devsrc
rm -rf $devsrc
mkdir $devsrc
mv data/dev/text1 $devsrc/text1
python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $devsrc/text1 $devsrc/onlyids $devsrc/transcripts.tmp
local/norm_dk/format_text.sh am $devsrc/transcripts.tmp > $devsrc/onlytext
paste -d ' ' $devsrc/onlyids $devsrc/onlytext > data/dev/text
# Also create a file that can be used for reranking using LMs
local/norm_dk/format_text.sh lm $devsrc/transcripts.tmp > data/dev/transcripts.txt
sort -u data/dev/transcripts.txt > data/dev/transcripts.uniq
utils/validate_data_dir.sh --no-feat data/dev || exit 1;
## TODO: add cleanup commands ## TODO: add cleanup commands

Просмотреть файл

@ -0,0 +1,34 @@
#!/bin/bash
# Copyright 2014 Mirsk Digital ApS (Author: Andreas Kirkedal)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
if [ $# != 2 ]; then
echo "Usage: create_dataset.sh <src-data-dir> <dest-dir> "
exit 1
fi
src=$1
dest=$2
mkdir $dest
python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $src/text.unnormalised $src/onlyids $src/transcripts.am
local/norm_dk/format_text.sh am $src/transcripts.am > $src/onlytext
paste -d ' ' $src/onlyids $src/onlytext > $dest/text
for f in wav.scp utt2spk; do
cp $src/$f $dest/$f
done
utils/utt2spk_to_spk2utt.pl $dest/utt2spk > $dest/spk2utt
utils/validate_data_dir.sh --no-feats $dest || exit 1;

Просмотреть файл

@ -131,7 +131,7 @@ if __name__ == '__main__':
else: else:
traindata = create_parallel_kaldi(flist, "") traindata = create_parallel_kaldi(flist, "")
textout = codecs.open(os.path.join(outpath, "text1"), "w", "utf8") textout = codecs.open(os.path.join(outpath, "text.unnormalised"), "w", "utf8")
wavout = codecs.open(os.path.join(outpath, "wav.scp"), "w") wavout = codecs.open(os.path.join(outpath, "wav.scp"), "w")
utt2spkout = codecs.open(os.path.join(outpath, "utt2spk"), "w") utt2spkout = codecs.open(os.path.join(outpath, "utt2spk"), "w")
textout.writelines(traindata[0]) textout.writelines(traindata[0])

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -21,14 +21,16 @@
mode=$1 mode=$1
tmp="$(mktemp -d)"
dir=$(pwd)/local/norm_dk dir=$(pwd)/local/norm_dk
src=$dir/src.tmp src=$tmp/src.tmp
abbr=$dir/anot.tmp abbr=$tmp/anot.tmp
rem=$dir/rem.tmp rem=$tmp/rem.tmp
line=$dir/line.tmp line=$tmp/line.tmp
num=$dir/num.tmp num=$tmp/num.tmp
nonum=$dir/nonum.tmp nonum=$tmp/nonum.tmp
cat $2 | tr -d '\r' > $src cat $2 | tr -d '\r' > $src
@ -50,4 +52,4 @@ PERLIO=:utf8 perl -pe '$_=uc'
# Comment this line for debugging # Comment this line for debugging
wait wait
rm -f $abbr $rem $line rm -rf $tmp

Просмотреть файл

@ -6,10 +6,11 @@
dir=`pwd`/data/local/data dir=`pwd`/data/local/data
lmdir=`pwd`/data/local/arpa_lm lmdir=`pwd`/data/local/transcript_lm
traindir=`pwd`/data/train traindir=`pwd`/data/local/trainsrc
testdir=`pwd`/data/test testdir=`pwd`/data/local/testsrc
devdir=`pwd`/data/dev devdir=`pwd`/data/local/devsrc
rm -rf $lmdir $traindir $testdir $devdir
mkdir -p $dir $lmdir $traindir $testdir $devdir mkdir -p $dir $lmdir $traindir $testdir $devdir
local=`pwd`/local local=`pwd`/local
utils=`pwd`/utils utils=`pwd`/utils
@ -18,7 +19,7 @@ utils=`pwd`/utils
# Checks if python3 is available on the system and install python3 in userspace if not # Checks if python3 is available on the system and install python3 in userspace if not
# This recipe currently relies on version 3 because python3 uses utf8 as internal # This recipe currently relies on version 3 because python3 uses utf8 as internal
# representation string representation # string representation
if ! which python3 >&/dev/null; then if ! which python3 >&/dev/null; then
echo "Installing python3 since not on your path." echo "Installing python3 since not on your path."
@ -60,7 +61,7 @@ if [ ! -d $dir/download/0611 ]; then
echo "Corpus unpacked succesfully." echo "Corpus unpacked succesfully."
fi fi
. ./path.sh # Needed for KALDI_ROOT
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
@ -76,62 +77,73 @@ mkdir -p $dir/corpus_processed/training/0565-1 $dir/corpus_processed/training/05
# Create parallel file lists and text files, but keep sound files in the same location to save disk space # Create parallel file lists and text files, but keep sound files in the same location to save disk space
# Writes the lists to data/local/data (~ 310h) # Writes the lists to data/local/data (~ 310h)
echo "Creating parallel data for training data."
python3 $local/sprak2kaldi.py $dir/download/0565-1 $dir/corpus_processed/training/0565-1 & # ~130h python3 $local/sprak2kaldi.py $dir/download/0565-1 $dir/corpus_processed/training/0565-1 & # ~130h
python3 $local/sprak2kaldi.py $dir/download/0565-2 $dir/corpus_processed/training/0565-2 & # ~115h python3 $local/sprak2kaldi.py $dir/download/0565-2 $dir/corpus_processed/training/0565-2 & # ~115h
python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon05 $dir/corpus_processed/training/0611_Stasjon05 & # ~51h python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon05 $dir/corpus_processed/training/0611_Stasjon05 & # ~51h
(
# Ditto dev set (~ 16h) # Ditto dev set (~ 16h)
rm -rf $dir/corpus_processed/dev03 echo "Creating parallel data for test data."
mkdir -p $dir/corpus_processed/dev03 rm -rf $dir/corpus_processed/dev03
python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon03 $dir/corpus_processed/dev03 & mkdir -p $dir/corpus_processed/dev03
python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon03 $dir/corpus_processed/dev03 &
) &
(
# Ditto test set (about 9 hours) # Ditto test set (about 9 hours)
rm -rf $dir/corpus_processed/test06 echo "Creating parallel data for development data."
mkdir -p $dir/corpus_processed/test06 rm -rf $dir/corpus_processed/test06
python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon06 $dir/corpus_processed/test06 || exit 1; mkdir -p $dir/corpus_processed/test06
python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon06 $dir/corpus_processed/test06 || exit 1;
) &
wait wait
# Create the LM training data
# Test and dev data is disjoint from training data, so we use those transcripts)
# Because training data is read aloud, there are many occurences of the same
# sentence and bias towards the domain. Make a version where
# the sentences are unique to reduce bias.
(
echo "Writing the LM text to file and normalising."
cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist | while read l; do cat $l; done > $lmdir/lmsents
python3 local/normalize_transcript.py local/norm_dk/numbersUp.tbl $lmdir/lmsents $lmdir/lmsents.norm
local/norm_dk/format_text.sh lm $lmdir/lmsents.norm > $lmdir/transcripts.txt
sort -u $lmdir/transcripts.txt > $lmdir/transcripts.uniq
) &
# Combine training file lists # Combine training file lists
echo "Combine file lists." echo "Combine file lists."
cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist $dir/corpus_processed/training/0611_Stasjon05/txtlist > $dir/traintxtfiles cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist $dir/corpus_processed/training/0611_Stasjon05/txtlist > $dir/traintxtfiles
cat $dir/corpus_processed/training/0565-1/sndlist $dir/corpus_processed/training/0565-2/sndlist $dir/corpus_processed/training/0611_Stasjon05/sndlist > $dir/trainsndfiles cat $dir/corpus_processed/training/0565-1/sndlist $dir/corpus_processed/training/0565-2/sndlist $dir/corpus_processed/training/0611_Stasjon05/sndlist > $dir/trainsndfiles
# LM training files (test data is disjoint from training data) # Move test file lists to the right location
echo "Write file list with LM text files. (This will take a while)" cp $dir/corpus_processed/dev03/txtlist $dir/devtxtfiles
cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist > $dir/lmtxtfiles cp $dir/corpus_processed/dev03/sndlist $dir/devsndfiles
cat $dir/lmtxtfiles | while read l; do cat $l; done > $dir/lmsents &
# Move test file lists to the right location # Move test file lists to the right location
mv $dir/corpus_processed/dev03/txtlist $dir/devtxtfiles cp $dir/corpus_processed/test06/txtlist $dir/testtxtfiles
mv $dir/corpus_processed/dev03/sndlist $dir/devsndfiles cp $dir/corpus_processed/test06/sndlist $dir/testsndfiles
# Write wav.scp, utt2spk and text.unnormalised for train, test and dev sets with
# Move test file lists to the right location
mv $dir/corpus_processed/test06/txtlist $dir/testtxtfiles
mv $dir/corpus_processed/test06/sndlist $dir/testsndfiles
# Write wav.scp, utt2spk and text1 for train, test and dev sets with
# Use sph2pipe because the wav files are actually sph files # Use sph2pipe because the wav files are actually sph files
echo "Creating wav.scp, utt2spk and text1 for train, test and dev dirs." echo "Creating wav.scp, utt2spk and text.unnormalised for train, test and dev"
python3 $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe & python3 $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe &
python3 $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe & python3 $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe &
python3 $local/data_prep.py $dir/devtxtfiles $devdir $dir/devsndfiles $sph2pipe & python3 $local/data_prep.py $dir/devtxtfiles $devdir $dir/devsndfiles $sph2pipe &
wait wait
# Create spk2utt file # Create the main data sets
utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt & local/create_datasets.sh $testdir data/test &
utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt & local/create_datasets.sh $devdir data/dev &
utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt local/create_datasets.sh $traindir data/train &
wait wait
for d in train test dev; do
utils/validate_data_dir.sh --no-feats --no-text data/$d || exit 1;
done
## TODO ## TODO
# Extract gender from spl files # Extract gender from spl files

Просмотреть файл

@ -8,15 +8,13 @@
# Note: you might want to try to give the option --spk-dep-weights=false to train_sgmm2.sh; # Note: you might want to try to give the option --spk-dep-weights=false to train_sgmm2.sh;
# this takes out the "symmetric SGMM" part which is not always helpful. # this takes out the "symmetric SGMM" part which is not always helpful.
# SGMM system on si84 data [sgmm5a]. Note: the system we aligned from used the si284 data for
# training, but this shouldn't have much effect.
test=$1 test=$1
if [ ! -d xxp/tri4b_ali ]; then
steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \ steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/tri4b exp/tri4b_ali || exit 1; data/train data/lang exp/tri4b exp/tri4b_ali || exit 1;
fi
steps/train_ubm.sh --cmd "$train_cmd" \ steps/train_ubm.sh --cmd "$train_cmd" \
400 data/train data/lang exp/tri4b_ali exp/ubm5a || exit 1; 400 data/train data/lang exp/tri4b_ali exp/ubm5a || exit 1;
@ -30,9 +28,9 @@ test=$1
exp/sgmm2_5a/graph_3g data/${test} exp/sgmm2_5a/decode_3g_${test} exp/sgmm2_5a/graph_3g data/${test} exp/sgmm2_5a/decode_3g_${test}
) & ) &
steps/align_sgmm2.sh --nj 50 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \ steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
--use-graphs true --use-gselect true data/train data/lang exp/sgmm2_5a exp/sgmm2_5a_ali || exit 1; --use-graphs true --use-gselect true data/train data/lang exp/sgmm2_5a exp/sgmm2_5a_ali || exit 1;
steps/make_denlats_sgmm2.sh --nj 50 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \ steps/make_denlats_sgmm2.sh --nj 30 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
data/train data/lang exp/sgmm2_5a_ali exp/sgmm2_5a_denlats data/train data/lang exp/sgmm2_5a_ali exp/sgmm2_5a_denlats
wait wait
@ -104,10 +102,10 @@ test=$1
wait wait
steps/align_sgmm2.sh --nj 50 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \ steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
--use-graphs true --use-gselect true data/train data/lang exp/sgmm2_5b exp/sgmm2_5b_ali --use-graphs true --use-gselect true data/train data/lang exp/sgmm2_5b exp/sgmm2_5b_ali
steps/make_denlats_sgmm2.sh --nj 50 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \ steps/make_denlats_sgmm2.sh --nj 30 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
data/train data/lang exp/sgmm2_5b_ali exp/sgmm2_5b_denlats data/train data/lang exp/sgmm2_5b_ali exp/sgmm2_5b_denlats
wait wait
@ -136,8 +134,6 @@ wait
done done
done done
wait wait
# Examples of combining some of the best decodings: SGMM+MMI with # Examples of combining some of the best decodings: SGMM+MMI with
@ -149,14 +145,3 @@ local/score_combine.sh data/${test} \
exp/sgmm2_5b_mmi_b0.1/decode_4g_${test}_it3 \ exp/sgmm2_5b_mmi_b0.1/decode_4g_${test}_it3 \
exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_4g_${test}_it8_3 exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_4g_${test}_it8_3
# %WER 4.43 [ 250 / 5643, 41 ins, 12 del, 197 sub ] exp/tri4b_fmmi_a/decode_tgpr_eval92_it8/wer_11
# %WER 3.85 [ 217 / 5643, 35 ins, 11 del, 171 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3/wer_10
# combined to:
# %WER 3.76 [ 212 / 5643, 32 ins, 12 del, 168 sub ] exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it8_3/wer_12
# Checking MBR decode of baseline:
cp -r -T exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3{,.mbr}
local/score_mbr.sh data/test_eval92 data/lang_test_bd_tgpr exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr
# MBR decoding did not seem to help (baseline was 3.85). I think this is normal at such low WERs.
%WER 3.86 [ 218 / 5643, 35 ins, 11 del, 172 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr/wer_10

Просмотреть файл

@ -13,6 +13,7 @@
local/sprak_data_prep.sh || exit 1; local/sprak_data_prep.sh || exit 1;
# Perform text normalisation, prepare dict folder and LM data transcriptions # Perform text normalisation, prepare dict folder and LM data transcriptions
# This setup uses previsously prepared data. eSpeak must be installed and in PATH to use dict_prep.sh
#local/dict_prep.sh || exit 1; #local/dict_prep.sh || exit 1;
local/copy_dict.sh || exit 1; local/copy_dict.sh || exit 1;
@ -29,33 +30,40 @@ mfccdir=mfcc
# p was added to the rspecifier (scp,p:$logdir/wav.JOB.scp) in make_mfcc.sh because some # p was added to the rspecifier (scp,p:$logdir/wav.JOB.scp) in make_mfcc.sh because some
# wave files are corrupt # wave files are corrupt
# Will return a warning message because of the corrupt audio files, but compute them anyway # Will return a warning message because of the corrupt audio files, but compute them anyway
steps/make_mfcc.sh --nj 30 --cmd $train_cmd data/train exp/make_mfcc/train mfcc # If this step fails and prints a partial diff, rerun from sprak_data_prep.sh
steps/make_mfcc.sh --nj 30 --cmd $train_cmd data/test exp/make_mfcc/test mfcc
steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/test exp/make_mfcc/test mfcc &
steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/dev exp/make_mfcc/dev mfcc &
steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/train exp/make_mfcc/train mfcc || exit 1;
wait
# Compute cepstral mean and variance normalisation # Compute cepstral mean and variance normalisation
steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train mfcc && \ steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test mfcc &
steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test mfcc steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev mfcc &
steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train mfcc
wait
# Repair data set (remove corrupt data points with corrupt audio) # Repair data set (remove corrupt data points with corrupt audio)
utils/fix_data_dir.sh data/train && utils/fix_data_dir.sh data/test
utils/fix_data_dir.sh data/dev utils/fix_data_dir.sh data/test &
utils/fix_data_dir.sh data/dev &
utils/fix_data_dir.sh data/train
wait
# Train LM with CMUCLMTK # Train LM with CMUCLMTK
# This setup uses IRSTLM
#local/sprak_train_lm.sh &> data/local/cmuclmtk/lm.log #local/sprak_train_lm.sh &> data/local/cmuclmtk/lm.log
# Train LM with irstlm # Train LM with irstlm
local/train_irstlm.sh data/local/dict/transcripts.txt 3 "b3g" data/lang data/local/trainb3_lm &> data/local/b3g.log & local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 3 "3g" data/lang data/local/train3_lm &> data/local/3g.log &
local/train_irstlm.sh data/local/dict/transcripts.uniq 3 "3g" data/lang data/local/train3_lm &> data/local/3g.log & local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 4 "4g" data/lang data/local/train4_lm &> data/local/4g.log
#local/train_irstlm.sh data/local/dict/transcripts.txt b4 "b4g" data/lang data/local/trainb4_lm &> data/local/b4g.log &
#local/train_irstlm.sh data/local/dict/transcripts.uniq 4 "4g" data/lang data/local/train4_lm &> data/local/4g.log &
# Make subset with 1k utterances for rapid testing # Make subset with 1k utterances for rapid testing
# Randomly selects 980 utterances from 7 speakers # Randomly selects 980 utterances from 7 speakers
utils/subset_data_dir.sh --per-spk data/test 140 data/test1k & utils/subset_data_dir.sh --per-spk data/test 140 data/test1k &
# Now make subset with the shortest 120k utterances. # Now make subset of the training data with the shortest 120k utterances.
utils/subset_data_dir.sh --shortest data/train 120000 data/train_120kshort || exit 1; utils/subset_data_dir.sh --shortest data/train 120000 data/train_120kshort || exit 1;
# Train monophone model on short utterances # Train monophone model on short utterances
@ -66,24 +74,14 @@ steps/train_mono.sh --nj 30 --cmd "$train_cmd" \
wait wait
utils/mkgraph.sh --mono data/lang_test_3g exp/mono0a exp/mono0a/graph_3g & utils/mkgraph.sh --mono data/lang_test_3g exp/mono0a exp/mono0a/graph_3g &
#utils/mkgraph.sh --mono data/lang_test_b3g exp/mono0a exp/mono0a/graph_b3g & utils/mkgraph.sh --mono data/lang_test_4g exp/mono0a exp/mono0a/graph_4g &
#utils/mkgraph.sh --mono data/lang_test_4g exp/mono0a exp/mono0a/graph_4g &
#utils/mkgraph.sh --mono data/lang_test_b4g exp/mono0a exp/mono0a/graph_b4g
# Ensure that all graphs are constructed # Ensure that all graphs are constructed
wait wait
#(
#steps/decode.sh --nj 7 --cmd "$decode_cmd" \
# exp/mono0a/graph_b3g data/test1k exp/mono0a/decode_b3g_test1k
#) &
steps/decode.sh --nj 7 --cmd "$decode_cmd" \ steps/decode.sh --nj 7 --cmd "$decode_cmd" \
exp/mono0a/graph_3g data/test1k exp/mono0a/decode_3g_test1k exp/mono0a/graph_3g data/test1k exp/mono0a/decode_3g_test1k
exit 0;
# steps/align_si.sh --boost-silence 1.25 --nj 42 --cmd "$train_cmd" \ # steps/align_si.sh --boost-silence 1.25 --nj 42 --cmd "$train_cmd" \
steps/align_si.sh --nj 30 --cmd "$train_cmd" \ steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/mono0a exp/mono0a_ali || exit 1; data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;
@ -96,19 +94,19 @@ wait
utils/mkgraph.sh data/lang_test_3g exp/tri1 exp/tri1/graph_3g & utils/mkgraph.sh data/lang_test_3g exp/tri1 exp/tri1/graph_3g &
utils/mkgraph.sh data/lang_test_b3g exp/tri1 exp/tri1/graph_b3g || exit 1;# utils/mkgraph.sh data/lang_test_4g exp/tri1 exp/tri1/graph_4g || exit 1;
#( (
#steps/decode.sh --nj 7 --cmd "$decode_cmd" \ steps/decode.sh --nj 7 --cmd "$decode_cmd" \
# exp/tri1/graph_4g data/test1k exp/tri1/decode_4g_test1k || exit 1; exp/tri1/graph_4g data/test1k exp/tri1/decode_4g_test1k || exit 1;
#) & ) &
( (
steps/decode.sh --nj 7 --cmd "$decode_cmd" \ steps/decode.sh --nj 7 --cmd "$decode_cmd" \
exp/tri1/graph_3g data/test1k exp/tri1/decode_3g_test1k || exit 1; exp/tri1/graph_3g data/test1k exp/tri1/decode_3g_test1k || exit 1;
) & ) &
steps/decode.sh --nj 7 --cmd "$decode_cmd" \
exp/tri1/graph_b3g data/test1k exp/tri1/decode_b3g_test1k || exit 1; wait
steps/align_si.sh --nj 30 --cmd "$train_cmd" \ steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/tri1 exp/tri1_ali || exit 1; data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
@ -120,14 +118,12 @@ steps/train_deltas.sh --cmd "$train_cmd" \
utils/mkgraph.sh data/lang_test_3g exp/tri2a exp/tri2a/graph_3g || exit 1; utils/mkgraph.sh data/lang_test_3g exp/tri2a exp/tri2a/graph_3g || exit 1;
#steps/decode.sh --nj 7 --cmd "$decode_cmd" \
# exp/tri2a/graph_b3g data/test1k exp/tri2a/decode_b3g_test1k || exit 1;
steps/decode.sh --nj 7 --cmd "$decode_cmd" \ steps/decode.sh --nj 7 --cmd "$decode_cmd" \
exp/tri2a/graph_3g data/test1k exp/tri2a/decode_3g_test1k || exit 1; exp/tri2a/graph_3g data/test1k exp/tri2a/decode_3g_test1k || exit 1;
steps/train_lda_mllt.sh --cmd "$train_cmd" \ steps/train_lda_mllt.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" \ --splice-opts "--left-context=5 --right-context=5" \
2500 15000 data/train data/lang exp/tri1_ali exp/tri2b || exit 1; 2500 15000 data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
utils/mkgraph.sh data/lang_test_3g exp/tri2b exp/tri2b/graph_3g || exit 1; utils/mkgraph.sh data/lang_test_3g exp/tri2b exp/tri2b/graph_3g || exit 1;
@ -135,7 +131,6 @@ steps/decode.sh --nj 7 --cmd "$decode_cmd" \
exp/tri2b/graph_3g data/test1k exp/tri2b/decode_3g_test1k || exit 1; exp/tri2b/graph_3g data/test1k exp/tri2b/decode_3g_test1k || exit 1;
# Align tri2b system with si84 data.
steps/align_si.sh --nj 30 --cmd "$train_cmd" \ steps/align_si.sh --nj 30 --cmd "$train_cmd" \
--use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali || exit 1; --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali || exit 1;
@ -151,18 +146,17 @@ steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
# Trying 4-gram language model # Trying 4-gram language model
local/train_irstlm.sh data/local/dict/transcripts.uniq 4 "4g" data/lang data/local/train4_lm &> data/local/4g.log
utils/mkgraph.sh data/lang_test_4g exp/tri3b exp/tri3b/graph_4g || exit 1; utils/mkgraph.sh data/lang_test_4g exp/tri3b exp/tri3b/graph_4g || exit 1;
steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 7 \ steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 7 \
exp/tri3b/graph_4g data/test1k exp/tri3b/decode_4g_test1k || exit 1; exp/tri3b/graph_4g data/test1k exp/tri3b/decode_4g_test1k || exit 1;
# Train RNN for reranking # Train RNN for reranking
local/sprak_train_rnnlms.sh data/local/dict data/dev/transcripts.uniq data/local/rnnlms/g_c380_d1k_h100_v130k local/sprak_train_rnnlms.sh data/local/dict data/dev/transcripts.uniq data/local/rnnlms/g_c380_d1k_h100_v130k
# Consumes a lot of memory! Do not run in parallel # Consumes a lot of memory! Do not run in parallel
local/sprak_run_rnnlms_tri3b.sh data/lang_test_3g data/local/rnnlms/g_c380_d1k_h100_v130k data/test1k exp/tri3b/decode_3g_test1k local/sprak_run_rnnlms_tri3b.sh data/lang_test_3g data/local/rnnlms/g_c380_d1k_h100_v130k data/test1k exp/tri3b/decode_3g_test1k
# From 3b system # From 3b system
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/tri3b exp/tri3b_ali || exit 1; data/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
@ -175,9 +169,6 @@ steps/train_sat.sh --cmd "$train_cmd" \
utils/mkgraph.sh data/lang_test_3g exp/tri4a exp/tri4a/graph_3g || exit 1; utils/mkgraph.sh data/lang_test_3g exp/tri4a exp/tri4a/graph_3g || exit 1;
steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \ steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
exp/tri4a/graph_3g data/test1k exp/tri4a/decode_3g_test1k || exit 1; exp/tri4a/graph_3g data/test1k exp/tri4a/decode_3g_test1k || exit 1;
# steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
# exp/tri4a/graph_tgpr data/test_eval92 exp/tri4a/decode_tgpr_eval92 || exit 1;
steps/train_quick.sh --cmd "$train_cmd" \ steps/train_quick.sh --cmd "$train_cmd" \
@ -195,9 +186,7 @@ steps/train_quick.sh --cmd "$train_cmd" \
wait wait
# alignment used to train nnets and sgmms
# Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on
# all the data). Use 30 jobs.
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/tri4b exp/tri4b_ali || exit 1; data/train data/lang exp/tri4b exp/tri4b_ali || exit 1;
@ -207,9 +196,6 @@ local/sprak_run_nnet_cpu.sh 3g test1k
## Works ## Works
local/sprak_run_sgmm2.sh test1k local/sprak_run_sgmm2.sh test1k
# You probably want to run the hybrid recipe as it is complementary:
#local/run_hybrid.sh
# Getting results [see RESULTS file] # Getting results [see RESULTS file]
for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done