From 8dc30c3b6bb5208597daf6e058ef18fadf271669 Mon Sep 17 00:00:00 2001 From: Dan Povey Date: Fri, 2 Jan 2015 03:38:04 +0000 Subject: [PATCH] Updates to various data preparation scripts so validation checks on 'lang' directories will pass. It's possible some of these changes will break some setups, but it's not feasible to fully test this right now. git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4739 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8 --- egs/aurora4/s5/local/aurora4_format_data.sh | 29 +-- egs/aurora4/s5/local/wsj_data_prep.sh | 201 ----------------- egs/aurora4/s5/local/wsj_extend_dict.sh | 173 --------------- egs/aurora4/s5/local/wsj_format_data.sh | 86 -------- egs/aurora4/s5/local/wsj_format_local_lms.sh | 52 ----- egs/aurora4/s5/local/wsj_train_lms.sh | 202 ------------------ egs/aurora4/s5/local/wsj_train_rnnlms.sh | 153 ------------- egs/babel/s5/local/wsj_data_prep.sh | 201 ----------------- egs/babel/s5/local/wsj_extend_dict.sh | 172 --------------- egs/babel/s5/local/wsj_format_data.sh | 86 -------- egs/babel/s5/local/wsj_format_local_lms.sh | 54 ----- egs/babel/s5/local/wsj_prepare_dict.sh | 83 ------- egs/babel/s5/local/wsj_train_lms.sh | 202 ------------------ egs/babel/s5/local/wsj_train_rnnlms.sh | 153 ------------- egs/chime_wsj0/s5/local/chime_format_data.sh | 27 +-- egs/chime_wsj0/s5/run.sh | 3 - egs/librispeech/s5/local/format_data.sh | 22 +- egs/librispeech/s5/local/format_lms.sh | 71 +++--- egs/sprakbanken/s5/local/sprak_train_cmulm.sh | 29 +-- .../s5/local/sprak_train_irstlm.sh | 33 +-- egs/sprakbanken/s5/local/train_irstlm.sh | 28 +-- egs/swbd/s5b/local/swbd1_prepare_dict.sh | 2 +- egs/wsj/s5/local/wsj_format_data.sh | 28 +-- egs/wsj/s5/local/wsj_prepare_dict.sh | 7 +- 24 files changed, 68 insertions(+), 2029 deletions(-) delete mode 100755 egs/aurora4/s5/local/wsj_data_prep.sh delete mode 100755 egs/aurora4/s5/local/wsj_extend_dict.sh delete mode 100755 egs/aurora4/s5/local/wsj_format_data.sh delete mode 100755 egs/aurora4/s5/local/wsj_format_local_lms.sh delete mode 100755 egs/aurora4/s5/local/wsj_train_lms.sh delete mode 100755 egs/aurora4/s5/local/wsj_train_rnnlms.sh delete mode 100755 egs/babel/s5/local/wsj_data_prep.sh delete mode 100755 egs/babel/s5/local/wsj_extend_dict.sh delete mode 100755 egs/babel/s5/local/wsj_format_data.sh delete mode 100755 egs/babel/s5/local/wsj_format_local_lms.sh delete mode 100755 egs/babel/s5/local/wsj_prepare_dict.sh delete mode 100755 egs/babel/s5/local/wsj_train_lms.sh delete mode 100755 egs/babel/s5/local/wsj_train_rnnlms.sh diff --git a/egs/aurora4/s5/local/aurora4_format_data.sh b/egs/aurora4/s5/local/aurora4_format_data.sh index 94494e61f..87e55c566 100755 --- a/egs/aurora4/s5/local/aurora4_format_data.sh +++ b/egs/aurora4/s5/local/aurora4_format_data.sh @@ -38,11 +38,8 @@ echo Preparing language models for test for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do test=data/lang_test_${lm_suffix} - mkdir -p $test - for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \ - phones/; do - cp -r data/lang/$f $test - done + cp -rT data/lang $test + gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt @@ -60,26 +57,8 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst - fstisstochastic $test/G.fst - # The output is like: - # 9.14233e-05 -0.259833 - # we do expect the first of these 2 numbers to be close to zero (the second is - # nonzero because the backoff weights make the states sum to >1). - # Because of the fiasco for these particular LMs, the first number is not - # as close to zero as it could be. - - # Everything below is only for diagnostic. - # Checking that G has no cycles with empty words on them (e.g. , ); - # this might cause determinization failure of CLG. - # #0 is treated as an empty word. - mkdir -p $tmpdir/g - awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ - < "$lexicon" >$tmpdir/g/select_empty.fst.txt - fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \ - fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst - fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && - echo "Language model has cycles with empty words" && exit 1 - rm -r $tmpdir/g + + utils/validate_lang.pl --skip-determinizability-check $test || exit 1; done echo "Succeeded in formatting data." diff --git a/egs/aurora4/s5/local/wsj_data_prep.sh b/egs/aurora4/s5/local/wsj_data_prep.sh deleted file mode 100755 index 685b57aa7..000000000 --- a/egs/aurora4/s5/local/wsj_data_prep.sh +++ /dev/null @@ -1,201 +0,0 @@ -#!/bin/bash - -# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0. - - -if [ $# -le 3 ]; then - echo "Arguments should be a list of WSJ directories, see ../run.sh for example." - exit 1; -fi - - -dir=`pwd`/data/local/data -lmdir=`pwd`/data/local/nist_lm -mkdir -p $dir $lmdir -local=`pwd`/local -utils=`pwd`/utils - -. ./path.sh # Needed for KALDI_ROOT -export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin -sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe -if [ ! -x $sph2pipe ]; then - echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; - exit 1; -fi - -cd $dir - -# Make directory of links to the WSJ disks such as 11-13.1. This relies on the command -# line arguments being absolute pathnames. -rm -r links/ 2>/dev/null -mkdir links/ -ln -s $* links - -# Do some basic checks that we have what we expected. -if [ ! -d links/11-13.1 -o ! -d links/13-34.1 -o ! -d links/11-2.1 ]; then - echo "wsj_data_prep.sh: Spot check of command line arguments failed" - echo "Command line arguments must be absolute pathnames to WSJ directories" - echo "with names like 11-13.1." - exit 1; -fi - -# This version for SI-84 - -cat links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \ - $local/ndx2flist.pl $* | sort | \ - grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si84.flist - -nl=`cat train_si84.flist | wc -l` -[ "$nl" -eq 7138 ] || echo "Warning: expected 7138 lines in train_si84.flist, got $nl" - -# This version for SI-284 -cat links/13-34.1/wsj1/doc/indices/si_tr_s.ndx \ - links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \ - $local/ndx2flist.pl $* | sort | \ - grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si284.flist - -nl=`cat train_si284.flist | wc -l` -[ "$nl" -eq 37416 ] || echo "Warning: expected 37416 lines in train_si284.flist, got $nl" - -# Now for the test sets. -# links/13-34.1/wsj1/doc/indices/readme.doc -# describes all the different test sets. -# Note: each test-set seems to come in multiple versions depending -# on different vocabulary sizes, verbalized vs. non-verbalized -# pronunciations, etc. We use the largest vocab and non-verbalized -# pronunciations. -# The most normal one seems to be the "baseline 60k test set", which -# is h1_p0. - -# Nov'92 (333 utts) -# These index files have a slightly different format; -# have to add .wv1 -cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_20.ndx | \ - $local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \ - sort > test_eval92.flist - -# Nov'92 (330 utts, 5k vocab) -cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_05.ndx | \ - $local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \ - sort > test_eval92_5k.flist - -# Nov'93: (213 utts) -# Have to replace a wrong disk-id. -cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \ - sed s/13_32_1/13_33_1/ | \ - $local/ndx2flist.pl $* | sort > test_eval93.flist - -# Nov'93: (213 utts, 5k) -cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h2_p0.ndx | \ - sed s/13_32_1/13_33_1/ | \ - $local/ndx2flist.pl $* | sort > test_eval93_5k.flist - -# Dev-set for Nov'93 (503 utts) -cat links/13-34.1/wsj1/doc/indices/h1_p0.ndx | \ - $local/ndx2flist.pl $* | sort > test_dev93.flist - -# Dev-set for Nov'93 (513 utts, 5k vocab) -cat links/13-34.1/wsj1/doc/indices/h2_p0.ndx | \ - $local/ndx2flist.pl $* | sort > test_dev93_5k.flist - - -# Dev-set Hub 1,2 (503, 913 utterances) - -# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt. -# Sometimes this gets copied from the CD's with upcasing, don't know -# why (could be older versions of the disks). -find `readlink links/13-16.1`/???1/??_??_20 -print | grep -i ".wv1" | sort > dev_dt_20.flist -find `readlink links/13-16.1`/???1/??_??_05 -print | grep -i ".wv1" | sort > dev_dt_05.flist - - -# Finding the transcript files: -for x in $*; do find -L $x -iname '*.dot'; done > dot_files.flist - -# Convert the transcripts into our format (no normalization yet) -for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do - $local/flist2scp.pl $x.flist | sort > ${x}_sph.scp - cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl dot_files.flist > $x.trans1 -done - -# Do some basic normalization steps. At this point we don't remove OOVs-- -# that will be done inside the training scripts, as we'd like to make the -# data-preparation stage independent of the specific lexicon used. -noiseword=""; -for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do - cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1; -done - -# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.) -for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do - awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp -done - -# Make the utt2spk and spk2utt files. -for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do - cat ${x}_sph.scp | awk '{print $1}' | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk - cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; -done - - -#in case we want to limit lm's on most frequent words, copy lm training word frequency list -cp links/13-32.1/wsj1/doc/lng_modl/vocab/wfl_64.lst $lmdir -chmod u+w $lmdir/*.lst # had weird permissions on source. - -# The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without -# verbalized pronunciations. This is the most common test setup, I understand. - -cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1; -chmod u+w $lmdir/lm_bg.arpa.gz - -# trigram would be: -cat links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb20onp.z | \ - perl -e 'while(<>){ if(m/^\\data\\/){ print; last; } } while(<>){ print; }' | \ - gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1; - -prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1; -gzip -f $lmdir/lm_tgpr.arpa || exit 1; - -# repeat for 5k language models -cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb05onp.z $lmdir/lm_bg_5k.arpa.gz || exit 1; -chmod u+w $lmdir/lm_bg_5k.arpa.gz - -# trigram would be: !only closed vocabulary here! -cp links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1; -chmod u+w $lmdir/lm_tg_5k.arpa.gz -gunzip $lmdir/lm_tg_5k.arpa.gz -tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz -rm $lmdir/lm_tg_5k.arpa - -prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1; -gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1; - - -if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then - rm wsj0-train-spkrinfo.txt - ! wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt && \ - echo "Getting wsj0-train-spkrinfo.txt from backup location" && \ - wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt -fi - -if [ ! -f wsj0-train-spkrinfo.txt ]; then - echo "Could not get the spkrinfo.txt file from LDC website (moved)?" - echo "This is possibly omitted from the training disks; couldn't find it." - echo "Everything else may have worked; we just may be missing gender info" - echo "which is only needed for VTLN-related diagnostics anyway." - exit 1 -fi -# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the -# LDC put it on the web. Perhaps it was accidentally omitted from the -# disks. - -cat links/11-13.1/wsj0/doc/spkrinfo.txt \ - links/13-32.1/wsj1/doc/evl_spok/spkrinfo.txt \ - links/13-34.1/wsj1/doc/dev_spok/spkrinfo.txt \ - links/13-34.1/wsj1/doc/train/spkrinfo.txt \ - ./wsj0-train-spkrinfo.txt | \ - perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \ - awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender - - -echo "Data preparation succeeded" diff --git a/egs/aurora4/s5/local/wsj_extend_dict.sh b/egs/aurora4/s5/local/wsj_extend_dict.sh deleted file mode 100755 index 38a06bb48..000000000 --- a/egs/aurora4/s5/local/wsj_extend_dict.sh +++ /dev/null @@ -1,173 +0,0 @@ -#!/bin/bash - -# This script builds a larger word-list and dictionary -# than used for the LMs supplied with the WSJ corpus. -# It uses a couple of strategies to fill-in words in -# the LM training data but not in CMUdict. One is -# to generate special prons for possible acronyms, that -# just consist of the constituent letters. The other -# is designed to handle derivatives of known words -# (e.g. deriving the pron of a plural from the pron of -# the base-word), but in a more general, learned-from-data -# way. -# It makes use of scripts in local/dict/ - -if [ $# -ne 1 ]; then - echo "Usage: local/wsj_train_lms.sh /foo/bar/WSJ/13-32.1/" - exit 1 -fi -if [ "`basename $1`" != 13-32.1 ]; then - echo "Expecting the argument to this script to end in 13-32.1" - exit 1 -fi - -# e.g. -#srcdir=/mnt/matylda2/data/WSJ1/13-32.1 -export PATH=$PATH:`pwd`/local/dict/ -srcdir=$1 -mkdir -p data/local/dict_larger -dir=data/local/dict_larger -cp data/local/dict/* data/local/dict_larger # Various files describing phones etc. - # are there; we just want to copy them as the phoneset is the same. -rm data/local/dict_larger/lexicon.txt # we don't want this. -rm data/local/dict_larger/lexiconp.txt # we don't want this either. -mincount=2 # Minimum count of an OOV we will try to generate a pron for. - -[ ! -f data/local/dict/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1; - -# Remove comments from cmudict; print first field; remove -# words like FOO(1) which are alternate prons: our dict format won't -# include these markers. -grep -v ';;;' data/local/dict/cmudict/cmudict.0.7a | - perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu - -cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu - -echo "Getting training data [this should take at least a few seconds; if not, there's a problem]" - -# Convert to uppercase, remove XML-like markings. -# For words ending in "." that are not in CMUdict, we assume that these -# are periods that somehow remained in the data during data preparation, -# and we we replace the "." with "\n". Note: we found this by looking at -# oov.counts below (before adding this rule). - -touch $dir/cleaned.gz -if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then - echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]"; -else - gunzip -c $srcdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \ - | awk '/^){ chop; $isword{$_} = 1; } - while() { - @A = split(" ", $_); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "." - # and have no other "." in them: treat as period. - print "$a"; - if ($n+1 < @A) { print "\n"; } - } else { print "$a "; } - } - print "\n"; - } - ' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz -fi - -# get unigram counts -echo "Getting unigram counts" -gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \ - awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams - -cat $dir/unigrams | awk -v dict=$dir/dict.cmu \ - 'BEGIN{while(getline $dir/oov.counts - -echo "Most frequent unseen unigrams are: " -head $dir/oov.counts - -# Prune away singleton counts, and remove things with numbers in -# (which should have been normalized) and with no letters at all. - - -cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \ - | awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist - -# Automatic rule-finding... - -# First make some prons for possible acronyms. -# Note: we don't do this for things like U.K or U.N, -# or A.B. (which doesn't exist anyway), -# as we consider this normalization/spelling errors. - -cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu > $dir/dict.acronyms - -mkdir $dir/f $dir/b # forward, backward directions of rules... - # forward is normal suffix - # rules, backward is reversed (prefix rules). These - # dirs contain stuff we create while making the rule-based - # extensions to the dictionary. - -# Remove ; and , from words, if they are present; these -# might crash our scripts, as they are used as separators there. -filter_dict.pl $dir/dict.cmu > $dir/f/dict -cat $dir/oovlist | filter_dict.pl > $dir/f/oovs -reverse_dict.pl $dir/f/dict > $dir/b/dict -reverse_dict.pl $dir/f/oovs > $dir/b/oovs - -# The next stage takes a few minutes. -# Note: the forward stage takes longer, as English is -# mostly a suffix-based language, and there are more rules -# that it finds. -for d in $dir/f $dir/b; do - ( - cd $d - cat dict | get_rules.pl 2>get_rules.log >rules - get_rule_hierarchy.pl rules >hierarchy - awk '{print $1}' dict | get_candidate_prons.pl rules dict | \ - limit_candidate_prons.pl hierarchy | \ - score_prons.pl dict | \ - count_rules.pl >rule.counts - # the sort command below is just for convenience of reading. - score_rules.pl rules.with_scores - get_candidate_prons.pl rules.with_scores dict oovs | \ - limit_candidate_prons.pl hierarchy > oovs.candidates - ) & -done -wait - -# Merge the candidates. -reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates -select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s %s\n", $1, $2);}' \ - > $dir/dict.oovs - -cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged - -awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled -sort $dir/oovlist | diff - $dir/oovlist.handled | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled - - -# add_counts.pl attaches to original counts to the list of handled/not-handled OOVs -add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts -add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts - -echo "**Top OOVs we handled are:**"; -head $dir/oovlist.handled.counts -echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**"; -head $dir/oovlist.not_handled.counts - - -echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`" -echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`" -echo "Count of OOVs we didn't handle due to low count is" \ - `awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts` -# The two files created above are for humans to look at, as diagnostics. - -cat < $dir/lexicon.txt -!SIL SIL - SPN - SPN - NSN -EOF - -echo "Created $dir/lexicon.txt" diff --git a/egs/aurora4/s5/local/wsj_format_data.sh b/egs/aurora4/s5/local/wsj_format_data.sh deleted file mode 100755 index 80bd8596a..000000000 --- a/egs/aurora4/s5/local/wsj_format_data.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - -# This script takes data prepared in a corpus-dependent way -# in data/local/, and converts it into the "canonical" form, -# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug, -# data/train_si284, data/train_si84, etc. - -# Don't bother doing train_si84 separately (although we have the file lists -# in data/local/) because it's just the first 7138 utterances in train_si284. -# We'll create train_si84 after doing the feature extraction. - -. ./path.sh || exit 1; - -echo "Preparing train and test data" -srcdir=data/local/data -lmdir=data/local/nist_lm -tmpdir=data/local/lm_tmp -lexicon=data/local/lang_tmp/lexiconp.txt -mkdir -p $tmpdir - -for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do - mkdir -p data/$x - cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1; - cp $srcdir/$x.txt data/$x/text || exit 1; - cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1; - cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1; - utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1; -done - - -# Next, for each type of language model, create the corresponding FST -# and the corresponding lang_test_* directory. - -echo Preparing language models for test - -for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do - test=data/lang_test_${lm_suffix} - mkdir -p $test - for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \ - phones/; do - cp -r data/lang/$f $test - done - gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ - utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt - - # grep -v ' ' because the LM seems to have some strange and useless - # stuff in it with multiple 's in the history. Encountered some other similar - # things in a LM from Geoff. Removing all "illegal" combinations of and , - # which are supposed to occur only at being/end of utt. These can cause - # determinization failures of CLG [ends up being epsilon cycles]. - gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst - fstisstochastic $test/G.fst - # The output is like: - # 9.14233e-05 -0.259833 - # we do expect the first of these 2 numbers to be close to zero (the second is - # nonzero because the backoff weights make the states sum to >1). - # Because of the fiasco for these particular LMs, the first number is not - # as close to zero as it could be. - - # Everything below is only for diagnostic. - # Checking that G has no cycles with empty words on them (e.g. , ); - # this might cause determinization failure of CLG. - # #0 is treated as an empty word. - mkdir -p $tmpdir/g - awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ - < "$lexicon" >$tmpdir/g/select_empty.fst.txt - fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \ - fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst - fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && - echo "Language model has cycles with empty words" && exit 1 - rm -r $tmpdir/g -done - -echo "Succeeded in formatting data." -rm -r $tmpdir diff --git a/egs/aurora4/s5/local/wsj_format_local_lms.sh b/egs/aurora4/s5/local/wsj_format_local_lms.sh deleted file mode 100755 index 606f94e6d..000000000 --- a/egs/aurora4/s5/local/wsj_format_local_lms.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash - -# Copyright Johns Hopkins University (Author: Daniel Povey) 2012 - -. ./path.sh - -[ ! -d data/lang_bd ] && echo "Expect data/local/lang_bd to exist" && exit 1; - -lm_srcdir_3g=data/local/local_lm/3gram-mincount -lm_srcdir_4g=data/local/local_lm/4gram-mincount - -[ ! -d "$lm_srcdir_3g" ] && echo "No such dir $lm_srcdir_3g" && exit 1; -[ ! -d "$lm_srcdir_4g" ] && echo "No such dir $lm_srcdir_4g" && exit 1; - -for d in data/lang_test_bd_{tg,tgpr,fg,fgpr}; do - rm -r $d 2>/dev/null - cp -r data/lang_bd $d -done - -lang=data/lang_bd - -# Be careful: this time we dispense with the grep -v ' ' so this might -# not work for LMs generated from all toolkits. -gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ - --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tgpr/G.fst || exit 1; - fstisstochastic data/lang_test_bd_tgpr/G.fst - -gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ - --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tg/G.fst || exit 1; - fstisstochastic data/lang_test_bd_tg/G.fst - -gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ - --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fg/G.fst || exit 1; - fstisstochastic data/lang_test_bd_fg/G.fst - -gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ - --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fgpr/G.fst || exit 1; - fstisstochastic data/lang_test_bd_fgpr/G.fst - -exit 0; diff --git a/egs/aurora4/s5/local/wsj_train_lms.sh b/egs/aurora4/s5/local/wsj_train_lms.sh deleted file mode 100755 index 060f387f2..000000000 --- a/egs/aurora4/s5/local/wsj_train_lms.sh +++ /dev/null @@ -1,202 +0,0 @@ -#!/bin/bash - -# This script trains LMs on the WSJ LM-training data. -# It requires that you have already run wsj_extend_dict.sh, -# to get the larger-size dictionary including all of CMUdict -# plus any OOVs and possible acronyms that we could easily -# derive pronunciations for. - -# This script takes no command-line arguments - -dir=data/local/local_lm -srcdir=data/local/dict_larger -mkdir -p $dir -. ./path.sh || exit 1; # for KALDI_ROOT -export PATH=$KALDI_ROOT/tools/kaldi_lm:$PATH -( # First make sure the kaldi_lm toolkit is installed. - cd $KALDI_ROOT/tools || exit 1; - if [ -d kaldi_lm ]; then - echo Not installing the kaldi_lm toolkit since it is already there. - else - echo Downloading and installing the kaldi_lm tools - if [ ! -f kaldi_lm.tar.gz ]; then - wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1; - fi - tar -xvzf kaldi_lm.tar.gz || exit 1; - cd kaldi_lm - make || exit 1; - echo Done making the kaldi_lm tools - fi -) || exit 1; - - - -if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then - echo "Expecting files $srcdir/cleaned.gz and $srcdir/lexicon.txt to exist"; - echo "You need to run local/wsj_extend_dict.sh before running this script." - exit 1; -fi - -# Get a wordlist-- keep everything but silence, which should not appear in -# the LM. -awk '{print $1}' $srcdir/lexicon.txt | grep -v -w '!SIL' > $dir/wordlist.txt - -# Get training data with OOV words (w.r.t. our current vocab) replaced with . -echo "Getting training data with OOV words replaced with (train_nounk.gz)" -gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.txt \ - 'BEGIN{while((getline0) v[$1]=1;} - {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf " ";print ""}'|sed 's/ $//g' \ - | gzip -c > $dir/train_nounk.gz - -# Get unigram counts (without bos/eos, but this doens't matter here, it's -# only to get the word-map, which treats them specially & doesn't need their -# counts). -# Add a 1-count for each word in word-list by including that in the data, -# so all words appear. -gunzip -c $dir/train_nounk.gz | cat - $dir/wordlist.txt | \ - awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \ - sort -nr > $dir/unigram.counts - -# Get "mapped" words-- a character encoding of the words that makes the common words very short. -cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map - -gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} - { for(n=1;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz - -# To save disk space, remove the un-mapped training data. We could -# easily generate it again if needed. -rm $dir/train_nounk.gz - -train_lm.sh --arpa --lmtype 3gram-mincount $dir -#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826 -# 7.8 million N-grams. - -prune_lm.sh --arpa 6.0 $dir/3gram-mincount/ -# 1.45 million N-grams. -# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139 - -train_lm.sh --arpa --lmtype 4gram-mincount $dir -#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180 -# 10.3 million N-grams. - -prune_lm.sh --arpa 7.0 $dir/4gram-mincount -# 1.50 million N-grams -# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757 - - -exit 0 - -### Below here, this script is showing various commands that -## were run during LM tuning. - -train_lm.sh --arpa --lmtype 3gram-mincount $dir -#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826 -# 7.8 million N-grams. - -prune_lm.sh --arpa 3.0 $dir/3gram-mincount/ -#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740 -# 2.5 million N-grams. - -prune_lm.sh --arpa 6.0 $dir/3gram-mincount/ -# 1.45 million N-grams. -# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139 - -train_lm.sh --arpa --lmtype 4gram-mincount $dir -#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180 -# 10.3 million N-grams. - -prune_lm.sh --arpa 3.0 $dir/4gram-mincount -#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294 -# 2.6 million N-grams. - -prune_lm.sh --arpa 4.0 $dir/4gram-mincount -# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717 -# 2.15 million N-grams. - -prune_lm.sh --arpa 5.0 $dir/4gram-mincount -# 1.86 million N-grams -# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023 - -prune_lm.sh --arpa 7.0 $dir/4gram-mincount -# 1.50 million N-grams -# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757 - -train_lm.sh --arpa --lmtype 3gram $dir -# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866 -# 20.0 million N-grams - -! which ngram-count \ - && echo "SRILM tools not installed so not doing the comparison" && exit 1; - -################# -# You could finish the script here if you wanted. -# Below is to show how to do baselines with SRILM. -# You'd have to install the SRILM toolkit first. - -heldout_sent=10000 # Don't change this if you want result to be comparable with - # kaldi_lm results -sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities. -mkdir -p $sdir -gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout -gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train -(echo ""; echo "" ) | cat - $dir/wordlist.txt > $sdir/wordlist.final.s - -# 3-gram: -ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ - -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz -ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2 -#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs -#0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437 - -# Trying 4-gram: -ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ - -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz -ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout -#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs -#0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822 - -#3-gram with pruning: -ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ - -prune 0.0000001 -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz -ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout -#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs -#0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616 -# Around 2.25M N-grams. -# Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/" -# above, which gave 2.5 million N-grams and a perplexity of 156. - -# Note: all SRILM experiments above fully discount all singleton 3 and 4-grams. -# You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to -# the kaldi_lm experiments above without "-mincount". - -## From here is how to train with -# IRSTLM. This is not really working at the moment. -export IRSTLM=$KALDI_ROOT/tools/irstlm/ - -idir=$dir/irstlm -mkdir $idir -gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | $IRSTLM/scripts/add-start-end.sh | \ - gzip -c > $idir/train.gz - -$IRSTLM/bin/dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no - cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\ -{print $0;}}' > vocab.irstlm.20k - - -$IRSTLM/bin/build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz -p yes \ - -n 3 -s improved-kneser-ney -b yes -# Testing perplexity with SRILM tools: -ngram -lm $idir/lm_3gram.gz -ppl $sdir/cleaned.heldout -#data/local/local_lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for in closed-vocabulary LM -#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs -#0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599 - -# Perplexity is very bad (should be ~141, since we used -p option, -# not 175), -# but adding -debug 3 to the command line shows that -# the IRSTLM LM does not seem to sum to one properly, so it seems that -# it produces an LM that isn't interpretable in the normal way as an ARPA -# LM. - - - diff --git a/egs/aurora4/s5/local/wsj_train_rnnlms.sh b/egs/aurora4/s5/local/wsj_train_rnnlms.sh deleted file mode 100755 index c0d1afaf6..000000000 --- a/egs/aurora4/s5/local/wsj_train_rnnlms.sh +++ /dev/null @@ -1,153 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (author: Daniel Povey) Tony Robinson - -# This script trains LMs on the WSJ LM-training data. -# It requires that you have already run wsj_extend_dict.sh, -# to get the larger-size dictionary including all of CMUdict -# plus any OOVs and possible acronyms that we could easily -# derive pronunciations for. - -# This script takes no command-line arguments but takes the --cmd option. - -# Begin configuration section. -rand_seed=0 -cmd=run.pl -nwords=10000 # This is how many words we're putting in the vocab of the RNNLM. -hidden=30 -class=200 # Num-classes... should be somewhat larger than sqrt of nwords. -direct=1000 # Probably number of megabytes to allocate for hash-table for "direct" connections. -rnnlm_ver=rnnlm-0.3e # version of RNNLM to use -# End configuration section. - -[ -f ./path.sh ] && . ./path.sh -. utils/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: local/wsj_train_rnnlms.sh [options] " - echo "For options, see top of script file" - exit 1; -fi - -dir=$1 -srcdir=data/local/dict_larger -mkdir -p $dir - -export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH - - -( # First make sure the kaldi_lm toolkit is installed. - # Note: this didn't work out of the box for me, I had to - # change the g++ version to just "g++" (no cross-compilation - # needed for me as I ran on a machine that had been setup - # as 64 bit by default. - cd $KALDI_ROOT/tools || exit 1; - if [ -d $rnnlm_ver ]; then - echo Not installing the rnnlm toolkit since it is already there. - else - echo Downloading and installing the rnnlm tools - # http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz - if [ ! -f $rnnlm_ver.tgz ]; then - wget http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz || exit 1; - fi - mkdir $rnnlm_ver - cd $rnnlm_ver - tar -xvzf ../$rnnlm_ver.tgz || exit 1; - make CC=g++ || exit 1; - echo Done making the rnnlm tools - fi -) || exit 1; - - -if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then - echo "Expecting files $srcdir/cleaned.gz and $srcdir/wordlist.final to exist"; - echo "You need to run local/wsj_extend_dict.sh before running this script." - exit 1; -fi - -cat $srcdir/lexicon.txt | awk '{print $1}' | grep -v -w '!SIL' > $dir/wordlist.all - -# Get training data with OOV words (w.r.t. our current vocab) replaced with . -echo "Getting training data with OOV words replaced with (train_nounk.gz)" -gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.all \ - 'BEGIN{while((getline0) v[$1]=1;} - {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf " ";print ""}'|sed 's/ $//g' \ - | gzip -c > $dir/all.gz - -echo "Splitting data into train and validation sets." -heldout_sent=10000 -gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data -gunzip -c $dir/all.gz | tail -n +$heldout_sent | \ - perl -e ' use List::Util qw(shuffle); @A=<>; print join("", shuffle(@A)); ' \ - > $dir/train.in # training data - - - # The rest will consist of a word-class represented by , that - # maps (with probabilities) to a whole class of words. - -# Get unigram counts from our training data, and use this to select word-list -# for RNNLM training; e.g. 10k most frequent words. Rest will go in a class -# that we (manually, at the shell level) assign probabilities for words that -# are in that class. Note: this word-list doesn't need to include ; this -# automatically gets added inside the rnnlm program. -# Note: by concatenating with $dir/wordlist.all, we are doing add-one -# smoothing of the counts. - -cat $dir/train.in $dir/wordlist.all | grep -v '' | grep -v '' | \ - awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \ - sort -nr > $dir/unigram.counts - -head -$nwords $dir/unigram.counts | awk '{print $2}' > $dir/wordlist.rnn - -tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts - -tot=`awk '{x=x+$1} END{print x}' $dir/unk_class.counts` -awk -v tot=$tot '{print $2, ($1*1.0/tot);}' <$dir/unk_class.counts >$dir/unk.probs - - -for type in train valid; do - cat $dir/$type.in | awk -v w=$dir/wordlist.rnn \ - 'BEGIN{while((getline0) v[$1]=1;} - {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf " ";print ""}'|sed 's/ $//g' \ - > $dir/$type -done -rm $dir/train.in # no longer needed-- and big. - -# Now randomize the order of the training data. -cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \ - sort | cut -f 2 > $dir/foo -mv $dir/foo $dir/train - -# OK we'll train the RNNLM on this data. - -# todo: change 100 to 320. -# using 100 classes as square root of 10k. -echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)" -#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/100.rnnlm \ -# -hidden 100 -rand-seed 1 -debug 2 -class 100 -bptt 2 -bptt-block 20 \ -# -direct-order 4 -direct 1000 -binary >& $dir/rnnlm1.log & - -$cmd $dir/rnnlm.log \ - $KALDI_ROOT/tools/$rnnlm_ver/rnnlm -independent -train $dir/train -valid $dir/valid \ - -rnnlm $dir/rnnlm -hidden $hidden -rand-seed 1 -debug 2 -class $class -bptt 2 -bptt-block 20 \ - -direct-order 4 -direct $direct -binary || exit 1; - - -# make it like a Kaldi table format, with fake utterance-ids. -cat $dir/valid.in | awk '{ printf("uttid-%d ", NR); print; }' > $dir/valid.with_ids - -utils/rnnlm_compute_scores.sh $dir $dir/tmp.valid $dir/valid.with_ids \ - $dir/valid.scores -nw=`wc -w < $dir/valid.with_ids` # Note: valid.with_ids includes utterance-ids which - # is one per word, to account for the at the end of each sentence; this is the - # correct number to normalize buy. -p=`awk -v nw=$nw '{x=x+$2} END{print exp(x/nw);}' <$dir/valid.scores` -echo Perplexity is $p | tee $dir/perplexity.log - -rm $dir/train $dir/all.gz - -# This is a better setup, but takes a long time to train: -#echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)" -#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/320.rnnlm \ -# -hidden 320 -rand-seed 1 -debug 2 -class 300 -bptt 2 -bptt-block 20 \ -# -direct-order 4 -direct 2000 -binary diff --git a/egs/babel/s5/local/wsj_data_prep.sh b/egs/babel/s5/local/wsj_data_prep.sh deleted file mode 100755 index 65143694f..000000000 --- a/egs/babel/s5/local/wsj_data_prep.sh +++ /dev/null @@ -1,201 +0,0 @@ -#!/bin/bash - -# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0. - - -if [ $# -le 3 ]; then - echo "Arguments should be a list of WSJ directories, see ../run.sh for example." - exit 1; -fi - - -dir=`pwd`/data/local/data -lmdir=`pwd`/data/local/nist_lm -mkdir -p $dir $lmdir -local=`pwd`/local -utils=`pwd`/utils - -. ./path.sh # Needed for KALDI_ROOT -export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin -sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe -if [ ! -x $sph2pipe ]; then - echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; - exit 1; -fi - -cd $dir - -# Make directory of links to the WSJ disks such as 11-13.1. This relies on the command -# line arguments being absolute pathnames. -rm -r links/ 2>/dev/null -mkdir links/ -ln -s $* links - -# Do some basic checks that we have what we expected. -if [ ! -d links/11-13.1 -o ! -d links/13-34.1 -o ! -d links/11-2.1 ]; then - echo "wsj_data_prep.sh: Spot check of command line arguments failed" - echo "Command line arguments must be absolute pathnames to WSJ directories" - echo "with names like 11-13.1." - exit 1; -fi - -# This version for SI-84 - -cat links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \ - $local/ndx2flist.pl $* | sort | \ - grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si84.flist - -nl=`cat train_si84.flist | wc -l` -[ "$nl" -eq 7138 ] || echo "Warning: expected 37416 lines in train_si84.flist, got $nl" - -# This version for SI-284 -cat links/13-34.1/wsj1/doc/indices/si_tr_s.ndx \ - links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \ - $local/ndx2flist.pl $* | sort | \ - grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si284.flist - -nl=`cat train_si284.flist | wc -l` -[ "$nl" -eq 37416 ] || echo "Warning: expected 37416 lines in train_si284.flist, got $nl" - -# Now for the test sets. -# links/13-34.1/wsj1/doc/indices/readme.doc -# describes all the different test sets. -# Note: each test-set seems to come in multiple versions depending -# on different vocabulary sizes, verbalized vs. non-verbalized -# pronunciations, etc. We use the largest vocab and non-verbalized -# pronunciations. -# The most normal one seems to be the "baseline 60k test set", which -# is h1_p0. - -# Nov'92 (333 utts) -# These index files have a slightly different format; -# have to add .wv1 -cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_20.ndx | \ - $local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \ - sort > test_eval92.flist - -# Nov'92 (330 utts, 5k vocab) -cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_05.ndx | \ - $local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \ - sort > test_eval92_5k.flist - -# Nov'93: (213 utts) -# Have to replace a wrong disk-id. -cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \ - sed s/13_32_1/13_33_1/ | \ - $local/ndx2flist.pl $* | sort > test_eval93.flist - -# Nov'93: (213 utts, 5k) -cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h2_p0.ndx | \ - sed s/13_32_1/13_33_1/ | \ - $local/ndx2flist.pl $* | sort > test_eval93_5k.flist - -# Dev-set for Nov'93 (503 utts) -cat links/13-34.1/wsj1/doc/indices/h1_p0.ndx | \ - $local/ndx2flist.pl $* | sort > test_dev93.flist - -# Dev-set for Nov'93 (513 utts, 5k vocab) -cat links/13-34.1/wsj1/doc/indices/h2_p0.ndx | \ - $local/ndx2flist.pl $* | sort > test_dev93_5k.flist - - -# Dev-set Hub 1,2 (503, 913 utterances) - -# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt. -# Sometimes this gets copied from the CD's with upcasing, don't know -# why (could be older versions of the disks). -find `readlink links/13-16.1`/???1/??_??_20 -print | grep -i ".wv1" | sort > dev_dt_20.flist -find `readlink links/13-16.1`/???1/??_??_05 -print | grep -i ".wv1" | sort > dev_dt_05.flist - - -# Finding the transcript files: -for x in $*; do find -L $x -iname '*.dot'; done > dot_files.flist - -# Convert the transcripts into our format (no normalization yet) -for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do - $local/flist2scp.pl $x.flist | sort > ${x}_sph.scp - cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl dot_files.flist > $x.trans1 -done - -# Do some basic normalization steps. At this point we don't remove OOVs-- -# that will be done inside the training scripts, as we'd like to make the -# data-preparation stage independent of the specific lexicon used. -noiseword=""; -for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do - cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1; -done - -# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.) -for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do - awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp -done - -# Make the utt2spk and spk2utt files. -for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do - cat ${x}_sph.scp | awk '{print $1}' | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk - cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; -done - - -#in case we want to limit lm's on most frequent words, copy lm training word frequency list -cp links/13-32.1/wsj1/doc/lng_modl/vocab/wfl_64.lst $lmdir -chmod u+w $lmdir/*.lst # had weird permissions on source. - -# The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without -# verbalized pronunciations. This is the most common test setup, I understand. - -cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1; -chmod u+w $lmdir/lm_bg.arpa.gz - -# trigram would be: -cat links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb20onp.z | \ - perl -e 'while(<>){ if(m/^\\data\\/){ print; last; } } while(<>){ print; }' | \ - gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1; - -prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1; -gzip -f $lmdir/lm_tgpr.arpa || exit 1; - -# repeat for 5k language models -cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb05onp.z $lmdir/lm_bg_5k.arpa.gz || exit 1; -chmod u+w $lmdir/lm_bg_5k.arpa.gz - -# trigram would be: !only closed vocabulary here! -cp links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1; -chmod u+w $lmdir/lm_tg_5k.arpa.gz -gunzip $lmdir/lm_tg_5k.arpa.gz -tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz -rm $lmdir/lm_tg_5k.arpa - -prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1; -gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1; - - -if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then - rm wsj0-train-spkrinfo.txt - ! wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt && \ - echo "Getting wsj0-train-spkrinfo.txt from backup location" && \ - wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt -fi - -if [ ! -f wsj0-train-spkrinfo.txt ]; then - echo "Could not get the spkrinfo.txt file from LDC website (moved)?" - echo "This is possibly omitted from the training disks; couldn't find it." - echo "Everything else may have worked; we just may be missing gender info" - echo "which is only needed for VTLN-related diagnostics anyway." - exit 1 -fi -# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the -# LDC put it on the web. Perhaps it was accidentally omitted from the -# disks. - -cat links/11-13.1/wsj0/doc/spkrinfo.txt \ - links/13-32.1/wsj1/doc/evl_spok/spkrinfo.txt \ - links/13-34.1/wsj1/doc/dev_spok/spkrinfo.txt \ - links/13-34.1/wsj1/doc/train/spkrinfo.txt \ - ./wsj0-train-spkrinfo.txt | \ - perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \ - awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender - - -echo "Data preparation succeeded" diff --git a/egs/babel/s5/local/wsj_extend_dict.sh b/egs/babel/s5/local/wsj_extend_dict.sh deleted file mode 100755 index 4658767a0..000000000 --- a/egs/babel/s5/local/wsj_extend_dict.sh +++ /dev/null @@ -1,172 +0,0 @@ -#!/bin/bash - -# This script builds a larger word-list and dictionary -# than used for the LMs supplied with the WSJ corpus. -# It uses a couple of strategies to fill-in words in -# the LM training data but not in CMUdict. One is -# to generate special prons for possible acronyms, that -# just consist of the constituent letters. The other -# is designed to handle derivatives of known words -# (e.g. deriving the pron of a plural from the pron of -# the base-word), but in a more general, learned-from-data -# way. -# It makes use of scripts in local/dict/ - -if [ $# -ne 1 ]; then - echo "Usage: local/wsj_train_lms.sh /foo/bar/WSJ/13-32.1/" - exit 1 -fi -if [ "`basename $1`" != 13-32.1 ]; then - echo "Expecting the argument to this script to end in 13-32.1" - exit 1 -fi - -# e.g. -#srcdir=/mnt/matylda2/data/WSJ1/13-32.1 -export PATH=$PATH:`pwd`/local/dict/ -srcdir=$1 -mkdir -p data/local/dict_larger -dir=data/local/dict_larger -cp data/local/dict/* data/local/dict_larger # Various files describing phones etc. - # are there; we just want to copy them as the phoneset is the same. -rm data/local/dict_larger/lexicon.txt # we don't want this. -mincount=2 # Minimum count of an OOV we will try to generate a pron for. - -[ ! -f data/local/dict/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1; - -# Remove comments from cmudict; print first field; remove -# words like FOO(1) which are alternate prons: our dict format won't -# include these markers. -grep -v ';;;' data/local/dict/cmudict/cmudict.0.7a | - perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu - -cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu - -echo "Getting training data [this should take at least a few seconds; if not, there's a problem]" - -# Convert to uppercase, remove XML-like markings. -# For words ending in "." that are not in CMUdict, we assume that these -# are periods that somehow remained in the data during data preparation, -# and we we replace the "." with "\n". Note: we found this by looking at -# oov.counts below (before adding this rule). - -touch $dir/cleaned.gz -if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then - echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]"; -else - gunzip -c $srcdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \ - | awk '/^){ chop; $isword{$_} = 1; } - while() { - @A = split(" ", $_); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "." - # and have no other "." in them: treat as period. - print "$a"; - if ($n+1 < @A) { print "\n"; } - } else { print "$a "; } - } - print "\n"; - } - ' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz -fi - -# get unigram counts -echo "Getting unigram counts" -gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \ - awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams - -cat $dir/unigrams | awk -v dict=$dir/dict.cmu \ - 'BEGIN{while(getline $dir/oov.counts - -echo "Most frequent unseen unigrams are: " -head $dir/oov.counts - -# Prune away singleton counts, and remove things with numbers in -# (which should have been normalized) and with no letters at all. - - -cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \ - | awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist - -# Automatic rule-finding... - -# First make some prons for possible acronyms. -# Note: we don't do this for things like U.K or U.N, -# or A.B. (which doesn't exist anyway), -# as we consider this normalization/spelling errors. - -cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu > $dir/dict.acronyms - -mkdir $dir/f $dir/b # forward, backward directions of rules... - # forward is normal suffix - # rules, backward is reversed (prefix rules). These - # dirs contain stuff we create while making the rule-based - # extensions to the dictionary. - -# Remove ; and , from words, if they are present; these -# might crash our scripts, as they are used as separators there. -filter_dict.pl $dir/dict.cmu > $dir/f/dict -cat $dir/oovlist | filter_dict.pl > $dir/f/oovs -reverse_dict.pl $dir/f/dict > $dir/b/dict -reverse_dict.pl $dir/f/oovs > $dir/b/oovs - -# The next stage takes a few minutes. -# Note: the forward stage takes longer, as English is -# mostly a suffix-based language, and there are more rules -# that it finds. -for d in $dir/f $dir/b; do - ( - cd $d - cat dict | get_rules.pl 2>get_rules.log >rules - get_rule_hierarchy.pl rules >hierarchy - awk '{print $1}' dict | get_candidate_prons.pl rules dict | \ - limit_candidate_prons.pl hierarchy | \ - score_prons.pl dict | \ - count_rules.pl >rule.counts - # the sort command below is just for convenience of reading. - score_rules.pl rules.with_scores - get_candidate_prons.pl rules.with_scores dict oovs | \ - limit_candidate_prons.pl hierarchy > oovs.candidates - ) & -done -wait - -# Merge the candidates. -reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates -select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s %s\n", $1, $2);}' \ - > $dir/dict.oovs - -cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged - -awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled -sort $dir/oovlist | diff - $dir/oovlist.handled | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled - - -# add_counts.pl attaches to original counts to the list of handled/not-handled OOVs -add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts -add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts - -echo "**Top OOVs we handled are:**"; -head $dir/oovlist.handled.counts -echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**"; -head $dir/oovlist.not_handled.counts - - -echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`" -echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`" -echo "Count of OOVs we didn't handle due to low count is" \ - `awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts` -# The two files created above are for humans to look at, as diagnostics. - -cat < $dir/lexicon.txt -!SIL SIL - SPN - SPN - NSN -EOF - -echo "Created $dir/lexicon.txt" diff --git a/egs/babel/s5/local/wsj_format_data.sh b/egs/babel/s5/local/wsj_format_data.sh deleted file mode 100755 index 88fcc7ed8..000000000 --- a/egs/babel/s5/local/wsj_format_data.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - -# This script takes data prepared in a corpus-dependent way -# in data/local/, and converts it into the "canonical" form, -# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug, -# data/train_si284, data/train_si84, etc. - -# Don't bother doing train_si84 separately (although we have the file lists -# in data/local/) because it's just the first 7138 utterances in train_si284. -# We'll create train_si84 after doing the feature extraction. - -. ./path.sh || exit 1; - -echo "Preparing train and test data" -srcdir=data/local/data -lmdir=data/local/nist_lm -tmpdir=data/local/lm_tmp -lexicon=data/local/lang_tmp/lexicon.txt -mkdir -p $tmpdir - -for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do - mkdir -p data/$x - cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1; - cp $srcdir/$x.txt data/$x/text || exit 1; - cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1; - cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1; - utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1; -done - - -# Next, for each type of language model, create the corresponding FST -# and the corresponding lang_test_* directory. - -echo Preparing language models for test - -for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do - test=data/lang_test_${lm_suffix} - mkdir -p $test - for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \ - phones/; do - cp -r data/lang/$f $test - done - gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ - utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt - - # grep -v ' ' because the LM seems to have some strange and useless - # stuff in it with multiple 's in the history. Encountered some other similar - # things in a LM from Geoff. Removing all "illegal" combinations of and , - # which are supposed to occur only at being/end of utt. These can cause - # determinization failures of CLG [ends up being epsilon cycles]. - gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst - fstisstochastic $test/G.fst - # The output is like: - # 9.14233e-05 -0.259833 - # we do expect the first of these 2 numbers to be close to zero (the second is - # nonzero because the backoff weights make the states sum to >1). - # Because of the fiasco for these particular LMs, the first number is not - # as close to zero as it could be. - - # Everything below is only for diagnostic. - # Checking that G has no cycles with empty words on them (e.g. , ); - # this might cause determinization failure of CLG. - # #0 is treated as an empty word. - mkdir -p $tmpdir/g - awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ - < "$lexicon" >$tmpdir/g/select_empty.fst.txt - fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \ - fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst - fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && - echo "Language model has cycles with empty words" && exit 1 - rm -r $tmpdir/g -done - -echo "Succeeded in formatting data." -rm -r $tmpdir diff --git a/egs/babel/s5/local/wsj_format_local_lms.sh b/egs/babel/s5/local/wsj_format_local_lms.sh deleted file mode 100755 index 2696f0fae..000000000 --- a/egs/babel/s5/local/wsj_format_local_lms.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -# Copyright Johns Hopkins University (Author: Daniel Povey) 2012 - -. ./path.sh - -[ ! -d data/lang_bd ] && echo "Expect data/local/lang_bd to exist" && exit 1; - -lm_srcdir_3g=data/local/local_lm/3gram-mincount -lm_srcdir_4g=data/local/local_lm/4gram-mincount - -[ ! -d "$lm_srcdir_3g" ] && echo "No such dir $lm_srcdir_3g" && exit 1; -[ ! -d "$lm_srcdir_4g" ] && echo "No such dir $lm_srcdir_4g" && exit 1; - -for d in data/lang_test_bd_{tg,tgpr,fg,fgpr}; do - rm -r $d 2>/dev/null - cp -r data/lang_bd $d -done - -lang=data/lang_bd - -exit - -# Be careful: this time we dispense with the grep -v ' ' so this might -# not work for LMs generated from all toolkits. -gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ - --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tgpr/G.fst || exit 1; - fstisstochastic data/lang_test_bd_tgpr/G.fst - -gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ - --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tg/G.fst || exit 1; - fstisstochastic data/lang_test_bd_tg/G.fst - -gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ - --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fg/G.fst || exit 1; - fstisstochastic data/lang_test_bd_fg/G.fst - -gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ - --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fgpr/G.fst || exit 1; - fstisstochastic data/lang_test_bd_fgpr/G.fst - -exit 0; diff --git a/egs/babel/s5/local/wsj_prepare_dict.sh b/egs/babel/s5/local/wsj_prepare_dict.sh deleted file mode 100755 index ef2eee8f6..000000000 --- a/egs/babel/s5/local/wsj_prepare_dict.sh +++ /dev/null @@ -1,83 +0,0 @@ -#!/bin/bash - -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# Call this script from one level above, e.g. from the s3/ directory. It puts -# its output in data/local/. - -# The parts of the output of this that will be needed are -# [in data/local/dict/ ] -# lexicon.txt -# extra_questions.txt -# nonsilence_phones.txt -# optional_silence.txt -# silence_phones.txt - -# run this from ../ -dir=data/local/dict -mkdir -p $dir - - -# (1) Get the CMU dictionary -svn co https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict \ - $dir/cmudict || exit 1; - -# can add -r 10966 for strict compatibility. - - -#(2) Dictionary preparation: - - -# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point). -# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones. - -# silence phones, one per line. -(echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt -echo SIL > $dir/optional_silence.txt - -# nonsilence phones; on each line is a list of phones that correspond -# really to the same base phone. -cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \ - perl -e 'while(<>){ - chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; - $phones_of{$1} .= "$_ "; } - foreach $list (values %phones_of) {print $list . "\n"; } ' \ - > $dir/nonsilence_phones.txt || exit 1; - -# A few extra questions that will be added to those obtained by automatically clustering -# the "real" phones. These ask about stress; there's also one for silence. -cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1; -cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) { - $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ - >> $dir/extra_questions.txt || exit 1; - -grep -v ';;;' $dir/cmudict/cmudict.0.7a | \ - perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \ - > $dir/lexicon1_raw_nosil.txt || exit 1; - -# Add to cmudict the silences, noises etc. - -(echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; echo ' NSN'; ) | \ - cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1; - - -# lexicon.txt is without the _B, _E, _S, _I markers. -# This is the input to wsj_format_data.sh -cp $dir/lexicon2_raw.txt $dir/lexicon.txt - - -echo "Dictionary preparation succeeded" - diff --git a/egs/babel/s5/local/wsj_train_lms.sh b/egs/babel/s5/local/wsj_train_lms.sh deleted file mode 100755 index 34c3b7b99..000000000 --- a/egs/babel/s5/local/wsj_train_lms.sh +++ /dev/null @@ -1,202 +0,0 @@ -#!/bin/bash - -# This script trains LMs on the WSJ LM-training data. -# It requires that you have already run wsj_extend_dict.sh, -# to get the larger-size dictionary including all of CMUdict -# plus any OOVs and possible acronyms that we could easily -# derive pronunciations for. - -# This script takes no command-line arguments - -dir=data/local/local_lm -srcdir=data/local/dict_larger -mkdir -p $dir -. ./path.sh || exit 1; # for KALDI_ROOT -export PATH=$KALDI_ROOT/tools/kaldi_lm:$PATH -( # First make sure the kaldi_lm toolkit is installed. - cd $KALDI_ROOT/tools || exit 1; - if [ -d kaldi_lm ]; then - echo Not installing the kaldi_lm toolkit since it is already there. - else - echo Downloading and installing the kaldi_lm tools - if [ ! -f kaldi_lm.tar.gz ]; then - wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1; - fi - tar -xvzf kaldi_lm.tar.gz || exit 1; - cd kaldi_lm - make || exit 1; - echo Done making the kaldi_lm tools - fi -) || exit 1; - - - -if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then - echo "Expecting files $srcdir/cleaned.gz and $srcdir/lexicon.txt to exist"; - echo "You need to run local/wsj_extend_dict.sh before running this script." - exit 1; -fi - -# Get a wordlist-- keep everything but silence, which should not appear in -# the LM. -awk '{print $1}' $srcdir/lexicon.txt | grep -v -w '!SIL' > $dir/wordlist.txt - -# Get training data with OOV words (w.r.t. our current vocab) replaced with . -echo "Getting training data with OOV words replaced with (train_nounk.gz)" -gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.txt \ - 'BEGIN{while((getline0) v[$1]=1;} - {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf " ";print ""}'|sed 's/ $//g' \ - | gzip -c > $dir/train_nounk.gz - -# Get unigram counts (without bos/eos, but this doens't matter here, it's -# only to get the word-map, which treats them specially & doesn't need their -# counts). -# Add a 1-count for each word in word-list by including that in the data, -# so all words appear. -gunzip -c $dir/train_nounk.gz | cat - $dir/wordlist.txt | \ - awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \ - sort -nr > $dir/unigram.counts - -# Get "mapped" words-- a character encoding of the words that makes the common words very short. -cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map - -gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} - { for(n=1;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz - -# To save disk space, remove the un-mapped training data. We could -# easily generate it again if needed. -rm $dir/train_nounk.gz - -train_lm.sh --arpa --lmtype 3gram-mincount $dir -#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826 -# 7.8 million N-grams. - -prune_lm.sh --arpa 6.0 $dir/3gram-mincount/ -# 1.45 million N-grams. -# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139 - -train_lm.sh --arpa --lmtype 4gram-mincount $dir -#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180 -# 10.3 million N-grams. - -prune_lm.sh --arpa 7.0 $dir/4gram-mincount -# 1.50 million N-grams -# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757 - - -exit 0 - -### Below here, this script is showing various commands that -## were run during LM tuning. - -train_lm.sh --arpa --lmtype 3gram-mincount $dir -#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826 -# 7.8 million N-grams. - -prune_lm.sh --arpa 3.0 $dir/3gram-mincount/ -#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740 -# 2.5 million N-grams. - -prune_lm.sh --arpa 6.0 $dir/3gram-mincount/ -# 1.45 million N-grams. -# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139 - -train_lm.sh --arpa --lmtype 4gram-mincount $dir -#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180 -# 10.3 million N-grams. - -prune_lm.sh --arpa 3.0 $dir/4gram-mincount -#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294 -# 2.6 million N-grams. - -prune_lm.sh --arpa 4.0 $dir/4gram-mincount -# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717 -# 2.15 million N-grams. - -prune_lm.sh --arpa 5.0 $dir/4gram-mincount -# 1.86 million N-grams -# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023 - -prune_lm.sh --arpa 7.0 $dir/4gram-mincount -# 1.50 million N-grams -# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757 - -train_lm.sh --arpa --lmtype 3gram $dir -# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866 -# 20.0 million N-grams - -! which ngram-count \ - && echo "SRILM tools not installed so not doing the comparison" && exit 1; - -################# -# You could finish the script here if you wanted. -# Below is to show how to do baselines with SRILM. -# You'd have to install the SRILM toolkit first. - -heldout_sent=10000 # Don't change this if you want result to be comparable with - # kaldi_lm results -sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities. -mkdir -p $sdir -gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout -gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train -(echo ""; echo "" ) | cat - $dir/wordlist.txt > $sdir/wordlist.final.s - -# 3-gram: -ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ - -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz -ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2 -#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs -#0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437 - -# Trying 4-gram: -ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ - -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz -ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout -#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs -#0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822 - -#3-gram with pruning: -ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ - -prune 0.0000001 -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz -ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout -#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs -#0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616 -# Around 2.25M N-grams. -# Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/" -# above, which gave 2.5 million N-grams and a perplexity of 156. - -# Note: all SRILM experiments above fully discount all singleton 3 and 4-grams. -# You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to -# the kaldi_lm experiments above without "-mincount". - -## From here is how to train with -# IRSTLM. This is not really working at the moment. -export IRSTLM=$KALDI_ROOT/tools/irstlm/ - -idir=$dir/irstlm -mkdir $idir -gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | $IRSTLM/scripts/add-start-end.sh | \ - gzip -c > $idir/train.gz - -$IRSTLM/bin/dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no - cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\ -{print $0;}}' > vocab.irstlm.20k - - -$IRSTLM/bin/build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz -p yes \ - -n 3 -s improved-kneser-ney -b yes -# Testing perplexity with SRILM tools: -ngram -lm $idir/lm_3gram.gz -ppl $sdir/cleaned.heldout -#data/local/local_lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for in closed-vocabulary LM -#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs -#0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599 - -# Perplexity is very bad (should be ~141, since we used -p option, -# not 175), -# but adding -debug 3 to the command line shows that -# the IRSTLM LM does not seem to sum to one properly, so it seems that -# it produces an LM that isn't interpretable in the normal way as an ARPA -# LM. - - - diff --git a/egs/babel/s5/local/wsj_train_rnnlms.sh b/egs/babel/s5/local/wsj_train_rnnlms.sh deleted file mode 100755 index c0d1afaf6..000000000 --- a/egs/babel/s5/local/wsj_train_rnnlms.sh +++ /dev/null @@ -1,153 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (author: Daniel Povey) Tony Robinson - -# This script trains LMs on the WSJ LM-training data. -# It requires that you have already run wsj_extend_dict.sh, -# to get the larger-size dictionary including all of CMUdict -# plus any OOVs and possible acronyms that we could easily -# derive pronunciations for. - -# This script takes no command-line arguments but takes the --cmd option. - -# Begin configuration section. -rand_seed=0 -cmd=run.pl -nwords=10000 # This is how many words we're putting in the vocab of the RNNLM. -hidden=30 -class=200 # Num-classes... should be somewhat larger than sqrt of nwords. -direct=1000 # Probably number of megabytes to allocate for hash-table for "direct" connections. -rnnlm_ver=rnnlm-0.3e # version of RNNLM to use -# End configuration section. - -[ -f ./path.sh ] && . ./path.sh -. utils/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: local/wsj_train_rnnlms.sh [options] " - echo "For options, see top of script file" - exit 1; -fi - -dir=$1 -srcdir=data/local/dict_larger -mkdir -p $dir - -export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH - - -( # First make sure the kaldi_lm toolkit is installed. - # Note: this didn't work out of the box for me, I had to - # change the g++ version to just "g++" (no cross-compilation - # needed for me as I ran on a machine that had been setup - # as 64 bit by default. - cd $KALDI_ROOT/tools || exit 1; - if [ -d $rnnlm_ver ]; then - echo Not installing the rnnlm toolkit since it is already there. - else - echo Downloading and installing the rnnlm tools - # http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz - if [ ! -f $rnnlm_ver.tgz ]; then - wget http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz || exit 1; - fi - mkdir $rnnlm_ver - cd $rnnlm_ver - tar -xvzf ../$rnnlm_ver.tgz || exit 1; - make CC=g++ || exit 1; - echo Done making the rnnlm tools - fi -) || exit 1; - - -if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then - echo "Expecting files $srcdir/cleaned.gz and $srcdir/wordlist.final to exist"; - echo "You need to run local/wsj_extend_dict.sh before running this script." - exit 1; -fi - -cat $srcdir/lexicon.txt | awk '{print $1}' | grep -v -w '!SIL' > $dir/wordlist.all - -# Get training data with OOV words (w.r.t. our current vocab) replaced with . -echo "Getting training data with OOV words replaced with (train_nounk.gz)" -gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.all \ - 'BEGIN{while((getline0) v[$1]=1;} - {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf " ";print ""}'|sed 's/ $//g' \ - | gzip -c > $dir/all.gz - -echo "Splitting data into train and validation sets." -heldout_sent=10000 -gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data -gunzip -c $dir/all.gz | tail -n +$heldout_sent | \ - perl -e ' use List::Util qw(shuffle); @A=<>; print join("", shuffle(@A)); ' \ - > $dir/train.in # training data - - - # The rest will consist of a word-class represented by , that - # maps (with probabilities) to a whole class of words. - -# Get unigram counts from our training data, and use this to select word-list -# for RNNLM training; e.g. 10k most frequent words. Rest will go in a class -# that we (manually, at the shell level) assign probabilities for words that -# are in that class. Note: this word-list doesn't need to include ; this -# automatically gets added inside the rnnlm program. -# Note: by concatenating with $dir/wordlist.all, we are doing add-one -# smoothing of the counts. - -cat $dir/train.in $dir/wordlist.all | grep -v '' | grep -v '' | \ - awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \ - sort -nr > $dir/unigram.counts - -head -$nwords $dir/unigram.counts | awk '{print $2}' > $dir/wordlist.rnn - -tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts - -tot=`awk '{x=x+$1} END{print x}' $dir/unk_class.counts` -awk -v tot=$tot '{print $2, ($1*1.0/tot);}' <$dir/unk_class.counts >$dir/unk.probs - - -for type in train valid; do - cat $dir/$type.in | awk -v w=$dir/wordlist.rnn \ - 'BEGIN{while((getline0) v[$1]=1;} - {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf " ";print ""}'|sed 's/ $//g' \ - > $dir/$type -done -rm $dir/train.in # no longer needed-- and big. - -# Now randomize the order of the training data. -cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \ - sort | cut -f 2 > $dir/foo -mv $dir/foo $dir/train - -# OK we'll train the RNNLM on this data. - -# todo: change 100 to 320. -# using 100 classes as square root of 10k. -echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)" -#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/100.rnnlm \ -# -hidden 100 -rand-seed 1 -debug 2 -class 100 -bptt 2 -bptt-block 20 \ -# -direct-order 4 -direct 1000 -binary >& $dir/rnnlm1.log & - -$cmd $dir/rnnlm.log \ - $KALDI_ROOT/tools/$rnnlm_ver/rnnlm -independent -train $dir/train -valid $dir/valid \ - -rnnlm $dir/rnnlm -hidden $hidden -rand-seed 1 -debug 2 -class $class -bptt 2 -bptt-block 20 \ - -direct-order 4 -direct $direct -binary || exit 1; - - -# make it like a Kaldi table format, with fake utterance-ids. -cat $dir/valid.in | awk '{ printf("uttid-%d ", NR); print; }' > $dir/valid.with_ids - -utils/rnnlm_compute_scores.sh $dir $dir/tmp.valid $dir/valid.with_ids \ - $dir/valid.scores -nw=`wc -w < $dir/valid.with_ids` # Note: valid.with_ids includes utterance-ids which - # is one per word, to account for the at the end of each sentence; this is the - # correct number to normalize buy. -p=`awk -v nw=$nw '{x=x+$2} END{print exp(x/nw);}' <$dir/valid.scores` -echo Perplexity is $p | tee $dir/perplexity.log - -rm $dir/train $dir/all.gz - -# This is a better setup, but takes a long time to train: -#echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)" -#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/320.rnnlm \ -# -hidden 320 -rand-seed 1 -debug 2 -class 300 -bptt 2 -bptt-block 20 \ -# -direct-order 4 -direct 2000 -binary diff --git a/egs/chime_wsj0/s5/local/chime_format_data.sh b/egs/chime_wsj0/s5/local/chime_format_data.sh index 8ca2be63b..177cd0f4a 100755 --- a/egs/chime_wsj0/s5/local/chime_format_data.sh +++ b/egs/chime_wsj0/s5/local/chime_format_data.sh @@ -38,11 +38,8 @@ echo Preparing language models for test for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do test=data/lang_test_${lm_suffix} - mkdir -p $test - for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \ - phones/; do - cp -r data/lang/$f $test - done + cp -rT data/lang $test + gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt @@ -60,26 +57,8 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst - fstisstochastic $test/G.fst - # The output is like: - # 9.14233e-05 -0.259833 - # we do expect the first of these 2 numbers to be close to zero (the second is - # nonzero because the backoff weights make the states sum to >1). - # Because of the fiasco for these particular LMs, the first number is not - # as close to zero as it could be. - # Everything below is only for diagnostic. - # Checking that G has no cycles with empty words on them (e.g. , ); - # this might cause determinization failure of CLG. - # #0 is treated as an empty word. - mkdir -p $tmpdir/g - awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ - < "$lexicon" >$tmpdir/g/select_empty.fst.txt - fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \ - fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst - fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && - echo "Language model has cycles with empty words" && exit 1 - rm -r $tmpdir/g + utils/validate_lang.pl $test || exit 1; done echo "Succeeded in formatting data." diff --git a/egs/chime_wsj0/s5/run.sh b/egs/chime_wsj0/s5/run.sh index 0bccc1719..54f69cea4 100755 --- a/egs/chime_wsj0/s5/run.sh +++ b/egs/chime_wsj0/s5/run.sh @@ -256,6 +256,3 @@ $cuda_cmd $dir/_train_nnet.log \ utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri7a_dnn exp/tri7a_dnn/graph_tgpr_5k || exit 1; steps/nnet/decode.sh --nj 8 --acwt 0.10 --config conf/decode_dnn.config \ exp/tri7a_dnn/graph_tgpr_5k data-fbank/test_eval92_5k_noisy $dir/decode_tgpr_5k_eval92_5k_noisy || exit 1; - - - diff --git a/egs/librispeech/s5/local/format_data.sh b/egs/librispeech/s5/local/format_data.sh index b13883279..52159f5e5 100755 --- a/egs/librispeech/s5/local/format_data.sh +++ b/egs/librispeech/s5/local/format_data.sh @@ -27,7 +27,7 @@ mkdir -p $tmpdir for lm_suffix in tgpr; do test=data/lang_test_${lm_suffix} mkdir -p $test - for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones/; do + for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones oov.txt oov.int; do cp -r data/lang/$f $test done gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\ @@ -47,26 +47,8 @@ for lm_suffix in tgpr; do utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst - fstisstochastic $test/G.fst - # The output is like: - # 9.14233e-05 -0.259833 - # we do expect the first of these 2 numbers to be close to zero (the second is - # nonzero because the backoff weights make the states sum to >1). - # Because of the fiasco for these particular LMs, the first number is not - # as close to zero as it could be. - # Everything below is only for diagnostic. - # Checking that G has no cycles with empty words on them (e.g. , ); - # this might cause determinization failure of CLG. - # #0 is treated as an empty word. - mkdir -p $tmpdir/g - awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ - < "$lexicon" >$tmpdir/g/select_empty.fst.txt - fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \ - fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst - fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && - echo "Language model has cycles with empty words" && exit 1 - rm -r $tmpdir/g + utils/validate_lang.pl $test || exit 1; done echo "Succeeded in formatting data." diff --git a/egs/librispeech/s5/local/format_lms.sh b/egs/librispeech/s5/local/format_lms.sh index d2b671d37..cda4c933c 100755 --- a/egs/librispeech/s5/local/format_lms.sh +++ b/egs/librispeech/s5/local/format_lms.sh @@ -6,7 +6,15 @@ # Prepares the test time language model(G) transducers # (adapted from wsj/s5/local/wsj_format_data.sh) -. path.sh +. ./path.sh || exit 1; + +# begin configuration section +src_dict=data/local/dict/lexicon.txt # only needed for diagnostics, to identify empty words. +src_dir=data/lang +# end configuration section + +. utils/parse_options.sh || exit 1; + set -e if [ $# -ne 1 ]; then @@ -14,29 +22,41 @@ if [ $# -ne 1 ]; then echo "e.g.: $0 /export/a15/vpanayotov/data/lm" echo ", where:" echo " is the directory in which the language model is stored/downloaded" + echo "Options:" + echo " --src-dir # source lang directory, default data/lang" exit 1 fi lm_dir=$1 -tmpdir=data/local/lm_tmp -lexicon=data/local/lang_tmp/lexiconp.txt +if [ ! -d $lm_dir ]; then + echo "$0: expected source LM directory $lm_dir to exist" + exit 1; +fi +if [ ! -f $src_dir/words.txt ]; then + echo "$0: expected $src_dir/words.txt to exist." + exit 1; +fi + + +tmpdir=data/local/lm_tmp.$$ +trap "rm -r $tmpdir" EXIT + mkdir -p $tmpdir -for lm_suffix in tgsmall tgmed tglarge; do - test=data/lang_test_${lm_suffix} - mkdir -p $test - for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones/; do - cp -r data/lang/$f $test - done +for lm_suffix in tgsmall tgmed; do + # tglarge is prepared by a separate command, called from run.sh; we don't + # want to compile G.fst for tglarge, as it takes a while. + test=${src_dir}_test_${lm_suffix} + cp -rT ${src_dir} $test gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\ utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt || exit 1 # grep -v ' ' because the LM seems to have some strange and useless - # stuff in it with multiple 's in the history. Encountered some other similar - # things in a LM from Geoff. Removing all "illegal" combinations of and , - # which are supposed to occur only at being/end of utt. These can cause - # determinization failures of CLG [ends up being epsilon cycles]. + # stuff in it with multiple 's in the history. Encountered some other + # similar things in a LM from Geoff. Removing all "illegal" combinations of + # and , which are supposed to occur only at being/end of utt. These + # can cause determinization failures of CLG [ends up being epsilon cycles]. gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \ grep -v ' ' | \ grep -v ' ' | \ @@ -44,31 +64,12 @@ for lm_suffix in tgsmall tgmed tglarge; do arpa2fst - | fstprint | \ utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \ utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst - fstisstochastic $test/G.fst || true - # The output is like: - # 9.14233e-05 -0.259833 - # we do expect the first of these 2 numbers to be close to zero (the second is - # nonzero because the backoff weights make the states sum to >1). - # Because of the fiasco for these particular LMs, the first number is not - # as close to zero as it could be. + --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ + fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst - # Everything below is only for diagnostic. - # Checking that G has no cycles with empty words on them (e.g. , ); - # this might cause determinization failure of CLG. - # #0 is treated as an empty word. - mkdir -p $tmpdir/g - awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ - < "$lexicon" >$tmpdir/g/select_empty.fst.txt - fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \ - fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst - fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && - echo "Language model has cycles with empty words" && exit 1 - rm -r $tmpdir/g + utils/validate_lang.pl --skip-determinization-check $test || exit 1; done echo "Succeeded in formatting data." -rm -r $tmpdir exit 0 diff --git a/egs/sprakbanken/s5/local/sprak_train_cmulm.sh b/egs/sprakbanken/s5/local/sprak_train_cmulm.sh index 4595d9d9e..db4c60dde 100755 --- a/egs/sprakbanken/s5/local/sprak_train_cmulm.sh +++ b/egs/sprakbanken/s5/local/sprak_train_cmulm.sh @@ -50,10 +50,7 @@ idngram2lm -linear -idngram $lmdir/sprak.idngram -vocab \ test=data/lang_test_${lm_suffix} mkdir -p $test -for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \ - phones/; do - cp -r data/lang/$f $test -done +cp -rT data/lang $test cat $lmdir/sprak.arpa | \ utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt @@ -72,25 +69,9 @@ cat $lmdir/sprak.arpa | \ utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst -fstisstochastic $test/G.fst - # The output is like: - # 9.14233e-05 -0.259833 - # we do expect the first of these 2 numbers to be close to zero (the second is - # nonzero because the backoff weights make the states sum to >1). - # Because of the fiasco for these particular LMs, the first number is not - # as close to zero as it could be. - # Everything below is only for diagnostic. - # Checking that G has no cycles with empty words on them (e.g. , ); - # this might cause determinization failure of CLG. - # #0 is treated as an empty word. -mkdir -p $tmpdir -awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ - < "$lexicon" >$tmpdir/select_empty.fst.txt -fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/select_empty.fst.txt | \ - fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/empty_words.fst -fstinfo $tmpdir/empty_words.fst | grep cyclic | grep -w 'y' && - echo "Language model has cycles with empty words" && exit 1 -echo "Succeeded in formatting data." -rm -r $tmpdir +utils/validate_lang.pl $test || exit 1; + +exit 0; + diff --git a/egs/sprakbanken/s5/local/sprak_train_irstlm.sh b/egs/sprakbanken/s5/local/sprak_train_irstlm.sh index ea3d8ed56..9f765d290 100755 --- a/egs/sprakbanken/s5/local/sprak_train_irstlm.sh +++ b/egs/sprakbanken/s5/local/sprak_train_irstlm.sh @@ -17,7 +17,6 @@ lm_suffix=$3 N=$4 lmdir=$5 extdict=${srcdict}_$lm_suffix -tmpdir=data/local/lm_tmp lang_tmp=data/local/lang_tmp extlang=data/lang_$lm_suffix @@ -137,10 +136,8 @@ tlm -tr=$lmdir/extra4.ngt -n=$N -lm=wb -o=$lmdir/extra${N}$lm_suffix test=data/lang_test_${N}${lm_suffix} mkdir -p $test -for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \ - phones/; do - cp -r $extlang/$f $test -done + +cp -r $extlang $test cat $lmdir/extra${N}$lm_suffix | \ utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt @@ -159,29 +156,7 @@ cat $lmdir/extra${N}$lm_suffix | \ utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst -fstisstochastic $test/G.fst -echo "Succeeded in formatting data." - # The output is like: - # 9.14233e-05 -0.259833 - # we do expect the first of these 2 numbers to be close to zero (the second is - # nonzero because the backoff weights make the states sum to >1). - # Because of the fiasco for these particular LMs, the first number is not - # as close to zero as it could be. +utils/validate_lang.pl $test || exit 1; - # Everything below is only for diagnostic. - # Checking that G has no cycles with empty words on them (e.g. , ); - # this might cause determinization failure of CLG. - # #0 is treated as an empty word. -echo "Running diagnostics. Investigate if the LM has cycles." - -mkdir -p $tmpdir -awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ - < "$lmdir/text.filt" >$tmpdir/select_empty.fst.txt -fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/select_empty.fst.txt | \ - fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/empty_words.fst -fstinfo $tmpdir/empty_words.fst | grep cyclic | grep -w 'y' && - echo "Language model has cycles with empty words" && exit 1 - - -rm -rf $tmpdir +exit 0; diff --git a/egs/sprakbanken/s5/local/train_irstlm.sh b/egs/sprakbanken/s5/local/train_irstlm.sh index 07db0b943..0185ae469 100755 --- a/egs/sprakbanken/s5/local/train_irstlm.sh +++ b/egs/sprakbanken/s5/local/train_irstlm.sh @@ -51,13 +51,10 @@ wait test=data/lang_test_${lm_suffix} mkdir -p $test -for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \ - phones/; do - cp -r $srcdir/$f $test -done +cp -rT $srcdir $test cat $lmdir/train${ngram}.arpa | \ -utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt + utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt # grep -v ' ' because the LM seems to have some strange and useless # stuff in it with multiple 's in the history. Encountered some other similar @@ -73,27 +70,10 @@ cat $lmdir/train${ngram}.arpa | \ utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst -fstisstochastic $test/G.fst - # The output is like: - # 9.14233e-05 -0.259833 - # we do expect the first of these 2 numbers to be close to zero (the second is - # nonzero because the backoff weights make the states sum to >1). - # Because of the fiasco for these particular LMs, the first number is not - # as close to zero as it could be. - # Everything below is only for diagnostic. - # Checking that G has no cycles with empty words on them (e.g. , ); - # this might cause determinization failure of CLG. - # #0 is treated as an empty word. - - -awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ - < "$lmdir/lm_input" >$tmpdir/select_empty.fst.txt -fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/select_empty.fst.txt | \ - fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/empty_words.fst -fstinfo $tmpdir/empty_words.fst | grep cyclic | grep -w 'y' && - echo "Language model has cycles with empty words" && exit 1 +utils/validate_lang.pl $test || exit 1; echo "Succeeded in formatting data." +exit 0; #rm -rf $tmpdir #rm -f $ccs \ No newline at end of file diff --git a/egs/swbd/s5b/local/swbd1_prepare_dict.sh b/egs/swbd/s5b/local/swbd1_prepare_dict.sh index 8ca49df51..d860e5e0c 100755 --- a/egs/swbd/s5b/local/swbd1_prepare_dict.sh +++ b/egs/swbd/s5b/local/swbd1_prepare_dict.sh @@ -22,7 +22,7 @@ cp $srcdict $dir/lexicon0.txt || exit 1; patch $dir/lexicon1.txt || exit 1; diff --git a/egs/wsj/s5/local/wsj_format_data.sh b/egs/wsj/s5/local/wsj_format_data.sh index 80bd8596a..6b7ebd545 100755 --- a/egs/wsj/s5/local/wsj_format_data.sh +++ b/egs/wsj/s5/local/wsj_format_data.sh @@ -38,11 +38,9 @@ echo Preparing language models for test for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do test=data/lang_test_${lm_suffix} - mkdir -p $test - for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \ - phones/; do - cp -r data/lang/$f $test - done + + cp -rT data/lang $test || exit 1; + gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt @@ -60,26 +58,8 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst - fstisstochastic $test/G.fst - # The output is like: - # 9.14233e-05 -0.259833 - # we do expect the first of these 2 numbers to be close to zero (the second is - # nonzero because the backoff weights make the states sum to >1). - # Because of the fiasco for these particular LMs, the first number is not - # as close to zero as it could be. - # Everything below is only for diagnostic. - # Checking that G has no cycles with empty words on them (e.g. , ); - # this might cause determinization failure of CLG. - # #0 is treated as an empty word. - mkdir -p $tmpdir/g - awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ - < "$lexicon" >$tmpdir/g/select_empty.fst.txt - fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \ - fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst - fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && - echo "Language model has cycles with empty words" && exit 1 - rm -r $tmpdir/g + utils/validate_lang.pl --skip-determinization-check $test || exit 1; done echo "Succeeded in formatting data." diff --git a/egs/wsj/s5/local/wsj_prepare_dict.sh b/egs/wsj/s5/local/wsj_prepare_dict.sh index 82ba8ad94..bab8fd712 100755 --- a/egs/wsj/s5/local/wsj_prepare_dict.sh +++ b/egs/wsj/s5/local/wsj_prepare_dict.sh @@ -1,6 +1,7 @@ #!/bin/bash -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# Copyright 2010-2012 Microsoft Corporation +# 2012-2014 Johns Hopkins University (Author: Daniel Povey) # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -70,14 +71,16 @@ grep -v ';;;' $dir/cmudict/cmudict.0.7a | \ # Add to cmudict the silences, noises etc. +# the sort | uniq is to remove a duplicated pron from cmudict. (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; echo ' NSN'; ) | \ - cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1; + cat - $dir/lexicon1_raw_nosil.txt | sort | uniq > $dir/lexicon2_raw.txt || exit 1; # lexicon.txt is without the _B, _E, _S, _I markers. # This is the input to wsj_format_data.sh cp $dir/lexicon2_raw.txt $dir/lexicon.txt +rm $dir/lexiconp.txt 2>/dev/null echo "Dictionary preparation succeeded"