Updates to various data preparation scripts so validation checks on 'lang' directories will pass. It's possible some of these changes will break some setups, but it's not feasible to fully test this right now.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4739 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2015-01-02 03:38:04 +00:00
Родитель 2610d3ba84
Коммит 8dc30c3b6b
24 изменённых файлов: 68 добавлений и 2029 удалений

Просмотреть файл

@ -38,11 +38,8 @@ echo Preparing language models for test
for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
test=data/lang_test_${lm_suffix}
mkdir -p $test
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
phones/; do
cp -r data/lang/$f $test
done
cp -rT data/lang $test
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
@ -60,26 +57,8 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
# we do expect the first of these 2 numbers to be close to zero (the second is
# nonzero because the backoff weights make the states sum to >1).
# Because of the <s> fiasco for these particular LMs, the first number is not
# as close to zero as it could be.
# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
# this might cause determinization failure of CLG.
# #0 is treated as an empty word.
mkdir -p $tmpdir/g
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -r $tmpdir/g
utils/validate_lang.pl --skip-determinizability-check $test || exit 1;
done
echo "Succeeded in formatting data."

Просмотреть файл

@ -1,201 +0,0 @@
#!/bin/bash
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
if [ $# -le 3 ]; then
echo "Arguments should be a list of WSJ directories, see ../run.sh for example."
exit 1;
fi
dir=`pwd`/data/local/data
lmdir=`pwd`/data/local/nist_lm
mkdir -p $dir $lmdir
local=`pwd`/local
utils=`pwd`/utils
. ./path.sh # Needed for KALDI_ROOT
export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
exit 1;
fi
cd $dir
# Make directory of links to the WSJ disks such as 11-13.1. This relies on the command
# line arguments being absolute pathnames.
rm -r links/ 2>/dev/null
mkdir links/
ln -s $* links
# Do some basic checks that we have what we expected.
if [ ! -d links/11-13.1 -o ! -d links/13-34.1 -o ! -d links/11-2.1 ]; then
echo "wsj_data_prep.sh: Spot check of command line arguments failed"
echo "Command line arguments must be absolute pathnames to WSJ directories"
echo "with names like 11-13.1."
exit 1;
fi
# This version for SI-84
cat links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
$local/ndx2flist.pl $* | sort | \
grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si84.flist
nl=`cat train_si84.flist | wc -l`
[ "$nl" -eq 7138 ] || echo "Warning: expected 7138 lines in train_si84.flist, got $nl"
# This version for SI-284
cat links/13-34.1/wsj1/doc/indices/si_tr_s.ndx \
links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
$local/ndx2flist.pl $* | sort | \
grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si284.flist
nl=`cat train_si284.flist | wc -l`
[ "$nl" -eq 37416 ] || echo "Warning: expected 37416 lines in train_si284.flist, got $nl"
# Now for the test sets.
# links/13-34.1/wsj1/doc/indices/readme.doc
# describes all the different test sets.
# Note: each test-set seems to come in multiple versions depending
# on different vocabulary sizes, verbalized vs. non-verbalized
# pronunciations, etc. We use the largest vocab and non-verbalized
# pronunciations.
# The most normal one seems to be the "baseline 60k test set", which
# is h1_p0.
# Nov'92 (333 utts)
# These index files have a slightly different format;
# have to add .wv1
cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_20.ndx | \
$local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \
sort > test_eval92.flist
# Nov'92 (330 utts, 5k vocab)
cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
$local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \
sort > test_eval92_5k.flist
# Nov'93: (213 utts)
# Have to replace a wrong disk-id.
cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \
sed s/13_32_1/13_33_1/ | \
$local/ndx2flist.pl $* | sort > test_eval93.flist
# Nov'93: (213 utts, 5k)
cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h2_p0.ndx | \
sed s/13_32_1/13_33_1/ | \
$local/ndx2flist.pl $* | sort > test_eval93_5k.flist
# Dev-set for Nov'93 (503 utts)
cat links/13-34.1/wsj1/doc/indices/h1_p0.ndx | \
$local/ndx2flist.pl $* | sort > test_dev93.flist
# Dev-set for Nov'93 (513 utts, 5k vocab)
cat links/13-34.1/wsj1/doc/indices/h2_p0.ndx | \
$local/ndx2flist.pl $* | sort > test_dev93_5k.flist
# Dev-set Hub 1,2 (503, 913 utterances)
# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.
# Sometimes this gets copied from the CD's with upcasing, don't know
# why (could be older versions of the disks).
find `readlink links/13-16.1`/???1/??_??_20 -print | grep -i ".wv1" | sort > dev_dt_20.flist
find `readlink links/13-16.1`/???1/??_??_05 -print | grep -i ".wv1" | sort > dev_dt_05.flist
# Finding the transcript files:
for x in $*; do find -L $x -iname '*.dot'; done > dot_files.flist
# Convert the transcripts into our format (no normalization yet)
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
$local/flist2scp.pl $x.flist | sort > ${x}_sph.scp
cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl dot_files.flist > $x.trans1
done
# Do some basic normalization steps. At this point we don't remove OOVs--
# that will be done inside the training scripts, as we'd like to make the
# data-preparation stage independent of the specific lexicon used.
noiseword="<NOISE>";
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1;
done
# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
done
# Make the utt2spk and spk2utt files.
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
cat ${x}_sph.scp | awk '{print $1}' | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
done
#in case we want to limit lm's on most frequent words, copy lm training word frequency list
cp links/13-32.1/wsj1/doc/lng_modl/vocab/wfl_64.lst $lmdir
chmod u+w $lmdir/*.lst # had weird permissions on source.
# The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without
# verbalized pronunciations. This is the most common test setup, I understand.
cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1;
chmod u+w $lmdir/lm_bg.arpa.gz
# trigram would be:
cat links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb20onp.z | \
perl -e 'while(<>){ if(m/^\\data\\/){ print; last; } } while(<>){ print; }' | \
gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1;
prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1;
gzip -f $lmdir/lm_tgpr.arpa || exit 1;
# repeat for 5k language models
cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb05onp.z $lmdir/lm_bg_5k.arpa.gz || exit 1;
chmod u+w $lmdir/lm_bg_5k.arpa.gz
# trigram would be: !only closed vocabulary here!
cp links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1;
chmod u+w $lmdir/lm_tg_5k.arpa.gz
gunzip $lmdir/lm_tg_5k.arpa.gz
tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz
rm $lmdir/lm_tg_5k.arpa
prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1;
gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1;
if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
rm wsj0-train-spkrinfo.txt
! wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt && \
echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt
fi
if [ ! -f wsj0-train-spkrinfo.txt ]; then
echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
echo "This is possibly omitted from the training disks; couldn't find it."
echo "Everything else may have worked; we just may be missing gender info"
echo "which is only needed for VTLN-related diagnostics anyway."
exit 1
fi
# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
# LDC put it on the web. Perhaps it was accidentally omitted from the
# disks.
cat links/11-13.1/wsj0/doc/spkrinfo.txt \
links/13-32.1/wsj1/doc/evl_spok/spkrinfo.txt \
links/13-34.1/wsj1/doc/dev_spok/spkrinfo.txt \
links/13-34.1/wsj1/doc/train/spkrinfo.txt \
./wsj0-train-spkrinfo.txt | \
perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
echo "Data preparation succeeded"

Просмотреть файл

@ -1,173 +0,0 @@
#!/bin/bash
# This script builds a larger word-list and dictionary
# than used for the LMs supplied with the WSJ corpus.
# It uses a couple of strategies to fill-in words in
# the LM training data but not in CMUdict. One is
# to generate special prons for possible acronyms, that
# just consist of the constituent letters. The other
# is designed to handle derivatives of known words
# (e.g. deriving the pron of a plural from the pron of
# the base-word), but in a more general, learned-from-data
# way.
# It makes use of scripts in local/dict/
if [ $# -ne 1 ]; then
echo "Usage: local/wsj_train_lms.sh /foo/bar/WSJ/13-32.1/"
exit 1
fi
if [ "`basename $1`" != 13-32.1 ]; then
echo "Expecting the argument to this script to end in 13-32.1"
exit 1
fi
# e.g.
#srcdir=/mnt/matylda2/data/WSJ1/13-32.1
export PATH=$PATH:`pwd`/local/dict/
srcdir=$1
mkdir -p data/local/dict_larger
dir=data/local/dict_larger
cp data/local/dict/* data/local/dict_larger # Various files describing phones etc.
# are there; we just want to copy them as the phoneset is the same.
rm data/local/dict_larger/lexicon.txt # we don't want this.
rm data/local/dict_larger/lexiconp.txt # we don't want this either.
mincount=2 # Minimum count of an OOV we will try to generate a pron for.
[ ! -f data/local/dict/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1;
# Remove comments from cmudict; print first field; remove
# words like FOO(1) which are alternate prons: our dict format won't
# include these markers.
grep -v ';;;' data/local/dict/cmudict/cmudict.0.7a |
perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu
cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu
echo "Getting training data [this should take at least a few seconds; if not, there's a problem]"
# Convert to uppercase, remove XML-like markings.
# For words ending in "." that are not in CMUdict, we assume that these
# are periods that somehow remained in the data during data preparation,
# and we we replace the "." with "\n". Note: we found this by looking at
# oov.counts below (before adding this rule).
touch $dir/cleaned.gz
if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then
echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]";
else
gunzip -c $srcdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \
| awk '/^</{next}{print toupper($0)}' | perl -e '
open(F, "<$ARGV[0]")||die;
while(<F>){ chop; $isword{$_} = 1; }
while(<STDIN>) {
@A = split(" ", $_);
for ($n = 0; $n < @A; $n++) {
$a = $A[$n];
if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "."
# and have no other "." in them: treat as period.
print "$a";
if ($n+1 < @A) { print "\n"; }
} else { print "$a "; }
}
print "\n";
}
' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz
fi
# get unigram counts
echo "Getting unigram counts"
gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \
awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams
cat $dir/unigrams | awk -v dict=$dir/dict.cmu \
'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \
> $dir/oov.counts
echo "Most frequent unseen unigrams are: "
head $dir/oov.counts
# Prune away singleton counts, and remove things with numbers in
# (which should have been normalized) and with no letters at all.
cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \
| awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist
# Automatic rule-finding...
# First make some prons for possible acronyms.
# Note: we don't do this for things like U.K or U.N,
# or A.B. (which doesn't exist anyway),
# as we consider this normalization/spelling errors.
cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu > $dir/dict.acronyms
mkdir $dir/f $dir/b # forward, backward directions of rules...
# forward is normal suffix
# rules, backward is reversed (prefix rules). These
# dirs contain stuff we create while making the rule-based
# extensions to the dictionary.
# Remove ; and , from words, if they are present; these
# might crash our scripts, as they are used as separators there.
filter_dict.pl $dir/dict.cmu > $dir/f/dict
cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
reverse_dict.pl $dir/f/dict > $dir/b/dict
reverse_dict.pl $dir/f/oovs > $dir/b/oovs
# The next stage takes a few minutes.
# Note: the forward stage takes longer, as English is
# mostly a suffix-based language, and there are more rules
# that it finds.
for d in $dir/f $dir/b; do
(
cd $d
cat dict | get_rules.pl 2>get_rules.log >rules
get_rule_hierarchy.pl rules >hierarchy
awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
limit_candidate_prons.pl hierarchy | \
score_prons.pl dict | \
count_rules.pl >rule.counts
# the sort command below is just for convenience of reading.
score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
get_candidate_prons.pl rules.with_scores dict oovs | \
limit_candidate_prons.pl hierarchy > oovs.candidates
) &
done
wait
# Merge the candidates.
reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s %s\n", $1, $2);}' \
> $dir/dict.oovs
cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
sort $dir/oovlist | diff - $dir/oovlist.handled | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
# add_counts.pl attaches to original counts to the list of handled/not-handled OOVs
add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts
add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts
echo "**Top OOVs we handled are:**";
head $dir/oovlist.handled.counts
echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**";
head $dir/oovlist.not_handled.counts
echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`"
echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`"
echo "Count of OOVs we didn't handle due to low count is" \
`awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts`
# The two files created above are for humans to look at, as diagnostics.
cat <<EOF | cat - $dir/dict.cmu $dir/dict.oovs_merged | sort | uniq > $dir/lexicon.txt
!SIL SIL
<SPOKEN_NOISE> SPN
<UNK> SPN
<NOISE> NSN
EOF
echo "Created $dir/lexicon.txt"

Просмотреть файл

@ -1,86 +0,0 @@
#!/bin/bash
# Copyright 2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# This script takes data prepared in a corpus-dependent way
# in data/local/, and converts it into the "canonical" form,
# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
# data/train_si284, data/train_si84, etc.
# Don't bother doing train_si84 separately (although we have the file lists
# in data/local/) because it's just the first 7138 utterances in train_si284.
# We'll create train_si84 after doing the feature extraction.
. ./path.sh || exit 1;
echo "Preparing train and test data"
srcdir=data/local/data
lmdir=data/local/nist_lm
tmpdir=data/local/lm_tmp
lexicon=data/local/lang_tmp/lexiconp.txt
mkdir -p $tmpdir
for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
mkdir -p data/$x
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
cp $srcdir/$x.txt data/$x/text || exit 1;
cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1;
done
# Next, for each type of language model, create the corresponding FST
# and the corresponding lang_test_* directory.
echo Preparing language models for test
for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
test=data/lang_test_${lm_suffix}
mkdir -p $test
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
phones/; do
cp -r data/lang/$f $test
done
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
# we do expect the first of these 2 numbers to be close to zero (the second is
# nonzero because the backoff weights make the states sum to >1).
# Because of the <s> fiasco for these particular LMs, the first number is not
# as close to zero as it could be.
# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
# this might cause determinization failure of CLG.
# #0 is treated as an empty word.
mkdir -p $tmpdir/g
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -r $tmpdir/g
done
echo "Succeeded in formatting data."
rm -r $tmpdir

Просмотреть файл

@ -1,52 +0,0 @@
#!/bin/bash
# Copyright Johns Hopkins University (Author: Daniel Povey) 2012
. ./path.sh
[ ! -d data/lang_bd ] && echo "Expect data/local/lang_bd to exist" && exit 1;
lm_srcdir_3g=data/local/local_lm/3gram-mincount
lm_srcdir_4g=data/local/local_lm/4gram-mincount
[ ! -d "$lm_srcdir_3g" ] && echo "No such dir $lm_srcdir_3g" && exit 1;
[ ! -d "$lm_srcdir_4g" ] && echo "No such dir $lm_srcdir_4g" && exit 1;
for d in data/lang_test_bd_{tg,tgpr,fg,fgpr}; do
rm -r $d 2>/dev/null
cp -r data/lang_bd $d
done
lang=data/lang_bd
# Be careful: this time we dispense with the grep -v '<s> <s>' so this might
# not work for LMs generated from all toolkits.
gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tgpr/G.fst || exit 1;
fstisstochastic data/lang_test_bd_tgpr/G.fst
gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tg/G.fst || exit 1;
fstisstochastic data/lang_test_bd_tg/G.fst
gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fg/G.fst || exit 1;
fstisstochastic data/lang_test_bd_fg/G.fst
gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fgpr/G.fst || exit 1;
fstisstochastic data/lang_test_bd_fgpr/G.fst
exit 0;

Просмотреть файл

@ -1,202 +0,0 @@
#!/bin/bash
# This script trains LMs on the WSJ LM-training data.
# It requires that you have already run wsj_extend_dict.sh,
# to get the larger-size dictionary including all of CMUdict
# plus any OOVs and possible acronyms that we could easily
# derive pronunciations for.
# This script takes no command-line arguments
dir=data/local/local_lm
srcdir=data/local/dict_larger
mkdir -p $dir
. ./path.sh || exit 1; # for KALDI_ROOT
export PATH=$KALDI_ROOT/tools/kaldi_lm:$PATH
( # First make sure the kaldi_lm toolkit is installed.
cd $KALDI_ROOT/tools || exit 1;
if [ -d kaldi_lm ]; then
echo Not installing the kaldi_lm toolkit since it is already there.
else
echo Downloading and installing the kaldi_lm tools
if [ ! -f kaldi_lm.tar.gz ]; then
wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1;
fi
tar -xvzf kaldi_lm.tar.gz || exit 1;
cd kaldi_lm
make || exit 1;
echo Done making the kaldi_lm tools
fi
) || exit 1;
if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
echo "Expecting files $srcdir/cleaned.gz and $srcdir/lexicon.txt to exist";
echo "You need to run local/wsj_extend_dict.sh before running this script."
exit 1;
fi
# Get a wordlist-- keep everything but silence, which should not appear in
# the LM.
awk '{print $1}' $srcdir/lexicon.txt | grep -v -w '!SIL' > $dir/wordlist.txt
# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)"
gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.txt \
'BEGIN{while((getline<w)>0) v[$1]=1;}
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
| gzip -c > $dir/train_nounk.gz
# Get unigram counts (without bos/eos, but this doens't matter here, it's
# only to get the word-map, which treats them specially & doesn't need their
# counts).
# Add a 1-count for each word in word-list by including that in the data,
# so all words appear.
gunzip -c $dir/train_nounk.gz | cat - $dir/wordlist.txt | \
awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
sort -nr > $dir/unigram.counts
# Get "mapped" words-- a character encoding of the words that makes the common words very short.
cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map
gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
{ for(n=1;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz
# To save disk space, remove the un-mapped training data. We could
# easily generate it again if needed.
rm $dir/train_nounk.gz
train_lm.sh --arpa --lmtype 3gram-mincount $dir
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
# 7.8 million N-grams.
prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
# 1.45 million N-grams.
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
train_lm.sh --arpa --lmtype 4gram-mincount $dir
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
# 10.3 million N-grams.
prune_lm.sh --arpa 7.0 $dir/4gram-mincount
# 1.50 million N-grams
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
exit 0
### Below here, this script is showing various commands that
## were run during LM tuning.
train_lm.sh --arpa --lmtype 3gram-mincount $dir
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
# 7.8 million N-grams.
prune_lm.sh --arpa 3.0 $dir/3gram-mincount/
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740
# 2.5 million N-grams.
prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
# 1.45 million N-grams.
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
train_lm.sh --arpa --lmtype 4gram-mincount $dir
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
# 10.3 million N-grams.
prune_lm.sh --arpa 3.0 $dir/4gram-mincount
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294
# 2.6 million N-grams.
prune_lm.sh --arpa 4.0 $dir/4gram-mincount
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717
# 2.15 million N-grams.
prune_lm.sh --arpa 5.0 $dir/4gram-mincount
# 1.86 million N-grams
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023
prune_lm.sh --arpa 7.0 $dir/4gram-mincount
# 1.50 million N-grams
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
train_lm.sh --arpa --lmtype 3gram $dir
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866
# 20.0 million N-grams
! which ngram-count \
&& echo "SRILM tools not installed so not doing the comparison" && exit 1;
#################
# You could finish the script here if you wanted.
# Below is to show how to do baselines with SRILM.
# You'd have to install the SRILM toolkit first.
heldout_sent=10000 # Don't change this if you want result to be comparable with
# kaldi_lm results
sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
mkdir -p $sdir
gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout
gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train
(echo "<s>"; echo "</s>" ) | cat - $dir/wordlist.txt > $sdir/wordlist.final.s
# 3-gram:
ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
-map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
#0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437
# Trying 4-gram:
ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
-map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz
ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
#0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822
#3-gram with pruning:
ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
-prune 0.0000001 -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz
ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
#0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616
# Around 2.25M N-grams.
# Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/"
# above, which gave 2.5 million N-grams and a perplexity of 156.
# Note: all SRILM experiments above fully discount all singleton 3 and 4-grams.
# You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to
# the kaldi_lm experiments above without "-mincount".
## From here is how to train with
# IRSTLM. This is not really working at the moment.
export IRSTLM=$KALDI_ROOT/tools/irstlm/
idir=$dir/irstlm
mkdir $idir
gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | $IRSTLM/scripts/add-start-end.sh | \
gzip -c > $idir/train.gz
$IRSTLM/bin/dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no
cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\
{print $0;}}' > vocab.irstlm.20k
$IRSTLM/bin/build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz -p yes \
-n 3 -s improved-kneser-ney -b yes
# Testing perplexity with SRILM tools:
ngram -lm $idir/lm_3gram.gz -ppl $sdir/cleaned.heldout
#data/local/local_lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for <unk> in closed-vocabulary LM
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs
#0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599
# Perplexity is very bad (should be ~141, since we used -p option,
# not 175),
# but adding -debug 3 to the command line shows that
# the IRSTLM LM does not seem to sum to one properly, so it seems that
# it produces an LM that isn't interpretable in the normal way as an ARPA
# LM.

Просмотреть файл

@ -1,153 +0,0 @@
#!/bin/bash
# Copyright 2012 Johns Hopkins University (author: Daniel Povey) Tony Robinson
# This script trains LMs on the WSJ LM-training data.
# It requires that you have already run wsj_extend_dict.sh,
# to get the larger-size dictionary including all of CMUdict
# plus any OOVs and possible acronyms that we could easily
# derive pronunciations for.
# This script takes no command-line arguments but takes the --cmd option.
# Begin configuration section.
rand_seed=0
cmd=run.pl
nwords=10000 # This is how many words we're putting in the vocab of the RNNLM.
hidden=30
class=200 # Num-classes... should be somewhat larger than sqrt of nwords.
direct=1000 # Probably number of megabytes to allocate for hash-table for "direct" connections.
rnnlm_ver=rnnlm-0.3e # version of RNNLM to use
# End configuration section.
[ -f ./path.sh ] && . ./path.sh
. utils/parse_options.sh
if [ $# != 1 ]; then
echo "Usage: local/wsj_train_rnnlms.sh [options] <dest-dir>"
echo "For options, see top of script file"
exit 1;
fi
dir=$1
srcdir=data/local/dict_larger
mkdir -p $dir
export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH
( # First make sure the kaldi_lm toolkit is installed.
# Note: this didn't work out of the box for me, I had to
# change the g++ version to just "g++" (no cross-compilation
# needed for me as I ran on a machine that had been setup
# as 64 bit by default.
cd $KALDI_ROOT/tools || exit 1;
if [ -d $rnnlm_ver ]; then
echo Not installing the rnnlm toolkit since it is already there.
else
echo Downloading and installing the rnnlm tools
# http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz
if [ ! -f $rnnlm_ver.tgz ]; then
wget http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz || exit 1;
fi
mkdir $rnnlm_ver
cd $rnnlm_ver
tar -xvzf ../$rnnlm_ver.tgz || exit 1;
make CC=g++ || exit 1;
echo Done making the rnnlm tools
fi
) || exit 1;
if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
echo "Expecting files $srcdir/cleaned.gz and $srcdir/wordlist.final to exist";
echo "You need to run local/wsj_extend_dict.sh before running this script."
exit 1;
fi
cat $srcdir/lexicon.txt | awk '{print $1}' | grep -v -w '!SIL' > $dir/wordlist.all
# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)"
gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.all \
'BEGIN{while((getline<w)>0) v[$1]=1;}
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
| gzip -c > $dir/all.gz
echo "Splitting data into train and validation sets."
heldout_sent=10000
gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data
gunzip -c $dir/all.gz | tail -n +$heldout_sent | \
perl -e ' use List::Util qw(shuffle); @A=<>; print join("", shuffle(@A)); ' \
> $dir/train.in # training data
# The rest will consist of a word-class represented by <RNN_UNK>, that
# maps (with probabilities) to a whole class of words.
# Get unigram counts from our training data, and use this to select word-list
# for RNNLM training; e.g. 10k most frequent words. Rest will go in a class
# that we (manually, at the shell level) assign probabilities for words that
# are in that class. Note: this word-list doesn't need to include </s>; this
# automatically gets added inside the rnnlm program.
# Note: by concatenating with $dir/wordlist.all, we are doing add-one
# smoothing of the counts.
cat $dir/train.in $dir/wordlist.all | grep -v '</s>' | grep -v '<s>' | \
awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
sort -nr > $dir/unigram.counts
head -$nwords $dir/unigram.counts | awk '{print $2}' > $dir/wordlist.rnn
tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts
tot=`awk '{x=x+$1} END{print x}' $dir/unk_class.counts`
awk -v tot=$tot '{print $2, ($1*1.0/tot);}' <$dir/unk_class.counts >$dir/unk.probs
for type in train valid; do
cat $dir/$type.in | awk -v w=$dir/wordlist.rnn \
'BEGIN{while((getline<w)>0) v[$1]=1;}
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<RNN_UNK> ";print ""}'|sed 's/ $//g' \
> $dir/$type
done
rm $dir/train.in # no longer needed-- and big.
# Now randomize the order of the training data.
cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \
sort | cut -f 2 > $dir/foo
mv $dir/foo $dir/train
# OK we'll train the RNNLM on this data.
# todo: change 100 to 320.
# using 100 classes as square root of 10k.
echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/100.rnnlm \
# -hidden 100 -rand-seed 1 -debug 2 -class 100 -bptt 2 -bptt-block 20 \
# -direct-order 4 -direct 1000 -binary >& $dir/rnnlm1.log &
$cmd $dir/rnnlm.log \
$KALDI_ROOT/tools/$rnnlm_ver/rnnlm -independent -train $dir/train -valid $dir/valid \
-rnnlm $dir/rnnlm -hidden $hidden -rand-seed 1 -debug 2 -class $class -bptt 2 -bptt-block 20 \
-direct-order 4 -direct $direct -binary || exit 1;
# make it like a Kaldi table format, with fake utterance-ids.
cat $dir/valid.in | awk '{ printf("uttid-%d ", NR); print; }' > $dir/valid.with_ids
utils/rnnlm_compute_scores.sh $dir $dir/tmp.valid $dir/valid.with_ids \
$dir/valid.scores
nw=`wc -w < $dir/valid.with_ids` # Note: valid.with_ids includes utterance-ids which
# is one per word, to account for the </s> at the end of each sentence; this is the
# correct number to normalize buy.
p=`awk -v nw=$nw '{x=x+$2} END{print exp(x/nw);}' <$dir/valid.scores`
echo Perplexity is $p | tee $dir/perplexity.log
rm $dir/train $dir/all.gz
# This is a better setup, but takes a long time to train:
#echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/320.rnnlm \
# -hidden 320 -rand-seed 1 -debug 2 -class 300 -bptt 2 -bptt-block 20 \
# -direct-order 4 -direct 2000 -binary

Просмотреть файл

@ -1,201 +0,0 @@
#!/bin/bash
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
if [ $# -le 3 ]; then
echo "Arguments should be a list of WSJ directories, see ../run.sh for example."
exit 1;
fi
dir=`pwd`/data/local/data
lmdir=`pwd`/data/local/nist_lm
mkdir -p $dir $lmdir
local=`pwd`/local
utils=`pwd`/utils
. ./path.sh # Needed for KALDI_ROOT
export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
exit 1;
fi
cd $dir
# Make directory of links to the WSJ disks such as 11-13.1. This relies on the command
# line arguments being absolute pathnames.
rm -r links/ 2>/dev/null
mkdir links/
ln -s $* links
# Do some basic checks that we have what we expected.
if [ ! -d links/11-13.1 -o ! -d links/13-34.1 -o ! -d links/11-2.1 ]; then
echo "wsj_data_prep.sh: Spot check of command line arguments failed"
echo "Command line arguments must be absolute pathnames to WSJ directories"
echo "with names like 11-13.1."
exit 1;
fi
# This version for SI-84
cat links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
$local/ndx2flist.pl $* | sort | \
grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si84.flist
nl=`cat train_si84.flist | wc -l`
[ "$nl" -eq 7138 ] || echo "Warning: expected 37416 lines in train_si84.flist, got $nl"
# This version for SI-284
cat links/13-34.1/wsj1/doc/indices/si_tr_s.ndx \
links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
$local/ndx2flist.pl $* | sort | \
grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si284.flist
nl=`cat train_si284.flist | wc -l`
[ "$nl" -eq 37416 ] || echo "Warning: expected 37416 lines in train_si284.flist, got $nl"
# Now for the test sets.
# links/13-34.1/wsj1/doc/indices/readme.doc
# describes all the different test sets.
# Note: each test-set seems to come in multiple versions depending
# on different vocabulary sizes, verbalized vs. non-verbalized
# pronunciations, etc. We use the largest vocab and non-verbalized
# pronunciations.
# The most normal one seems to be the "baseline 60k test set", which
# is h1_p0.
# Nov'92 (333 utts)
# These index files have a slightly different format;
# have to add .wv1
cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_20.ndx | \
$local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \
sort > test_eval92.flist
# Nov'92 (330 utts, 5k vocab)
cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
$local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \
sort > test_eval92_5k.flist
# Nov'93: (213 utts)
# Have to replace a wrong disk-id.
cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \
sed s/13_32_1/13_33_1/ | \
$local/ndx2flist.pl $* | sort > test_eval93.flist
# Nov'93: (213 utts, 5k)
cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h2_p0.ndx | \
sed s/13_32_1/13_33_1/ | \
$local/ndx2flist.pl $* | sort > test_eval93_5k.flist
# Dev-set for Nov'93 (503 utts)
cat links/13-34.1/wsj1/doc/indices/h1_p0.ndx | \
$local/ndx2flist.pl $* | sort > test_dev93.flist
# Dev-set for Nov'93 (513 utts, 5k vocab)
cat links/13-34.1/wsj1/doc/indices/h2_p0.ndx | \
$local/ndx2flist.pl $* | sort > test_dev93_5k.flist
# Dev-set Hub 1,2 (503, 913 utterances)
# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.
# Sometimes this gets copied from the CD's with upcasing, don't know
# why (could be older versions of the disks).
find `readlink links/13-16.1`/???1/??_??_20 -print | grep -i ".wv1" | sort > dev_dt_20.flist
find `readlink links/13-16.1`/???1/??_??_05 -print | grep -i ".wv1" | sort > dev_dt_05.flist
# Finding the transcript files:
for x in $*; do find -L $x -iname '*.dot'; done > dot_files.flist
# Convert the transcripts into our format (no normalization yet)
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
$local/flist2scp.pl $x.flist | sort > ${x}_sph.scp
cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl dot_files.flist > $x.trans1
done
# Do some basic normalization steps. At this point we don't remove OOVs--
# that will be done inside the training scripts, as we'd like to make the
# data-preparation stage independent of the specific lexicon used.
noiseword="<NOISE>";
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1;
done
# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
done
# Make the utt2spk and spk2utt files.
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
cat ${x}_sph.scp | awk '{print $1}' | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
done
#in case we want to limit lm's on most frequent words, copy lm training word frequency list
cp links/13-32.1/wsj1/doc/lng_modl/vocab/wfl_64.lst $lmdir
chmod u+w $lmdir/*.lst # had weird permissions on source.
# The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without
# verbalized pronunciations. This is the most common test setup, I understand.
cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1;
chmod u+w $lmdir/lm_bg.arpa.gz
# trigram would be:
cat links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb20onp.z | \
perl -e 'while(<>){ if(m/^\\data\\/){ print; last; } } while(<>){ print; }' | \
gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1;
prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1;
gzip -f $lmdir/lm_tgpr.arpa || exit 1;
# repeat for 5k language models
cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb05onp.z $lmdir/lm_bg_5k.arpa.gz || exit 1;
chmod u+w $lmdir/lm_bg_5k.arpa.gz
# trigram would be: !only closed vocabulary here!
cp links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1;
chmod u+w $lmdir/lm_tg_5k.arpa.gz
gunzip $lmdir/lm_tg_5k.arpa.gz
tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz
rm $lmdir/lm_tg_5k.arpa
prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1;
gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1;
if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
rm wsj0-train-spkrinfo.txt
! wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt && \
echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt
fi
if [ ! -f wsj0-train-spkrinfo.txt ]; then
echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
echo "This is possibly omitted from the training disks; couldn't find it."
echo "Everything else may have worked; we just may be missing gender info"
echo "which is only needed for VTLN-related diagnostics anyway."
exit 1
fi
# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
# LDC put it on the web. Perhaps it was accidentally omitted from the
# disks.
cat links/11-13.1/wsj0/doc/spkrinfo.txt \
links/13-32.1/wsj1/doc/evl_spok/spkrinfo.txt \
links/13-34.1/wsj1/doc/dev_spok/spkrinfo.txt \
links/13-34.1/wsj1/doc/train/spkrinfo.txt \
./wsj0-train-spkrinfo.txt | \
perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
echo "Data preparation succeeded"

Просмотреть файл

@ -1,172 +0,0 @@
#!/bin/bash
# This script builds a larger word-list and dictionary
# than used for the LMs supplied with the WSJ corpus.
# It uses a couple of strategies to fill-in words in
# the LM training data but not in CMUdict. One is
# to generate special prons for possible acronyms, that
# just consist of the constituent letters. The other
# is designed to handle derivatives of known words
# (e.g. deriving the pron of a plural from the pron of
# the base-word), but in a more general, learned-from-data
# way.
# It makes use of scripts in local/dict/
if [ $# -ne 1 ]; then
echo "Usage: local/wsj_train_lms.sh /foo/bar/WSJ/13-32.1/"
exit 1
fi
if [ "`basename $1`" != 13-32.1 ]; then
echo "Expecting the argument to this script to end in 13-32.1"
exit 1
fi
# e.g.
#srcdir=/mnt/matylda2/data/WSJ1/13-32.1
export PATH=$PATH:`pwd`/local/dict/
srcdir=$1
mkdir -p data/local/dict_larger
dir=data/local/dict_larger
cp data/local/dict/* data/local/dict_larger # Various files describing phones etc.
# are there; we just want to copy them as the phoneset is the same.
rm data/local/dict_larger/lexicon.txt # we don't want this.
mincount=2 # Minimum count of an OOV we will try to generate a pron for.
[ ! -f data/local/dict/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1;
# Remove comments from cmudict; print first field; remove
# words like FOO(1) which are alternate prons: our dict format won't
# include these markers.
grep -v ';;;' data/local/dict/cmudict/cmudict.0.7a |
perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu
cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu
echo "Getting training data [this should take at least a few seconds; if not, there's a problem]"
# Convert to uppercase, remove XML-like markings.
# For words ending in "." that are not in CMUdict, we assume that these
# are periods that somehow remained in the data during data preparation,
# and we we replace the "." with "\n". Note: we found this by looking at
# oov.counts below (before adding this rule).
touch $dir/cleaned.gz
if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then
echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]";
else
gunzip -c $srcdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \
| awk '/^</{next}{print toupper($0)}' | perl -e '
open(F, "<$ARGV[0]")||die;
while(<F>){ chop; $isword{$_} = 1; }
while(<STDIN>) {
@A = split(" ", $_);
for ($n = 0; $n < @A; $n++) {
$a = $A[$n];
if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "."
# and have no other "." in them: treat as period.
print "$a";
if ($n+1 < @A) { print "\n"; }
} else { print "$a "; }
}
print "\n";
}
' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz
fi
# get unigram counts
echo "Getting unigram counts"
gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \
awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams
cat $dir/unigrams | awk -v dict=$dir/dict.cmu \
'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \
> $dir/oov.counts
echo "Most frequent unseen unigrams are: "
head $dir/oov.counts
# Prune away singleton counts, and remove things with numbers in
# (which should have been normalized) and with no letters at all.
cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \
| awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist
# Automatic rule-finding...
# First make some prons for possible acronyms.
# Note: we don't do this for things like U.K or U.N,
# or A.B. (which doesn't exist anyway),
# as we consider this normalization/spelling errors.
cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu > $dir/dict.acronyms
mkdir $dir/f $dir/b # forward, backward directions of rules...
# forward is normal suffix
# rules, backward is reversed (prefix rules). These
# dirs contain stuff we create while making the rule-based
# extensions to the dictionary.
# Remove ; and , from words, if they are present; these
# might crash our scripts, as they are used as separators there.
filter_dict.pl $dir/dict.cmu > $dir/f/dict
cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
reverse_dict.pl $dir/f/dict > $dir/b/dict
reverse_dict.pl $dir/f/oovs > $dir/b/oovs
# The next stage takes a few minutes.
# Note: the forward stage takes longer, as English is
# mostly a suffix-based language, and there are more rules
# that it finds.
for d in $dir/f $dir/b; do
(
cd $d
cat dict | get_rules.pl 2>get_rules.log >rules
get_rule_hierarchy.pl rules >hierarchy
awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
limit_candidate_prons.pl hierarchy | \
score_prons.pl dict | \
count_rules.pl >rule.counts
# the sort command below is just for convenience of reading.
score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
get_candidate_prons.pl rules.with_scores dict oovs | \
limit_candidate_prons.pl hierarchy > oovs.candidates
) &
done
wait
# Merge the candidates.
reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s %s\n", $1, $2);}' \
> $dir/dict.oovs
cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
sort $dir/oovlist | diff - $dir/oovlist.handled | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
# add_counts.pl attaches to original counts to the list of handled/not-handled OOVs
add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts
add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts
echo "**Top OOVs we handled are:**";
head $dir/oovlist.handled.counts
echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**";
head $dir/oovlist.not_handled.counts
echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`"
echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`"
echo "Count of OOVs we didn't handle due to low count is" \
`awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts`
# The two files created above are for humans to look at, as diagnostics.
cat <<EOF | cat - $dir/dict.cmu $dir/dict.oovs_merged | sort | uniq > $dir/lexicon.txt
!SIL SIL
<SPOKEN_NOISE> SPN
<UNK> SPN
<NOISE> NSN
EOF
echo "Created $dir/lexicon.txt"

Просмотреть файл

@ -1,86 +0,0 @@
#!/bin/bash
# Copyright 2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# This script takes data prepared in a corpus-dependent way
# in data/local/, and converts it into the "canonical" form,
# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
# data/train_si284, data/train_si84, etc.
# Don't bother doing train_si84 separately (although we have the file lists
# in data/local/) because it's just the first 7138 utterances in train_si284.
# We'll create train_si84 after doing the feature extraction.
. ./path.sh || exit 1;
echo "Preparing train and test data"
srcdir=data/local/data
lmdir=data/local/nist_lm
tmpdir=data/local/lm_tmp
lexicon=data/local/lang_tmp/lexicon.txt
mkdir -p $tmpdir
for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
mkdir -p data/$x
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
cp $srcdir/$x.txt data/$x/text || exit 1;
cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1;
done
# Next, for each type of language model, create the corresponding FST
# and the corresponding lang_test_* directory.
echo Preparing language models for test
for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
test=data/lang_test_${lm_suffix}
mkdir -p $test
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
phones/; do
cp -r data/lang/$f $test
done
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
# we do expect the first of these 2 numbers to be close to zero (the second is
# nonzero because the backoff weights make the states sum to >1).
# Because of the <s> fiasco for these particular LMs, the first number is not
# as close to zero as it could be.
# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
# this might cause determinization failure of CLG.
# #0 is treated as an empty word.
mkdir -p $tmpdir/g
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -r $tmpdir/g
done
echo "Succeeded in formatting data."
rm -r $tmpdir

Просмотреть файл

@ -1,54 +0,0 @@
#!/bin/bash
# Copyright Johns Hopkins University (Author: Daniel Povey) 2012
. ./path.sh
[ ! -d data/lang_bd ] && echo "Expect data/local/lang_bd to exist" && exit 1;
lm_srcdir_3g=data/local/local_lm/3gram-mincount
lm_srcdir_4g=data/local/local_lm/4gram-mincount
[ ! -d "$lm_srcdir_3g" ] && echo "No such dir $lm_srcdir_3g" && exit 1;
[ ! -d "$lm_srcdir_4g" ] && echo "No such dir $lm_srcdir_4g" && exit 1;
for d in data/lang_test_bd_{tg,tgpr,fg,fgpr}; do
rm -r $d 2>/dev/null
cp -r data/lang_bd $d
done
lang=data/lang_bd
exit
# Be careful: this time we dispense with the grep -v '<s> <s>' so this might
# not work for LMs generated from all toolkits.
gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tgpr/G.fst || exit 1;
fstisstochastic data/lang_test_bd_tgpr/G.fst
gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tg/G.fst || exit 1;
fstisstochastic data/lang_test_bd_tg/G.fst
gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fg/G.fst || exit 1;
fstisstochastic data/lang_test_bd_fg/G.fst
gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fgpr/G.fst || exit 1;
fstisstochastic data/lang_test_bd_fgpr/G.fst
exit 0;

Просмотреть файл

@ -1,83 +0,0 @@
#!/bin/bash
# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Call this script from one level above, e.g. from the s3/ directory. It puts
# its output in data/local/.
# The parts of the output of this that will be needed are
# [in data/local/dict/ ]
# lexicon.txt
# extra_questions.txt
# nonsilence_phones.txt
# optional_silence.txt
# silence_phones.txt
# run this from ../
dir=data/local/dict
mkdir -p $dir
# (1) Get the CMU dictionary
svn co https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict \
$dir/cmudict || exit 1;
# can add -r 10966 for strict compatibility.
#(2) Dictionary preparation:
# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.
# silence phones, one per line.
(echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt
echo SIL > $dir/optional_silence.txt
# nonsilence phones; on each line is a list of phones that correspond
# really to the same base phone.
cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \
perl -e 'while(<>){
chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
$phones_of{$1} .= "$_ "; }
foreach $list (values %phones_of) {print $list . "\n"; } ' \
> $dir/nonsilence_phones.txt || exit 1;
# A few extra questions that will be added to those obtained by automatically clustering
# the "real" phones. These ask about stress; there's also one for silence.
cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
$p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
>> $dir/extra_questions.txt || exit 1;
grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
> $dir/lexicon1_raw_nosil.txt || exit 1;
# Add to cmudict the silences, noises etc.
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; echo '<NOISE> NSN'; ) | \
cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1;
# lexicon.txt is without the _B, _E, _S, _I markers.
# This is the input to wsj_format_data.sh
cp $dir/lexicon2_raw.txt $dir/lexicon.txt
echo "Dictionary preparation succeeded"

Просмотреть файл

@ -1,202 +0,0 @@
#!/bin/bash
# This script trains LMs on the WSJ LM-training data.
# It requires that you have already run wsj_extend_dict.sh,
# to get the larger-size dictionary including all of CMUdict
# plus any OOVs and possible acronyms that we could easily
# derive pronunciations for.
# This script takes no command-line arguments
dir=data/local/local_lm
srcdir=data/local/dict_larger
mkdir -p $dir
. ./path.sh || exit 1; # for KALDI_ROOT
export PATH=$KALDI_ROOT/tools/kaldi_lm:$PATH
( # First make sure the kaldi_lm toolkit is installed.
cd $KALDI_ROOT/tools || exit 1;
if [ -d kaldi_lm ]; then
echo Not installing the kaldi_lm toolkit since it is already there.
else
echo Downloading and installing the kaldi_lm tools
if [ ! -f kaldi_lm.tar.gz ]; then
wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
fi
tar -xvzf kaldi_lm.tar.gz || exit 1;
cd kaldi_lm
make || exit 1;
echo Done making the kaldi_lm tools
fi
) || exit 1;
if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
echo "Expecting files $srcdir/cleaned.gz and $srcdir/lexicon.txt to exist";
echo "You need to run local/wsj_extend_dict.sh before running this script."
exit 1;
fi
# Get a wordlist-- keep everything but silence, which should not appear in
# the LM.
awk '{print $1}' $srcdir/lexicon.txt | grep -v -w '!SIL' > $dir/wordlist.txt
# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)"
gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.txt \
'BEGIN{while((getline<w)>0) v[$1]=1;}
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
| gzip -c > $dir/train_nounk.gz
# Get unigram counts (without bos/eos, but this doens't matter here, it's
# only to get the word-map, which treats them specially & doesn't need their
# counts).
# Add a 1-count for each word in word-list by including that in the data,
# so all words appear.
gunzip -c $dir/train_nounk.gz | cat - $dir/wordlist.txt | \
awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
sort -nr > $dir/unigram.counts
# Get "mapped" words-- a character encoding of the words that makes the common words very short.
cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map
gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
{ for(n=1;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz
# To save disk space, remove the un-mapped training data. We could
# easily generate it again if needed.
rm $dir/train_nounk.gz
train_lm.sh --arpa --lmtype 3gram-mincount $dir
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
# 7.8 million N-grams.
prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
# 1.45 million N-grams.
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
train_lm.sh --arpa --lmtype 4gram-mincount $dir
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
# 10.3 million N-grams.
prune_lm.sh --arpa 7.0 $dir/4gram-mincount
# 1.50 million N-grams
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
exit 0
### Below here, this script is showing various commands that
## were run during LM tuning.
train_lm.sh --arpa --lmtype 3gram-mincount $dir
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
# 7.8 million N-grams.
prune_lm.sh --arpa 3.0 $dir/3gram-mincount/
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740
# 2.5 million N-grams.
prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
# 1.45 million N-grams.
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
train_lm.sh --arpa --lmtype 4gram-mincount $dir
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
# 10.3 million N-grams.
prune_lm.sh --arpa 3.0 $dir/4gram-mincount
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294
# 2.6 million N-grams.
prune_lm.sh --arpa 4.0 $dir/4gram-mincount
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717
# 2.15 million N-grams.
prune_lm.sh --arpa 5.0 $dir/4gram-mincount
# 1.86 million N-grams
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023
prune_lm.sh --arpa 7.0 $dir/4gram-mincount
# 1.50 million N-grams
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
train_lm.sh --arpa --lmtype 3gram $dir
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866
# 20.0 million N-grams
! which ngram-count \
&& echo "SRILM tools not installed so not doing the comparison" && exit 1;
#################
# You could finish the script here if you wanted.
# Below is to show how to do baselines with SRILM.
# You'd have to install the SRILM toolkit first.
heldout_sent=10000 # Don't change this if you want result to be comparable with
# kaldi_lm results
sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
mkdir -p $sdir
gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout
gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train
(echo "<s>"; echo "</s>" ) | cat - $dir/wordlist.txt > $sdir/wordlist.final.s
# 3-gram:
ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
-map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
#0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437
# Trying 4-gram:
ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
-map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz
ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
#0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822
#3-gram with pruning:
ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
-prune 0.0000001 -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz
ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
#0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616
# Around 2.25M N-grams.
# Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/"
# above, which gave 2.5 million N-grams and a perplexity of 156.
# Note: all SRILM experiments above fully discount all singleton 3 and 4-grams.
# You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to
# the kaldi_lm experiments above without "-mincount".
## From here is how to train with
# IRSTLM. This is not really working at the moment.
export IRSTLM=$KALDI_ROOT/tools/irstlm/
idir=$dir/irstlm
mkdir $idir
gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | $IRSTLM/scripts/add-start-end.sh | \
gzip -c > $idir/train.gz
$IRSTLM/bin/dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no
cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\
{print $0;}}' > vocab.irstlm.20k
$IRSTLM/bin/build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz -p yes \
-n 3 -s improved-kneser-ney -b yes
# Testing perplexity with SRILM tools:
ngram -lm $idir/lm_3gram.gz -ppl $sdir/cleaned.heldout
#data/local/local_lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for <unk> in closed-vocabulary LM
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs
#0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599
# Perplexity is very bad (should be ~141, since we used -p option,
# not 175),
# but adding -debug 3 to the command line shows that
# the IRSTLM LM does not seem to sum to one properly, so it seems that
# it produces an LM that isn't interpretable in the normal way as an ARPA
# LM.

Просмотреть файл

@ -1,153 +0,0 @@
#!/bin/bash
# Copyright 2012 Johns Hopkins University (author: Daniel Povey) Tony Robinson
# This script trains LMs on the WSJ LM-training data.
# It requires that you have already run wsj_extend_dict.sh,
# to get the larger-size dictionary including all of CMUdict
# plus any OOVs and possible acronyms that we could easily
# derive pronunciations for.
# This script takes no command-line arguments but takes the --cmd option.
# Begin configuration section.
rand_seed=0
cmd=run.pl
nwords=10000 # This is how many words we're putting in the vocab of the RNNLM.
hidden=30
class=200 # Num-classes... should be somewhat larger than sqrt of nwords.
direct=1000 # Probably number of megabytes to allocate for hash-table for "direct" connections.
rnnlm_ver=rnnlm-0.3e # version of RNNLM to use
# End configuration section.
[ -f ./path.sh ] && . ./path.sh
. utils/parse_options.sh
if [ $# != 1 ]; then
echo "Usage: local/wsj_train_rnnlms.sh [options] <dest-dir>"
echo "For options, see top of script file"
exit 1;
fi
dir=$1
srcdir=data/local/dict_larger
mkdir -p $dir
export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH
( # First make sure the kaldi_lm toolkit is installed.
# Note: this didn't work out of the box for me, I had to
# change the g++ version to just "g++" (no cross-compilation
# needed for me as I ran on a machine that had been setup
# as 64 bit by default.
cd $KALDI_ROOT/tools || exit 1;
if [ -d $rnnlm_ver ]; then
echo Not installing the rnnlm toolkit since it is already there.
else
echo Downloading and installing the rnnlm tools
# http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz
if [ ! -f $rnnlm_ver.tgz ]; then
wget http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz || exit 1;
fi
mkdir $rnnlm_ver
cd $rnnlm_ver
tar -xvzf ../$rnnlm_ver.tgz || exit 1;
make CC=g++ || exit 1;
echo Done making the rnnlm tools
fi
) || exit 1;
if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
echo "Expecting files $srcdir/cleaned.gz and $srcdir/wordlist.final to exist";
echo "You need to run local/wsj_extend_dict.sh before running this script."
exit 1;
fi
cat $srcdir/lexicon.txt | awk '{print $1}' | grep -v -w '!SIL' > $dir/wordlist.all
# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)"
gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.all \
'BEGIN{while((getline<w)>0) v[$1]=1;}
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
| gzip -c > $dir/all.gz
echo "Splitting data into train and validation sets."
heldout_sent=10000
gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data
gunzip -c $dir/all.gz | tail -n +$heldout_sent | \
perl -e ' use List::Util qw(shuffle); @A=<>; print join("", shuffle(@A)); ' \
> $dir/train.in # training data
# The rest will consist of a word-class represented by <RNN_UNK>, that
# maps (with probabilities) to a whole class of words.
# Get unigram counts from our training data, and use this to select word-list
# for RNNLM training; e.g. 10k most frequent words. Rest will go in a class
# that we (manually, at the shell level) assign probabilities for words that
# are in that class. Note: this word-list doesn't need to include </s>; this
# automatically gets added inside the rnnlm program.
# Note: by concatenating with $dir/wordlist.all, we are doing add-one
# smoothing of the counts.
cat $dir/train.in $dir/wordlist.all | grep -v '</s>' | grep -v '<s>' | \
awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
sort -nr > $dir/unigram.counts
head -$nwords $dir/unigram.counts | awk '{print $2}' > $dir/wordlist.rnn
tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts
tot=`awk '{x=x+$1} END{print x}' $dir/unk_class.counts`
awk -v tot=$tot '{print $2, ($1*1.0/tot);}' <$dir/unk_class.counts >$dir/unk.probs
for type in train valid; do
cat $dir/$type.in | awk -v w=$dir/wordlist.rnn \
'BEGIN{while((getline<w)>0) v[$1]=1;}
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<RNN_UNK> ";print ""}'|sed 's/ $//g' \
> $dir/$type
done
rm $dir/train.in # no longer needed-- and big.
# Now randomize the order of the training data.
cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \
sort | cut -f 2 > $dir/foo
mv $dir/foo $dir/train
# OK we'll train the RNNLM on this data.
# todo: change 100 to 320.
# using 100 classes as square root of 10k.
echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/100.rnnlm \
# -hidden 100 -rand-seed 1 -debug 2 -class 100 -bptt 2 -bptt-block 20 \
# -direct-order 4 -direct 1000 -binary >& $dir/rnnlm1.log &
$cmd $dir/rnnlm.log \
$KALDI_ROOT/tools/$rnnlm_ver/rnnlm -independent -train $dir/train -valid $dir/valid \
-rnnlm $dir/rnnlm -hidden $hidden -rand-seed 1 -debug 2 -class $class -bptt 2 -bptt-block 20 \
-direct-order 4 -direct $direct -binary || exit 1;
# make it like a Kaldi table format, with fake utterance-ids.
cat $dir/valid.in | awk '{ printf("uttid-%d ", NR); print; }' > $dir/valid.with_ids
utils/rnnlm_compute_scores.sh $dir $dir/tmp.valid $dir/valid.with_ids \
$dir/valid.scores
nw=`wc -w < $dir/valid.with_ids` # Note: valid.with_ids includes utterance-ids which
# is one per word, to account for the </s> at the end of each sentence; this is the
# correct number to normalize buy.
p=`awk -v nw=$nw '{x=x+$2} END{print exp(x/nw);}' <$dir/valid.scores`
echo Perplexity is $p | tee $dir/perplexity.log
rm $dir/train $dir/all.gz
# This is a better setup, but takes a long time to train:
#echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/320.rnnlm \
# -hidden 320 -rand-seed 1 -debug 2 -class 300 -bptt 2 -bptt-block 20 \
# -direct-order 4 -direct 2000 -binary

Просмотреть файл

@ -38,11 +38,8 @@ echo Preparing language models for test
for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
test=data/lang_test_${lm_suffix}
mkdir -p $test
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
phones/; do
cp -r data/lang/$f $test
done
cp -rT data/lang $test
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
@ -60,26 +57,8 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
# we do expect the first of these 2 numbers to be close to zero (the second is
# nonzero because the backoff weights make the states sum to >1).
# Because of the <s> fiasco for these particular LMs, the first number is not
# as close to zero as it could be.
# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
# this might cause determinization failure of CLG.
# #0 is treated as an empty word.
mkdir -p $tmpdir/g
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -r $tmpdir/g
utils/validate_lang.pl $test || exit 1;
done
echo "Succeeded in formatting data."

Просмотреть файл

@ -256,6 +256,3 @@ $cuda_cmd $dir/_train_nnet.log \
utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri7a_dnn exp/tri7a_dnn/graph_tgpr_5k || exit 1;
steps/nnet/decode.sh --nj 8 --acwt 0.10 --config conf/decode_dnn.config \
exp/tri7a_dnn/graph_tgpr_5k data-fbank/test_eval92_5k_noisy $dir/decode_tgpr_5k_eval92_5k_noisy || exit 1;

Просмотреть файл

@ -27,7 +27,7 @@ mkdir -p $tmpdir
for lm_suffix in tgpr; do
test=data/lang_test_${lm_suffix}
mkdir -p $test
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones/; do
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones oov.txt oov.int; do
cp -r data/lang/$f $test
done
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\
@ -47,26 +47,8 @@ for lm_suffix in tgpr; do
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
# we do expect the first of these 2 numbers to be close to zero (the second is
# nonzero because the backoff weights make the states sum to >1).
# Because of the <s> fiasco for these particular LMs, the first number is not
# as close to zero as it could be.
# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
# this might cause determinization failure of CLG.
# #0 is treated as an empty word.
mkdir -p $tmpdir/g
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -r $tmpdir/g
utils/validate_lang.pl $test || exit 1;
done
echo "Succeeded in formatting data."

Просмотреть файл

@ -6,7 +6,15 @@
# Prepares the test time language model(G) transducers
# (adapted from wsj/s5/local/wsj_format_data.sh)
. path.sh
. ./path.sh || exit 1;
# begin configuration section
src_dict=data/local/dict/lexicon.txt # only needed for diagnostics, to identify empty words.
src_dir=data/lang
# end configuration section
. utils/parse_options.sh || exit 1;
set -e
if [ $# -ne 1 ]; then
@ -14,29 +22,41 @@ if [ $# -ne 1 ]; then
echo "e.g.: $0 /export/a15/vpanayotov/data/lm"
echo ", where:"
echo " <lm-dir> is the directory in which the language model is stored/downloaded"
echo "Options:"
echo " --src-dir <dir> # source lang directory, default data/lang"
exit 1
fi
lm_dir=$1
tmpdir=data/local/lm_tmp
lexicon=data/local/lang_tmp/lexiconp.txt
if [ ! -d $lm_dir ]; then
echo "$0: expected source LM directory $lm_dir to exist"
exit 1;
fi
if [ ! -f $src_dir/words.txt ]; then
echo "$0: expected $src_dir/words.txt to exist."
exit 1;
fi
tmpdir=data/local/lm_tmp.$$
trap "rm -r $tmpdir" EXIT
mkdir -p $tmpdir
for lm_suffix in tgsmall tgmed tglarge; do
test=data/lang_test_${lm_suffix}
mkdir -p $test
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones/; do
cp -r data/lang/$f $test
done
for lm_suffix in tgsmall tgmed; do
# tglarge is prepared by a separate command, called from run.sh; we don't
# want to compile G.fst for tglarge, as it takes a while.
test=${src_dir}_test_${lm_suffix}
cp -rT ${src_dir} $test
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt || exit 1
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# stuff in it with multiple <s>'s in the history. Encountered some other
# similar things in a LM from Geoff. Removing all "illegal" combinations of
# <s> and </s>, which are supposed to occur only at being/end of utt. These
# can cause determinization failures of CLG [ends up being epsilon cycles].
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
@ -44,31 +64,12 @@ for lm_suffix in tgsmall tgmed tglarge; do
arpa2fst - | fstprint | \
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst || true
# The output is like:
# 9.14233e-05 -0.259833
# we do expect the first of these 2 numbers to be close to zero (the second is
# nonzero because the backoff weights make the states sum to >1).
# Because of the <s> fiasco for these particular LMs, the first number is not
# as close to zero as it could be.
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
# this might cause determinization failure of CLG.
# #0 is treated as an empty word.
mkdir -p $tmpdir/g
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -r $tmpdir/g
utils/validate_lang.pl --skip-determinization-check $test || exit 1;
done
echo "Succeeded in formatting data."
rm -r $tmpdir
exit 0

Просмотреть файл

@ -50,10 +50,7 @@ idngram2lm -linear -idngram $lmdir/sprak.idngram -vocab \
test=data/lang_test_${lm_suffix}
mkdir -p $test
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
phones/; do
cp -r data/lang/$f $test
done
cp -rT data/lang $test
cat $lmdir/sprak.arpa | \
utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt
@ -72,25 +69,9 @@ cat $lmdir/sprak.arpa | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
# we do expect the first of these 2 numbers to be close to zero (the second is
# nonzero because the backoff weights make the states sum to >1).
# Because of the <s> fiasco for these particular LMs, the first number is not
# as close to zero as it could be.
# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
# this might cause determinization failure of CLG.
# #0 is treated as an empty word.
mkdir -p $tmpdir
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
< "$lexicon" >$tmpdir/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/empty_words.fst
fstinfo $tmpdir/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
echo "Succeeded in formatting data."
rm -r $tmpdir
utils/validate_lang.pl $test || exit 1;
exit 0;

Просмотреть файл

@ -17,7 +17,6 @@ lm_suffix=$3
N=$4
lmdir=$5
extdict=${srcdict}_$lm_suffix
tmpdir=data/local/lm_tmp
lang_tmp=data/local/lang_tmp
extlang=data/lang_$lm_suffix
@ -137,10 +136,8 @@ tlm -tr=$lmdir/extra4.ngt -n=$N -lm=wb -o=$lmdir/extra${N}$lm_suffix
test=data/lang_test_${N}${lm_suffix}
mkdir -p $test
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
phones/; do
cp -r $extlang/$f $test
done
cp -r $extlang $test
cat $lmdir/extra${N}$lm_suffix | \
utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt
@ -159,29 +156,7 @@ cat $lmdir/extra${N}$lm_suffix | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst
echo "Succeeded in formatting data."
# The output is like:
# 9.14233e-05 -0.259833
# we do expect the first of these 2 numbers to be close to zero (the second is
# nonzero because the backoff weights make the states sum to >1).
# Because of the <s> fiasco for these particular LMs, the first number is not
# as close to zero as it could be.
utils/validate_lang.pl $test || exit 1;
# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
# this might cause determinization failure of CLG.
# #0 is treated as an empty word.
echo "Running diagnostics. Investigate if the LM has cycles."
mkdir -p $tmpdir
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
< "$lmdir/text.filt" >$tmpdir/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/empty_words.fst
fstinfo $tmpdir/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -rf $tmpdir
exit 0;

Просмотреть файл

@ -51,13 +51,10 @@ wait
test=data/lang_test_${lm_suffix}
mkdir -p $test
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
phones/; do
cp -r $srcdir/$f $test
done
cp -rT $srcdir $test
cat $lmdir/train${ngram}.arpa | \
utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt
utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
@ -73,27 +70,10 @@ cat $lmdir/train${ngram}.arpa | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
# we do expect the first of these 2 numbers to be close to zero (the second is
# nonzero because the backoff weights make the states sum to >1).
# Because of the <s> fiasco for these particular LMs, the first number is not
# as close to zero as it could be.
# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
# this might cause determinization failure of CLG.
# #0 is treated as an empty word.
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
< "$lmdir/lm_input" >$tmpdir/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/empty_words.fst
fstinfo $tmpdir/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
utils/validate_lang.pl $test || exit 1;
echo "Succeeded in formatting data."
exit 0;
#rm -rf $tmpdir
#rm -f $ccs

Просмотреть файл

@ -22,7 +22,7 @@ cp $srcdict $dir/lexicon0.txt || exit 1;
patch <local/dict.patch $dir/lexicon0.txt || exit 1;
#(2a) Dictionary preparation:
# Pre-processing (Upper-case, remove comments)
# Pre-processing (lower-case, remove comments)
awk 'BEGIN{getline}($0 !~ /^#/) {$0=tolower($0); print}' \
$srcdict | sort | awk '($0 !~ /^[[:space:]]*$/) {print}' \
> $dir/lexicon1.txt || exit 1;

Просмотреть файл

@ -38,11 +38,9 @@ echo Preparing language models for test
for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
test=data/lang_test_${lm_suffix}
mkdir -p $test
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
phones/; do
cp -r data/lang/$f $test
done
cp -rT data/lang $test || exit 1;
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
@ -60,26 +58,8 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
# we do expect the first of these 2 numbers to be close to zero (the second is
# nonzero because the backoff weights make the states sum to >1).
# Because of the <s> fiasco for these particular LMs, the first number is not
# as close to zero as it could be.
# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
# this might cause determinization failure of CLG.
# #0 is treated as an empty word.
mkdir -p $tmpdir/g
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -r $tmpdir/g
utils/validate_lang.pl --skip-determinization-check $test || exit 1;
done
echo "Succeeded in formatting data."

Просмотреть файл

@ -1,6 +1,7 @@
#!/bin/bash
# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Copyright 2010-2012 Microsoft Corporation
# 2012-2014 Johns Hopkins University (Author: Daniel Povey)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -70,14 +71,16 @@ grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
# Add to cmudict the silences, noises etc.
# the sort | uniq is to remove a duplicated pron from cmudict.
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; echo '<NOISE> NSN'; ) | \
cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1;
cat - $dir/lexicon1_raw_nosil.txt | sort | uniq > $dir/lexicon2_raw.txt || exit 1;
# lexicon.txt is without the _B, _E, _S, _I markers.
# This is the input to wsj_format_data.sh
cp $dir/lexicon2_raw.txt $dir/lexicon.txt
rm $dir/lexiconp.txt 2>/dev/null
echo "Dictionary preparation succeeded"