зеркало из https://github.com/mozilla/kaldi.git
Updates to various data preparation scripts so validation checks on 'lang' directories will pass. It's possible some of these changes will break some setups, but it's not feasible to fully test this right now.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4739 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
2610d3ba84
Коммит
8dc30c3b6b
|
@ -38,11 +38,8 @@ echo Preparing language models for test
|
||||||
|
|
||||||
for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
|
for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
|
||||||
test=data/lang_test_${lm_suffix}
|
test=data/lang_test_${lm_suffix}
|
||||||
mkdir -p $test
|
cp -rT data/lang $test
|
||||||
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
|
|
||||||
phones/; do
|
|
||||||
cp -r data/lang/$f $test
|
|
||||||
done
|
|
||||||
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
||||||
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
|
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
|
||||||
|
|
||||||
|
@ -60,26 +57,8 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||||
fstisstochastic $test/G.fst
|
|
||||||
# The output is like:
|
utils/validate_lang.pl --skip-determinizability-check $test || exit 1;
|
||||||
# 9.14233e-05 -0.259833
|
|
||||||
# we do expect the first of these 2 numbers to be close to zero (the second is
|
|
||||||
# nonzero because the backoff weights make the states sum to >1).
|
|
||||||
# Because of the <s> fiasco for these particular LMs, the first number is not
|
|
||||||
# as close to zero as it could be.
|
|
||||||
|
|
||||||
# Everything below is only for diagnostic.
|
|
||||||
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
|
|
||||||
# this might cause determinization failure of CLG.
|
|
||||||
# #0 is treated as an empty word.
|
|
||||||
mkdir -p $tmpdir/g
|
|
||||||
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
|
|
||||||
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
|
|
||||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
|
|
||||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
|
|
||||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
|
||||||
echo "Language model has cycles with empty words" && exit 1
|
|
||||||
rm -r $tmpdir/g
|
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "Succeeded in formatting data."
|
echo "Succeeded in formatting data."
|
||||||
|
|
|
@ -1,201 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
|
|
||||||
# Apache 2.0.
|
|
||||||
|
|
||||||
|
|
||||||
if [ $# -le 3 ]; then
|
|
||||||
echo "Arguments should be a list of WSJ directories, see ../run.sh for example."
|
|
||||||
exit 1;
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
dir=`pwd`/data/local/data
|
|
||||||
lmdir=`pwd`/data/local/nist_lm
|
|
||||||
mkdir -p $dir $lmdir
|
|
||||||
local=`pwd`/local
|
|
||||||
utils=`pwd`/utils
|
|
||||||
|
|
||||||
. ./path.sh # Needed for KALDI_ROOT
|
|
||||||
export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
|
|
||||||
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
|
|
||||||
if [ ! -x $sph2pipe ]; then
|
|
||||||
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
|
|
||||||
exit 1;
|
|
||||||
fi
|
|
||||||
|
|
||||||
cd $dir
|
|
||||||
|
|
||||||
# Make directory of links to the WSJ disks such as 11-13.1. This relies on the command
|
|
||||||
# line arguments being absolute pathnames.
|
|
||||||
rm -r links/ 2>/dev/null
|
|
||||||
mkdir links/
|
|
||||||
ln -s $* links
|
|
||||||
|
|
||||||
# Do some basic checks that we have what we expected.
|
|
||||||
if [ ! -d links/11-13.1 -o ! -d links/13-34.1 -o ! -d links/11-2.1 ]; then
|
|
||||||
echo "wsj_data_prep.sh: Spot check of command line arguments failed"
|
|
||||||
echo "Command line arguments must be absolute pathnames to WSJ directories"
|
|
||||||
echo "with names like 11-13.1."
|
|
||||||
exit 1;
|
|
||||||
fi
|
|
||||||
|
|
||||||
# This version for SI-84
|
|
||||||
|
|
||||||
cat links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
|
|
||||||
$local/ndx2flist.pl $* | sort | \
|
|
||||||
grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si84.flist
|
|
||||||
|
|
||||||
nl=`cat train_si84.flist | wc -l`
|
|
||||||
[ "$nl" -eq 7138 ] || echo "Warning: expected 7138 lines in train_si84.flist, got $nl"
|
|
||||||
|
|
||||||
# This version for SI-284
|
|
||||||
cat links/13-34.1/wsj1/doc/indices/si_tr_s.ndx \
|
|
||||||
links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
|
|
||||||
$local/ndx2flist.pl $* | sort | \
|
|
||||||
grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si284.flist
|
|
||||||
|
|
||||||
nl=`cat train_si284.flist | wc -l`
|
|
||||||
[ "$nl" -eq 37416 ] || echo "Warning: expected 37416 lines in train_si284.flist, got $nl"
|
|
||||||
|
|
||||||
# Now for the test sets.
|
|
||||||
# links/13-34.1/wsj1/doc/indices/readme.doc
|
|
||||||
# describes all the different test sets.
|
|
||||||
# Note: each test-set seems to come in multiple versions depending
|
|
||||||
# on different vocabulary sizes, verbalized vs. non-verbalized
|
|
||||||
# pronunciations, etc. We use the largest vocab and non-verbalized
|
|
||||||
# pronunciations.
|
|
||||||
# The most normal one seems to be the "baseline 60k test set", which
|
|
||||||
# is h1_p0.
|
|
||||||
|
|
||||||
# Nov'92 (333 utts)
|
|
||||||
# These index files have a slightly different format;
|
|
||||||
# have to add .wv1
|
|
||||||
cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_20.ndx | \
|
|
||||||
$local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \
|
|
||||||
sort > test_eval92.flist
|
|
||||||
|
|
||||||
# Nov'92 (330 utts, 5k vocab)
|
|
||||||
cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
|
|
||||||
$local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \
|
|
||||||
sort > test_eval92_5k.flist
|
|
||||||
|
|
||||||
# Nov'93: (213 utts)
|
|
||||||
# Have to replace a wrong disk-id.
|
|
||||||
cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \
|
|
||||||
sed s/13_32_1/13_33_1/ | \
|
|
||||||
$local/ndx2flist.pl $* | sort > test_eval93.flist
|
|
||||||
|
|
||||||
# Nov'93: (213 utts, 5k)
|
|
||||||
cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h2_p0.ndx | \
|
|
||||||
sed s/13_32_1/13_33_1/ | \
|
|
||||||
$local/ndx2flist.pl $* | sort > test_eval93_5k.flist
|
|
||||||
|
|
||||||
# Dev-set for Nov'93 (503 utts)
|
|
||||||
cat links/13-34.1/wsj1/doc/indices/h1_p0.ndx | \
|
|
||||||
$local/ndx2flist.pl $* | sort > test_dev93.flist
|
|
||||||
|
|
||||||
# Dev-set for Nov'93 (513 utts, 5k vocab)
|
|
||||||
cat links/13-34.1/wsj1/doc/indices/h2_p0.ndx | \
|
|
||||||
$local/ndx2flist.pl $* | sort > test_dev93_5k.flist
|
|
||||||
|
|
||||||
|
|
||||||
# Dev-set Hub 1,2 (503, 913 utterances)
|
|
||||||
|
|
||||||
# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.
|
|
||||||
# Sometimes this gets copied from the CD's with upcasing, don't know
|
|
||||||
# why (could be older versions of the disks).
|
|
||||||
find `readlink links/13-16.1`/???1/??_??_20 -print | grep -i ".wv1" | sort > dev_dt_20.flist
|
|
||||||
find `readlink links/13-16.1`/???1/??_??_05 -print | grep -i ".wv1" | sort > dev_dt_05.flist
|
|
||||||
|
|
||||||
|
|
||||||
# Finding the transcript files:
|
|
||||||
for x in $*; do find -L $x -iname '*.dot'; done > dot_files.flist
|
|
||||||
|
|
||||||
# Convert the transcripts into our format (no normalization yet)
|
|
||||||
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
|
||||||
$local/flist2scp.pl $x.flist | sort > ${x}_sph.scp
|
|
||||||
cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl dot_files.flist > $x.trans1
|
|
||||||
done
|
|
||||||
|
|
||||||
# Do some basic normalization steps. At this point we don't remove OOVs--
|
|
||||||
# that will be done inside the training scripts, as we'd like to make the
|
|
||||||
# data-preparation stage independent of the specific lexicon used.
|
|
||||||
noiseword="<NOISE>";
|
|
||||||
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
|
||||||
cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1;
|
|
||||||
done
|
|
||||||
|
|
||||||
# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
|
|
||||||
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
|
||||||
awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
|
|
||||||
done
|
|
||||||
|
|
||||||
# Make the utt2spk and spk2utt files.
|
|
||||||
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
|
||||||
cat ${x}_sph.scp | awk '{print $1}' | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
|
|
||||||
cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
|
|
||||||
done
|
|
||||||
|
|
||||||
|
|
||||||
#in case we want to limit lm's on most frequent words, copy lm training word frequency list
|
|
||||||
cp links/13-32.1/wsj1/doc/lng_modl/vocab/wfl_64.lst $lmdir
|
|
||||||
chmod u+w $lmdir/*.lst # had weird permissions on source.
|
|
||||||
|
|
||||||
# The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without
|
|
||||||
# verbalized pronunciations. This is the most common test setup, I understand.
|
|
||||||
|
|
||||||
cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1;
|
|
||||||
chmod u+w $lmdir/lm_bg.arpa.gz
|
|
||||||
|
|
||||||
# trigram would be:
|
|
||||||
cat links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb20onp.z | \
|
|
||||||
perl -e 'while(<>){ if(m/^\\data\\/){ print; last; } } while(<>){ print; }' | \
|
|
||||||
gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1;
|
|
||||||
|
|
||||||
prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1;
|
|
||||||
gzip -f $lmdir/lm_tgpr.arpa || exit 1;
|
|
||||||
|
|
||||||
# repeat for 5k language models
|
|
||||||
cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb05onp.z $lmdir/lm_bg_5k.arpa.gz || exit 1;
|
|
||||||
chmod u+w $lmdir/lm_bg_5k.arpa.gz
|
|
||||||
|
|
||||||
# trigram would be: !only closed vocabulary here!
|
|
||||||
cp links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1;
|
|
||||||
chmod u+w $lmdir/lm_tg_5k.arpa.gz
|
|
||||||
gunzip $lmdir/lm_tg_5k.arpa.gz
|
|
||||||
tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz
|
|
||||||
rm $lmdir/lm_tg_5k.arpa
|
|
||||||
|
|
||||||
prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1;
|
|
||||||
gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1;
|
|
||||||
|
|
||||||
|
|
||||||
if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
|
|
||||||
rm wsj0-train-spkrinfo.txt
|
|
||||||
! wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt && \
|
|
||||||
echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
|
|
||||||
wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -f wsj0-train-spkrinfo.txt ]; then
|
|
||||||
echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
|
|
||||||
echo "This is possibly omitted from the training disks; couldn't find it."
|
|
||||||
echo "Everything else may have worked; we just may be missing gender info"
|
|
||||||
echo "which is only needed for VTLN-related diagnostics anyway."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
|
|
||||||
# LDC put it on the web. Perhaps it was accidentally omitted from the
|
|
||||||
# disks.
|
|
||||||
|
|
||||||
cat links/11-13.1/wsj0/doc/spkrinfo.txt \
|
|
||||||
links/13-32.1/wsj1/doc/evl_spok/spkrinfo.txt \
|
|
||||||
links/13-34.1/wsj1/doc/dev_spok/spkrinfo.txt \
|
|
||||||
links/13-34.1/wsj1/doc/train/spkrinfo.txt \
|
|
||||||
./wsj0-train-spkrinfo.txt | \
|
|
||||||
perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
|
|
||||||
awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
|
|
||||||
|
|
||||||
|
|
||||||
echo "Data preparation succeeded"
|
|
|
@ -1,173 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script builds a larger word-list and dictionary
|
|
||||||
# than used for the LMs supplied with the WSJ corpus.
|
|
||||||
# It uses a couple of strategies to fill-in words in
|
|
||||||
# the LM training data but not in CMUdict. One is
|
|
||||||
# to generate special prons for possible acronyms, that
|
|
||||||
# just consist of the constituent letters. The other
|
|
||||||
# is designed to handle derivatives of known words
|
|
||||||
# (e.g. deriving the pron of a plural from the pron of
|
|
||||||
# the base-word), but in a more general, learned-from-data
|
|
||||||
# way.
|
|
||||||
# It makes use of scripts in local/dict/
|
|
||||||
|
|
||||||
if [ $# -ne 1 ]; then
|
|
||||||
echo "Usage: local/wsj_train_lms.sh /foo/bar/WSJ/13-32.1/"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
if [ "`basename $1`" != 13-32.1 ]; then
|
|
||||||
echo "Expecting the argument to this script to end in 13-32.1"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# e.g.
|
|
||||||
#srcdir=/mnt/matylda2/data/WSJ1/13-32.1
|
|
||||||
export PATH=$PATH:`pwd`/local/dict/
|
|
||||||
srcdir=$1
|
|
||||||
mkdir -p data/local/dict_larger
|
|
||||||
dir=data/local/dict_larger
|
|
||||||
cp data/local/dict/* data/local/dict_larger # Various files describing phones etc.
|
|
||||||
# are there; we just want to copy them as the phoneset is the same.
|
|
||||||
rm data/local/dict_larger/lexicon.txt # we don't want this.
|
|
||||||
rm data/local/dict_larger/lexiconp.txt # we don't want this either.
|
|
||||||
mincount=2 # Minimum count of an OOV we will try to generate a pron for.
|
|
||||||
|
|
||||||
[ ! -f data/local/dict/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1;
|
|
||||||
|
|
||||||
# Remove comments from cmudict; print first field; remove
|
|
||||||
# words like FOO(1) which are alternate prons: our dict format won't
|
|
||||||
# include these markers.
|
|
||||||
grep -v ';;;' data/local/dict/cmudict/cmudict.0.7a |
|
|
||||||
perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu
|
|
||||||
|
|
||||||
cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu
|
|
||||||
|
|
||||||
echo "Getting training data [this should take at least a few seconds; if not, there's a problem]"
|
|
||||||
|
|
||||||
# Convert to uppercase, remove XML-like markings.
|
|
||||||
# For words ending in "." that are not in CMUdict, we assume that these
|
|
||||||
# are periods that somehow remained in the data during data preparation,
|
|
||||||
# and we we replace the "." with "\n". Note: we found this by looking at
|
|
||||||
# oov.counts below (before adding this rule).
|
|
||||||
|
|
||||||
touch $dir/cleaned.gz
|
|
||||||
if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then
|
|
||||||
echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]";
|
|
||||||
else
|
|
||||||
gunzip -c $srcdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \
|
|
||||||
| awk '/^</{next}{print toupper($0)}' | perl -e '
|
|
||||||
open(F, "<$ARGV[0]")||die;
|
|
||||||
while(<F>){ chop; $isword{$_} = 1; }
|
|
||||||
while(<STDIN>) {
|
|
||||||
@A = split(" ", $_);
|
|
||||||
for ($n = 0; $n < @A; $n++) {
|
|
||||||
$a = $A[$n];
|
|
||||||
if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "."
|
|
||||||
# and have no other "." in them: treat as period.
|
|
||||||
print "$a";
|
|
||||||
if ($n+1 < @A) { print "\n"; }
|
|
||||||
} else { print "$a "; }
|
|
||||||
}
|
|
||||||
print "\n";
|
|
||||||
}
|
|
||||||
' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz
|
|
||||||
fi
|
|
||||||
|
|
||||||
# get unigram counts
|
|
||||||
echo "Getting unigram counts"
|
|
||||||
gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \
|
|
||||||
awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams
|
|
||||||
|
|
||||||
cat $dir/unigrams | awk -v dict=$dir/dict.cmu \
|
|
||||||
'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \
|
|
||||||
> $dir/oov.counts
|
|
||||||
|
|
||||||
echo "Most frequent unseen unigrams are: "
|
|
||||||
head $dir/oov.counts
|
|
||||||
|
|
||||||
# Prune away singleton counts, and remove things with numbers in
|
|
||||||
# (which should have been normalized) and with no letters at all.
|
|
||||||
|
|
||||||
|
|
||||||
cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \
|
|
||||||
| awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist
|
|
||||||
|
|
||||||
# Automatic rule-finding...
|
|
||||||
|
|
||||||
# First make some prons for possible acronyms.
|
|
||||||
# Note: we don't do this for things like U.K or U.N,
|
|
||||||
# or A.B. (which doesn't exist anyway),
|
|
||||||
# as we consider this normalization/spelling errors.
|
|
||||||
|
|
||||||
cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu > $dir/dict.acronyms
|
|
||||||
|
|
||||||
mkdir $dir/f $dir/b # forward, backward directions of rules...
|
|
||||||
# forward is normal suffix
|
|
||||||
# rules, backward is reversed (prefix rules). These
|
|
||||||
# dirs contain stuff we create while making the rule-based
|
|
||||||
# extensions to the dictionary.
|
|
||||||
|
|
||||||
# Remove ; and , from words, if they are present; these
|
|
||||||
# might crash our scripts, as they are used as separators there.
|
|
||||||
filter_dict.pl $dir/dict.cmu > $dir/f/dict
|
|
||||||
cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
|
|
||||||
reverse_dict.pl $dir/f/dict > $dir/b/dict
|
|
||||||
reverse_dict.pl $dir/f/oovs > $dir/b/oovs
|
|
||||||
|
|
||||||
# The next stage takes a few minutes.
|
|
||||||
# Note: the forward stage takes longer, as English is
|
|
||||||
# mostly a suffix-based language, and there are more rules
|
|
||||||
# that it finds.
|
|
||||||
for d in $dir/f $dir/b; do
|
|
||||||
(
|
|
||||||
cd $d
|
|
||||||
cat dict | get_rules.pl 2>get_rules.log >rules
|
|
||||||
get_rule_hierarchy.pl rules >hierarchy
|
|
||||||
awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
|
|
||||||
limit_candidate_prons.pl hierarchy | \
|
|
||||||
score_prons.pl dict | \
|
|
||||||
count_rules.pl >rule.counts
|
|
||||||
# the sort command below is just for convenience of reading.
|
|
||||||
score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
|
|
||||||
get_candidate_prons.pl rules.with_scores dict oovs | \
|
|
||||||
limit_candidate_prons.pl hierarchy > oovs.candidates
|
|
||||||
) &
|
|
||||||
done
|
|
||||||
wait
|
|
||||||
|
|
||||||
# Merge the candidates.
|
|
||||||
reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
|
|
||||||
select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s %s\n", $1, $2);}' \
|
|
||||||
> $dir/dict.oovs
|
|
||||||
|
|
||||||
cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
|
|
||||||
|
|
||||||
awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
|
|
||||||
sort $dir/oovlist | diff - $dir/oovlist.handled | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
|
|
||||||
|
|
||||||
|
|
||||||
# add_counts.pl attaches to original counts to the list of handled/not-handled OOVs
|
|
||||||
add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts
|
|
||||||
add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts
|
|
||||||
|
|
||||||
echo "**Top OOVs we handled are:**";
|
|
||||||
head $dir/oovlist.handled.counts
|
|
||||||
echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**";
|
|
||||||
head $dir/oovlist.not_handled.counts
|
|
||||||
|
|
||||||
|
|
||||||
echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`"
|
|
||||||
echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`"
|
|
||||||
echo "Count of OOVs we didn't handle due to low count is" \
|
|
||||||
`awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts`
|
|
||||||
# The two files created above are for humans to look at, as diagnostics.
|
|
||||||
|
|
||||||
cat <<EOF | cat - $dir/dict.cmu $dir/dict.oovs_merged | sort | uniq > $dir/lexicon.txt
|
|
||||||
!SIL SIL
|
|
||||||
<SPOKEN_NOISE> SPN
|
|
||||||
<UNK> SPN
|
|
||||||
<NOISE> NSN
|
|
||||||
EOF
|
|
||||||
|
|
||||||
echo "Created $dir/lexicon.txt"
|
|
|
@ -1,86 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Copyright 2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
|
|
||||||
# Apache 2.0
|
|
||||||
|
|
||||||
# This script takes data prepared in a corpus-dependent way
|
|
||||||
# in data/local/, and converts it into the "canonical" form,
|
|
||||||
# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
|
|
||||||
# data/train_si284, data/train_si84, etc.
|
|
||||||
|
|
||||||
# Don't bother doing train_si84 separately (although we have the file lists
|
|
||||||
# in data/local/) because it's just the first 7138 utterances in train_si284.
|
|
||||||
# We'll create train_si84 after doing the feature extraction.
|
|
||||||
|
|
||||||
. ./path.sh || exit 1;
|
|
||||||
|
|
||||||
echo "Preparing train and test data"
|
|
||||||
srcdir=data/local/data
|
|
||||||
lmdir=data/local/nist_lm
|
|
||||||
tmpdir=data/local/lm_tmp
|
|
||||||
lexicon=data/local/lang_tmp/lexiconp.txt
|
|
||||||
mkdir -p $tmpdir
|
|
||||||
|
|
||||||
for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
|
||||||
mkdir -p data/$x
|
|
||||||
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
|
|
||||||
cp $srcdir/$x.txt data/$x/text || exit 1;
|
|
||||||
cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
|
|
||||||
cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
|
|
||||||
utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1;
|
|
||||||
done
|
|
||||||
|
|
||||||
|
|
||||||
# Next, for each type of language model, create the corresponding FST
|
|
||||||
# and the corresponding lang_test_* directory.
|
|
||||||
|
|
||||||
echo Preparing language models for test
|
|
||||||
|
|
||||||
for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
|
|
||||||
test=data/lang_test_${lm_suffix}
|
|
||||||
mkdir -p $test
|
|
||||||
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
|
|
||||||
phones/; do
|
|
||||||
cp -r data/lang/$f $test
|
|
||||||
done
|
|
||||||
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
|
||||||
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
|
|
||||||
|
|
||||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
|
||||||
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
|
||||||
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
|
||||||
# which are supposed to occur only at being/end of utt. These can cause
|
|
||||||
# determinization failures of CLG [ends up being epsilon cycles].
|
|
||||||
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
|
||||||
grep -v '<s> <s>' | \
|
|
||||||
grep -v '</s> <s>' | \
|
|
||||||
grep -v '</s> </s>' | \
|
|
||||||
arpa2fst - | fstprint | \
|
|
||||||
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
|
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
|
||||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
|
||||||
fstisstochastic $test/G.fst
|
|
||||||
# The output is like:
|
|
||||||
# 9.14233e-05 -0.259833
|
|
||||||
# we do expect the first of these 2 numbers to be close to zero (the second is
|
|
||||||
# nonzero because the backoff weights make the states sum to >1).
|
|
||||||
# Because of the <s> fiasco for these particular LMs, the first number is not
|
|
||||||
# as close to zero as it could be.
|
|
||||||
|
|
||||||
# Everything below is only for diagnostic.
|
|
||||||
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
|
|
||||||
# this might cause determinization failure of CLG.
|
|
||||||
# #0 is treated as an empty word.
|
|
||||||
mkdir -p $tmpdir/g
|
|
||||||
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
|
|
||||||
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
|
|
||||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
|
|
||||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
|
|
||||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
|
||||||
echo "Language model has cycles with empty words" && exit 1
|
|
||||||
rm -r $tmpdir/g
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "Succeeded in formatting data."
|
|
||||||
rm -r $tmpdir
|
|
|
@ -1,52 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Copyright Johns Hopkins University (Author: Daniel Povey) 2012
|
|
||||||
|
|
||||||
. ./path.sh
|
|
||||||
|
|
||||||
[ ! -d data/lang_bd ] && echo "Expect data/local/lang_bd to exist" && exit 1;
|
|
||||||
|
|
||||||
lm_srcdir_3g=data/local/local_lm/3gram-mincount
|
|
||||||
lm_srcdir_4g=data/local/local_lm/4gram-mincount
|
|
||||||
|
|
||||||
[ ! -d "$lm_srcdir_3g" ] && echo "No such dir $lm_srcdir_3g" && exit 1;
|
|
||||||
[ ! -d "$lm_srcdir_4g" ] && echo "No such dir $lm_srcdir_4g" && exit 1;
|
|
||||||
|
|
||||||
for d in data/lang_test_bd_{tg,tgpr,fg,fgpr}; do
|
|
||||||
rm -r $d 2>/dev/null
|
|
||||||
cp -r data/lang_bd $d
|
|
||||||
done
|
|
||||||
|
|
||||||
lang=data/lang_bd
|
|
||||||
|
|
||||||
# Be careful: this time we dispense with the grep -v '<s> <s>' so this might
|
|
||||||
# not work for LMs generated from all toolkits.
|
|
||||||
gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \
|
|
||||||
arpa2fst - | fstprint | \
|
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
|
||||||
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tgpr/G.fst || exit 1;
|
|
||||||
fstisstochastic data/lang_test_bd_tgpr/G.fst
|
|
||||||
|
|
||||||
gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
|
|
||||||
arpa2fst - | fstprint | \
|
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
|
||||||
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tg/G.fst || exit 1;
|
|
||||||
fstisstochastic data/lang_test_bd_tg/G.fst
|
|
||||||
|
|
||||||
gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
|
|
||||||
arpa2fst - | fstprint | \
|
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
|
||||||
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fg/G.fst || exit 1;
|
|
||||||
fstisstochastic data/lang_test_bd_fg/G.fst
|
|
||||||
|
|
||||||
gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \
|
|
||||||
arpa2fst - | fstprint | \
|
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
|
||||||
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fgpr/G.fst || exit 1;
|
|
||||||
fstisstochastic data/lang_test_bd_fgpr/G.fst
|
|
||||||
|
|
||||||
exit 0;
|
|
|
@ -1,202 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script trains LMs on the WSJ LM-training data.
|
|
||||||
# It requires that you have already run wsj_extend_dict.sh,
|
|
||||||
# to get the larger-size dictionary including all of CMUdict
|
|
||||||
# plus any OOVs and possible acronyms that we could easily
|
|
||||||
# derive pronunciations for.
|
|
||||||
|
|
||||||
# This script takes no command-line arguments
|
|
||||||
|
|
||||||
dir=data/local/local_lm
|
|
||||||
srcdir=data/local/dict_larger
|
|
||||||
mkdir -p $dir
|
|
||||||
. ./path.sh || exit 1; # for KALDI_ROOT
|
|
||||||
export PATH=$KALDI_ROOT/tools/kaldi_lm:$PATH
|
|
||||||
( # First make sure the kaldi_lm toolkit is installed.
|
|
||||||
cd $KALDI_ROOT/tools || exit 1;
|
|
||||||
if [ -d kaldi_lm ]; then
|
|
||||||
echo Not installing the kaldi_lm toolkit since it is already there.
|
|
||||||
else
|
|
||||||
echo Downloading and installing the kaldi_lm tools
|
|
||||||
if [ ! -f kaldi_lm.tar.gz ]; then
|
|
||||||
wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1;
|
|
||||||
fi
|
|
||||||
tar -xvzf kaldi_lm.tar.gz || exit 1;
|
|
||||||
cd kaldi_lm
|
|
||||||
make || exit 1;
|
|
||||||
echo Done making the kaldi_lm tools
|
|
||||||
fi
|
|
||||||
) || exit 1;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
|
|
||||||
echo "Expecting files $srcdir/cleaned.gz and $srcdir/lexicon.txt to exist";
|
|
||||||
echo "You need to run local/wsj_extend_dict.sh before running this script."
|
|
||||||
exit 1;
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Get a wordlist-- keep everything but silence, which should not appear in
|
|
||||||
# the LM.
|
|
||||||
awk '{print $1}' $srcdir/lexicon.txt | grep -v -w '!SIL' > $dir/wordlist.txt
|
|
||||||
|
|
||||||
# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
|
|
||||||
echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)"
|
|
||||||
gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.txt \
|
|
||||||
'BEGIN{while((getline<w)>0) v[$1]=1;}
|
|
||||||
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
|
|
||||||
| gzip -c > $dir/train_nounk.gz
|
|
||||||
|
|
||||||
# Get unigram counts (without bos/eos, but this doens't matter here, it's
|
|
||||||
# only to get the word-map, which treats them specially & doesn't need their
|
|
||||||
# counts).
|
|
||||||
# Add a 1-count for each word in word-list by including that in the data,
|
|
||||||
# so all words appear.
|
|
||||||
gunzip -c $dir/train_nounk.gz | cat - $dir/wordlist.txt | \
|
|
||||||
awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
|
|
||||||
sort -nr > $dir/unigram.counts
|
|
||||||
|
|
||||||
# Get "mapped" words-- a character encoding of the words that makes the common words very short.
|
|
||||||
cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map
|
|
||||||
|
|
||||||
gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
|
|
||||||
{ for(n=1;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz
|
|
||||||
|
|
||||||
# To save disk space, remove the un-mapped training data. We could
|
|
||||||
# easily generate it again if needed.
|
|
||||||
rm $dir/train_nounk.gz
|
|
||||||
|
|
||||||
train_lm.sh --arpa --lmtype 3gram-mincount $dir
|
|
||||||
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
|
|
||||||
# 7.8 million N-grams.
|
|
||||||
|
|
||||||
prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
|
|
||||||
# 1.45 million N-grams.
|
|
||||||
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
|
|
||||||
|
|
||||||
train_lm.sh --arpa --lmtype 4gram-mincount $dir
|
|
||||||
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
|
|
||||||
# 10.3 million N-grams.
|
|
||||||
|
|
||||||
prune_lm.sh --arpa 7.0 $dir/4gram-mincount
|
|
||||||
# 1.50 million N-grams
|
|
||||||
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
|
|
||||||
|
|
||||||
|
|
||||||
exit 0
|
|
||||||
|
|
||||||
### Below here, this script is showing various commands that
|
|
||||||
## were run during LM tuning.
|
|
||||||
|
|
||||||
train_lm.sh --arpa --lmtype 3gram-mincount $dir
|
|
||||||
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
|
|
||||||
# 7.8 million N-grams.
|
|
||||||
|
|
||||||
prune_lm.sh --arpa 3.0 $dir/3gram-mincount/
|
|
||||||
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740
|
|
||||||
# 2.5 million N-grams.
|
|
||||||
|
|
||||||
prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
|
|
||||||
# 1.45 million N-grams.
|
|
||||||
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
|
|
||||||
|
|
||||||
train_lm.sh --arpa --lmtype 4gram-mincount $dir
|
|
||||||
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
|
|
||||||
# 10.3 million N-grams.
|
|
||||||
|
|
||||||
prune_lm.sh --arpa 3.0 $dir/4gram-mincount
|
|
||||||
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294
|
|
||||||
# 2.6 million N-grams.
|
|
||||||
|
|
||||||
prune_lm.sh --arpa 4.0 $dir/4gram-mincount
|
|
||||||
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717
|
|
||||||
# 2.15 million N-grams.
|
|
||||||
|
|
||||||
prune_lm.sh --arpa 5.0 $dir/4gram-mincount
|
|
||||||
# 1.86 million N-grams
|
|
||||||
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023
|
|
||||||
|
|
||||||
prune_lm.sh --arpa 7.0 $dir/4gram-mincount
|
|
||||||
# 1.50 million N-grams
|
|
||||||
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
|
|
||||||
|
|
||||||
train_lm.sh --arpa --lmtype 3gram $dir
|
|
||||||
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866
|
|
||||||
# 20.0 million N-grams
|
|
||||||
|
|
||||||
! which ngram-count \
|
|
||||||
&& echo "SRILM tools not installed so not doing the comparison" && exit 1;
|
|
||||||
|
|
||||||
#################
|
|
||||||
# You could finish the script here if you wanted.
|
|
||||||
# Below is to show how to do baselines with SRILM.
|
|
||||||
# You'd have to install the SRILM toolkit first.
|
|
||||||
|
|
||||||
heldout_sent=10000 # Don't change this if you want result to be comparable with
|
|
||||||
# kaldi_lm results
|
|
||||||
sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
|
|
||||||
mkdir -p $sdir
|
|
||||||
gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout
|
|
||||||
gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train
|
|
||||||
(echo "<s>"; echo "</s>" ) | cat - $dir/wordlist.txt > $sdir/wordlist.final.s
|
|
||||||
|
|
||||||
# 3-gram:
|
|
||||||
ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
|
|
||||||
-map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
|
|
||||||
ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2
|
|
||||||
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
|
|
||||||
#0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437
|
|
||||||
|
|
||||||
# Trying 4-gram:
|
|
||||||
ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
|
|
||||||
-map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz
|
|
||||||
ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout
|
|
||||||
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
|
|
||||||
#0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822
|
|
||||||
|
|
||||||
#3-gram with pruning:
|
|
||||||
ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
|
|
||||||
-prune 0.0000001 -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz
|
|
||||||
ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout
|
|
||||||
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
|
|
||||||
#0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616
|
|
||||||
# Around 2.25M N-grams.
|
|
||||||
# Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/"
|
|
||||||
# above, which gave 2.5 million N-grams and a perplexity of 156.
|
|
||||||
|
|
||||||
# Note: all SRILM experiments above fully discount all singleton 3 and 4-grams.
|
|
||||||
# You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to
|
|
||||||
# the kaldi_lm experiments above without "-mincount".
|
|
||||||
|
|
||||||
## From here is how to train with
|
|
||||||
# IRSTLM. This is not really working at the moment.
|
|
||||||
export IRSTLM=$KALDI_ROOT/tools/irstlm/
|
|
||||||
|
|
||||||
idir=$dir/irstlm
|
|
||||||
mkdir $idir
|
|
||||||
gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | $IRSTLM/scripts/add-start-end.sh | \
|
|
||||||
gzip -c > $idir/train.gz
|
|
||||||
|
|
||||||
$IRSTLM/bin/dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no
|
|
||||||
cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\
|
|
||||||
{print $0;}}' > vocab.irstlm.20k
|
|
||||||
|
|
||||||
|
|
||||||
$IRSTLM/bin/build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz -p yes \
|
|
||||||
-n 3 -s improved-kneser-ney -b yes
|
|
||||||
# Testing perplexity with SRILM tools:
|
|
||||||
ngram -lm $idir/lm_3gram.gz -ppl $sdir/cleaned.heldout
|
|
||||||
#data/local/local_lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for <unk> in closed-vocabulary LM
|
|
||||||
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs
|
|
||||||
#0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599
|
|
||||||
|
|
||||||
# Perplexity is very bad (should be ~141, since we used -p option,
|
|
||||||
# not 175),
|
|
||||||
# but adding -debug 3 to the command line shows that
|
|
||||||
# the IRSTLM LM does not seem to sum to one properly, so it seems that
|
|
||||||
# it produces an LM that isn't interpretable in the normal way as an ARPA
|
|
||||||
# LM.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,153 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Copyright 2012 Johns Hopkins University (author: Daniel Povey) Tony Robinson
|
|
||||||
|
|
||||||
# This script trains LMs on the WSJ LM-training data.
|
|
||||||
# It requires that you have already run wsj_extend_dict.sh,
|
|
||||||
# to get the larger-size dictionary including all of CMUdict
|
|
||||||
# plus any OOVs and possible acronyms that we could easily
|
|
||||||
# derive pronunciations for.
|
|
||||||
|
|
||||||
# This script takes no command-line arguments but takes the --cmd option.
|
|
||||||
|
|
||||||
# Begin configuration section.
|
|
||||||
rand_seed=0
|
|
||||||
cmd=run.pl
|
|
||||||
nwords=10000 # This is how many words we're putting in the vocab of the RNNLM.
|
|
||||||
hidden=30
|
|
||||||
class=200 # Num-classes... should be somewhat larger than sqrt of nwords.
|
|
||||||
direct=1000 # Probably number of megabytes to allocate for hash-table for "direct" connections.
|
|
||||||
rnnlm_ver=rnnlm-0.3e # version of RNNLM to use
|
|
||||||
# End configuration section.
|
|
||||||
|
|
||||||
[ -f ./path.sh ] && . ./path.sh
|
|
||||||
. utils/parse_options.sh
|
|
||||||
|
|
||||||
if [ $# != 1 ]; then
|
|
||||||
echo "Usage: local/wsj_train_rnnlms.sh [options] <dest-dir>"
|
|
||||||
echo "For options, see top of script file"
|
|
||||||
exit 1;
|
|
||||||
fi
|
|
||||||
|
|
||||||
dir=$1
|
|
||||||
srcdir=data/local/dict_larger
|
|
||||||
mkdir -p $dir
|
|
||||||
|
|
||||||
export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH
|
|
||||||
|
|
||||||
|
|
||||||
( # First make sure the kaldi_lm toolkit is installed.
|
|
||||||
# Note: this didn't work out of the box for me, I had to
|
|
||||||
# change the g++ version to just "g++" (no cross-compilation
|
|
||||||
# needed for me as I ran on a machine that had been setup
|
|
||||||
# as 64 bit by default.
|
|
||||||
cd $KALDI_ROOT/tools || exit 1;
|
|
||||||
if [ -d $rnnlm_ver ]; then
|
|
||||||
echo Not installing the rnnlm toolkit since it is already there.
|
|
||||||
else
|
|
||||||
echo Downloading and installing the rnnlm tools
|
|
||||||
# http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz
|
|
||||||
if [ ! -f $rnnlm_ver.tgz ]; then
|
|
||||||
wget http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz || exit 1;
|
|
||||||
fi
|
|
||||||
mkdir $rnnlm_ver
|
|
||||||
cd $rnnlm_ver
|
|
||||||
tar -xvzf ../$rnnlm_ver.tgz || exit 1;
|
|
||||||
make CC=g++ || exit 1;
|
|
||||||
echo Done making the rnnlm tools
|
|
||||||
fi
|
|
||||||
) || exit 1;
|
|
||||||
|
|
||||||
|
|
||||||
if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
|
|
||||||
echo "Expecting files $srcdir/cleaned.gz and $srcdir/wordlist.final to exist";
|
|
||||||
echo "You need to run local/wsj_extend_dict.sh before running this script."
|
|
||||||
exit 1;
|
|
||||||
fi
|
|
||||||
|
|
||||||
cat $srcdir/lexicon.txt | awk '{print $1}' | grep -v -w '!SIL' > $dir/wordlist.all
|
|
||||||
|
|
||||||
# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
|
|
||||||
echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)"
|
|
||||||
gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.all \
|
|
||||||
'BEGIN{while((getline<w)>0) v[$1]=1;}
|
|
||||||
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
|
|
||||||
| gzip -c > $dir/all.gz
|
|
||||||
|
|
||||||
echo "Splitting data into train and validation sets."
|
|
||||||
heldout_sent=10000
|
|
||||||
gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data
|
|
||||||
gunzip -c $dir/all.gz | tail -n +$heldout_sent | \
|
|
||||||
perl -e ' use List::Util qw(shuffle); @A=<>; print join("", shuffle(@A)); ' \
|
|
||||||
> $dir/train.in # training data
|
|
||||||
|
|
||||||
|
|
||||||
# The rest will consist of a word-class represented by <RNN_UNK>, that
|
|
||||||
# maps (with probabilities) to a whole class of words.
|
|
||||||
|
|
||||||
# Get unigram counts from our training data, and use this to select word-list
|
|
||||||
# for RNNLM training; e.g. 10k most frequent words. Rest will go in a class
|
|
||||||
# that we (manually, at the shell level) assign probabilities for words that
|
|
||||||
# are in that class. Note: this word-list doesn't need to include </s>; this
|
|
||||||
# automatically gets added inside the rnnlm program.
|
|
||||||
# Note: by concatenating with $dir/wordlist.all, we are doing add-one
|
|
||||||
# smoothing of the counts.
|
|
||||||
|
|
||||||
cat $dir/train.in $dir/wordlist.all | grep -v '</s>' | grep -v '<s>' | \
|
|
||||||
awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
|
|
||||||
sort -nr > $dir/unigram.counts
|
|
||||||
|
|
||||||
head -$nwords $dir/unigram.counts | awk '{print $2}' > $dir/wordlist.rnn
|
|
||||||
|
|
||||||
tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts
|
|
||||||
|
|
||||||
tot=`awk '{x=x+$1} END{print x}' $dir/unk_class.counts`
|
|
||||||
awk -v tot=$tot '{print $2, ($1*1.0/tot);}' <$dir/unk_class.counts >$dir/unk.probs
|
|
||||||
|
|
||||||
|
|
||||||
for type in train valid; do
|
|
||||||
cat $dir/$type.in | awk -v w=$dir/wordlist.rnn \
|
|
||||||
'BEGIN{while((getline<w)>0) v[$1]=1;}
|
|
||||||
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<RNN_UNK> ";print ""}'|sed 's/ $//g' \
|
|
||||||
> $dir/$type
|
|
||||||
done
|
|
||||||
rm $dir/train.in # no longer needed-- and big.
|
|
||||||
|
|
||||||
# Now randomize the order of the training data.
|
|
||||||
cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \
|
|
||||||
sort | cut -f 2 > $dir/foo
|
|
||||||
mv $dir/foo $dir/train
|
|
||||||
|
|
||||||
# OK we'll train the RNNLM on this data.
|
|
||||||
|
|
||||||
# todo: change 100 to 320.
|
|
||||||
# using 100 classes as square root of 10k.
|
|
||||||
echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
|
|
||||||
#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/100.rnnlm \
|
|
||||||
# -hidden 100 -rand-seed 1 -debug 2 -class 100 -bptt 2 -bptt-block 20 \
|
|
||||||
# -direct-order 4 -direct 1000 -binary >& $dir/rnnlm1.log &
|
|
||||||
|
|
||||||
$cmd $dir/rnnlm.log \
|
|
||||||
$KALDI_ROOT/tools/$rnnlm_ver/rnnlm -independent -train $dir/train -valid $dir/valid \
|
|
||||||
-rnnlm $dir/rnnlm -hidden $hidden -rand-seed 1 -debug 2 -class $class -bptt 2 -bptt-block 20 \
|
|
||||||
-direct-order 4 -direct $direct -binary || exit 1;
|
|
||||||
|
|
||||||
|
|
||||||
# make it like a Kaldi table format, with fake utterance-ids.
|
|
||||||
cat $dir/valid.in | awk '{ printf("uttid-%d ", NR); print; }' > $dir/valid.with_ids
|
|
||||||
|
|
||||||
utils/rnnlm_compute_scores.sh $dir $dir/tmp.valid $dir/valid.with_ids \
|
|
||||||
$dir/valid.scores
|
|
||||||
nw=`wc -w < $dir/valid.with_ids` # Note: valid.with_ids includes utterance-ids which
|
|
||||||
# is one per word, to account for the </s> at the end of each sentence; this is the
|
|
||||||
# correct number to normalize buy.
|
|
||||||
p=`awk -v nw=$nw '{x=x+$2} END{print exp(x/nw);}' <$dir/valid.scores`
|
|
||||||
echo Perplexity is $p | tee $dir/perplexity.log
|
|
||||||
|
|
||||||
rm $dir/train $dir/all.gz
|
|
||||||
|
|
||||||
# This is a better setup, but takes a long time to train:
|
|
||||||
#echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
|
|
||||||
#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/320.rnnlm \
|
|
||||||
# -hidden 320 -rand-seed 1 -debug 2 -class 300 -bptt 2 -bptt-block 20 \
|
|
||||||
# -direct-order 4 -direct 2000 -binary
|
|
|
@ -1,201 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
|
|
||||||
# Apache 2.0.
|
|
||||||
|
|
||||||
|
|
||||||
if [ $# -le 3 ]; then
|
|
||||||
echo "Arguments should be a list of WSJ directories, see ../run.sh for example."
|
|
||||||
exit 1;
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
dir=`pwd`/data/local/data
|
|
||||||
lmdir=`pwd`/data/local/nist_lm
|
|
||||||
mkdir -p $dir $lmdir
|
|
||||||
local=`pwd`/local
|
|
||||||
utils=`pwd`/utils
|
|
||||||
|
|
||||||
. ./path.sh # Needed for KALDI_ROOT
|
|
||||||
export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
|
|
||||||
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
|
|
||||||
if [ ! -x $sph2pipe ]; then
|
|
||||||
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
|
|
||||||
exit 1;
|
|
||||||
fi
|
|
||||||
|
|
||||||
cd $dir
|
|
||||||
|
|
||||||
# Make directory of links to the WSJ disks such as 11-13.1. This relies on the command
|
|
||||||
# line arguments being absolute pathnames.
|
|
||||||
rm -r links/ 2>/dev/null
|
|
||||||
mkdir links/
|
|
||||||
ln -s $* links
|
|
||||||
|
|
||||||
# Do some basic checks that we have what we expected.
|
|
||||||
if [ ! -d links/11-13.1 -o ! -d links/13-34.1 -o ! -d links/11-2.1 ]; then
|
|
||||||
echo "wsj_data_prep.sh: Spot check of command line arguments failed"
|
|
||||||
echo "Command line arguments must be absolute pathnames to WSJ directories"
|
|
||||||
echo "with names like 11-13.1."
|
|
||||||
exit 1;
|
|
||||||
fi
|
|
||||||
|
|
||||||
# This version for SI-84
|
|
||||||
|
|
||||||
cat links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
|
|
||||||
$local/ndx2flist.pl $* | sort | \
|
|
||||||
grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si84.flist
|
|
||||||
|
|
||||||
nl=`cat train_si84.flist | wc -l`
|
|
||||||
[ "$nl" -eq 7138 ] || echo "Warning: expected 37416 lines in train_si84.flist, got $nl"
|
|
||||||
|
|
||||||
# This version for SI-284
|
|
||||||
cat links/13-34.1/wsj1/doc/indices/si_tr_s.ndx \
|
|
||||||
links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
|
|
||||||
$local/ndx2flist.pl $* | sort | \
|
|
||||||
grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si284.flist
|
|
||||||
|
|
||||||
nl=`cat train_si284.flist | wc -l`
|
|
||||||
[ "$nl" -eq 37416 ] || echo "Warning: expected 37416 lines in train_si284.flist, got $nl"
|
|
||||||
|
|
||||||
# Now for the test sets.
|
|
||||||
# links/13-34.1/wsj1/doc/indices/readme.doc
|
|
||||||
# describes all the different test sets.
|
|
||||||
# Note: each test-set seems to come in multiple versions depending
|
|
||||||
# on different vocabulary sizes, verbalized vs. non-verbalized
|
|
||||||
# pronunciations, etc. We use the largest vocab and non-verbalized
|
|
||||||
# pronunciations.
|
|
||||||
# The most normal one seems to be the "baseline 60k test set", which
|
|
||||||
# is h1_p0.
|
|
||||||
|
|
||||||
# Nov'92 (333 utts)
|
|
||||||
# These index files have a slightly different format;
|
|
||||||
# have to add .wv1
|
|
||||||
cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_20.ndx | \
|
|
||||||
$local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \
|
|
||||||
sort > test_eval92.flist
|
|
||||||
|
|
||||||
# Nov'92 (330 utts, 5k vocab)
|
|
||||||
cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
|
|
||||||
$local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \
|
|
||||||
sort > test_eval92_5k.flist
|
|
||||||
|
|
||||||
# Nov'93: (213 utts)
|
|
||||||
# Have to replace a wrong disk-id.
|
|
||||||
cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \
|
|
||||||
sed s/13_32_1/13_33_1/ | \
|
|
||||||
$local/ndx2flist.pl $* | sort > test_eval93.flist
|
|
||||||
|
|
||||||
# Nov'93: (213 utts, 5k)
|
|
||||||
cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h2_p0.ndx | \
|
|
||||||
sed s/13_32_1/13_33_1/ | \
|
|
||||||
$local/ndx2flist.pl $* | sort > test_eval93_5k.flist
|
|
||||||
|
|
||||||
# Dev-set for Nov'93 (503 utts)
|
|
||||||
cat links/13-34.1/wsj1/doc/indices/h1_p0.ndx | \
|
|
||||||
$local/ndx2flist.pl $* | sort > test_dev93.flist
|
|
||||||
|
|
||||||
# Dev-set for Nov'93 (513 utts, 5k vocab)
|
|
||||||
cat links/13-34.1/wsj1/doc/indices/h2_p0.ndx | \
|
|
||||||
$local/ndx2flist.pl $* | sort > test_dev93_5k.flist
|
|
||||||
|
|
||||||
|
|
||||||
# Dev-set Hub 1,2 (503, 913 utterances)
|
|
||||||
|
|
||||||
# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.
|
|
||||||
# Sometimes this gets copied from the CD's with upcasing, don't know
|
|
||||||
# why (could be older versions of the disks).
|
|
||||||
find `readlink links/13-16.1`/???1/??_??_20 -print | grep -i ".wv1" | sort > dev_dt_20.flist
|
|
||||||
find `readlink links/13-16.1`/???1/??_??_05 -print | grep -i ".wv1" | sort > dev_dt_05.flist
|
|
||||||
|
|
||||||
|
|
||||||
# Finding the transcript files:
|
|
||||||
for x in $*; do find -L $x -iname '*.dot'; done > dot_files.flist
|
|
||||||
|
|
||||||
# Convert the transcripts into our format (no normalization yet)
|
|
||||||
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
|
||||||
$local/flist2scp.pl $x.flist | sort > ${x}_sph.scp
|
|
||||||
cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl dot_files.flist > $x.trans1
|
|
||||||
done
|
|
||||||
|
|
||||||
# Do some basic normalization steps. At this point we don't remove OOVs--
|
|
||||||
# that will be done inside the training scripts, as we'd like to make the
|
|
||||||
# data-preparation stage independent of the specific lexicon used.
|
|
||||||
noiseword="<NOISE>";
|
|
||||||
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
|
||||||
cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1;
|
|
||||||
done
|
|
||||||
|
|
||||||
# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
|
|
||||||
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
|
||||||
awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
|
|
||||||
done
|
|
||||||
|
|
||||||
# Make the utt2spk and spk2utt files.
|
|
||||||
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
|
||||||
cat ${x}_sph.scp | awk '{print $1}' | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
|
|
||||||
cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
|
|
||||||
done
|
|
||||||
|
|
||||||
|
|
||||||
#in case we want to limit lm's on most frequent words, copy lm training word frequency list
|
|
||||||
cp links/13-32.1/wsj1/doc/lng_modl/vocab/wfl_64.lst $lmdir
|
|
||||||
chmod u+w $lmdir/*.lst # had weird permissions on source.
|
|
||||||
|
|
||||||
# The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without
|
|
||||||
# verbalized pronunciations. This is the most common test setup, I understand.
|
|
||||||
|
|
||||||
cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1;
|
|
||||||
chmod u+w $lmdir/lm_bg.arpa.gz
|
|
||||||
|
|
||||||
# trigram would be:
|
|
||||||
cat links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb20onp.z | \
|
|
||||||
perl -e 'while(<>){ if(m/^\\data\\/){ print; last; } } while(<>){ print; }' | \
|
|
||||||
gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1;
|
|
||||||
|
|
||||||
prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1;
|
|
||||||
gzip -f $lmdir/lm_tgpr.arpa || exit 1;
|
|
||||||
|
|
||||||
# repeat for 5k language models
|
|
||||||
cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb05onp.z $lmdir/lm_bg_5k.arpa.gz || exit 1;
|
|
||||||
chmod u+w $lmdir/lm_bg_5k.arpa.gz
|
|
||||||
|
|
||||||
# trigram would be: !only closed vocabulary here!
|
|
||||||
cp links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1;
|
|
||||||
chmod u+w $lmdir/lm_tg_5k.arpa.gz
|
|
||||||
gunzip $lmdir/lm_tg_5k.arpa.gz
|
|
||||||
tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz
|
|
||||||
rm $lmdir/lm_tg_5k.arpa
|
|
||||||
|
|
||||||
prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1;
|
|
||||||
gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1;
|
|
||||||
|
|
||||||
|
|
||||||
if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
|
|
||||||
rm wsj0-train-spkrinfo.txt
|
|
||||||
! wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt && \
|
|
||||||
echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
|
|
||||||
wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -f wsj0-train-spkrinfo.txt ]; then
|
|
||||||
echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
|
|
||||||
echo "This is possibly omitted from the training disks; couldn't find it."
|
|
||||||
echo "Everything else may have worked; we just may be missing gender info"
|
|
||||||
echo "which is only needed for VTLN-related diagnostics anyway."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
|
|
||||||
# LDC put it on the web. Perhaps it was accidentally omitted from the
|
|
||||||
# disks.
|
|
||||||
|
|
||||||
cat links/11-13.1/wsj0/doc/spkrinfo.txt \
|
|
||||||
links/13-32.1/wsj1/doc/evl_spok/spkrinfo.txt \
|
|
||||||
links/13-34.1/wsj1/doc/dev_spok/spkrinfo.txt \
|
|
||||||
links/13-34.1/wsj1/doc/train/spkrinfo.txt \
|
|
||||||
./wsj0-train-spkrinfo.txt | \
|
|
||||||
perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
|
|
||||||
awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
|
|
||||||
|
|
||||||
|
|
||||||
echo "Data preparation succeeded"
|
|
|
@ -1,172 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script builds a larger word-list and dictionary
|
|
||||||
# than used for the LMs supplied with the WSJ corpus.
|
|
||||||
# It uses a couple of strategies to fill-in words in
|
|
||||||
# the LM training data but not in CMUdict. One is
|
|
||||||
# to generate special prons for possible acronyms, that
|
|
||||||
# just consist of the constituent letters. The other
|
|
||||||
# is designed to handle derivatives of known words
|
|
||||||
# (e.g. deriving the pron of a plural from the pron of
|
|
||||||
# the base-word), but in a more general, learned-from-data
|
|
||||||
# way.
|
|
||||||
# It makes use of scripts in local/dict/
|
|
||||||
|
|
||||||
if [ $# -ne 1 ]; then
|
|
||||||
echo "Usage: local/wsj_train_lms.sh /foo/bar/WSJ/13-32.1/"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
if [ "`basename $1`" != 13-32.1 ]; then
|
|
||||||
echo "Expecting the argument to this script to end in 13-32.1"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# e.g.
|
|
||||||
#srcdir=/mnt/matylda2/data/WSJ1/13-32.1
|
|
||||||
export PATH=$PATH:`pwd`/local/dict/
|
|
||||||
srcdir=$1
|
|
||||||
mkdir -p data/local/dict_larger
|
|
||||||
dir=data/local/dict_larger
|
|
||||||
cp data/local/dict/* data/local/dict_larger # Various files describing phones etc.
|
|
||||||
# are there; we just want to copy them as the phoneset is the same.
|
|
||||||
rm data/local/dict_larger/lexicon.txt # we don't want this.
|
|
||||||
mincount=2 # Minimum count of an OOV we will try to generate a pron for.
|
|
||||||
|
|
||||||
[ ! -f data/local/dict/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1;
|
|
||||||
|
|
||||||
# Remove comments from cmudict; print first field; remove
|
|
||||||
# words like FOO(1) which are alternate prons: our dict format won't
|
|
||||||
# include these markers.
|
|
||||||
grep -v ';;;' data/local/dict/cmudict/cmudict.0.7a |
|
|
||||||
perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu
|
|
||||||
|
|
||||||
cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu
|
|
||||||
|
|
||||||
echo "Getting training data [this should take at least a few seconds; if not, there's a problem]"
|
|
||||||
|
|
||||||
# Convert to uppercase, remove XML-like markings.
|
|
||||||
# For words ending in "." that are not in CMUdict, we assume that these
|
|
||||||
# are periods that somehow remained in the data during data preparation,
|
|
||||||
# and we we replace the "." with "\n". Note: we found this by looking at
|
|
||||||
# oov.counts below (before adding this rule).
|
|
||||||
|
|
||||||
touch $dir/cleaned.gz
|
|
||||||
if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then
|
|
||||||
echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]";
|
|
||||||
else
|
|
||||||
gunzip -c $srcdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \
|
|
||||||
| awk '/^</{next}{print toupper($0)}' | perl -e '
|
|
||||||
open(F, "<$ARGV[0]")||die;
|
|
||||||
while(<F>){ chop; $isword{$_} = 1; }
|
|
||||||
while(<STDIN>) {
|
|
||||||
@A = split(" ", $_);
|
|
||||||
for ($n = 0; $n < @A; $n++) {
|
|
||||||
$a = $A[$n];
|
|
||||||
if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "."
|
|
||||||
# and have no other "." in them: treat as period.
|
|
||||||
print "$a";
|
|
||||||
if ($n+1 < @A) { print "\n"; }
|
|
||||||
} else { print "$a "; }
|
|
||||||
}
|
|
||||||
print "\n";
|
|
||||||
}
|
|
||||||
' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz
|
|
||||||
fi
|
|
||||||
|
|
||||||
# get unigram counts
|
|
||||||
echo "Getting unigram counts"
|
|
||||||
gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \
|
|
||||||
awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams
|
|
||||||
|
|
||||||
cat $dir/unigrams | awk -v dict=$dir/dict.cmu \
|
|
||||||
'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \
|
|
||||||
> $dir/oov.counts
|
|
||||||
|
|
||||||
echo "Most frequent unseen unigrams are: "
|
|
||||||
head $dir/oov.counts
|
|
||||||
|
|
||||||
# Prune away singleton counts, and remove things with numbers in
|
|
||||||
# (which should have been normalized) and with no letters at all.
|
|
||||||
|
|
||||||
|
|
||||||
cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \
|
|
||||||
| awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist
|
|
||||||
|
|
||||||
# Automatic rule-finding...
|
|
||||||
|
|
||||||
# First make some prons for possible acronyms.
|
|
||||||
# Note: we don't do this for things like U.K or U.N,
|
|
||||||
# or A.B. (which doesn't exist anyway),
|
|
||||||
# as we consider this normalization/spelling errors.
|
|
||||||
|
|
||||||
cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu > $dir/dict.acronyms
|
|
||||||
|
|
||||||
mkdir $dir/f $dir/b # forward, backward directions of rules...
|
|
||||||
# forward is normal suffix
|
|
||||||
# rules, backward is reversed (prefix rules). These
|
|
||||||
# dirs contain stuff we create while making the rule-based
|
|
||||||
# extensions to the dictionary.
|
|
||||||
|
|
||||||
# Remove ; and , from words, if they are present; these
|
|
||||||
# might crash our scripts, as they are used as separators there.
|
|
||||||
filter_dict.pl $dir/dict.cmu > $dir/f/dict
|
|
||||||
cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
|
|
||||||
reverse_dict.pl $dir/f/dict > $dir/b/dict
|
|
||||||
reverse_dict.pl $dir/f/oovs > $dir/b/oovs
|
|
||||||
|
|
||||||
# The next stage takes a few minutes.
|
|
||||||
# Note: the forward stage takes longer, as English is
|
|
||||||
# mostly a suffix-based language, and there are more rules
|
|
||||||
# that it finds.
|
|
||||||
for d in $dir/f $dir/b; do
|
|
||||||
(
|
|
||||||
cd $d
|
|
||||||
cat dict | get_rules.pl 2>get_rules.log >rules
|
|
||||||
get_rule_hierarchy.pl rules >hierarchy
|
|
||||||
awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
|
|
||||||
limit_candidate_prons.pl hierarchy | \
|
|
||||||
score_prons.pl dict | \
|
|
||||||
count_rules.pl >rule.counts
|
|
||||||
# the sort command below is just for convenience of reading.
|
|
||||||
score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
|
|
||||||
get_candidate_prons.pl rules.with_scores dict oovs | \
|
|
||||||
limit_candidate_prons.pl hierarchy > oovs.candidates
|
|
||||||
) &
|
|
||||||
done
|
|
||||||
wait
|
|
||||||
|
|
||||||
# Merge the candidates.
|
|
||||||
reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
|
|
||||||
select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s %s\n", $1, $2);}' \
|
|
||||||
> $dir/dict.oovs
|
|
||||||
|
|
||||||
cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
|
|
||||||
|
|
||||||
awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
|
|
||||||
sort $dir/oovlist | diff - $dir/oovlist.handled | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
|
|
||||||
|
|
||||||
|
|
||||||
# add_counts.pl attaches to original counts to the list of handled/not-handled OOVs
|
|
||||||
add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts
|
|
||||||
add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts
|
|
||||||
|
|
||||||
echo "**Top OOVs we handled are:**";
|
|
||||||
head $dir/oovlist.handled.counts
|
|
||||||
echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**";
|
|
||||||
head $dir/oovlist.not_handled.counts
|
|
||||||
|
|
||||||
|
|
||||||
echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`"
|
|
||||||
echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`"
|
|
||||||
echo "Count of OOVs we didn't handle due to low count is" \
|
|
||||||
`awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts`
|
|
||||||
# The two files created above are for humans to look at, as diagnostics.
|
|
||||||
|
|
||||||
cat <<EOF | cat - $dir/dict.cmu $dir/dict.oovs_merged | sort | uniq > $dir/lexicon.txt
|
|
||||||
!SIL SIL
|
|
||||||
<SPOKEN_NOISE> SPN
|
|
||||||
<UNK> SPN
|
|
||||||
<NOISE> NSN
|
|
||||||
EOF
|
|
||||||
|
|
||||||
echo "Created $dir/lexicon.txt"
|
|
|
@ -1,86 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Copyright 2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
|
|
||||||
# Apache 2.0
|
|
||||||
|
|
||||||
# This script takes data prepared in a corpus-dependent way
|
|
||||||
# in data/local/, and converts it into the "canonical" form,
|
|
||||||
# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
|
|
||||||
# data/train_si284, data/train_si84, etc.
|
|
||||||
|
|
||||||
# Don't bother doing train_si84 separately (although we have the file lists
|
|
||||||
# in data/local/) because it's just the first 7138 utterances in train_si284.
|
|
||||||
# We'll create train_si84 after doing the feature extraction.
|
|
||||||
|
|
||||||
. ./path.sh || exit 1;
|
|
||||||
|
|
||||||
echo "Preparing train and test data"
|
|
||||||
srcdir=data/local/data
|
|
||||||
lmdir=data/local/nist_lm
|
|
||||||
tmpdir=data/local/lm_tmp
|
|
||||||
lexicon=data/local/lang_tmp/lexicon.txt
|
|
||||||
mkdir -p $tmpdir
|
|
||||||
|
|
||||||
for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
|
||||||
mkdir -p data/$x
|
|
||||||
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
|
|
||||||
cp $srcdir/$x.txt data/$x/text || exit 1;
|
|
||||||
cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
|
|
||||||
cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
|
|
||||||
utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1;
|
|
||||||
done
|
|
||||||
|
|
||||||
|
|
||||||
# Next, for each type of language model, create the corresponding FST
|
|
||||||
# and the corresponding lang_test_* directory.
|
|
||||||
|
|
||||||
echo Preparing language models for test
|
|
||||||
|
|
||||||
for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
|
|
||||||
test=data/lang_test_${lm_suffix}
|
|
||||||
mkdir -p $test
|
|
||||||
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
|
|
||||||
phones/; do
|
|
||||||
cp -r data/lang/$f $test
|
|
||||||
done
|
|
||||||
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
|
||||||
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
|
|
||||||
|
|
||||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
|
||||||
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
|
||||||
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
|
||||||
# which are supposed to occur only at being/end of utt. These can cause
|
|
||||||
# determinization failures of CLG [ends up being epsilon cycles].
|
|
||||||
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
|
||||||
grep -v '<s> <s>' | \
|
|
||||||
grep -v '</s> <s>' | \
|
|
||||||
grep -v '</s> </s>' | \
|
|
||||||
arpa2fst - | fstprint | \
|
|
||||||
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
|
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
|
||||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
|
||||||
fstisstochastic $test/G.fst
|
|
||||||
# The output is like:
|
|
||||||
# 9.14233e-05 -0.259833
|
|
||||||
# we do expect the first of these 2 numbers to be close to zero (the second is
|
|
||||||
# nonzero because the backoff weights make the states sum to >1).
|
|
||||||
# Because of the <s> fiasco for these particular LMs, the first number is not
|
|
||||||
# as close to zero as it could be.
|
|
||||||
|
|
||||||
# Everything below is only for diagnostic.
|
|
||||||
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
|
|
||||||
# this might cause determinization failure of CLG.
|
|
||||||
# #0 is treated as an empty word.
|
|
||||||
mkdir -p $tmpdir/g
|
|
||||||
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
|
|
||||||
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
|
|
||||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
|
|
||||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
|
|
||||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
|
||||||
echo "Language model has cycles with empty words" && exit 1
|
|
||||||
rm -r $tmpdir/g
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "Succeeded in formatting data."
|
|
||||||
rm -r $tmpdir
|
|
|
@ -1,54 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Copyright Johns Hopkins University (Author: Daniel Povey) 2012
|
|
||||||
|
|
||||||
. ./path.sh
|
|
||||||
|
|
||||||
[ ! -d data/lang_bd ] && echo "Expect data/local/lang_bd to exist" && exit 1;
|
|
||||||
|
|
||||||
lm_srcdir_3g=data/local/local_lm/3gram-mincount
|
|
||||||
lm_srcdir_4g=data/local/local_lm/4gram-mincount
|
|
||||||
|
|
||||||
[ ! -d "$lm_srcdir_3g" ] && echo "No such dir $lm_srcdir_3g" && exit 1;
|
|
||||||
[ ! -d "$lm_srcdir_4g" ] && echo "No such dir $lm_srcdir_4g" && exit 1;
|
|
||||||
|
|
||||||
for d in data/lang_test_bd_{tg,tgpr,fg,fgpr}; do
|
|
||||||
rm -r $d 2>/dev/null
|
|
||||||
cp -r data/lang_bd $d
|
|
||||||
done
|
|
||||||
|
|
||||||
lang=data/lang_bd
|
|
||||||
|
|
||||||
exit
|
|
||||||
|
|
||||||
# Be careful: this time we dispense with the grep -v '<s> <s>' so this might
|
|
||||||
# not work for LMs generated from all toolkits.
|
|
||||||
gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \
|
|
||||||
arpa2fst - | fstprint | \
|
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
|
||||||
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tgpr/G.fst || exit 1;
|
|
||||||
fstisstochastic data/lang_test_bd_tgpr/G.fst
|
|
||||||
|
|
||||||
gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
|
|
||||||
arpa2fst - | fstprint | \
|
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
|
||||||
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tg/G.fst || exit 1;
|
|
||||||
fstisstochastic data/lang_test_bd_tg/G.fst
|
|
||||||
|
|
||||||
gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
|
|
||||||
arpa2fst - | fstprint | \
|
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
|
||||||
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fg/G.fst || exit 1;
|
|
||||||
fstisstochastic data/lang_test_bd_fg/G.fst
|
|
||||||
|
|
||||||
gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \
|
|
||||||
arpa2fst - | fstprint | \
|
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
|
||||||
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fgpr/G.fst || exit 1;
|
|
||||||
fstisstochastic data/lang_test_bd_fgpr/G.fst
|
|
||||||
|
|
||||||
exit 0;
|
|
|
@ -1,83 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
|
|
||||||
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
|
||||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
|
||||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
|
||||||
# See the Apache 2 License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
# Call this script from one level above, e.g. from the s3/ directory. It puts
|
|
||||||
# its output in data/local/.
|
|
||||||
|
|
||||||
# The parts of the output of this that will be needed are
|
|
||||||
# [in data/local/dict/ ]
|
|
||||||
# lexicon.txt
|
|
||||||
# extra_questions.txt
|
|
||||||
# nonsilence_phones.txt
|
|
||||||
# optional_silence.txt
|
|
||||||
# silence_phones.txt
|
|
||||||
|
|
||||||
# run this from ../
|
|
||||||
dir=data/local/dict
|
|
||||||
mkdir -p $dir
|
|
||||||
|
|
||||||
|
|
||||||
# (1) Get the CMU dictionary
|
|
||||||
svn co https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict \
|
|
||||||
$dir/cmudict || exit 1;
|
|
||||||
|
|
||||||
# can add -r 10966 for strict compatibility.
|
|
||||||
|
|
||||||
|
|
||||||
#(2) Dictionary preparation:
|
|
||||||
|
|
||||||
|
|
||||||
# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
|
|
||||||
# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.
|
|
||||||
|
|
||||||
# silence phones, one per line.
|
|
||||||
(echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt
|
|
||||||
echo SIL > $dir/optional_silence.txt
|
|
||||||
|
|
||||||
# nonsilence phones; on each line is a list of phones that correspond
|
|
||||||
# really to the same base phone.
|
|
||||||
cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \
|
|
||||||
perl -e 'while(<>){
|
|
||||||
chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
|
|
||||||
$phones_of{$1} .= "$_ "; }
|
|
||||||
foreach $list (values %phones_of) {print $list . "\n"; } ' \
|
|
||||||
> $dir/nonsilence_phones.txt || exit 1;
|
|
||||||
|
|
||||||
# A few extra questions that will be added to those obtained by automatically clustering
|
|
||||||
# the "real" phones. These ask about stress; there's also one for silence.
|
|
||||||
cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
|
|
||||||
cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
|
|
||||||
$p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
|
|
||||||
>> $dir/extra_questions.txt || exit 1;
|
|
||||||
|
|
||||||
grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
|
|
||||||
perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
|
|
||||||
> $dir/lexicon1_raw_nosil.txt || exit 1;
|
|
||||||
|
|
||||||
# Add to cmudict the silences, noises etc.
|
|
||||||
|
|
||||||
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; echo '<NOISE> NSN'; ) | \
|
|
||||||
cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1;
|
|
||||||
|
|
||||||
|
|
||||||
# lexicon.txt is without the _B, _E, _S, _I markers.
|
|
||||||
# This is the input to wsj_format_data.sh
|
|
||||||
cp $dir/lexicon2_raw.txt $dir/lexicon.txt
|
|
||||||
|
|
||||||
|
|
||||||
echo "Dictionary preparation succeeded"
|
|
||||||
|
|
|
@ -1,202 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script trains LMs on the WSJ LM-training data.
|
|
||||||
# It requires that you have already run wsj_extend_dict.sh,
|
|
||||||
# to get the larger-size dictionary including all of CMUdict
|
|
||||||
# plus any OOVs and possible acronyms that we could easily
|
|
||||||
# derive pronunciations for.
|
|
||||||
|
|
||||||
# This script takes no command-line arguments
|
|
||||||
|
|
||||||
dir=data/local/local_lm
|
|
||||||
srcdir=data/local/dict_larger
|
|
||||||
mkdir -p $dir
|
|
||||||
. ./path.sh || exit 1; # for KALDI_ROOT
|
|
||||||
export PATH=$KALDI_ROOT/tools/kaldi_lm:$PATH
|
|
||||||
( # First make sure the kaldi_lm toolkit is installed.
|
|
||||||
cd $KALDI_ROOT/tools || exit 1;
|
|
||||||
if [ -d kaldi_lm ]; then
|
|
||||||
echo Not installing the kaldi_lm toolkit since it is already there.
|
|
||||||
else
|
|
||||||
echo Downloading and installing the kaldi_lm tools
|
|
||||||
if [ ! -f kaldi_lm.tar.gz ]; then
|
|
||||||
wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
|
|
||||||
fi
|
|
||||||
tar -xvzf kaldi_lm.tar.gz || exit 1;
|
|
||||||
cd kaldi_lm
|
|
||||||
make || exit 1;
|
|
||||||
echo Done making the kaldi_lm tools
|
|
||||||
fi
|
|
||||||
) || exit 1;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
|
|
||||||
echo "Expecting files $srcdir/cleaned.gz and $srcdir/lexicon.txt to exist";
|
|
||||||
echo "You need to run local/wsj_extend_dict.sh before running this script."
|
|
||||||
exit 1;
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Get a wordlist-- keep everything but silence, which should not appear in
|
|
||||||
# the LM.
|
|
||||||
awk '{print $1}' $srcdir/lexicon.txt | grep -v -w '!SIL' > $dir/wordlist.txt
|
|
||||||
|
|
||||||
# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
|
|
||||||
echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)"
|
|
||||||
gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.txt \
|
|
||||||
'BEGIN{while((getline<w)>0) v[$1]=1;}
|
|
||||||
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
|
|
||||||
| gzip -c > $dir/train_nounk.gz
|
|
||||||
|
|
||||||
# Get unigram counts (without bos/eos, but this doens't matter here, it's
|
|
||||||
# only to get the word-map, which treats them specially & doesn't need their
|
|
||||||
# counts).
|
|
||||||
# Add a 1-count for each word in word-list by including that in the data,
|
|
||||||
# so all words appear.
|
|
||||||
gunzip -c $dir/train_nounk.gz | cat - $dir/wordlist.txt | \
|
|
||||||
awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
|
|
||||||
sort -nr > $dir/unigram.counts
|
|
||||||
|
|
||||||
# Get "mapped" words-- a character encoding of the words that makes the common words very short.
|
|
||||||
cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map
|
|
||||||
|
|
||||||
gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
|
|
||||||
{ for(n=1;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz
|
|
||||||
|
|
||||||
# To save disk space, remove the un-mapped training data. We could
|
|
||||||
# easily generate it again if needed.
|
|
||||||
rm $dir/train_nounk.gz
|
|
||||||
|
|
||||||
train_lm.sh --arpa --lmtype 3gram-mincount $dir
|
|
||||||
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
|
|
||||||
# 7.8 million N-grams.
|
|
||||||
|
|
||||||
prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
|
|
||||||
# 1.45 million N-grams.
|
|
||||||
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
|
|
||||||
|
|
||||||
train_lm.sh --arpa --lmtype 4gram-mincount $dir
|
|
||||||
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
|
|
||||||
# 10.3 million N-grams.
|
|
||||||
|
|
||||||
prune_lm.sh --arpa 7.0 $dir/4gram-mincount
|
|
||||||
# 1.50 million N-grams
|
|
||||||
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
|
|
||||||
|
|
||||||
|
|
||||||
exit 0
|
|
||||||
|
|
||||||
### Below here, this script is showing various commands that
|
|
||||||
## were run during LM tuning.
|
|
||||||
|
|
||||||
train_lm.sh --arpa --lmtype 3gram-mincount $dir
|
|
||||||
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
|
|
||||||
# 7.8 million N-grams.
|
|
||||||
|
|
||||||
prune_lm.sh --arpa 3.0 $dir/3gram-mincount/
|
|
||||||
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740
|
|
||||||
# 2.5 million N-grams.
|
|
||||||
|
|
||||||
prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
|
|
||||||
# 1.45 million N-grams.
|
|
||||||
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
|
|
||||||
|
|
||||||
train_lm.sh --arpa --lmtype 4gram-mincount $dir
|
|
||||||
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
|
|
||||||
# 10.3 million N-grams.
|
|
||||||
|
|
||||||
prune_lm.sh --arpa 3.0 $dir/4gram-mincount
|
|
||||||
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294
|
|
||||||
# 2.6 million N-grams.
|
|
||||||
|
|
||||||
prune_lm.sh --arpa 4.0 $dir/4gram-mincount
|
|
||||||
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717
|
|
||||||
# 2.15 million N-grams.
|
|
||||||
|
|
||||||
prune_lm.sh --arpa 5.0 $dir/4gram-mincount
|
|
||||||
# 1.86 million N-grams
|
|
||||||
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023
|
|
||||||
|
|
||||||
prune_lm.sh --arpa 7.0 $dir/4gram-mincount
|
|
||||||
# 1.50 million N-grams
|
|
||||||
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
|
|
||||||
|
|
||||||
train_lm.sh --arpa --lmtype 3gram $dir
|
|
||||||
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866
|
|
||||||
# 20.0 million N-grams
|
|
||||||
|
|
||||||
! which ngram-count \
|
|
||||||
&& echo "SRILM tools not installed so not doing the comparison" && exit 1;
|
|
||||||
|
|
||||||
#################
|
|
||||||
# You could finish the script here if you wanted.
|
|
||||||
# Below is to show how to do baselines with SRILM.
|
|
||||||
# You'd have to install the SRILM toolkit first.
|
|
||||||
|
|
||||||
heldout_sent=10000 # Don't change this if you want result to be comparable with
|
|
||||||
# kaldi_lm results
|
|
||||||
sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
|
|
||||||
mkdir -p $sdir
|
|
||||||
gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout
|
|
||||||
gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train
|
|
||||||
(echo "<s>"; echo "</s>" ) | cat - $dir/wordlist.txt > $sdir/wordlist.final.s
|
|
||||||
|
|
||||||
# 3-gram:
|
|
||||||
ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
|
|
||||||
-map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
|
|
||||||
ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2
|
|
||||||
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
|
|
||||||
#0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437
|
|
||||||
|
|
||||||
# Trying 4-gram:
|
|
||||||
ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
|
|
||||||
-map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz
|
|
||||||
ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout
|
|
||||||
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
|
|
||||||
#0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822
|
|
||||||
|
|
||||||
#3-gram with pruning:
|
|
||||||
ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
|
|
||||||
-prune 0.0000001 -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz
|
|
||||||
ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout
|
|
||||||
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
|
|
||||||
#0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616
|
|
||||||
# Around 2.25M N-grams.
|
|
||||||
# Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/"
|
|
||||||
# above, which gave 2.5 million N-grams and a perplexity of 156.
|
|
||||||
|
|
||||||
# Note: all SRILM experiments above fully discount all singleton 3 and 4-grams.
|
|
||||||
# You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to
|
|
||||||
# the kaldi_lm experiments above without "-mincount".
|
|
||||||
|
|
||||||
## From here is how to train with
|
|
||||||
# IRSTLM. This is not really working at the moment.
|
|
||||||
export IRSTLM=$KALDI_ROOT/tools/irstlm/
|
|
||||||
|
|
||||||
idir=$dir/irstlm
|
|
||||||
mkdir $idir
|
|
||||||
gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | $IRSTLM/scripts/add-start-end.sh | \
|
|
||||||
gzip -c > $idir/train.gz
|
|
||||||
|
|
||||||
$IRSTLM/bin/dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no
|
|
||||||
cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\
|
|
||||||
{print $0;}}' > vocab.irstlm.20k
|
|
||||||
|
|
||||||
|
|
||||||
$IRSTLM/bin/build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz -p yes \
|
|
||||||
-n 3 -s improved-kneser-ney -b yes
|
|
||||||
# Testing perplexity with SRILM tools:
|
|
||||||
ngram -lm $idir/lm_3gram.gz -ppl $sdir/cleaned.heldout
|
|
||||||
#data/local/local_lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for <unk> in closed-vocabulary LM
|
|
||||||
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs
|
|
||||||
#0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599
|
|
||||||
|
|
||||||
# Perplexity is very bad (should be ~141, since we used -p option,
|
|
||||||
# not 175),
|
|
||||||
# but adding -debug 3 to the command line shows that
|
|
||||||
# the IRSTLM LM does not seem to sum to one properly, so it seems that
|
|
||||||
# it produces an LM that isn't interpretable in the normal way as an ARPA
|
|
||||||
# LM.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,153 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Copyright 2012 Johns Hopkins University (author: Daniel Povey) Tony Robinson
|
|
||||||
|
|
||||||
# This script trains LMs on the WSJ LM-training data.
|
|
||||||
# It requires that you have already run wsj_extend_dict.sh,
|
|
||||||
# to get the larger-size dictionary including all of CMUdict
|
|
||||||
# plus any OOVs and possible acronyms that we could easily
|
|
||||||
# derive pronunciations for.
|
|
||||||
|
|
||||||
# This script takes no command-line arguments but takes the --cmd option.
|
|
||||||
|
|
||||||
# Begin configuration section.
|
|
||||||
rand_seed=0
|
|
||||||
cmd=run.pl
|
|
||||||
nwords=10000 # This is how many words we're putting in the vocab of the RNNLM.
|
|
||||||
hidden=30
|
|
||||||
class=200 # Num-classes... should be somewhat larger than sqrt of nwords.
|
|
||||||
direct=1000 # Probably number of megabytes to allocate for hash-table for "direct" connections.
|
|
||||||
rnnlm_ver=rnnlm-0.3e # version of RNNLM to use
|
|
||||||
# End configuration section.
|
|
||||||
|
|
||||||
[ -f ./path.sh ] && . ./path.sh
|
|
||||||
. utils/parse_options.sh
|
|
||||||
|
|
||||||
if [ $# != 1 ]; then
|
|
||||||
echo "Usage: local/wsj_train_rnnlms.sh [options] <dest-dir>"
|
|
||||||
echo "For options, see top of script file"
|
|
||||||
exit 1;
|
|
||||||
fi
|
|
||||||
|
|
||||||
dir=$1
|
|
||||||
srcdir=data/local/dict_larger
|
|
||||||
mkdir -p $dir
|
|
||||||
|
|
||||||
export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH
|
|
||||||
|
|
||||||
|
|
||||||
( # First make sure the kaldi_lm toolkit is installed.
|
|
||||||
# Note: this didn't work out of the box for me, I had to
|
|
||||||
# change the g++ version to just "g++" (no cross-compilation
|
|
||||||
# needed for me as I ran on a machine that had been setup
|
|
||||||
# as 64 bit by default.
|
|
||||||
cd $KALDI_ROOT/tools || exit 1;
|
|
||||||
if [ -d $rnnlm_ver ]; then
|
|
||||||
echo Not installing the rnnlm toolkit since it is already there.
|
|
||||||
else
|
|
||||||
echo Downloading and installing the rnnlm tools
|
|
||||||
# http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz
|
|
||||||
if [ ! -f $rnnlm_ver.tgz ]; then
|
|
||||||
wget http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz || exit 1;
|
|
||||||
fi
|
|
||||||
mkdir $rnnlm_ver
|
|
||||||
cd $rnnlm_ver
|
|
||||||
tar -xvzf ../$rnnlm_ver.tgz || exit 1;
|
|
||||||
make CC=g++ || exit 1;
|
|
||||||
echo Done making the rnnlm tools
|
|
||||||
fi
|
|
||||||
) || exit 1;
|
|
||||||
|
|
||||||
|
|
||||||
if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
|
|
||||||
echo "Expecting files $srcdir/cleaned.gz and $srcdir/wordlist.final to exist";
|
|
||||||
echo "You need to run local/wsj_extend_dict.sh before running this script."
|
|
||||||
exit 1;
|
|
||||||
fi
|
|
||||||
|
|
||||||
cat $srcdir/lexicon.txt | awk '{print $1}' | grep -v -w '!SIL' > $dir/wordlist.all
|
|
||||||
|
|
||||||
# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
|
|
||||||
echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)"
|
|
||||||
gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.all \
|
|
||||||
'BEGIN{while((getline<w)>0) v[$1]=1;}
|
|
||||||
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
|
|
||||||
| gzip -c > $dir/all.gz
|
|
||||||
|
|
||||||
echo "Splitting data into train and validation sets."
|
|
||||||
heldout_sent=10000
|
|
||||||
gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data
|
|
||||||
gunzip -c $dir/all.gz | tail -n +$heldout_sent | \
|
|
||||||
perl -e ' use List::Util qw(shuffle); @A=<>; print join("", shuffle(@A)); ' \
|
|
||||||
> $dir/train.in # training data
|
|
||||||
|
|
||||||
|
|
||||||
# The rest will consist of a word-class represented by <RNN_UNK>, that
|
|
||||||
# maps (with probabilities) to a whole class of words.
|
|
||||||
|
|
||||||
# Get unigram counts from our training data, and use this to select word-list
|
|
||||||
# for RNNLM training; e.g. 10k most frequent words. Rest will go in a class
|
|
||||||
# that we (manually, at the shell level) assign probabilities for words that
|
|
||||||
# are in that class. Note: this word-list doesn't need to include </s>; this
|
|
||||||
# automatically gets added inside the rnnlm program.
|
|
||||||
# Note: by concatenating with $dir/wordlist.all, we are doing add-one
|
|
||||||
# smoothing of the counts.
|
|
||||||
|
|
||||||
cat $dir/train.in $dir/wordlist.all | grep -v '</s>' | grep -v '<s>' | \
|
|
||||||
awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
|
|
||||||
sort -nr > $dir/unigram.counts
|
|
||||||
|
|
||||||
head -$nwords $dir/unigram.counts | awk '{print $2}' > $dir/wordlist.rnn
|
|
||||||
|
|
||||||
tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts
|
|
||||||
|
|
||||||
tot=`awk '{x=x+$1} END{print x}' $dir/unk_class.counts`
|
|
||||||
awk -v tot=$tot '{print $2, ($1*1.0/tot);}' <$dir/unk_class.counts >$dir/unk.probs
|
|
||||||
|
|
||||||
|
|
||||||
for type in train valid; do
|
|
||||||
cat $dir/$type.in | awk -v w=$dir/wordlist.rnn \
|
|
||||||
'BEGIN{while((getline<w)>0) v[$1]=1;}
|
|
||||||
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<RNN_UNK> ";print ""}'|sed 's/ $//g' \
|
|
||||||
> $dir/$type
|
|
||||||
done
|
|
||||||
rm $dir/train.in # no longer needed-- and big.
|
|
||||||
|
|
||||||
# Now randomize the order of the training data.
|
|
||||||
cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \
|
|
||||||
sort | cut -f 2 > $dir/foo
|
|
||||||
mv $dir/foo $dir/train
|
|
||||||
|
|
||||||
# OK we'll train the RNNLM on this data.
|
|
||||||
|
|
||||||
# todo: change 100 to 320.
|
|
||||||
# using 100 classes as square root of 10k.
|
|
||||||
echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
|
|
||||||
#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/100.rnnlm \
|
|
||||||
# -hidden 100 -rand-seed 1 -debug 2 -class 100 -bptt 2 -bptt-block 20 \
|
|
||||||
# -direct-order 4 -direct 1000 -binary >& $dir/rnnlm1.log &
|
|
||||||
|
|
||||||
$cmd $dir/rnnlm.log \
|
|
||||||
$KALDI_ROOT/tools/$rnnlm_ver/rnnlm -independent -train $dir/train -valid $dir/valid \
|
|
||||||
-rnnlm $dir/rnnlm -hidden $hidden -rand-seed 1 -debug 2 -class $class -bptt 2 -bptt-block 20 \
|
|
||||||
-direct-order 4 -direct $direct -binary || exit 1;
|
|
||||||
|
|
||||||
|
|
||||||
# make it like a Kaldi table format, with fake utterance-ids.
|
|
||||||
cat $dir/valid.in | awk '{ printf("uttid-%d ", NR); print; }' > $dir/valid.with_ids
|
|
||||||
|
|
||||||
utils/rnnlm_compute_scores.sh $dir $dir/tmp.valid $dir/valid.with_ids \
|
|
||||||
$dir/valid.scores
|
|
||||||
nw=`wc -w < $dir/valid.with_ids` # Note: valid.with_ids includes utterance-ids which
|
|
||||||
# is one per word, to account for the </s> at the end of each sentence; this is the
|
|
||||||
# correct number to normalize buy.
|
|
||||||
p=`awk -v nw=$nw '{x=x+$2} END{print exp(x/nw);}' <$dir/valid.scores`
|
|
||||||
echo Perplexity is $p | tee $dir/perplexity.log
|
|
||||||
|
|
||||||
rm $dir/train $dir/all.gz
|
|
||||||
|
|
||||||
# This is a better setup, but takes a long time to train:
|
|
||||||
#echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
|
|
||||||
#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/320.rnnlm \
|
|
||||||
# -hidden 320 -rand-seed 1 -debug 2 -class 300 -bptt 2 -bptt-block 20 \
|
|
||||||
# -direct-order 4 -direct 2000 -binary
|
|
|
@ -38,11 +38,8 @@ echo Preparing language models for test
|
||||||
|
|
||||||
for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
|
for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
|
||||||
test=data/lang_test_${lm_suffix}
|
test=data/lang_test_${lm_suffix}
|
||||||
mkdir -p $test
|
cp -rT data/lang $test
|
||||||
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
|
|
||||||
phones/; do
|
|
||||||
cp -r data/lang/$f $test
|
|
||||||
done
|
|
||||||
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
||||||
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
|
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
|
||||||
|
|
||||||
|
@ -60,26 +57,8 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||||
fstisstochastic $test/G.fst
|
|
||||||
# The output is like:
|
|
||||||
# 9.14233e-05 -0.259833
|
|
||||||
# we do expect the first of these 2 numbers to be close to zero (the second is
|
|
||||||
# nonzero because the backoff weights make the states sum to >1).
|
|
||||||
# Because of the <s> fiasco for these particular LMs, the first number is not
|
|
||||||
# as close to zero as it could be.
|
|
||||||
|
|
||||||
# Everything below is only for diagnostic.
|
utils/validate_lang.pl $test || exit 1;
|
||||||
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
|
|
||||||
# this might cause determinization failure of CLG.
|
|
||||||
# #0 is treated as an empty word.
|
|
||||||
mkdir -p $tmpdir/g
|
|
||||||
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
|
|
||||||
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
|
|
||||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
|
|
||||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
|
|
||||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
|
||||||
echo "Language model has cycles with empty words" && exit 1
|
|
||||||
rm -r $tmpdir/g
|
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "Succeeded in formatting data."
|
echo "Succeeded in formatting data."
|
||||||
|
|
|
@ -256,6 +256,3 @@ $cuda_cmd $dir/_train_nnet.log \
|
||||||
utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri7a_dnn exp/tri7a_dnn/graph_tgpr_5k || exit 1;
|
utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri7a_dnn exp/tri7a_dnn/graph_tgpr_5k || exit 1;
|
||||||
steps/nnet/decode.sh --nj 8 --acwt 0.10 --config conf/decode_dnn.config \
|
steps/nnet/decode.sh --nj 8 --acwt 0.10 --config conf/decode_dnn.config \
|
||||||
exp/tri7a_dnn/graph_tgpr_5k data-fbank/test_eval92_5k_noisy $dir/decode_tgpr_5k_eval92_5k_noisy || exit 1;
|
exp/tri7a_dnn/graph_tgpr_5k data-fbank/test_eval92_5k_noisy $dir/decode_tgpr_5k_eval92_5k_noisy || exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -27,7 +27,7 @@ mkdir -p $tmpdir
|
||||||
for lm_suffix in tgpr; do
|
for lm_suffix in tgpr; do
|
||||||
test=data/lang_test_${lm_suffix}
|
test=data/lang_test_${lm_suffix}
|
||||||
mkdir -p $test
|
mkdir -p $test
|
||||||
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones/; do
|
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones oov.txt oov.int; do
|
||||||
cp -r data/lang/$f $test
|
cp -r data/lang/$f $test
|
||||||
done
|
done
|
||||||
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\
|
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\
|
||||||
|
@ -47,26 +47,8 @@ for lm_suffix in tgpr; do
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||||
fstisstochastic $test/G.fst
|
|
||||||
# The output is like:
|
|
||||||
# 9.14233e-05 -0.259833
|
|
||||||
# we do expect the first of these 2 numbers to be close to zero (the second is
|
|
||||||
# nonzero because the backoff weights make the states sum to >1).
|
|
||||||
# Because of the <s> fiasco for these particular LMs, the first number is not
|
|
||||||
# as close to zero as it could be.
|
|
||||||
|
|
||||||
# Everything below is only for diagnostic.
|
utils/validate_lang.pl $test || exit 1;
|
||||||
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
|
|
||||||
# this might cause determinization failure of CLG.
|
|
||||||
# #0 is treated as an empty word.
|
|
||||||
mkdir -p $tmpdir/g
|
|
||||||
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
|
|
||||||
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
|
|
||||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
|
|
||||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
|
|
||||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
|
||||||
echo "Language model has cycles with empty words" && exit 1
|
|
||||||
rm -r $tmpdir/g
|
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "Succeeded in formatting data."
|
echo "Succeeded in formatting data."
|
||||||
|
|
|
@ -6,7 +6,15 @@
|
||||||
# Prepares the test time language model(G) transducers
|
# Prepares the test time language model(G) transducers
|
||||||
# (adapted from wsj/s5/local/wsj_format_data.sh)
|
# (adapted from wsj/s5/local/wsj_format_data.sh)
|
||||||
|
|
||||||
. path.sh
|
. ./path.sh || exit 1;
|
||||||
|
|
||||||
|
# begin configuration section
|
||||||
|
src_dict=data/local/dict/lexicon.txt # only needed for diagnostics, to identify empty words.
|
||||||
|
src_dir=data/lang
|
||||||
|
# end configuration section
|
||||||
|
|
||||||
|
. utils/parse_options.sh || exit 1;
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
if [ $# -ne 1 ]; then
|
if [ $# -ne 1 ]; then
|
||||||
|
@ -14,29 +22,41 @@ if [ $# -ne 1 ]; then
|
||||||
echo "e.g.: $0 /export/a15/vpanayotov/data/lm"
|
echo "e.g.: $0 /export/a15/vpanayotov/data/lm"
|
||||||
echo ", where:"
|
echo ", where:"
|
||||||
echo " <lm-dir> is the directory in which the language model is stored/downloaded"
|
echo " <lm-dir> is the directory in which the language model is stored/downloaded"
|
||||||
|
echo "Options:"
|
||||||
|
echo " --src-dir <dir> # source lang directory, default data/lang"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
lm_dir=$1
|
lm_dir=$1
|
||||||
|
|
||||||
tmpdir=data/local/lm_tmp
|
if [ ! -d $lm_dir ]; then
|
||||||
lexicon=data/local/lang_tmp/lexiconp.txt
|
echo "$0: expected source LM directory $lm_dir to exist"
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
if [ ! -f $src_dir/words.txt ]; then
|
||||||
|
echo "$0: expected $src_dir/words.txt to exist."
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
tmpdir=data/local/lm_tmp.$$
|
||||||
|
trap "rm -r $tmpdir" EXIT
|
||||||
|
|
||||||
mkdir -p $tmpdir
|
mkdir -p $tmpdir
|
||||||
|
|
||||||
for lm_suffix in tgsmall tgmed tglarge; do
|
for lm_suffix in tgsmall tgmed; do
|
||||||
test=data/lang_test_${lm_suffix}
|
# tglarge is prepared by a separate command, called from run.sh; we don't
|
||||||
mkdir -p $test
|
# want to compile G.fst for tglarge, as it takes a while.
|
||||||
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones/; do
|
test=${src_dir}_test_${lm_suffix}
|
||||||
cp -r data/lang/$f $test
|
cp -rT ${src_dir} $test
|
||||||
done
|
|
||||||
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\
|
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\
|
||||||
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt || exit 1
|
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt || exit 1
|
||||||
|
|
||||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||||
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
# stuff in it with multiple <s>'s in the history. Encountered some other
|
||||||
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
# similar things in a LM from Geoff. Removing all "illegal" combinations of
|
||||||
# which are supposed to occur only at being/end of utt. These can cause
|
# <s> and </s>, which are supposed to occur only at being/end of utt. These
|
||||||
# determinization failures of CLG [ends up being epsilon cycles].
|
# can cause determinization failures of CLG [ends up being epsilon cycles].
|
||||||
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
|
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
|
||||||
grep -v '<s> <s>' | \
|
grep -v '<s> <s>' | \
|
||||||
grep -v '</s> <s>' | \
|
grep -v '</s> <s>' | \
|
||||||
|
@ -44,31 +64,12 @@ for lm_suffix in tgsmall tgmed tglarge; do
|
||||||
arpa2fst - | fstprint | \
|
arpa2fst - | fstprint | \
|
||||||
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
|
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||||
fstisstochastic $test/G.fst || true
|
|
||||||
# The output is like:
|
|
||||||
# 9.14233e-05 -0.259833
|
|
||||||
# we do expect the first of these 2 numbers to be close to zero (the second is
|
|
||||||
# nonzero because the backoff weights make the states sum to >1).
|
|
||||||
# Because of the <s> fiasco for these particular LMs, the first number is not
|
|
||||||
# as close to zero as it could be.
|
|
||||||
|
|
||||||
# Everything below is only for diagnostic.
|
utils/validate_lang.pl --skip-determinization-check $test || exit 1;
|
||||||
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
|
|
||||||
# this might cause determinization failure of CLG.
|
|
||||||
# #0 is treated as an empty word.
|
|
||||||
mkdir -p $tmpdir/g
|
|
||||||
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
|
|
||||||
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
|
|
||||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
|
|
||||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
|
|
||||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
|
||||||
echo "Language model has cycles with empty words" && exit 1
|
|
||||||
rm -r $tmpdir/g
|
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "Succeeded in formatting data."
|
echo "Succeeded in formatting data."
|
||||||
rm -r $tmpdir
|
|
||||||
|
|
||||||
exit 0
|
exit 0
|
||||||
|
|
|
@ -50,10 +50,7 @@ idngram2lm -linear -idngram $lmdir/sprak.idngram -vocab \
|
||||||
test=data/lang_test_${lm_suffix}
|
test=data/lang_test_${lm_suffix}
|
||||||
mkdir -p $test
|
mkdir -p $test
|
||||||
|
|
||||||
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
|
cp -rT data/lang $test
|
||||||
phones/; do
|
|
||||||
cp -r data/lang/$f $test
|
|
||||||
done
|
|
||||||
|
|
||||||
cat $lmdir/sprak.arpa | \
|
cat $lmdir/sprak.arpa | \
|
||||||
utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt
|
utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt
|
||||||
|
@ -72,25 +69,9 @@ cat $lmdir/sprak.arpa | \
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||||
fstisstochastic $test/G.fst
|
|
||||||
# The output is like:
|
|
||||||
# 9.14233e-05 -0.259833
|
|
||||||
# we do expect the first of these 2 numbers to be close to zero (the second is
|
|
||||||
# nonzero because the backoff weights make the states sum to >1).
|
|
||||||
# Because of the <s> fiasco for these particular LMs, the first number is not
|
|
||||||
# as close to zero as it could be.
|
|
||||||
|
|
||||||
# Everything below is only for diagnostic.
|
|
||||||
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
|
|
||||||
# this might cause determinization failure of CLG.
|
|
||||||
# #0 is treated as an empty word.
|
|
||||||
mkdir -p $tmpdir
|
|
||||||
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
|
|
||||||
< "$lexicon" >$tmpdir/select_empty.fst.txt
|
|
||||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/select_empty.fst.txt | \
|
|
||||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/empty_words.fst
|
|
||||||
fstinfo $tmpdir/empty_words.fst | grep cyclic | grep -w 'y' &&
|
|
||||||
echo "Language model has cycles with empty words" && exit 1
|
|
||||||
|
|
||||||
echo "Succeeded in formatting data."
|
utils/validate_lang.pl $test || exit 1;
|
||||||
rm -r $tmpdir
|
|
||||||
|
exit 0;
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,6 @@ lm_suffix=$3
|
||||||
N=$4
|
N=$4
|
||||||
lmdir=$5
|
lmdir=$5
|
||||||
extdict=${srcdict}_$lm_suffix
|
extdict=${srcdict}_$lm_suffix
|
||||||
tmpdir=data/local/lm_tmp
|
|
||||||
lang_tmp=data/local/lang_tmp
|
lang_tmp=data/local/lang_tmp
|
||||||
extlang=data/lang_$lm_suffix
|
extlang=data/lang_$lm_suffix
|
||||||
|
|
||||||
|
@ -137,10 +136,8 @@ tlm -tr=$lmdir/extra4.ngt -n=$N -lm=wb -o=$lmdir/extra${N}$lm_suffix
|
||||||
test=data/lang_test_${N}${lm_suffix}
|
test=data/lang_test_${N}${lm_suffix}
|
||||||
mkdir -p $test
|
mkdir -p $test
|
||||||
|
|
||||||
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
|
|
||||||
phones/; do
|
cp -r $extlang $test
|
||||||
cp -r $extlang/$f $test
|
|
||||||
done
|
|
||||||
|
|
||||||
cat $lmdir/extra${N}$lm_suffix | \
|
cat $lmdir/extra${N}$lm_suffix | \
|
||||||
utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt
|
utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt
|
||||||
|
@ -159,29 +156,7 @@ cat $lmdir/extra${N}$lm_suffix | \
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||||
fstisstochastic $test/G.fst
|
|
||||||
|
|
||||||
echo "Succeeded in formatting data."
|
utils/validate_lang.pl $test || exit 1;
|
||||||
# The output is like:
|
|
||||||
# 9.14233e-05 -0.259833
|
|
||||||
# we do expect the first of these 2 numbers to be close to zero (the second is
|
|
||||||
# nonzero because the backoff weights make the states sum to >1).
|
|
||||||
# Because of the <s> fiasco for these particular LMs, the first number is not
|
|
||||||
# as close to zero as it could be.
|
|
||||||
|
|
||||||
# Everything below is only for diagnostic.
|
exit 0;
|
||||||
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
|
|
||||||
# this might cause determinization failure of CLG.
|
|
||||||
# #0 is treated as an empty word.
|
|
||||||
echo "Running diagnostics. Investigate if the LM has cycles."
|
|
||||||
|
|
||||||
mkdir -p $tmpdir
|
|
||||||
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
|
|
||||||
< "$lmdir/text.filt" >$tmpdir/select_empty.fst.txt
|
|
||||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/select_empty.fst.txt | \
|
|
||||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/empty_words.fst
|
|
||||||
fstinfo $tmpdir/empty_words.fst | grep cyclic | grep -w 'y' &&
|
|
||||||
echo "Language model has cycles with empty words" && exit 1
|
|
||||||
|
|
||||||
|
|
||||||
rm -rf $tmpdir
|
|
||||||
|
|
|
@ -51,13 +51,10 @@ wait
|
||||||
test=data/lang_test_${lm_suffix}
|
test=data/lang_test_${lm_suffix}
|
||||||
mkdir -p $test
|
mkdir -p $test
|
||||||
|
|
||||||
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
|
cp -rT $srcdir $test
|
||||||
phones/; do
|
|
||||||
cp -r $srcdir/$f $test
|
|
||||||
done
|
|
||||||
|
|
||||||
cat $lmdir/train${ngram}.arpa | \
|
cat $lmdir/train${ngram}.arpa | \
|
||||||
utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt
|
utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt
|
||||||
|
|
||||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||||
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
||||||
|
@ -73,27 +70,10 @@ cat $lmdir/train${ngram}.arpa | \
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||||
fstisstochastic $test/G.fst
|
|
||||||
# The output is like:
|
|
||||||
# 9.14233e-05 -0.259833
|
|
||||||
# we do expect the first of these 2 numbers to be close to zero (the second is
|
|
||||||
# nonzero because the backoff weights make the states sum to >1).
|
|
||||||
# Because of the <s> fiasco for these particular LMs, the first number is not
|
|
||||||
# as close to zero as it could be.
|
|
||||||
|
|
||||||
# Everything below is only for diagnostic.
|
utils/validate_lang.pl $test || exit 1;
|
||||||
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
|
|
||||||
# this might cause determinization failure of CLG.
|
|
||||||
# #0 is treated as an empty word.
|
|
||||||
|
|
||||||
|
|
||||||
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
|
|
||||||
< "$lmdir/lm_input" >$tmpdir/select_empty.fst.txt
|
|
||||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/select_empty.fst.txt | \
|
|
||||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/empty_words.fst
|
|
||||||
fstinfo $tmpdir/empty_words.fst | grep cyclic | grep -w 'y' &&
|
|
||||||
echo "Language model has cycles with empty words" && exit 1
|
|
||||||
|
|
||||||
echo "Succeeded in formatting data."
|
echo "Succeeded in formatting data."
|
||||||
|
exit 0;
|
||||||
#rm -rf $tmpdir
|
#rm -rf $tmpdir
|
||||||
#rm -f $ccs
|
#rm -f $ccs
|
|
@ -22,7 +22,7 @@ cp $srcdict $dir/lexicon0.txt || exit 1;
|
||||||
patch <local/dict.patch $dir/lexicon0.txt || exit 1;
|
patch <local/dict.patch $dir/lexicon0.txt || exit 1;
|
||||||
|
|
||||||
#(2a) Dictionary preparation:
|
#(2a) Dictionary preparation:
|
||||||
# Pre-processing (Upper-case, remove comments)
|
# Pre-processing (lower-case, remove comments)
|
||||||
awk 'BEGIN{getline}($0 !~ /^#/) {$0=tolower($0); print}' \
|
awk 'BEGIN{getline}($0 !~ /^#/) {$0=tolower($0); print}' \
|
||||||
$srcdict | sort | awk '($0 !~ /^[[:space:]]*$/) {print}' \
|
$srcdict | sort | awk '($0 !~ /^[[:space:]]*$/) {print}' \
|
||||||
> $dir/lexicon1.txt || exit 1;
|
> $dir/lexicon1.txt || exit 1;
|
||||||
|
|
|
@ -38,11 +38,9 @@ echo Preparing language models for test
|
||||||
|
|
||||||
for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
|
for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
|
||||||
test=data/lang_test_${lm_suffix}
|
test=data/lang_test_${lm_suffix}
|
||||||
mkdir -p $test
|
|
||||||
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
|
cp -rT data/lang $test || exit 1;
|
||||||
phones/; do
|
|
||||||
cp -r data/lang/$f $test
|
|
||||||
done
|
|
||||||
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
||||||
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
|
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
|
||||||
|
|
||||||
|
@ -60,26 +58,8 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
|
||||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||||
fstisstochastic $test/G.fst
|
|
||||||
# The output is like:
|
|
||||||
# 9.14233e-05 -0.259833
|
|
||||||
# we do expect the first of these 2 numbers to be close to zero (the second is
|
|
||||||
# nonzero because the backoff weights make the states sum to >1).
|
|
||||||
# Because of the <s> fiasco for these particular LMs, the first number is not
|
|
||||||
# as close to zero as it could be.
|
|
||||||
|
|
||||||
# Everything below is only for diagnostic.
|
utils/validate_lang.pl --skip-determinization-check $test || exit 1;
|
||||||
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
|
|
||||||
# this might cause determinization failure of CLG.
|
|
||||||
# #0 is treated as an empty word.
|
|
||||||
mkdir -p $tmpdir/g
|
|
||||||
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
|
|
||||||
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
|
|
||||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
|
|
||||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
|
|
||||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
|
||||||
echo "Language model has cycles with empty words" && exit 1
|
|
||||||
rm -r $tmpdir/g
|
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "Succeeded in formatting data."
|
echo "Succeeded in formatting data."
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
|
# Copyright 2010-2012 Microsoft Corporation
|
||||||
|
# 2012-2014 Johns Hopkins University (Author: Daniel Povey)
|
||||||
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
# you may not use this file except in compliance with the License.
|
# you may not use this file except in compliance with the License.
|
||||||
|
@ -70,14 +71,16 @@ grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
|
||||||
|
|
||||||
# Add to cmudict the silences, noises etc.
|
# Add to cmudict the silences, noises etc.
|
||||||
|
|
||||||
|
# the sort | uniq is to remove a duplicated pron from cmudict.
|
||||||
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; echo '<NOISE> NSN'; ) | \
|
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; echo '<NOISE> NSN'; ) | \
|
||||||
cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1;
|
cat - $dir/lexicon1_raw_nosil.txt | sort | uniq > $dir/lexicon2_raw.txt || exit 1;
|
||||||
|
|
||||||
|
|
||||||
# lexicon.txt is without the _B, _E, _S, _I markers.
|
# lexicon.txt is without the _B, _E, _S, _I markers.
|
||||||
# This is the input to wsj_format_data.sh
|
# This is the input to wsj_format_data.sh
|
||||||
cp $dir/lexicon2_raw.txt $dir/lexicon.txt
|
cp $dir/lexicon2_raw.txt $dir/lexicon.txt
|
||||||
|
|
||||||
|
rm $dir/lexiconp.txt 2>/dev/null
|
||||||
|
|
||||||
echo "Dictionary preparation succeeded"
|
echo "Dictionary preparation succeeded"
|
||||||
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче