trunk: some changes to dict_dir_add_pronprobs.sh to make it more robust to bad input; modify validate_dict_dir.pl to check for duplicate lexicon entries and Librispeech scripts to remove them. [bug in cmudict with repeated entry of SPIRIT was causing failure in dict_dir_add_pronprobs.sh]

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4736 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2015-01-01 21:57:26 +00:00
Родитель fc33f00b13
Коммит 265ad600ae
4 изменённых файлов: 28 добавлений и 15 удалений

Просмотреть файл

@ -58,9 +58,9 @@ for f in 3-gram.arpa.gz 3-gram.pruned.1e-7.arpa.gz 3-gram.pruned.3e-7.arpa.gz 4-
done
cd $dst_dir
ln -s 3-gram.pruned.1e-7.arpa.gz lm_tgmed.arpa.gz
ln -s 3-gram.pruned.3e-7.arpa.gz lm_tgsmall.arpa.gz
ln -s 3-gram.arpa.gz lm_tglarge.arpa.gz
ln -s 4-gram.arpa.gz lm_fglarge.arpa.gz
ln -sf 3-gram.pruned.1e-7.arpa.gz lm_tgmed.arpa.gz
ln -sf 3-gram.pruned.3e-7.arpa.gz lm_tgsmall.arpa.gz
ln -sf 3-gram.arpa.gz lm_tglarge.arpa.gz
ln -sf 4-gram.arpa.gz lm_fglarge.arpa.gz
exit 0

Просмотреть файл

@ -136,7 +136,7 @@ fi
if [ $stage -le 4 ]; then
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |\
cat - $lexicon_raw_nosil >$dst_dir/lexicon.txt
cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt
echo "Lexicon text file saved as: $dst_dir/lexicon.txt"
fi

Просмотреть файл

@ -26,12 +26,12 @@ utils/validate_dict_dir.pl $srcdir;
if [ -f $srcdir/lexicon.txt ]; then
src_lex=$srcdir/lexicon.txt
cp $srcdir/lexicon.txt $dir || exit 1;
perl -ane 'print join(" ", split(" ", $_)) . "\n";' <$src_lex >$dir/lexicon.txt
elif [ -f $srcdir/lexiconp.txt ]; then
echo "$0: removing the pron-probs from $srcdir/lexiconp.txt to create $dir/lexicon.txt"
# the second awk command below normalizes the spaces (avoid double space).
src_lex=$srcdir/lexiconp.txt
awk '{$2 = ""; print $0;}' <$srcdir/lexiconp.txt | awk '{print $0}' >$dir/lexicon.txt || exit 1;
awk '{$2 = ""; print $0;}' <$srcdir/lexiconp.txt | perl -ane 'print join(" ", split(" " ,$_)) . "\n";' >$dir/lexicon.txt || exit 1;
fi
@ -59,10 +59,11 @@ if [ "$n_old" != "$n_new" ]; then
fi
# now regenerate lexicon.txt from lexiconp.txt, to make sure the lines are
# in the same order. The first awk command removes the pron, the second
# normalizes the space to avoid a double space.
cat $dir/lexiconp.txt | awk '{$2 = ""; print;}' | awk '{print $0}' >$dir/lexicon.txt
# in the same order.
cat $dir/lexiconp.txt | awk '{$2 = ""; print;}' | sed 's/ / /g' >$dir/lexicon.txt
# add mandatory files.
for f in silence_phones.txt nonsilence_phones.txt; do
if [ ! -f $srcdir/$f ]; then
echo "$0: expected $srcdir/$f to exist."
@ -71,10 +72,15 @@ for f in silence_phones.txt nonsilence_phones.txt; do
cp $srcdir/$f $dir/ || exit 1;
done
rm $dir/optional_silence.txt 2>/dev/null
if [ -f $srcdir/optional_silence.txt ]; then
cp $srcdir/optional_silence.txt $dir || exit 1;
fi
# add optional files (at least, I think these are optional; would have to check the docs).
for f in optional_silence.txt extra_questions.txt; do
rm $dir/$f 2>/dev/null
if [ -f $srcdir/$f ]; then
cp $srcdir/$f $dir || exit 1;
fi
done
echo "$0: produced dictionary directory with probabilities in $dir/"
echo "$0: validating $dir .."

Просмотреть файл

@ -132,12 +132,18 @@ print "\n";
sub check_lexicon {
my ($lexfn, $pron_probs) = @_;
print "Checking $lexfn\n";
if(-z "$lexfn") {set_to_fail(); print "--> ERROR: $lexfn is empty or not exists\n";}
my %seen_line = {};
if(-z "$lexfn") {set_to_fail(); print "--> ERROR: $lexfn is empty or does not exist\n";}
if(!open(L, "<$lexfn")) {set_to_fail(); print "--> ERROR: fail to open $lexfn\n";}
$idx = 1;
$success = 1;
print "--> reading $lexfn\n";
while (<L>) {
if (defined $seen_line{$_}) {
print "--> ERROR: line '$_' of $lexfn is repeated\n";
set_to_fail();
}
$seen_line{$_} = 1;
if (! s/\n$//) {
print "--> ERROR: last line '$_' of $lexfn does not end in newline.\n";
set_to_fail();
@ -160,6 +166,7 @@ sub check_lexicon {
}
$idx ++;
}
%seen_line = {};
close(L);
$success == 0 || print "--> $lexfn is OK\n";
print "\n";