зеркало из https://github.com/mozilla/kaldi.git
trunk: some changes to dict_dir_add_pronprobs.sh to make it more robust to bad input; modify validate_dict_dir.pl to check for duplicate lexicon entries and Librispeech scripts to remove them. [bug in cmudict with repeated entry of SPIRIT was causing failure in dict_dir_add_pronprobs.sh]
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4736 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
fc33f00b13
Коммит
265ad600ae
|
@ -58,9 +58,9 @@ for f in 3-gram.arpa.gz 3-gram.pruned.1e-7.arpa.gz 3-gram.pruned.3e-7.arpa.gz 4-
|
|||
done
|
||||
|
||||
cd $dst_dir
|
||||
ln -s 3-gram.pruned.1e-7.arpa.gz lm_tgmed.arpa.gz
|
||||
ln -s 3-gram.pruned.3e-7.arpa.gz lm_tgsmall.arpa.gz
|
||||
ln -s 3-gram.arpa.gz lm_tglarge.arpa.gz
|
||||
ln -s 4-gram.arpa.gz lm_fglarge.arpa.gz
|
||||
ln -sf 3-gram.pruned.1e-7.arpa.gz lm_tgmed.arpa.gz
|
||||
ln -sf 3-gram.pruned.3e-7.arpa.gz lm_tgsmall.arpa.gz
|
||||
ln -sf 3-gram.arpa.gz lm_tglarge.arpa.gz
|
||||
ln -sf 4-gram.arpa.gz lm_fglarge.arpa.gz
|
||||
|
||||
exit 0
|
||||
|
|
|
@ -136,7 +136,7 @@ fi
|
|||
|
||||
if [ $stage -le 4 ]; then
|
||||
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |\
|
||||
cat - $lexicon_raw_nosil >$dst_dir/lexicon.txt
|
||||
cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt
|
||||
echo "Lexicon text file saved as: $dst_dir/lexicon.txt"
|
||||
fi
|
||||
|
||||
|
|
|
@ -26,12 +26,12 @@ utils/validate_dict_dir.pl $srcdir;
|
|||
|
||||
if [ -f $srcdir/lexicon.txt ]; then
|
||||
src_lex=$srcdir/lexicon.txt
|
||||
cp $srcdir/lexicon.txt $dir || exit 1;
|
||||
perl -ane 'print join(" ", split(" ", $_)) . "\n";' <$src_lex >$dir/lexicon.txt
|
||||
elif [ -f $srcdir/lexiconp.txt ]; then
|
||||
echo "$0: removing the pron-probs from $srcdir/lexiconp.txt to create $dir/lexicon.txt"
|
||||
# the second awk command below normalizes the spaces (avoid double space).
|
||||
src_lex=$srcdir/lexiconp.txt
|
||||
awk '{$2 = ""; print $0;}' <$srcdir/lexiconp.txt | awk '{print $0}' >$dir/lexicon.txt || exit 1;
|
||||
awk '{$2 = ""; print $0;}' <$srcdir/lexiconp.txt | perl -ane 'print join(" ", split(" " ,$_)) . "\n";' >$dir/lexicon.txt || exit 1;
|
||||
fi
|
||||
|
||||
|
||||
|
@ -59,10 +59,11 @@ if [ "$n_old" != "$n_new" ]; then
|
|||
fi
|
||||
|
||||
# now regenerate lexicon.txt from lexiconp.txt, to make sure the lines are
|
||||
# in the same order. The first awk command removes the pron, the second
|
||||
# normalizes the space to avoid a double space.
|
||||
cat $dir/lexiconp.txt | awk '{$2 = ""; print;}' | awk '{print $0}' >$dir/lexicon.txt
|
||||
# in the same order.
|
||||
cat $dir/lexiconp.txt | awk '{$2 = ""; print;}' | sed 's/ / /g' >$dir/lexicon.txt
|
||||
|
||||
|
||||
# add mandatory files.
|
||||
for f in silence_phones.txt nonsilence_phones.txt; do
|
||||
if [ ! -f $srcdir/$f ]; then
|
||||
echo "$0: expected $srcdir/$f to exist."
|
||||
|
@ -71,10 +72,15 @@ for f in silence_phones.txt nonsilence_phones.txt; do
|
|||
cp $srcdir/$f $dir/ || exit 1;
|
||||
done
|
||||
|
||||
rm $dir/optional_silence.txt 2>/dev/null
|
||||
if [ -f $srcdir/optional_silence.txt ]; then
|
||||
cp $srcdir/optional_silence.txt $dir || exit 1;
|
||||
fi
|
||||
|
||||
# add optional files (at least, I think these are optional; would have to check the docs).
|
||||
for f in optional_silence.txt extra_questions.txt; do
|
||||
rm $dir/$f 2>/dev/null
|
||||
if [ -f $srcdir/$f ]; then
|
||||
cp $srcdir/$f $dir || exit 1;
|
||||
fi
|
||||
done
|
||||
|
||||
|
||||
echo "$0: produced dictionary directory with probabilities in $dir/"
|
||||
echo "$0: validating $dir .."
|
||||
|
|
|
@ -132,12 +132,18 @@ print "\n";
|
|||
sub check_lexicon {
|
||||
my ($lexfn, $pron_probs) = @_;
|
||||
print "Checking $lexfn\n";
|
||||
if(-z "$lexfn") {set_to_fail(); print "--> ERROR: $lexfn is empty or not exists\n";}
|
||||
my %seen_line = {};
|
||||
if(-z "$lexfn") {set_to_fail(); print "--> ERROR: $lexfn is empty or does not exist\n";}
|
||||
if(!open(L, "<$lexfn")) {set_to_fail(); print "--> ERROR: fail to open $lexfn\n";}
|
||||
$idx = 1;
|
||||
$success = 1;
|
||||
print "--> reading $lexfn\n";
|
||||
while (<L>) {
|
||||
if (defined $seen_line{$_}) {
|
||||
print "--> ERROR: line '$_' of $lexfn is repeated\n";
|
||||
set_to_fail();
|
||||
}
|
||||
$seen_line{$_} = 1;
|
||||
if (! s/\n$//) {
|
||||
print "--> ERROR: last line '$_' of $lexfn does not end in newline.\n";
|
||||
set_to_fail();
|
||||
|
@ -160,6 +166,7 @@ sub check_lexicon {
|
|||
}
|
||||
$idx ++;
|
||||
}
|
||||
%seen_line = {};
|
||||
close(L);
|
||||
$success == 0 || print "--> $lexfn is OK\n";
|
||||
print "\n";
|
||||
|
|
Загрузка…
Ссылка в новой задаче