trunk: some changes to dict_dir_add_pronprobs.sh to make it more robust to bad input; modify validate_dict_dir.pl to check for duplicate lexicon entries and Librispeech scripts to remove them. [bug in cmudict with repeated entry of SPIRIT was causing failure in dict_dir_add_pronprobs.sh]

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4736 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2015-01-01 21:57:26 +00:00 · 2015-01-01 21:57:26 +00:00 · 265ad600ae
--- a/egs/librispeech/s5/local/download_lm.sh
+++ b/egs/librispeech/s5/local/download_lm.sh
@ -58,9 +58,9 @@ for f in 3-gram.arpa.gz 3-gram.pruned.1e-7.arpa.gz 3-gram.pruned.3e-7.arpa.gz 4-
 done

 cd $dst_dir
-ln -s 3-gram.pruned.1e-7.arpa.gz lm_tgmed.arpa.gz
-ln -s 3-gram.pruned.3e-7.arpa.gz lm_tgsmall.arpa.gz
-ln -s 3-gram.arpa.gz lm_tglarge.arpa.gz
-ln -s 4-gram.arpa.gz lm_fglarge.arpa.gz
+ln -sf 3-gram.pruned.1e-7.arpa.gz lm_tgmed.arpa.gz
+ln -sf 3-gram.pruned.3e-7.arpa.gz lm_tgsmall.arpa.gz
+ln -sf 3-gram.arpa.gz lm_tglarge.arpa.gz
+ln -sf 4-gram.arpa.gz lm_fglarge.arpa.gz

 exit 0
--- a/egs/librispeech/s5/local/prepare_dict.sh
+++ b/egs/librispeech/s5/local/prepare_dict.sh
@ -136,7 +136,7 @@ fi

 if [ $stage -le 4 ]; then
  (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |\
-  cat - $lexicon_raw_nosil >$dst_dir/lexicon.txt
+  cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt
  echo "Lexicon text file saved as: $dst_dir/lexicon.txt"
 fi

--- a/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh
+++ b/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh
@ -26,12 +26,12 @@ utils/validate_dict_dir.pl $srcdir;

 if [ -f $srcdir/lexicon.txt ]; then
  src_lex=$srcdir/lexicon.txt
-  cp $srcdir/lexicon.txt $dir || exit 1;
+  perl -ane 'print join(" ", split(" ", $_)) . "\n";' <$src_lex >$dir/lexicon.txt
 elif [ -f $srcdir/lexiconp.txt ]; then
  echo "$0: removing the pron-probs from $srcdir/lexiconp.txt to create $dir/lexicon.txt"
  # the second awk command below normalizes the spaces (avoid double space).
  src_lex=$srcdir/lexiconp.txt
-  awk '{$2 = ""; print $0;}' <$srcdir/lexiconp.txt  | awk '{print $0}' >$dir/lexicon.txt || exit 1;
+  awk '{$2 = ""; print $0;}' <$srcdir/lexiconp.txt | perl -ane 'print join(" ", split(" " ,$_)) . "\n";'  >$dir/lexicon.txt || exit 1;
 fi


@ -59,10 +59,11 @@ if [ "$n_old" != "$n_new" ]; then
 fi

 # now regenerate lexicon.txt from lexiconp.txt, to make sure the lines are
-# in the same order.  The first awk command removes the pron, the second
-# normalizes the space to avoid a double space.
-cat $dir/lexiconp.txt | awk '{$2 = ""; print;}' | awk '{print $0}' >$dir/lexicon.txt
+# in the same order. 
+cat $dir/lexiconp.txt | awk '{$2 = ""; print;}' | sed 's/  / /g' >$dir/lexicon.txt

+
+# add mandatory files.
 for f in silence_phones.txt nonsilence_phones.txt; do
  if [ ! -f $srcdir/$f ]; then
    echo "$0: expected $srcdir/$f to exist."
@ -71,10 +72,15 @@ for f in silence_phones.txt nonsilence_phones.txt; do
  cp $srcdir/$f $dir/ || exit 1;
 done

-rm $dir/optional_silence.txt 2>/dev/null
-if [ -f $srcdir/optional_silence.txt ]; then
-  cp $srcdir/optional_silence.txt $dir || exit 1;
-fi
+
+# add optional files (at least, I think these are optional; would have to check the docs).
+for f in optional_silence.txt extra_questions.txt; do
+  rm $dir/$f 2>/dev/null
+  if [ -f $srcdir/$f ]; then
+    cp $srcdir/$f $dir || exit 1;
+  fi
+done
+

 echo "$0: produced dictionary directory with probabilities in $dir/"
 echo "$0: validating $dir .."
--- a/egs/wsj/s5/utils/validate_dict_dir.pl
+++ b/egs/wsj/s5/utils/validate_dict_dir.pl
@ -132,12 +132,18 @@ print "\n";
 sub check_lexicon {
  my ($lexfn, $pron_probs) = @_;
  print "Checking $lexfn\n";
-  if(-z "$lexfn") {set_to_fail(); print "--> ERROR: $lexfn is empty or not exists\n";}
+  my %seen_line = {};
+  if(-z "$lexfn") {set_to_fail(); print "--> ERROR: $lexfn is empty or does not exist\n";}
  if(!open(L, "<$lexfn")) {set_to_fail(); print "--> ERROR: fail to open $lexfn\n";}
  $idx = 1;
  $success = 1;
  print "--> reading $lexfn\n";
  while (<L>) {
+    if (defined $seen_line{$_}) {
+      print "--> ERROR: line '$_' of $lexfn is repeated\n";
+      set_to_fail();
+    }
+    $seen_line{$_} = 1;
    if (! s/\n$//) {
      print "--> ERROR: last line '$_' of $lexfn does not end in newline.\n";
      set_to_fail();
@ -160,6 +166,7 @@ sub check_lexicon {
    }
    $idx ++;
  }
+  %seen_line = {};
  close(L);
  $success == 0 || print "--> $lexfn is OK\n";
  print "\n";