diff --git a/egs/wsj/s5/utils/make_phone_bigram_lang.sh b/egs/wsj/s5/utils/make_phone_bigram_lang.sh index fd1d52253..939e06798 100755 --- a/egs/wsj/s5/utils/make_phone_bigram_lang.sh +++ b/egs/wsj/s5/utils/make_phone_bigram_lang.sh @@ -42,11 +42,21 @@ rm -r $lang_out/phones 2>/dev/null cp -r $lang/phones/ $lang_out/ rm $lang_out/phones/word_boundary.* 2>/dev/null # these would # no longer be valid. -# List of disambig symbols will be empty. +# List of disambig symbols will be empty: not needed, since G.fst and L.fst * G.fst +# are determinizable without any. echo -n > $lang_out/phones/disambig.txt echo -n > $lang_out/phones/disambig.int echo -n > $lang_out/phones/disambig.csl +# Let OOV symbol be the first phone. This is arbitrary, it's just +# so that validate_lang.pl succeeds. We should never actually use +# this. +oov_sym=$(tail -n +2 $lang_out/phones.txt | head -n 1 | awk '{print $1}') +oov_int=$(tail -n +2 $lang_out/phones.txt | head -n 1 | awk '{print $2}') +echo $oov_sym > $lang_out/oov.txt +echo $oov_int > $lang_out/oov.int + + # Get phone-level transcripts of training data and create a # language model. ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \ @@ -96,3 +106,7 @@ grep -v '' $lang_out/phones.txt | awk '{printf("0 0 %s %s\n", $2, $2);} END # L and L_disambig are the same. cp $lang_out/L.fst $lang_out/L_disambig.fst + +utils/validate_lang.pl $lang_out || exit 1; +echo "$0: ignore warnings above from validate_lang.pl (these are expected)" + diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl index dbc91548c..60b4aa68f 100755 --- a/egs/wsj/s5/utils/validate_lang.pl +++ b/egs/wsj/s5/utils/validate_lang.pl @@ -4,22 +4,25 @@ # # Validation script for data/lang -if(@ARGV != 1) { - die "Usage: validate_lang.pl lang_directory\n"; +if (@ARGV != 1) { + print "Usage: $0 \n"; + print "e.g.: $0 data/lang\n"; + exit(1); } $lang = shift @ARGV; $exit = 0; +$warning = 0; # Checking phones.txt ------------------------------- print "Checking $lang/phones.txt ...\n"; -if(-z "$lang/phones.txt") {print "--> ERROR: $lang/phones.txt is empty or not exists\n"; exit 1;} -if(!open(P, "<$lang/phones.txt")) {print "--> ERROR: fail to open $lang/phones.txt\n"; exit 1;} +if (-z "$lang/phones.txt") {print "--> ERROR: $lang/phones.txt is empty or does not exist\n"; exit 1;} +if (!open(P, "<$lang/phones.txt")) {print "--> ERROR: fail to open $lang/phones.txt\n"; exit 1;} $idx = 1; %psymtab = (); while(

) { chomp; my @col = split(" ", $_); - if(@col != 2) {print "--> ERROR: expect 2 columns in $lang/phones.txt (break at line $idx)\n"; exit 1;} + if (@col != 2) {print "--> ERROR: expect 2 columns in $lang/phones.txt (break at line $idx)\n"; exit 1;} my $phone = shift @col; my $id = shift @col; $psymtab{$phone} = $id; @@ -28,7 +31,7 @@ while(

) { close(P); %pint2sym = (); foreach(keys %psymtab) { - if($pint2sym{$psymtab{$_}}) {print "--> ERROR: ID \"$psymtab{$_}\" duplicates\n"; exit 1;} + if ($pint2sym{$psymtab{$_}}) {print "--> ERROR: ID \"$psymtab{$_}\" duplicates\n"; exit 1;} else {$pint2sym{$psymtab{$_}} = $_;} } print "--> $lang/phones.txt is OK\n"; @@ -36,14 +39,14 @@ print "\n"; # Check word.txt ------------------------------- print "Checking words.txt: #0 ...\n"; -if(-z "$lang/words.txt") {print "--> ERROR: $lang/words.txt is empty or not exists\n"; exit 1;} -if(!open(W, "<$lang/words.txt")) {print "--> ERROR: fail to open $lang/words.txt\n"; exit 1;} +if (-z "$lang/words.txt") {print "--> ERROR: $lang/words.txt is empty or does not exist\n"; exit 1;} +if (!open(W, "<$lang/words.txt")) {print "--> ERROR: fail to open $lang/words.txt\n"; exit 1;} $idx = 1; %wsymtab = (); while() { chomp; my @col = split(" ", $_); - if(@col != 2) {print "--> ERROR: expect 2 columns in $lang/words.txt (line $idx)\n"; exit 1;} + if (@col != 2) {print "--> ERROR: expect 2 columns in $lang/words.txt (line $idx)\n"; exit 1;} $word = shift @col; $id = shift @col; $wsymtab{$word} = $id; @@ -52,31 +55,37 @@ while() { close(W); %wint2sym = (); foreach(keys %wsymtab) { - if($wint2sym{$wsymtab{$_}}) {print "--> ERROR: ID \"$wsymtab{$_}\" duplicates\n"; exit 1;} + if ($wint2sym{$wsymtab{$_}}) {print "--> ERROR: ID \"$wsymtab{$_}\" duplicates\n"; exit 1;} else {$wint2sym{$wsymtab{$_}} = $_;} } -if(exists $wsymtab{"#0"}) { +if (exists $wsymtab{"#0"}) { print "--> $lang/words.txt has \"#0\"\n"; print "--> $lang/words.txt is OK\n"; -} else {print "--> ERROR: $lang/words.txt doesn't have \"#0\"\n"; $exit = 1;} +} else { + $warning = 1; + print "--> WARNING: $lang/words.txt doesn't have \"#0\"\n"; + print "--> (if you are using ARPA-type language models, you will normally\n"; + print "--> need the disambiguation symbol \"#0\" to ensure determinizability)\n"; +} print "\n"; # Checking phones/* ------------------------------- sub check_txt_int_csl { my ($cat, $symtab) = @_; print "Checking $cat.\{txt, int, csl\} ...\n"; - if(-z "$cat.txt") {$exit = 1; return print "--> ERROR: $cat.txt is empty or not exists\n";} - if(-z "$cat.int") {$exit = 1; return print "--> ERROR: $cat.int is empty or not exists\n";} - if(-z "$cat.csl") {$exit = 1; return print "--> ERROR: $cat.csl is empty or not exists\n";} - if(!open(TXT, "<$cat.txt")) {$exit = 1; return print "--> ERROR: fail to open $cat.txt\n";} - if(!open(INT, "<$cat.int")) {$exit = 1; return print "--> ERROR: fail to open $cat.int\n";} - if(!open(CSL, "<$cat.csl")) {$exit = 1; return print "--> ERROR: fail to open $cat.csl\n";} + if (!open(TXT, "<$cat.txt")) {$exit = 1; return print "--> ERROR: fail to open $cat.txt\n";} + if (!open(INT, "<$cat.int")) {$exit = 1; return print "--> ERROR: fail to open $cat.int\n";} + if (!open(CSL, "<$cat.csl")) {$exit = 1; return print "--> ERROR: fail to open $cat.csl\n";} + + if (-z "$cat.txt") {$warning = 1; print "--> WARNING: $cat.txt is empty\n";} + if (-z "$cat.int") {$warning = 1; print "--> WARNING: $cat.int is empty\n";} + if (-z "$cat.csl") {$warning = 1; print "--> WARNING: $cat.csl is empty\n";} $idx1 = 1; while() { chomp; my @col = split(" ", $_); - if(@col != 1) {$exit = 1; return print "--> ERROR: expect 1 column in $cat.txt (break at line $idx1)\n";} + if (@col != 1) {$exit = 1; return print "--> ERROR: expect 1 column in $cat.txt (break at line $idx1)\n";} $entry[$idx1] = shift @col; $idx1 ++; } @@ -87,26 +96,36 @@ sub check_txt_int_csl { while() { chomp; my @col = split(" ", $_); - if(@col != 1) {$exit = 1; return print "--> ERROR: expect 1 column in $cat.int (break at line $idx2)\n";} - if($symtab->{$entry[$idx2]} ne shift @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";} + if (@col != 1) {$exit = 1; return print "--> ERROR: expect 1 column in $cat.int (break at line $idx2)\n";} + if ($symtab->{$entry[$idx2]} ne shift @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";} $idx2 ++; } close(INT); $idx2 --; - if($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";} + if ($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";} print "--> $cat.int corresponds to $cat.txt\n"; - $idx3 = 1; + $num_lines = 0; while() { chomp; my @col = split(":", $_); - if(@col != $idx1) {$exit = 1; return print "--> ERROR: expect $idx1 block/blocks in $cat.csl (break at line $idx3)\n";} + $num_lines++; + if (@col != $idx1) {$exit = 1; return print "--> ERROR: expect $idx1 block/blocks in $cat.csl (break at line $idx3)\n";} foreach(1 .. $idx1) { - if($symtab->{$entry[$_]} ne @col[$_-1]) {$exit = 1; return print "--> ERROR: $cat.csl doesn't correspond to $cat.txt (break at line $idx3, block $_)\n";} + if ($symtab->{$entry[$_]} ne @col[$_-1]) {$exit = 1; return print "--> ERROR: $cat.csl doesn't correspond to $cat.txt (break at line $idx3, block $_)\n";} + } + } + close(CSL); + if ($idx1 != 0) { # nonempty .txt,.int files + if ($num_lines != 1) { + $exit = 1; + return print "--> ERROR: expect 1 line in $cat.csl\n"; + } + } else { + if ($num_lines != 1 && $num_lines != 0) { + $exit = 1; + return print "--> ERROR: expect 0 or 1 line in $cat.csl, since empty .txt,int\n"; } - $idx3 ++; } - close(CSL); $idx3 --; - if($idx3 != 1) {$exit = 1; return print "--> ERROR: expect 1 row in $cat.csl (break at line ", $idx3+1, ")\n";} print "--> $cat.csl corresponds to $cat.txt\n"; return print "--> $cat.\{txt, int, csl\} are OK\n"; @@ -115,10 +134,10 @@ sub check_txt_int_csl { sub check_txt_int { my ($cat, $symtab) = @_; print "Checking $cat.\{txt, int\} ...\n"; - if(-z "$cat.txt") {$exit = 1; return print "--> ERROR: $cat.txt is empty or not exists\n";} - if(-z "$cat.int") {$exit = 1; return print "--> ERROR: $cat.int is empty or not exists\n";} - if(!open(TXT, "<$cat.txt")) {$exit = 1; return print "--> ERROR: fail to open $cat.txt\n";} - if(!open(INT, "<$cat.int")) {$exit = 1; return print "--> ERROR: fail to open $cat.int\n";} + if (-z "$cat.txt") {$exit = 1; return print "--> ERROR: $cat.txt is empty or does not exist\n";} + if (-z "$cat.int") {$exit = 1; return print "--> ERROR: $cat.int is empty or does not exist\n";} + if (!open(TXT, "<$cat.txt")) {$exit = 1; return print "--> ERROR: fail to open $cat.txt\n";} + if (!open(INT, "<$cat.int")) {$exit = 1; return print "--> ERROR: fail to open $cat.int\n";} $idx1 = 1; while() { @@ -146,14 +165,14 @@ sub check_txt_int { s/ singleton$//g; my @col = split(" ", $_); @set = split(" ", $entry[$idx2]); - if(@set != @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";} + if (@set != @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";} foreach(0 .. @set-1) { - if($symtab->{@set[$_]} ne @col[$_]) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2, block " ,$_+1, ")\n";} + if ($symtab->{@set[$_]} ne @col[$_]) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2, block " ,$_+1, ")\n";} } $idx2 ++; } close(INT); $idx2 --; - if($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";} + if ($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";} print "--> $cat.int corresponds to $cat.txt\n"; return print "--> $cat.\{txt, int\} are OK\n"; @@ -167,17 +186,19 @@ foreach(@list1) { foreach(@list2) { check_txt_int("$lang/phones/$_", \%psymtab); print "\n"; } -if((-s "$lang/phones/extra_questions.txt") || (-s "$lang/phones/extra_questions.int")) { +if ((-s "$lang/phones/extra_questions.txt") || (-s "$lang/phones/extra_questions.int")) { check_txt_int("$lang/phones/extra_questions", \%psymtab); print "\n"; } else { print "Checking $lang/phones/extra_questions.\{txt, int\} ...\n"; - if((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int")) { + if ((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int")) { print "--> WARNING: the optional $lang/phones/extra_questions.\{txt, int\} are empty!\n\n"; + $warning = 1; } else { - $exit = 1; print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n"; + print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n"; + $exit = 1; } } -if(-e "$lang/phones/word_boundary.txt") { +if (-e "$lang/phones/word_boundary.txt") { check_txt_int("$lang/phones/word_boundary", \%psymtab); print "\n"; } @@ -187,7 +208,7 @@ sub intersect { @itset = (); %itset = (); foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { + if (exists $b->{$_} and !$itset{$_}) { push(@itset, $_); $itset{$_} = 1; } @@ -197,16 +218,16 @@ sub intersect { sub check_disjoint { print "Checking disjoint: silence.txt, nosilenct.txt, disambig.txt ...\n"; - if(!open(S, "<$lang/phones/silence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/silence.txt\n";} - if(!open(N, "<$lang/phones/nonsilence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/nonsilence.txt\n";} - if(!open(D, "<$lang/phones/disambig.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/disambig.txt\n";} + if (!open(S, "<$lang/phones/silence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/silence.txt\n";} + if (!open(N, "<$lang/phones/nonsilence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/nonsilence.txt\n";} + if (!open(D, "<$lang/phones/disambig.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/disambig.txt\n";} $idx = 1; while() { chomp; my @col = split(" ", $_); $phone = shift @col; - if($silence{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/silence.txt (line $idx)\n";} + if ($silence{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/silence.txt (line $idx)\n";} $silence{$phone} = 1; push(@silence, $phone); $idx ++; @@ -218,7 +239,7 @@ sub check_disjoint { chomp; my @col = split(" ", $_); $phone = shift @col; - if($nonsilence{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/nonsilence.txt (line $idx)\n";} + if ($nonsilence{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/nonsilence.txt (line $idx)\n";} $nonsilence{$phone} = 1; push(@nonsilence, $phone); $idx ++; @@ -230,7 +251,7 @@ sub check_disjoint { chomp; my @col = split(" ", $_); $phone = shift @col; - if($disambig{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/disambig.txt (line $idx)\n";} + if ($disambig{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/disambig.txt (line $idx)\n";} $disambig{$phone} = 1; $idx ++; } @@ -241,7 +262,7 @@ sub check_disjoint { my @itsect3 = intersect(\%disambig, \%nonsilence); $success = 1; - if(@itsect1 != 0) { + if (@itsect1 != 0) { $success = 0; $exit = 1; print "--> ERROR: silence.txt and nonsilence.txt have intersection -- "; foreach(@itsect1) { @@ -250,7 +271,7 @@ sub check_disjoint { print "\n"; } else {print "--> silence.txt and nonsilence.txt are disjoint\n";} - if(@itsect2 != 0) { + if (@itsect2 != 0) { $success = 0; $exit = 1; print "--> ERROR: silence.txt and disambig.txt have intersection -- "; foreach(@itsect2) { @@ -259,7 +280,7 @@ sub check_disjoint { print "\n"; } else {print "--> silence.txt and disambig.txt are disjoint\n";} - if(@itsect3 != 0) { + if (@itsect3 != 0) { $success = 0; $exit = 1; print "--> ERROR: disambig.txt and nonsilence.txt have intersection -- "; foreach(@itsect1) { @@ -274,9 +295,9 @@ sub check_disjoint { sub check_summation { print "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n"; - if(scalar(keys %silence) == 0) {$exit = 1; return print "--> ERROR: $lang/phones/silence.txt is empty or not exists\n";} - if(scalar(keys %nonsilence) == 0) {$exit = 1; return print "--> ERROR: $lang/phones/nonsilence.txt is empty or not exists\n";} - if(scalar(keys %disambig) == 0) {$exit = 1; return print "--> ERROR: $lang/phones/disambig.txt is empty or not exists\n";} + if (scalar(keys %silence) == 0) {$exit = 1; return print "--> ERROR: $lang/phones/silence.txt is empty or does not exist\n";} + if (scalar(keys %nonsilence) == 0) {$exit = 1; return print "--> ERROR: $lang/phones/nonsilence.txt is empty or does not exist\n";} + if (scalar(keys %disambig) == 0) {$warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n";} %sum = (%silence, %nonsilence, %disambig); $sum{""} = 1; @@ -285,23 +306,23 @@ sub check_summation { my @key1 = keys %sum; my @key2 = keys %psymtab; my %itset = (); foreach(@itset) {$itset{$_} = 1;} - if(@itset < @key1) { + if (@itset < @key1) { $exit = 1; print "--> ERROR: phones in silence.txt, nonsilence.txt, disambig.txt but not in phones.txt -- "; foreach(@key1) { - if(!$itset{$_}) {print "$_ ";} + if (!$itset{$_}) {print "$_ ";} } print "\n"; } - if(@itset < @key2) { + if (@itset < @key2) { $exit = 1; print "--> ERROR: phones in phones.txt but not in silence.txt, nonsilence.txt, disambig.txt -- "; foreach(@key2) { - if(!$itset{$_}) {print "$_ ";} + if (!$itset{$_}) {print "$_ ";} } print "\n"; } - if(@itset == @key1 and @itset == @key2) { + if (@itset == @key1 and @itset == @key2) { print "--> summation property is OK\n"; } return; @@ -319,8 +340,8 @@ check_summation; print "\n"; print "Checking optional_silence.txt ...\n"; $idx = 1; $success = 1; -if(-z "$lang/phones/optional_silence.txt") {$exit = 1; $success = 0; print "--> ERROR: $lang/phones/optional_silence.txt is empty or not exists\n";} -if(!open(OS, "<$lang/phones/optional_silence.txt")) {$exit = 1; $success = 0; print "--> ERROR: fail to open $lang/phones/optional_silence.txt\n";} +if (-z "$lang/phones/optional_silence.txt") {$exit = 1; $success = 0; print "--> ERROR: $lang/phones/optional_silence.txt is empty or does not exist\n";} +if (!open(OS, "<$lang/phones/optional_silence.txt")) {$exit = 1; $success = 0; print "--> ERROR: fail to open $lang/phones/optional_silence.txt\n";} print "--> reading $lang/phones/optional_silence.txt\n"; while() { chomp; @@ -338,35 +359,38 @@ print "\n"; # Check disambiguation symbols ------------------------------- print "Checking disambiguation symbols: #0 and #1\n"; -if(scalar(keys %disambig) == 0) {$exit = 1; print "--> ERROR: $lang/phones/disambig.txt is empty or not exists\n";} -if(exists $disambig{"#0"} and exists $disambig{"#1"}) { +if (scalar(keys %disambig) == 0) {$warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n";} +if (exists $disambig{"#0"} and exists $disambig{"#1"}) { print "--> $lang/phones/disambig.txt has \"#0\" and \"#1\"\n"; print "--> $lang/phones/disambig.txt is OK\n\n"; } else { - $exit = 1; print "--> ERROR: $lang/phones/disambig.txt doesn't have \"#0\" or \"#1\"\n"; + print "--> WARNING: $lang/phones/disambig.txt doesn't have \"#0\" or \"#1\";\n"; + print "--> this would not be OK with a conventional ARPA-type language\n"; + print "--> model or a conventional lexicon (L.fst)\n"; + $warning = 1; } # Check topo ------------------------------- print "Checking topo ...\n"; -if(-z "$lang/topo") {$exit = 1; print "--> ERROR: $lang/topo is empty or not exists\n";} -if(!open(T, "<$lang/topo")) {$exit = 1; print "--> ERROR: fail to open $lang/topo\n";} +if (-z "$lang/topo") {$exit = 1; print "--> ERROR: $lang/topo is empty or does not exist\n";} +if (!open(T, "<$lang/topo")) {$exit = 1; print "--> ERROR: fail to open $lang/topo\n";} $idx = 1; while() { chomp; - next if(m/^<.*>[ ]*$/); - if($idx == 1) {$nonsilence_seq = $_; $idx ++;} - if($idx == 2) {$silence_seq = $_;} + next if (m/^<.*>[ ]*$/); + if ($idx == 1) {$nonsilence_seq = $_; $idx ++;} + if ($idx == 2) {$silence_seq = $_;} } close(T); -if($silence_seq == 0 || $nonsilence_seq == 0) {$exit = 1; print "--> ERROR: $lang/topo doesn't have nonsilence section or silence section\n";} +if ($silence_seq == 0 || $nonsilence_seq == 0) {$exit = 1; print "--> ERROR: $lang/topo doesn't have nonsilence section or silence section\n";} @silence_seq = split(" ", $silence_seq); @nonsilence_seq = split(" ", $nonsilence_seq); $success1 = 1; -if(@nonsilence_seq != @nonsilence) {$exit = 1; print "--> ERROR: $lang/topo's nonsilence section doesn't correspond to nonsilence.txt\n";} +if (@nonsilence_seq != @nonsilence) {$exit = 1; print "--> ERROR: $lang/topo's nonsilence section doesn't correspond to nonsilence.txt\n";} else { foreach(0 .. scalar(@nonsilence)-1) { - if($psymtab{@nonsilence[$_]} ne @nonsilence_seq[$_]) { + if ($psymtab{@nonsilence[$_]} ne @nonsilence_seq[$_]) { $exit = 1; print "--> ERROR: $lang/topo's nonsilence section doesn't correspond to nonsilence.txt\n"; $success = 0; } @@ -374,10 +398,10 @@ else { } $success1 != 1 || print "--> $lang/topo's nonsilence section is OK\n"; $success2 = 1; -if(@silence_seq != @silence) {$exit = 1; print "--> ERROR: $lang/topo's silence section doesn't correspond to silence.txt\n";} +if (@silence_seq != @silence) {$exit = 1; print "--> ERROR: $lang/topo's silence section doesn't correspond to silence.txt\n";} else { foreach(0 .. scalar(@silence)-1) { - if($psymtab{@silence[$_]} ne @silence_seq[$_]) { + if ($psymtab{@silence[$_]} ne @silence_seq[$_]) { $exit = 1; print "--> ERROR: $lang/topo's silence section doesn't correspond to silence.txt\n"; $success = 0; } @@ -393,9 +417,9 @@ $begin = ""; $end = ""; $internal = ""; $singleton = ""; -if(-s "$lang/phones/word_boundary.txt") { +if (-s "$lang/phones/word_boundary.txt") { print "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n"; - if(!open (W, "<$lang/phones/word_boundary.txt")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.txt\n";} + if (!open (W, "<$lang/phones/word_boundary.txt")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.txt\n";} $idx = 1; %wb = (); while() { @@ -406,7 +430,7 @@ if(-s "$lang/phones/word_boundary.txt") { if (m/^.*end$/ ) {s/ end$//g; @col = split(" ", $_); if (@col == 1) {$end .= "$col[0] ";}} if (m/^.*internal$/ ) {s/ internal$//g; @col = split(" ", $_); if (@col == 1) {$internal .= "$col[0] ";}} if (m/^.*singleton$/) {s/ singleton$//g; @col = split(" ", $_); if (@col == 1) {$singleton .= "$col[0] ";}} - if(@col != 1) {$exit = 1; print "--> ERROR: expect 1 column in $lang/phones/word_boundary.txt (line $idx)\n";} + if (@col != 1) {$exit = 1; print "--> ERROR: expect 1 column in $lang/phones/word_boundary.txt (line $idx)\n";} $wb{shift @col} = 1; $idx ++; } @@ -414,7 +438,7 @@ if(-s "$lang/phones/word_boundary.txt") { @itset = intersect(\%disambig, \%wb); $success1 = 1; - if(@itset != 0) { + if (@itset != 0) { $success1 = 0; $exit = 1; print "--> ERROR: $lang/phones/word_boundary.txt has disambiguation symbols -- "; foreach(@itset) {print "$_ ";} @@ -426,19 +450,19 @@ if(-s "$lang/phones/word_boundary.txt") { @itset = intersect(\%sum, \%wb); %itset = (); foreach(@itset) {$itset{$_} = 1;} $success2 = 1; - if(@itset < scalar(keys %sum)) { + if (@itset < scalar(keys %sum)) { $success2 = 0; $exit = 1; print "--> ERROR: phones in nonsilence.txt and silence.txt but not in word_boundary.txt -- "; foreach(keys %sum) { - if(!$itset{$_}) {print "$_ ";} + if (!$itset{$_}) {print "$_ ";} } print "\n"; } - if(@itset < scalar(keys %wb)) { + if (@itset < scalar(keys %wb)) { $success2 = 0; $exit = 1; print "--> ERROR: phones in word_boundary.txt but not in nonsilence.txt or silence.txt -- "; foreach(keys %wb) { - if(!$itset{$_}) {print "$_ ";} + if (!$itset{$_}) {print "$_ ";} } print "\n"; } @@ -447,16 +471,16 @@ if(-s "$lang/phones/word_boundary.txt") { print "\n"; } -if(-s "$lang/phones/word_boundary.int") { +if (-s "$lang/phones/word_boundary.int") { print "Checking word_boundary.int and disambig.int\n"; - if(!open (W, "<$lang/phones/word_boundary.int")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.int\n";} + if (!open (W, "<$lang/phones/word_boundary.int")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.int\n";} while () { @A = split; if (@A != 2) { $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/word_boundary.int\n"; } $wbtype{$A[0]} = $A[1]; } close(W); - if(!open (D, "<$lang/phones/disambig.int")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/disambig.int\n";} + if (!open (D, "<$lang/phones/disambig.int")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/disambig.int\n";} while () { @A = split; if (@A != 1) { $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/disambig.int\n"; } @@ -477,7 +501,7 @@ if(-s "$lang/phones/word_boundary.int") { $sid ++; } $wordseq = $wordseq . "$sid 0"; - $phoneseq = `echo \"$wordseq" | fstcompile | fstcompose $lang/$fst - | fstproject | fstrandgen | fstrmepsilon | fsttopsort | fstprint | awk '{if(NF > 2) {print \$3}}';`; + $phoneseq = `echo \"$wordseq" | fstcompile | fstcompose $lang/$fst - | fstproject | fstrandgen | fstrmepsilon | fsttopsort | fstprint | awk '{if (NF > 2) {print \$3}}';`; @phoneseq = split(" ", $phoneseq); $transition = { }; # empty assoc. array of allowed transitions between phone types. 1 means we count a word, # 0 means transition is allowed. bos and eos are added as extra symbols here. @@ -529,6 +553,34 @@ if(-s "$lang/phones/word_boundary.int") { check_txt_int("$lang/oov", \%wsymtab); print "\n"; -if ($exit == 1) { print "--> ERROR\n"; exit 1;} -else { print "--> SUCCESS\n"; exit 0; } +# Check determinizability of G.fst +if (-e "$lang/G.fst") { + $cmd = "fstdeterminize $lang/G.fst /dev/null"; + $ret = system(". ./path.sh; $cmd"); + if ($ret == 0) { + print "--> command $cmd succeeded\n"; + } else { + print "--> ERROR: command $cmd failed\n"; + $exit = 1; + } +} + +if (-e "$lang/G.fst" && -e "$lang/L_disambig.fst") { + $cmd = "fstcompose $lang/L_disambig.fst $lang/G.fst | fstdeterminize > /dev/null"; + $ret = system(". ./path.sh; $cmd"); + if ($ret == 0) { + print "--> command $cmd succeeded\n"; + } else { + print "--> ERROR: command $cmd failed\n"; + $exit = 1; + } +} + + +if ($exit == 1) { print "--> ERROR (see error messages above)\n"; exit 1;} +else { + if ($warning == 1) { print "--> WARNING (check output above for warnings)\n"; exit 0; } + else { print "--> SUCCESS\n"; exit 0; } +} +