trunk: improvements to validate_lang.pl, and updates to make_phone_bigram_lang.sh to properly validate output.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@3368 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2013-12-28 20:56:13 +00:00
Родитель 00ca8f8a31
Коммит 7ae89356b5
2 изменённых файлов: 153 добавлений и 87 удалений

Просмотреть файл

@ -42,11 +42,21 @@ rm -r $lang_out/phones 2>/dev/null
cp -r $lang/phones/ $lang_out/
rm $lang_out/phones/word_boundary.* 2>/dev/null # these would
# no longer be valid.
# List of disambig symbols will be empty.
# List of disambig symbols will be empty: not needed, since G.fst and L.fst * G.fst
# are determinizable without any.
echo -n > $lang_out/phones/disambig.txt
echo -n > $lang_out/phones/disambig.int
echo -n > $lang_out/phones/disambig.csl
# Let OOV symbol be the first phone. This is arbitrary, it's just
# so that validate_lang.pl succeeds. We should never actually use
# this.
oov_sym=$(tail -n +2 $lang_out/phones.txt | head -n 1 | awk '{print $1}')
oov_int=$(tail -n +2 $lang_out/phones.txt | head -n 1 | awk '{print $2}')
echo $oov_sym > $lang_out/oov.txt
echo $oov_int > $lang_out/oov.int
# Get phone-level transcripts of training data and create a
# language model.
ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \
@ -96,3 +106,7 @@ grep -v '<eps>' $lang_out/phones.txt | awk '{printf("0 0 %s %s\n", $2, $2);} END
# L and L_disambig are the same.
cp $lang_out/L.fst $lang_out/L_disambig.fst
utils/validate_lang.pl $lang_out || exit 1;
echo "$0: ignore warnings above from validate_lang.pl (these are expected)"

Просмотреть файл

@ -4,22 +4,25 @@
#
# Validation script for data/lang
if(@ARGV != 1) {
die "Usage: validate_lang.pl lang_directory\n";
if (@ARGV != 1) {
print "Usage: $0 <lang_directory>\n";
print "e.g.: $0 data/lang\n";
exit(1);
}
$lang = shift @ARGV;
$exit = 0;
$warning = 0;
# Checking phones.txt -------------------------------
print "Checking $lang/phones.txt ...\n";
if(-z "$lang/phones.txt") {print "--> ERROR: $lang/phones.txt is empty or not exists\n"; exit 1;}
if(!open(P, "<$lang/phones.txt")) {print "--> ERROR: fail to open $lang/phones.txt\n"; exit 1;}
if (-z "$lang/phones.txt") {print "--> ERROR: $lang/phones.txt is empty or does not exist\n"; exit 1;}
if (!open(P, "<$lang/phones.txt")) {print "--> ERROR: fail to open $lang/phones.txt\n"; exit 1;}
$idx = 1;
%psymtab = ();
while(<P>) {
chomp;
my @col = split(" ", $_);
if(@col != 2) {print "--> ERROR: expect 2 columns in $lang/phones.txt (break at line $idx)\n"; exit 1;}
if (@col != 2) {print "--> ERROR: expect 2 columns in $lang/phones.txt (break at line $idx)\n"; exit 1;}
my $phone = shift @col;
my $id = shift @col;
$psymtab{$phone} = $id;
@ -28,7 +31,7 @@ while(<P>) {
close(P);
%pint2sym = ();
foreach(keys %psymtab) {
if($pint2sym{$psymtab{$_}}) {print "--> ERROR: ID \"$psymtab{$_}\" duplicates\n"; exit 1;}
if ($pint2sym{$psymtab{$_}}) {print "--> ERROR: ID \"$psymtab{$_}\" duplicates\n"; exit 1;}
else {$pint2sym{$psymtab{$_}} = $_;}
}
print "--> $lang/phones.txt is OK\n";
@ -36,14 +39,14 @@ print "\n";
# Check word.txt -------------------------------
print "Checking words.txt: #0 ...\n";
if(-z "$lang/words.txt") {print "--> ERROR: $lang/words.txt is empty or not exists\n"; exit 1;}
if(!open(W, "<$lang/words.txt")) {print "--> ERROR: fail to open $lang/words.txt\n"; exit 1;}
if (-z "$lang/words.txt") {print "--> ERROR: $lang/words.txt is empty or does not exist\n"; exit 1;}
if (!open(W, "<$lang/words.txt")) {print "--> ERROR: fail to open $lang/words.txt\n"; exit 1;}
$idx = 1;
%wsymtab = ();
while(<W>) {
chomp;
my @col = split(" ", $_);
if(@col != 2) {print "--> ERROR: expect 2 columns in $lang/words.txt (line $idx)\n"; exit 1;}
if (@col != 2) {print "--> ERROR: expect 2 columns in $lang/words.txt (line $idx)\n"; exit 1;}
$word = shift @col;
$id = shift @col;
$wsymtab{$word} = $id;
@ -52,31 +55,37 @@ while(<W>) {
close(W);
%wint2sym = ();
foreach(keys %wsymtab) {
if($wint2sym{$wsymtab{$_}}) {print "--> ERROR: ID \"$wsymtab{$_}\" duplicates\n"; exit 1;}
if ($wint2sym{$wsymtab{$_}}) {print "--> ERROR: ID \"$wsymtab{$_}\" duplicates\n"; exit 1;}
else {$wint2sym{$wsymtab{$_}} = $_;}
}
if(exists $wsymtab{"#0"}) {
if (exists $wsymtab{"#0"}) {
print "--> $lang/words.txt has \"#0\"\n";
print "--> $lang/words.txt is OK\n";
} else {print "--> ERROR: $lang/words.txt doesn't have \"#0\"\n"; $exit = 1;}
} else {
$warning = 1;
print "--> WARNING: $lang/words.txt doesn't have \"#0\"\n";
print "--> (if you are using ARPA-type language models, you will normally\n";
print "--> need the disambiguation symbol \"#0\" to ensure determinizability)\n";
}
print "\n";
# Checking phones/* -------------------------------
sub check_txt_int_csl {
my ($cat, $symtab) = @_;
print "Checking $cat.\{txt, int, csl\} ...\n";
if(-z "$cat.txt") {$exit = 1; return print "--> ERROR: $cat.txt is empty or not exists\n";}
if(-z "$cat.int") {$exit = 1; return print "--> ERROR: $cat.int is empty or not exists\n";}
if(-z "$cat.csl") {$exit = 1; return print "--> ERROR: $cat.csl is empty or not exists\n";}
if(!open(TXT, "<$cat.txt")) {$exit = 1; return print "--> ERROR: fail to open $cat.txt\n";}
if(!open(INT, "<$cat.int")) {$exit = 1; return print "--> ERROR: fail to open $cat.int\n";}
if(!open(CSL, "<$cat.csl")) {$exit = 1; return print "--> ERROR: fail to open $cat.csl\n";}
if (!open(TXT, "<$cat.txt")) {$exit = 1; return print "--> ERROR: fail to open $cat.txt\n";}
if (!open(INT, "<$cat.int")) {$exit = 1; return print "--> ERROR: fail to open $cat.int\n";}
if (!open(CSL, "<$cat.csl")) {$exit = 1; return print "--> ERROR: fail to open $cat.csl\n";}
if (-z "$cat.txt") {$warning = 1; print "--> WARNING: $cat.txt is empty\n";}
if (-z "$cat.int") {$warning = 1; print "--> WARNING: $cat.int is empty\n";}
if (-z "$cat.csl") {$warning = 1; print "--> WARNING: $cat.csl is empty\n";}
$idx1 = 1;
while(<TXT>) {
chomp;
my @col = split(" ", $_);
if(@col != 1) {$exit = 1; return print "--> ERROR: expect 1 column in $cat.txt (break at line $idx1)\n";}
if (@col != 1) {$exit = 1; return print "--> ERROR: expect 1 column in $cat.txt (break at line $idx1)\n";}
$entry[$idx1] = shift @col;
$idx1 ++;
}
@ -87,26 +96,36 @@ sub check_txt_int_csl {
while(<INT>) {
chomp;
my @col = split(" ", $_);
if(@col != 1) {$exit = 1; return print "--> ERROR: expect 1 column in $cat.int (break at line $idx2)\n";}
if($symtab->{$entry[$idx2]} ne shift @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";}
if (@col != 1) {$exit = 1; return print "--> ERROR: expect 1 column in $cat.int (break at line $idx2)\n";}
if ($symtab->{$entry[$idx2]} ne shift @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";}
$idx2 ++;
}
close(INT); $idx2 --;
if($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";}
if ($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";}
print "--> $cat.int corresponds to $cat.txt\n";
$idx3 = 1;
$num_lines = 0;
while(<CSL>) {
chomp;
my @col = split(":", $_);
if(@col != $idx1) {$exit = 1; return print "--> ERROR: expect $idx1 block/blocks in $cat.csl (break at line $idx3)\n";}
$num_lines++;
if (@col != $idx1) {$exit = 1; return print "--> ERROR: expect $idx1 block/blocks in $cat.csl (break at line $idx3)\n";}
foreach(1 .. $idx1) {
if($symtab->{$entry[$_]} ne @col[$_-1]) {$exit = 1; return print "--> ERROR: $cat.csl doesn't correspond to $cat.txt (break at line $idx3, block $_)\n";}
if ($symtab->{$entry[$_]} ne @col[$_-1]) {$exit = 1; return print "--> ERROR: $cat.csl doesn't correspond to $cat.txt (break at line $idx3, block $_)\n";}
}
}
close(CSL);
if ($idx1 != 0) { # nonempty .txt,.int files
if ($num_lines != 1) {
$exit = 1;
return print "--> ERROR: expect 1 line in $cat.csl\n";
}
} else {
if ($num_lines != 1 && $num_lines != 0) {
$exit = 1;
return print "--> ERROR: expect 0 or 1 line in $cat.csl, since empty .txt,int\n";
}
$idx3 ++;
}
close(CSL); $idx3 --;
if($idx3 != 1) {$exit = 1; return print "--> ERROR: expect 1 row in $cat.csl (break at line ", $idx3+1, ")\n";}
print "--> $cat.csl corresponds to $cat.txt\n";
return print "--> $cat.\{txt, int, csl\} are OK\n";
@ -115,10 +134,10 @@ sub check_txt_int_csl {
sub check_txt_int {
my ($cat, $symtab) = @_;
print "Checking $cat.\{txt, int\} ...\n";
if(-z "$cat.txt") {$exit = 1; return print "--> ERROR: $cat.txt is empty or not exists\n";}
if(-z "$cat.int") {$exit = 1; return print "--> ERROR: $cat.int is empty or not exists\n";}
if(!open(TXT, "<$cat.txt")) {$exit = 1; return print "--> ERROR: fail to open $cat.txt\n";}
if(!open(INT, "<$cat.int")) {$exit = 1; return print "--> ERROR: fail to open $cat.int\n";}
if (-z "$cat.txt") {$exit = 1; return print "--> ERROR: $cat.txt is empty or does not exist\n";}
if (-z "$cat.int") {$exit = 1; return print "--> ERROR: $cat.int is empty or does not exist\n";}
if (!open(TXT, "<$cat.txt")) {$exit = 1; return print "--> ERROR: fail to open $cat.txt\n";}
if (!open(INT, "<$cat.int")) {$exit = 1; return print "--> ERROR: fail to open $cat.int\n";}
$idx1 = 1;
while(<TXT>) {
@ -146,14 +165,14 @@ sub check_txt_int {
s/ singleton$//g;
my @col = split(" ", $_);
@set = split(" ", $entry[$idx2]);
if(@set != @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";}
if (@set != @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";}
foreach(0 .. @set-1) {
if($symtab->{@set[$_]} ne @col[$_]) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2, block " ,$_+1, ")\n";}
if ($symtab->{@set[$_]} ne @col[$_]) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2, block " ,$_+1, ")\n";}
}
$idx2 ++;
}
close(INT); $idx2 --;
if($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";}
if ($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";}
print "--> $cat.int corresponds to $cat.txt\n";
return print "--> $cat.\{txt, int\} are OK\n";
@ -167,17 +186,19 @@ foreach(@list1) {
foreach(@list2) {
check_txt_int("$lang/phones/$_", \%psymtab); print "\n";
}
if((-s "$lang/phones/extra_questions.txt") || (-s "$lang/phones/extra_questions.int")) {
if ((-s "$lang/phones/extra_questions.txt") || (-s "$lang/phones/extra_questions.int")) {
check_txt_int("$lang/phones/extra_questions", \%psymtab); print "\n";
} else {
print "Checking $lang/phones/extra_questions.\{txt, int\} ...\n";
if((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int")) {
if ((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int")) {
print "--> WARNING: the optional $lang/phones/extra_questions.\{txt, int\} are empty!\n\n";
$warning = 1;
} else {
$exit = 1; print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n";
print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n";
$exit = 1;
}
}
if(-e "$lang/phones/word_boundary.txt") {
if (-e "$lang/phones/word_boundary.txt") {
check_txt_int("$lang/phones/word_boundary", \%psymtab); print "\n";
}
@ -187,7 +208,7 @@ sub intersect {
@itset = ();
%itset = ();
foreach(keys %$a) {
if(exists $b->{$_} and !$itset{$_}) {
if (exists $b->{$_} and !$itset{$_}) {
push(@itset, $_);
$itset{$_} = 1;
}
@ -197,16 +218,16 @@ sub intersect {
sub check_disjoint {
print "Checking disjoint: silence.txt, nosilenct.txt, disambig.txt ...\n";
if(!open(S, "<$lang/phones/silence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/silence.txt\n";}
if(!open(N, "<$lang/phones/nonsilence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/nonsilence.txt\n";}
if(!open(D, "<$lang/phones/disambig.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/disambig.txt\n";}
if (!open(S, "<$lang/phones/silence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/silence.txt\n";}
if (!open(N, "<$lang/phones/nonsilence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/nonsilence.txt\n";}
if (!open(D, "<$lang/phones/disambig.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/disambig.txt\n";}
$idx = 1;
while(<S>) {
chomp;
my @col = split(" ", $_);
$phone = shift @col;
if($silence{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/silence.txt (line $idx)\n";}
if ($silence{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/silence.txt (line $idx)\n";}
$silence{$phone} = 1;
push(@silence, $phone);
$idx ++;
@ -218,7 +239,7 @@ sub check_disjoint {
chomp;
my @col = split(" ", $_);
$phone = shift @col;
if($nonsilence{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/nonsilence.txt (line $idx)\n";}
if ($nonsilence{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/nonsilence.txt (line $idx)\n";}
$nonsilence{$phone} = 1;
push(@nonsilence, $phone);
$idx ++;
@ -230,7 +251,7 @@ sub check_disjoint {
chomp;
my @col = split(" ", $_);
$phone = shift @col;
if($disambig{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/disambig.txt (line $idx)\n";}
if ($disambig{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/disambig.txt (line $idx)\n";}
$disambig{$phone} = 1;
$idx ++;
}
@ -241,7 +262,7 @@ sub check_disjoint {
my @itsect3 = intersect(\%disambig, \%nonsilence);
$success = 1;
if(@itsect1 != 0) {
if (@itsect1 != 0) {
$success = 0;
$exit = 1; print "--> ERROR: silence.txt and nonsilence.txt have intersection -- ";
foreach(@itsect1) {
@ -250,7 +271,7 @@ sub check_disjoint {
print "\n";
} else {print "--> silence.txt and nonsilence.txt are disjoint\n";}
if(@itsect2 != 0) {
if (@itsect2 != 0) {
$success = 0;
$exit = 1; print "--> ERROR: silence.txt and disambig.txt have intersection -- ";
foreach(@itsect2) {
@ -259,7 +280,7 @@ sub check_disjoint {
print "\n";
} else {print "--> silence.txt and disambig.txt are disjoint\n";}
if(@itsect3 != 0) {
if (@itsect3 != 0) {
$success = 0;
$exit = 1; print "--> ERROR: disambig.txt and nonsilence.txt have intersection -- ";
foreach(@itsect1) {
@ -274,9 +295,9 @@ sub check_disjoint {
sub check_summation {
print "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n";
if(scalar(keys %silence) == 0) {$exit = 1; return print "--> ERROR: $lang/phones/silence.txt is empty or not exists\n";}
if(scalar(keys %nonsilence) == 0) {$exit = 1; return print "--> ERROR: $lang/phones/nonsilence.txt is empty or not exists\n";}
if(scalar(keys %disambig) == 0) {$exit = 1; return print "--> ERROR: $lang/phones/disambig.txt is empty or not exists\n";}
if (scalar(keys %silence) == 0) {$exit = 1; return print "--> ERROR: $lang/phones/silence.txt is empty or does not exist\n";}
if (scalar(keys %nonsilence) == 0) {$exit = 1; return print "--> ERROR: $lang/phones/nonsilence.txt is empty or does not exist\n";}
if (scalar(keys %disambig) == 0) {$warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n";}
%sum = (%silence, %nonsilence, %disambig);
$sum{"<eps>"} = 1;
@ -285,23 +306,23 @@ sub check_summation {
my @key1 = keys %sum;
my @key2 = keys %psymtab;
my %itset = (); foreach(@itset) {$itset{$_} = 1;}
if(@itset < @key1) {
if (@itset < @key1) {
$exit = 1; print "--> ERROR: phones in silence.txt, nonsilence.txt, disambig.txt but not in phones.txt -- ";
foreach(@key1) {
if(!$itset{$_}) {print "$_ ";}
if (!$itset{$_}) {print "$_ ";}
}
print "\n";
}
if(@itset < @key2) {
if (@itset < @key2) {
$exit = 1; print "--> ERROR: phones in phones.txt but not in silence.txt, nonsilence.txt, disambig.txt -- ";
foreach(@key2) {
if(!$itset{$_}) {print "$_ ";}
if (!$itset{$_}) {print "$_ ";}
}
print "\n";
}
if(@itset == @key1 and @itset == @key2) {
if (@itset == @key1 and @itset == @key2) {
print "--> summation property is OK\n";
}
return;
@ -319,8 +340,8 @@ check_summation; print "\n";
print "Checking optional_silence.txt ...\n";
$idx = 1;
$success = 1;
if(-z "$lang/phones/optional_silence.txt") {$exit = 1; $success = 0; print "--> ERROR: $lang/phones/optional_silence.txt is empty or not exists\n";}
if(!open(OS, "<$lang/phones/optional_silence.txt")) {$exit = 1; $success = 0; print "--> ERROR: fail to open $lang/phones/optional_silence.txt\n";}
if (-z "$lang/phones/optional_silence.txt") {$exit = 1; $success = 0; print "--> ERROR: $lang/phones/optional_silence.txt is empty or does not exist\n";}
if (!open(OS, "<$lang/phones/optional_silence.txt")) {$exit = 1; $success = 0; print "--> ERROR: fail to open $lang/phones/optional_silence.txt\n";}
print "--> reading $lang/phones/optional_silence.txt\n";
while(<OS>) {
chomp;
@ -338,35 +359,38 @@ print "\n";
# Check disambiguation symbols -------------------------------
print "Checking disambiguation symbols: #0 and #1\n";
if(scalar(keys %disambig) == 0) {$exit = 1; print "--> ERROR: $lang/phones/disambig.txt is empty or not exists\n";}
if(exists $disambig{"#0"} and exists $disambig{"#1"}) {
if (scalar(keys %disambig) == 0) {$warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n";}
if (exists $disambig{"#0"} and exists $disambig{"#1"}) {
print "--> $lang/phones/disambig.txt has \"#0\" and \"#1\"\n";
print "--> $lang/phones/disambig.txt is OK\n\n";
} else {
$exit = 1; print "--> ERROR: $lang/phones/disambig.txt doesn't have \"#0\" or \"#1\"\n";
print "--> WARNING: $lang/phones/disambig.txt doesn't have \"#0\" or \"#1\";\n";
print "--> this would not be OK with a conventional ARPA-type language\n";
print "--> model or a conventional lexicon (L.fst)\n";
$warning = 1;
}
# Check topo -------------------------------
print "Checking topo ...\n";
if(-z "$lang/topo") {$exit = 1; print "--> ERROR: $lang/topo is empty or not exists\n";}
if(!open(T, "<$lang/topo")) {$exit = 1; print "--> ERROR: fail to open $lang/topo\n";}
if (-z "$lang/topo") {$exit = 1; print "--> ERROR: $lang/topo is empty or does not exist\n";}
if (!open(T, "<$lang/topo")) {$exit = 1; print "--> ERROR: fail to open $lang/topo\n";}
$idx = 1;
while(<T>) {
chomp;
next if(m/^<.*>[ ]*$/);
if($idx == 1) {$nonsilence_seq = $_; $idx ++;}
if($idx == 2) {$silence_seq = $_;}
next if (m/^<.*>[ ]*$/);
if ($idx == 1) {$nonsilence_seq = $_; $idx ++;}
if ($idx == 2) {$silence_seq = $_;}
}
close(T);
if($silence_seq == 0 || $nonsilence_seq == 0) {$exit = 1; print "--> ERROR: $lang/topo doesn't have nonsilence section or silence section\n";}
if ($silence_seq == 0 || $nonsilence_seq == 0) {$exit = 1; print "--> ERROR: $lang/topo doesn't have nonsilence section or silence section\n";}
@silence_seq = split(" ", $silence_seq);
@nonsilence_seq = split(" ", $nonsilence_seq);
$success1 = 1;
if(@nonsilence_seq != @nonsilence) {$exit = 1; print "--> ERROR: $lang/topo's nonsilence section doesn't correspond to nonsilence.txt\n";}
if (@nonsilence_seq != @nonsilence) {$exit = 1; print "--> ERROR: $lang/topo's nonsilence section doesn't correspond to nonsilence.txt\n";}
else {
foreach(0 .. scalar(@nonsilence)-1) {
if($psymtab{@nonsilence[$_]} ne @nonsilence_seq[$_]) {
if ($psymtab{@nonsilence[$_]} ne @nonsilence_seq[$_]) {
$exit = 1; print "--> ERROR: $lang/topo's nonsilence section doesn't correspond to nonsilence.txt\n";
$success = 0;
}
@ -374,10 +398,10 @@ else {
}
$success1 != 1 || print "--> $lang/topo's nonsilence section is OK\n";
$success2 = 1;
if(@silence_seq != @silence) {$exit = 1; print "--> ERROR: $lang/topo's silence section doesn't correspond to silence.txt\n";}
if (@silence_seq != @silence) {$exit = 1; print "--> ERROR: $lang/topo's silence section doesn't correspond to silence.txt\n";}
else {
foreach(0 .. scalar(@silence)-1) {
if($psymtab{@silence[$_]} ne @silence_seq[$_]) {
if ($psymtab{@silence[$_]} ne @silence_seq[$_]) {
$exit = 1; print "--> ERROR: $lang/topo's silence section doesn't correspond to silence.txt\n";
$success = 0;
}
@ -393,9 +417,9 @@ $begin = "";
$end = "";
$internal = "";
$singleton = "";
if(-s "$lang/phones/word_boundary.txt") {
if (-s "$lang/phones/word_boundary.txt") {
print "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n";
if(!open (W, "<$lang/phones/word_boundary.txt")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.txt\n";}
if (!open (W, "<$lang/phones/word_boundary.txt")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.txt\n";}
$idx = 1;
%wb = ();
while(<W>) {
@ -406,7 +430,7 @@ if(-s "$lang/phones/word_boundary.txt") {
if (m/^.*end$/ ) {s/ end$//g; @col = split(" ", $_); if (@col == 1) {$end .= "$col[0] ";}}
if (m/^.*internal$/ ) {s/ internal$//g; @col = split(" ", $_); if (@col == 1) {$internal .= "$col[0] ";}}
if (m/^.*singleton$/) {s/ singleton$//g; @col = split(" ", $_); if (@col == 1) {$singleton .= "$col[0] ";}}
if(@col != 1) {$exit = 1; print "--> ERROR: expect 1 column in $lang/phones/word_boundary.txt (line $idx)\n";}
if (@col != 1) {$exit = 1; print "--> ERROR: expect 1 column in $lang/phones/word_boundary.txt (line $idx)\n";}
$wb{shift @col} = 1;
$idx ++;
}
@ -414,7 +438,7 @@ if(-s "$lang/phones/word_boundary.txt") {
@itset = intersect(\%disambig, \%wb);
$success1 = 1;
if(@itset != 0) {
if (@itset != 0) {
$success1 = 0;
$exit = 1; print "--> ERROR: $lang/phones/word_boundary.txt has disambiguation symbols -- ";
foreach(@itset) {print "$_ ";}
@ -426,19 +450,19 @@ if(-s "$lang/phones/word_boundary.txt") {
@itset = intersect(\%sum, \%wb);
%itset = (); foreach(@itset) {$itset{$_} = 1;}
$success2 = 1;
if(@itset < scalar(keys %sum)) {
if (@itset < scalar(keys %sum)) {
$success2 = 0;
$exit = 1; print "--> ERROR: phones in nonsilence.txt and silence.txt but not in word_boundary.txt -- ";
foreach(keys %sum) {
if(!$itset{$_}) {print "$_ ";}
if (!$itset{$_}) {print "$_ ";}
}
print "\n";
}
if(@itset < scalar(keys %wb)) {
if (@itset < scalar(keys %wb)) {
$success2 = 0;
$exit = 1; print "--> ERROR: phones in word_boundary.txt but not in nonsilence.txt or silence.txt -- ";
foreach(keys %wb) {
if(!$itset{$_}) {print "$_ ";}
if (!$itset{$_}) {print "$_ ";}
}
print "\n";
}
@ -447,16 +471,16 @@ if(-s "$lang/phones/word_boundary.txt") {
print "\n";
}
if(-s "$lang/phones/word_boundary.int") {
if (-s "$lang/phones/word_boundary.int") {
print "Checking word_boundary.int and disambig.int\n";
if(!open (W, "<$lang/phones/word_boundary.int")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.int\n";}
if (!open (W, "<$lang/phones/word_boundary.int")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.int\n";}
while (<W>) {
@A = split;
if (@A != 2) { $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/word_boundary.int\n"; }
$wbtype{$A[0]} = $A[1];
}
close(W);
if(!open (D, "<$lang/phones/disambig.int")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/disambig.int\n";}
if (!open (D, "<$lang/phones/disambig.int")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/disambig.int\n";}
while (<D>) {
@A = split;
if (@A != 1) { $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/disambig.int\n"; }
@ -477,7 +501,7 @@ if(-s "$lang/phones/word_boundary.int") {
$sid ++;
}
$wordseq = $wordseq . "$sid 0";
$phoneseq = `echo \"$wordseq" | fstcompile | fstcompose $lang/$fst - | fstproject | fstrandgen | fstrmepsilon | fsttopsort | fstprint | awk '{if(NF > 2) {print \$3}}';`;
$phoneseq = `echo \"$wordseq" | fstcompile | fstcompose $lang/$fst - | fstproject | fstrandgen | fstrmepsilon | fsttopsort | fstprint | awk '{if (NF > 2) {print \$3}}';`;
@phoneseq = split(" ", $phoneseq);
$transition = { }; # empty assoc. array of allowed transitions between phone types. 1 means we count a word,
# 0 means transition is allowed. bos and eos are added as extra symbols here.
@ -529,6 +553,34 @@ if(-s "$lang/phones/word_boundary.int") {
check_txt_int("$lang/oov", \%wsymtab); print "\n";
if ($exit == 1) { print "--> ERROR\n"; exit 1;}
else { print "--> SUCCESS\n"; exit 0; }
# Check determinizability of G.fst
if (-e "$lang/G.fst") {
$cmd = "fstdeterminize $lang/G.fst /dev/null";
$ret = system(". ./path.sh; $cmd");
if ($ret == 0) {
print "--> command $cmd succeeded\n";
} else {
print "--> ERROR: command $cmd failed\n";
$exit = 1;
}
}
if (-e "$lang/G.fst" && -e "$lang/L_disambig.fst") {
$cmd = "fstcompose $lang/L_disambig.fst $lang/G.fst | fstdeterminize > /dev/null";
$ret = system(". ./path.sh; $cmd");
if ($ret == 0) {
print "--> command $cmd succeeded\n";
} else {
print "--> ERROR: command $cmd failed\n";
$exit = 1;
}
}
if ($exit == 1) { print "--> ERROR (see error messages above)\n"; exit 1;}
else {
if ($warning == 1) { print "--> WARNING (check output above for warnings)\n"; exit 0; }
else { print "--> SUCCESS\n"; exit 0; }
}