trunk: improvements to validate_lang.pl, and updates to make_phone_bigram_lang.sh to properly validate output.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@3368 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2013-12-28 20:56:13 +00:00 · 2013-12-28 20:56:13 +00:00 · 7ae89356b5
--- a/egs/wsj/s5/utils/make_phone_bigram_lang.sh
+++ b/egs/wsj/s5/utils/make_phone_bigram_lang.sh
@ -42,11 +42,21 @@ rm -r $lang_out/phones 2>/dev/null
 cp -r $lang/phones/ $lang_out/
 rm $lang_out/phones/word_boundary.* 2>/dev/null # these would
  # no longer be valid.
-# List of disambig symbols will be empty.
+# List of disambig symbols will be empty: not needed, since G.fst and L.fst * G.fst
+# are determinizable without any.
 echo -n > $lang_out/phones/disambig.txt
 echo -n > $lang_out/phones/disambig.int
 echo -n > $lang_out/phones/disambig.csl

+# Let OOV symbol be the first phone.  This is arbitrary, it's just
+# so that validate_lang.pl succeeds.  We should never actually use
+# this.
+oov_sym=$(tail -n +2 $lang_out/phones.txt | head -n 1 | awk '{print $1}')
+oov_int=$(tail -n +2 $lang_out/phones.txt | head -n 1 | awk '{print $2}')
+echo $oov_sym > $lang_out/oov.txt
+echo $oov_int > $lang_out/oov.int
+
+
 # Get phone-level transcripts of training data and create a
 # language model.
 ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \
@ -96,3 +106,7 @@ grep -v '<eps>' $lang_out/phones.txt | awk '{printf("0 0 %s %s\n", $2, $2);} END

 # L and L_disambig are the same.
 cp $lang_out/L.fst $lang_out/L_disambig.fst
+
+utils/validate_lang.pl $lang_out || exit 1;
+echo "$0: ignore warnings above from validate_lang.pl (these are expected)"
+
--- a/egs/wsj/s5/utils/validate_lang.pl
+++ b/egs/wsj/s5/utils/validate_lang.pl
@ -4,22 +4,25 @@
 #
 # Validation script for data/lang

-if(@ARGV != 1) {
-  die "Usage: validate_lang.pl lang_directory\n";
+if (@ARGV != 1) {
+  print "Usage: $0 <lang_directory>\n";
+  print "e.g.:  $0 data/lang\n";
+  exit(1);
 }

 $lang = shift @ARGV;
 $exit = 0;
+$warning = 0;
 # Checking phones.txt -------------------------------
 print "Checking $lang/phones.txt ...\n";
-if(-z "$lang/phones.txt") {print "--> ERROR: $lang/phones.txt is empty or not exists\n"; exit 1;}
-if(!open(P, "<$lang/phones.txt")) {print "--> ERROR: fail to open $lang/phones.txt\n"; exit 1;}
+if (-z "$lang/phones.txt") {print "--> ERROR: $lang/phones.txt is empty or does not exist\n"; exit 1;}
+if (!open(P, "<$lang/phones.txt")) {print "--> ERROR: fail to open $lang/phones.txt\n"; exit 1;}
 $idx = 1;
 %psymtab = ();
 while(<P>) {
  chomp;
  my @col = split(" ", $_);
-  if(@col != 2) {print "--> ERROR: expect 2 columns in $lang/phones.txt (break at line $idx)\n"; exit 1;}
+  if (@col != 2) {print "--> ERROR: expect 2 columns in $lang/phones.txt (break at line $idx)\n"; exit 1;}
  my $phone = shift @col;
  my $id = shift @col;
  $psymtab{$phone} = $id;
@ -28,7 +31,7 @@ while(<P>) {
 close(P);
 %pint2sym = (); 
 foreach(keys %psymtab) {
-  if($pint2sym{$psymtab{$_}}) {print "--> ERROR: ID \"$psymtab{$_}\" duplicates\n"; exit 1;} 
+  if ($pint2sym{$psymtab{$_}}) {print "--> ERROR: ID \"$psymtab{$_}\" duplicates\n"; exit 1;} 
  else {$pint2sym{$psymtab{$_}} = $_;}
 }
 print "--> $lang/phones.txt is OK\n";
@ -36,14 +39,14 @@ print "\n";

 # Check word.txt -------------------------------
 print "Checking words.txt: #0 ...\n";
-if(-z "$lang/words.txt") {print "--> ERROR: $lang/words.txt is empty or not exists\n"; exit 1;}
-if(!open(W, "<$lang/words.txt")) {print "--> ERROR: fail to open $lang/words.txt\n"; exit 1;}
+if (-z "$lang/words.txt") {print "--> ERROR: $lang/words.txt is empty or does not exist\n"; exit 1;}
+if (!open(W, "<$lang/words.txt")) {print "--> ERROR: fail to open $lang/words.txt\n"; exit 1;}
 $idx = 1;
 %wsymtab = ();
 while(<W>) {
  chomp;
  my @col = split(" ", $_);
-  if(@col != 2) {print "--> ERROR: expect 2 columns in $lang/words.txt (line $idx)\n"; exit 1;}
+  if (@col != 2) {print "--> ERROR: expect 2 columns in $lang/words.txt (line $idx)\n"; exit 1;}
  $word = shift @col;
  $id = shift @col;
  $wsymtab{$word} = $id;
@ -52,31 +55,37 @@ while(<W>) {
 close(W);
 %wint2sym = (); 
 foreach(keys %wsymtab) {
-  if($wint2sym{$wsymtab{$_}}) {print "--> ERROR: ID \"$wsymtab{$_}\" duplicates\n"; exit 1;} 
+  if ($wint2sym{$wsymtab{$_}}) {print "--> ERROR: ID \"$wsymtab{$_}\" duplicates\n"; exit 1;} 
  else {$wint2sym{$wsymtab{$_}} = $_;}
 }
-if(exists $wsymtab{"#0"}) {
+if (exists $wsymtab{"#0"}) {
  print "--> $lang/words.txt has \"#0\"\n";
  print "--> $lang/words.txt is OK\n";
-} else {print "--> ERROR: $lang/words.txt doesn't have \"#0\"\n"; $exit = 1;}
+} else {
+  $warning = 1;
+  print "--> WARNING: $lang/words.txt doesn't have \"#0\"\n";
+  print "-->          (if you are using ARPA-type language models, you will normally\n";
+  print "-->           need the disambiguation symbol \"#0\" to ensure determinizability)\n";
+}
 print "\n";

 # Checking phones/* -------------------------------
 sub check_txt_int_csl {
  my ($cat, $symtab) = @_;
  print "Checking $cat.\{txt, int, csl\} ...\n";
-  if(-z "$cat.txt") {$exit = 1; return print "--> ERROR: $cat.txt is empty or not exists\n";}
-  if(-z "$cat.int") {$exit = 1; return print "--> ERROR: $cat.int is empty or not exists\n";}
-  if(-z "$cat.csl") {$exit = 1; return print "--> ERROR: $cat.csl is empty or not exists\n";}
-  if(!open(TXT, "<$cat.txt")) {$exit = 1; return print "--> ERROR: fail to open $cat.txt\n";}
-  if(!open(INT, "<$cat.int")) {$exit = 1; return print "--> ERROR: fail to open $cat.int\n";}
-  if(!open(CSL, "<$cat.csl")) {$exit = 1; return print "--> ERROR: fail to open $cat.csl\n";}
+  if (!open(TXT, "<$cat.txt")) {$exit = 1; return print "--> ERROR: fail to open $cat.txt\n";}
+  if (!open(INT, "<$cat.int")) {$exit = 1; return print "--> ERROR: fail to open $cat.int\n";}
+  if (!open(CSL, "<$cat.csl")) {$exit = 1; return print "--> ERROR: fail to open $cat.csl\n";}
+
+  if (-z "$cat.txt") {$warning = 1; print "--> WARNING: $cat.txt is empty\n";}
+  if (-z "$cat.int") {$warning = 1; print "--> WARNING: $cat.int is empty\n";}
+  if (-z "$cat.csl") {$warning = 1; print "--> WARNING: $cat.csl is empty\n";}

  $idx1 = 1;
  while(<TXT>) {
    chomp;
    my @col = split(" ", $_);
-    if(@col != 1) {$exit = 1; return print "--> ERROR: expect 1 column in $cat.txt (break at line $idx1)\n";}
+    if (@col != 1) {$exit = 1; return print "--> ERROR: expect 1 column in $cat.txt (break at line $idx1)\n";}
    $entry[$idx1] = shift @col;
    $idx1 ++;
  }
@ -87,26 +96,36 @@ sub check_txt_int_csl {
  while(<INT>) {
    chomp;
    my @col = split(" ", $_);
-    if(@col != 1) {$exit = 1; return print "--> ERROR: expect 1 column in $cat.int (break at line $idx2)\n";}
-    if($symtab->{$entry[$idx2]} ne shift @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";}
+    if (@col != 1) {$exit = 1; return print "--> ERROR: expect 1 column in $cat.int (break at line $idx2)\n";}
+    if ($symtab->{$entry[$idx2]} ne shift @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";}
    $idx2 ++;
  }
  close(INT); $idx2 --;
-  if($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";}
+  if ($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";}
  print "--> $cat.int corresponds to $cat.txt\n";

-  $idx3 = 1;
+  $num_lines = 0;
  while(<CSL>) {
    chomp;
    my @col = split(":", $_);
-    if(@col != $idx1) {$exit = 1; return print "--> ERROR: expect $idx1 block/blocks in $cat.csl (break at line $idx3)\n";}
+    $num_lines++;
+    if (@col != $idx1) {$exit = 1; return print "--> ERROR: expect $idx1 block/blocks in $cat.csl (break at line $idx3)\n";}
    foreach(1 .. $idx1) {
-      if($symtab->{$entry[$_]} ne @col[$_-1]) {$exit = 1; return print "--> ERROR: $cat.csl doesn't correspond to $cat.txt (break at line $idx3, block $_)\n";}
+      if ($symtab->{$entry[$_]} ne @col[$_-1]) {$exit = 1; return print "--> ERROR: $cat.csl doesn't correspond to $cat.txt (break at line $idx3, block $_)\n";}
+    }
+  }
+  close(CSL);
+  if ($idx1 != 0) { # nonempty .txt,.int files
+    if ($num_lines != 1) {
+      $exit = 1; 
+      return print "--> ERROR: expect 1 line in $cat.csl\n";
+    }
+  } else {
+    if ($num_lines != 1 && $num_lines != 0) {
+      $exit = 1;
+      return print "--> ERROR: expect 0 or 1 line in $cat.csl, since empty .txt,int\n";
    }
-    $idx3 ++;
  }
-  close(CSL); $idx3 --;
-  if($idx3 != 1) {$exit = 1; return print "--> ERROR: expect 1 row in $cat.csl (break at line ", $idx3+1, ")\n";}
  print "--> $cat.csl corresponds to $cat.txt\n";

  return print "--> $cat.\{txt, int, csl\} are OK\n";
@ -115,10 +134,10 @@ sub check_txt_int_csl {
 sub check_txt_int {
  my ($cat, $symtab) = @_;
  print "Checking $cat.\{txt, int\} ...\n";
-  if(-z "$cat.txt") {$exit = 1; return print "--> ERROR: $cat.txt is empty or not exists\n";}
-  if(-z "$cat.int") {$exit = 1; return print "--> ERROR: $cat.int is empty or not exists\n";}
-  if(!open(TXT, "<$cat.txt")) {$exit = 1; return print "--> ERROR: fail to open $cat.txt\n";}
-  if(!open(INT, "<$cat.int")) {$exit = 1; return print "--> ERROR: fail to open $cat.int\n";}
+  if (-z "$cat.txt") {$exit = 1; return print "--> ERROR: $cat.txt is empty or does not exist\n";}
+  if (-z "$cat.int") {$exit = 1; return print "--> ERROR: $cat.int is empty or does not exist\n";}
+  if (!open(TXT, "<$cat.txt")) {$exit = 1; return print "--> ERROR: fail to open $cat.txt\n";}
+  if (!open(INT, "<$cat.int")) {$exit = 1; return print "--> ERROR: fail to open $cat.int\n";}

  $idx1 = 1;
  while(<TXT>) {
@ -146,14 +165,14 @@ sub check_txt_int {
    s/ singleton$//g;
    my @col = split(" ", $_);
    @set = split(" ", $entry[$idx2]);
-    if(@set != @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";}
+    if (@set != @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";}
    foreach(0 .. @set-1) {
-      if($symtab->{@set[$_]} ne @col[$_]) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2, block " ,$_+1, ")\n";}
+      if ($symtab->{@set[$_]} ne @col[$_]) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2, block " ,$_+1, ")\n";}
    }
    $idx2 ++;
  }
  close(INT); $idx2 --;
-  if($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";}
+  if ($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";}
  print "--> $cat.int corresponds to $cat.txt\n";

  return print "--> $cat.\{txt, int\} are OK\n";
@ -167,17 +186,19 @@ foreach(@list1) {
 foreach(@list2) {
  check_txt_int("$lang/phones/$_", \%psymtab); print "\n";
 }
-if((-s "$lang/phones/extra_questions.txt") || (-s "$lang/phones/extra_questions.int")) {
+if ((-s "$lang/phones/extra_questions.txt") || (-s "$lang/phones/extra_questions.int")) {
  check_txt_int("$lang/phones/extra_questions", \%psymtab); print "\n";
 } else {
  print "Checking $lang/phones/extra_questions.\{txt, int\} ...\n";
-  if((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int")) {
+  if ((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int")) {
    print "--> WARNING: the optional $lang/phones/extra_questions.\{txt, int\} are empty!\n\n";
+    $warning = 1;
  } else {
-    $exit = 1; print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n";
+    print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n";
+    $exit = 1;
  }
 } 
-if(-e "$lang/phones/word_boundary.txt") {
+if (-e "$lang/phones/word_boundary.txt") {
  check_txt_int("$lang/phones/word_boundary", \%psymtab); print "\n";
 }

@ -187,7 +208,7 @@ sub intersect {
  @itset = ();
  %itset = ();
  foreach(keys %$a) {
-    if(exists $b->{$_} and !$itset{$_}) {
+    if (exists $b->{$_} and !$itset{$_}) {
      push(@itset, $_);
      $itset{$_} = 1;
    }
@ -197,16 +218,16 @@ sub intersect {

 sub check_disjoint {
  print "Checking disjoint: silence.txt, nosilenct.txt, disambig.txt ...\n";
-  if(!open(S, "<$lang/phones/silence.txt"))    {$exit = 1; return print "--> ERROR: fail to open $lang/phones/silence.txt\n";}
-  if(!open(N, "<$lang/phones/nonsilence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/nonsilence.txt\n";}
-  if(!open(D, "<$lang/phones/disambig.txt"))   {$exit = 1; return print "--> ERROR: fail to open $lang/phones/disambig.txt\n";}
+  if (!open(S, "<$lang/phones/silence.txt"))    {$exit = 1; return print "--> ERROR: fail to open $lang/phones/silence.txt\n";}
+  if (!open(N, "<$lang/phones/nonsilence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/nonsilence.txt\n";}
+  if (!open(D, "<$lang/phones/disambig.txt"))   {$exit = 1; return print "--> ERROR: fail to open $lang/phones/disambig.txt\n";}

  $idx = 1;
  while(<S>) {
    chomp;
    my @col = split(" ", $_);
    $phone = shift @col;
-    if($silence{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/silence.txt (line $idx)\n";}
+    if ($silence{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/silence.txt (line $idx)\n";}
    $silence{$phone} = 1;
    push(@silence, $phone);
    $idx ++;
@ -218,7 +239,7 @@ sub check_disjoint {
    chomp;
    my @col = split(" ", $_);
    $phone = shift @col;
-    if($nonsilence{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/nonsilence.txt (line $idx)\n";}
+    if ($nonsilence{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/nonsilence.txt (line $idx)\n";}
    $nonsilence{$phone} = 1;
    push(@nonsilence, $phone);
    $idx ++;
@ -230,7 +251,7 @@ sub check_disjoint {
    chomp;
    my @col = split(" ", $_);
    $phone = shift @col;
-    if($disambig{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/disambig.txt (line $idx)\n";}
+    if ($disambig{$phone}) {$exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/disambig.txt (line $idx)\n";}
    $disambig{$phone} = 1;
    $idx ++;
  }
@ -241,7 +262,7 @@ sub check_disjoint {
  my @itsect3 = intersect(\%disambig, \%nonsilence);

  $success = 1;
-  if(@itsect1 != 0) {
+  if (@itsect1 != 0) {
    $success = 0;
    $exit = 1; print "--> ERROR: silence.txt and nonsilence.txt have intersection -- ";
    foreach(@itsect1) {
@ -250,7 +271,7 @@ sub check_disjoint {
    print "\n";
  } else {print "--> silence.txt and nonsilence.txt are disjoint\n";}

-  if(@itsect2 != 0) {
+  if (@itsect2 != 0) {
    $success = 0;
    $exit = 1; print "--> ERROR: silence.txt and disambig.txt have intersection -- ";
    foreach(@itsect2) {
@ -259,7 +280,7 @@ sub check_disjoint {
    print "\n";
  } else {print "--> silence.txt and disambig.txt are disjoint\n";}

-  if(@itsect3 != 0) {
+  if (@itsect3 != 0) {
    $success = 0;
    $exit = 1; print "--> ERROR: disambig.txt and nonsilence.txt have intersection -- ";
    foreach(@itsect1) {
@ -274,9 +295,9 @@ sub check_disjoint {

 sub check_summation {
  print "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n";
-  if(scalar(keys %silence) == 0)      {$exit = 1; return print "--> ERROR: $lang/phones/silence.txt is empty or not exists\n";}
-  if(scalar(keys %nonsilence) == 0)   {$exit = 1; return print "--> ERROR: $lang/phones/nonsilence.txt is empty or not exists\n";}
-  if(scalar(keys %disambig) == 0)     {$exit = 1; return print "--> ERROR: $lang/phones/disambig.txt is empty or not exists\n";}
+  if (scalar(keys %silence) == 0)    {$exit = 1; return print "--> ERROR: $lang/phones/silence.txt is empty or does not exist\n";}
+  if (scalar(keys %nonsilence) == 0) {$exit = 1; return print "--> ERROR: $lang/phones/nonsilence.txt is empty or does not exist\n";}
+  if (scalar(keys %disambig) == 0)   {$warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n";}

  %sum = (%silence, %nonsilence, %disambig);
  $sum{"<eps>"} = 1;
@ -285,23 +306,23 @@ sub check_summation {
  my @key1 = keys %sum;
  my @key2 = keys %psymtab;
  my %itset = (); foreach(@itset) {$itset{$_} = 1;}
-  if(@itset < @key1) {
+  if (@itset < @key1) {
    $exit = 1; print "--> ERROR: phones in silence.txt, nonsilence.txt, disambig.txt but not in phones.txt -- ";
    foreach(@key1) {
-      if(!$itset{$_}) {print "$_ ";}
+      if (!$itset{$_}) {print "$_ ";}
    }
    print "\n";
  }

-  if(@itset < @key2) {
+  if (@itset < @key2) {
    $exit = 1; print "--> ERROR: phones in phones.txt but not in silence.txt, nonsilence.txt, disambig.txt -- ";
    foreach(@key2) {
-      if(!$itset{$_}) {print "$_ ";}
+      if (!$itset{$_}) {print "$_ ";}
    }
    print "\n";
  }

-  if(@itset == @key1 and @itset == @key2) {
+  if (@itset == @key1 and @itset == @key2) {
    print "--> summation property is OK\n";
  }
  return;
@ -319,8 +340,8 @@ check_summation; print "\n";
 print "Checking optional_silence.txt ...\n";
 $idx = 1;
 $success = 1;
-if(-z "$lang/phones/optional_silence.txt") {$exit = 1; $success = 0; print "--> ERROR: $lang/phones/optional_silence.txt is empty or not exists\n";}
-if(!open(OS, "<$lang/phones/optional_silence.txt")) {$exit = 1; $success = 0; print "--> ERROR: fail to open $lang/phones/optional_silence.txt\n";}
+if (-z "$lang/phones/optional_silence.txt") {$exit = 1; $success = 0; print "--> ERROR: $lang/phones/optional_silence.txt is empty or does not exist\n";}
+if (!open(OS, "<$lang/phones/optional_silence.txt")) {$exit = 1; $success = 0; print "--> ERROR: fail to open $lang/phones/optional_silence.txt\n";}
 print "--> reading $lang/phones/optional_silence.txt\n";
 while(<OS>) {
  chomp;
@ -338,35 +359,38 @@ print "\n";

 # Check disambiguation symbols -------------------------------
 print "Checking disambiguation symbols: #0 and #1\n";
-if(scalar(keys %disambig) == 0) {$exit = 1; print "--> ERROR: $lang/phones/disambig.txt is empty or not exists\n";}
-if(exists $disambig{"#0"} and exists $disambig{"#1"}) {
+if (scalar(keys %disambig) == 0) {$warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n";}
+if (exists $disambig{"#0"} and exists $disambig{"#1"}) {
  print "--> $lang/phones/disambig.txt has \"#0\" and \"#1\"\n";
  print "--> $lang/phones/disambig.txt is OK\n\n";
 } else {
-  $exit = 1; print "--> ERROR: $lang/phones/disambig.txt doesn't have \"#0\" or \"#1\"\n";
+  print "--> WARNING: $lang/phones/disambig.txt doesn't have \"#0\" or \"#1\";\n";
+  print "-->          this would not be OK with a conventional ARPA-type language\n";
+  print "-->          model or a conventional lexicon (L.fst)\n";
+  $warning = 1;
 }


 # Check topo -------------------------------
 print "Checking topo ...\n";
-if(-z "$lang/topo") {$exit = 1; print "--> ERROR: $lang/topo is empty or not exists\n";}
-if(!open(T, "<$lang/topo")) {$exit = 1; print "--> ERROR: fail to open $lang/topo\n";}
+if (-z "$lang/topo") {$exit = 1; print "--> ERROR: $lang/topo is empty or does not exist\n";}
+if (!open(T, "<$lang/topo")) {$exit = 1; print "--> ERROR: fail to open $lang/topo\n";}
 $idx = 1;
 while(<T>) {
  chomp;
-  next if(m/^<.*>[ ]*$/);
-  if($idx == 1) {$nonsilence_seq = $_; $idx ++;}
-  if($idx == 2) {$silence_seq = $_;}
+  next if (m/^<.*>[ ]*$/);
+  if ($idx == 1) {$nonsilence_seq = $_; $idx ++;}
+  if ($idx == 2) {$silence_seq = $_;}
 }
 close(T);
-if($silence_seq == 0 || $nonsilence_seq == 0) {$exit = 1; print "--> ERROR: $lang/topo doesn't have nonsilence section or silence section\n";}
+if ($silence_seq == 0 || $nonsilence_seq == 0) {$exit = 1; print "--> ERROR: $lang/topo doesn't have nonsilence section or silence section\n";}
@silence_seq = split(" ", $silence_seq);
@nonsilence_seq = split(" ", $nonsilence_seq);
 $success1 = 1;
-if(@nonsilence_seq != @nonsilence) {$exit = 1; print "--> ERROR: $lang/topo's nonsilence section doesn't correspond to nonsilence.txt\n";}
+if (@nonsilence_seq != @nonsilence) {$exit = 1; print "--> ERROR: $lang/topo's nonsilence section doesn't correspond to nonsilence.txt\n";}
 else {
  foreach(0 .. scalar(@nonsilence)-1) {
-    if($psymtab{@nonsilence[$_]} ne @nonsilence_seq[$_]) {
+    if ($psymtab{@nonsilence[$_]} ne @nonsilence_seq[$_]) {
      $exit = 1; print "--> ERROR: $lang/topo's nonsilence section doesn't correspond to nonsilence.txt\n";
      $success = 0;
    }
@ -374,10 +398,10 @@ else {
 }
 $success1 != 1 || print "--> $lang/topo's nonsilence section is OK\n";
 $success2 = 1;
-if(@silence_seq != @silence) {$exit = 1; print "--> ERROR: $lang/topo's silence section doesn't correspond to silence.txt\n";}
+if (@silence_seq != @silence) {$exit = 1; print "--> ERROR: $lang/topo's silence section doesn't correspond to silence.txt\n";}
 else {
  foreach(0 .. scalar(@silence)-1) {
-    if($psymtab{@silence[$_]} ne @silence_seq[$_]) {
+    if ($psymtab{@silence[$_]} ne @silence_seq[$_]) {
      $exit = 1; print "--> ERROR: $lang/topo's silence section doesn't correspond to silence.txt\n";
      $success = 0;
    }
@ -393,9 +417,9 @@ $begin     = "";
 $end       = "";
 $internal  = "";
 $singleton = "";
-if(-s "$lang/phones/word_boundary.txt") {
+if (-s "$lang/phones/word_boundary.txt") {
  print "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n";
-  if(!open (W, "<$lang/phones/word_boundary.txt")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.txt\n";}
+  if (!open (W, "<$lang/phones/word_boundary.txt")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.txt\n";}
  $idx = 1;
  %wb = ();
  while(<W>) {
@ -406,7 +430,7 @@ if(-s "$lang/phones/word_boundary.txt") {
    if (m/^.*end$/      ) {s/ end$//g;       @col = split(" ", $_); if (@col == 1) {$end       .= "$col[0] ";}}
    if (m/^.*internal$/ ) {s/ internal$//g;  @col = split(" ", $_); if (@col == 1) {$internal  .= "$col[0] ";}}
    if (m/^.*singleton$/) {s/ singleton$//g; @col = split(" ", $_); if (@col == 1) {$singleton .= "$col[0] ";}}
-    if(@col != 1) {$exit = 1; print "--> ERROR: expect 1 column in $lang/phones/word_boundary.txt (line $idx)\n";}
+    if (@col != 1) {$exit = 1; print "--> ERROR: expect 1 column in $lang/phones/word_boundary.txt (line $idx)\n";}
    $wb{shift @col} = 1;
    $idx ++;
  }
@ -414,7 +438,7 @@ if(-s "$lang/phones/word_boundary.txt") {

  @itset = intersect(\%disambig, \%wb);
  $success1 = 1;
-  if(@itset != 0) {
+  if (@itset != 0) {
    $success1 = 0;
    $exit = 1; print "--> ERROR: $lang/phones/word_boundary.txt has disambiguation symbols -- ";
    foreach(@itset) {print "$_ ";}
@ -426,19 +450,19 @@ if(-s "$lang/phones/word_boundary.txt") {
  @itset = intersect(\%sum, \%wb);
  %itset = (); foreach(@itset) {$itset{$_} = 1;}
  $success2 = 1;
-  if(@itset < scalar(keys %sum)) {
+  if (@itset < scalar(keys %sum)) {
    $success2 = 0;
    $exit = 1; print "--> ERROR: phones in nonsilence.txt and silence.txt but not in word_boundary.txt -- ";
    foreach(keys %sum) {
-      if(!$itset{$_}) {print "$_ ";}            
+      if (!$itset{$_}) {print "$_ ";}            
    }
    print "\n";
  }
-  if(@itset < scalar(keys %wb)) {
+  if (@itset < scalar(keys %wb)) {
    $success2 = 0;
    $exit = 1; print "--> ERROR: phones in word_boundary.txt but not in nonsilence.txt or silence.txt -- ";
    foreach(keys %wb) {
-      if(!$itset{$_}) {print "$_ ";}
+      if (!$itset{$_}) {print "$_ ";}
    }
    print "\n";
  }
@ -447,16 +471,16 @@ if(-s "$lang/phones/word_boundary.txt") {
  print "\n";
 }

-if(-s "$lang/phones/word_boundary.int") {
+if (-s "$lang/phones/word_boundary.int") {
  print "Checking word_boundary.int and disambig.int\n";
-  if(!open (W, "<$lang/phones/word_boundary.int")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.int\n";}
+  if (!open (W, "<$lang/phones/word_boundary.int")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.int\n";}
  while (<W>) {
    @A = split;
    if (@A != 2) { $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/word_boundary.int\n"; }
    $wbtype{$A[0]} = $A[1];
  }
  close(W);
-  if(!open (D, "<$lang/phones/disambig.int")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/disambig.int\n";}
+  if (!open (D, "<$lang/phones/disambig.int")) {$exit = 1; print "--> ERROR: fail to open $lang/phones/disambig.int\n";}
  while (<D>) { 
    @A = split;
    if (@A != 1) { $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/disambig.int\n"; }
@ -477,7 +501,7 @@ if(-s "$lang/phones/word_boundary.int") {
      $sid ++;
    }
    $wordseq = $wordseq . "$sid 0";
-    $phoneseq = `echo \"$wordseq" | fstcompile | fstcompose $lang/$fst - | fstproject | fstrandgen | fstrmepsilon | fsttopsort | fstprint | awk '{if(NF > 2) {print \$3}}';`;
+    $phoneseq = `echo \"$wordseq" | fstcompile | fstcompose $lang/$fst - | fstproject | fstrandgen | fstrmepsilon | fsttopsort | fstprint | awk '{if (NF > 2) {print \$3}}';`;
    @phoneseq = split(" ", $phoneseq);
    $transition = { }; # empty assoc. array of allowed transitions between phone types.  1 means we count a word,
    # 0 means transition is allowed.  bos and eos are added as extra symbols here.
@ -529,6 +553,34 @@ if(-s "$lang/phones/word_boundary.int") {
 check_txt_int("$lang/oov", \%wsymtab); print "\n";


-if ($exit == 1) { print "--> ERROR\n"; exit 1;}
-else { print "--> SUCCESS\n"; exit 0; }
+# Check determinizability of G.fst
+if (-e "$lang/G.fst") {
+  $cmd = "fstdeterminize $lang/G.fst /dev/null";
+  $ret = system(". ./path.sh; $cmd");
+  if ($ret == 0) {
+    print "--> command $cmd succeeded\n";
+  } else {
+    print "--> ERROR: command $cmd failed\n";
+    $exit = 1;
+  }
+}
+
+if (-e "$lang/G.fst" && -e "$lang/L_disambig.fst") {
+  $cmd = "fstcompose $lang/L_disambig.fst $lang/G.fst | fstdeterminize > /dev/null";
+  $ret = system(". ./path.sh; $cmd");
+  if ($ret == 0) {
+    print "--> command $cmd succeeded\n";
+  } else {
+    print "--> ERROR: command $cmd failed\n";
+    $exit = 1;
+  }
+}
+
+
+if ($exit == 1) { print "--> ERROR (see error messages above)\n"; exit 1;}
+else {
+  if ($warning == 1) { print "--> WARNING (check output above for warnings)\n"; exit 0; }
+  else { print "--> SUCCESS\n"; exit 0; }
+}
+