trunk: adding some scripts that were skipped while merging sandbox/language_id.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@3935 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2014-04-21 01:56:39 +00:00 · 2014-04-21 01:56:39 +00:00 · d68f1be037
--- a/egs/lre/v1/lid/balance_priors_to_test.pl
+++ b/egs/lre/v1/lid/balance_priors_to_test.pl
@ -0,0 +1,69 @@
+#!/usr/bin/perl -w
+
+my ($train_file, $test_file, $lang_file, $priors_file) = @ARGV;
+open(UTT2LANG_TRAIN, "<$train_file") or die "no utt2lang training file";
+
+%train_count = ();
+$train_tot = 0;
+while(<UTT2LANG_TRAIN>) {
+  $line = $_;
+  chomp($line);
+  @words = split(" ", $line);
+  $lang = $words[1];
+  if (not exists($train_count{$lang})) {
+    $train_count{$lang} = 1;
+  } else {
+    $train_count{$lang} += 1;
+  }
+  $train_tot += 1;
+}
+
+open(UTT2LANG_TEST, "<$test_file");
+
+%test_count = ();
+$test_tot = 0;
+while(<UTT2LANG_TEST>) {
+  $line = $_;
+  chomp($line);
+  @words = split(" ", $line);
+  $lang = $words[1];
+  if (not exists($test_count{$lang})) {
+    $test_count{$lang} = 1;
+  } else {
+    $test_count{$lang} += 1;
+  }
+  $test_tot += 1;
+}
+
+foreach my $key (keys %train_count) {
+  if (not exists($test_count{$key})) {
+    $test_count{$key} = 0;
+  }
+}
+
+# load languages file
+open(LANGUAGES, "<$lang_file");
+@idx_to_lang = ();
+
+$largest_idx = 0;
+while(<LANGUAGES>) {
+  $line = $_;
+  chomp($line);
+  @words = split(" ", $line);
+  $lang = $words[0];
+  $idx = $words[1];
+  $idx_to_lang[$idx + 0] = $lang;
+  if ($idx > $largest_idx) {
+    $largest_idx = $idx;
+  }
+}
+
+$priors = " [ ";
+foreach $lang (@idx_to_lang) {
+  $ratio = (1.0*$test_count{$lang}) / $train_count{$lang};
+  $priors .= "$ratio ";
+}
+
+$priors .= " ]";
+open(PRIORS, ">$priors_file");
+print PRIORS $priors;
--- a/egs/lre/v1/lid/remove_dialect.pl
+++ b/egs/lre/v1/lid/remove_dialect.pl
@ -0,0 +1,19 @@
+#!/usr/bin/perl
+# Removes the dialect parts on an utt2lang file.
+# For example <utt> chinese.wu is converted to <utt> chinese.
+
+my ($utt2lang_file) = @ARGV;
+open(UTT2LANG, "<$utt2lang_file") or die "no utt2lang file";
+$utt2lang_short = "";
+while(<UTT2LANG>) {
+  $line = $_;
+  chomp($line);
+  @words = split(" ", $line);
+  $utt = $words[0];
+  $lang_long = $words[1];
+  @lang_parts = split('[.]', $lang_long);
+  # The actual language. Other parts are dialects or subcategories.
+  $lang = $lang_parts[0];
+  $utt2lang_short .= $utt . " " . $lang . "\n";
+}
+print $utt2lang_short;
--- a/egs/lre/v1/run_logistic_regression.sh
+++ b/egs/lre/v1/run_logistic_regression.sh
@ -11,28 +11,28 @@ set -e

 config=conf/logistic-regression.conf

-awk '{print $2}' <(utils/remove_dialect.pl data/train/utt2lang) | sort -u | \
+awk '{print $2}' <(lid/remove_dialect.pl data/train/utt2lang) | sort -u | \
  awk '{print $1, NR-1}' >  exp/ivectors_train/languages.txt

 model=exp/ivectors_train/logistic_regression
 model_rebalanced=exp/ivectors_train/logistic_regression_rebalanced
 train_ivectors="ark:ivector-normalize-length \
         scp:exp/ivectors_train/ivector.scp ark:- |";
-classes="ark:utils/remove_dialect.pl data/train/utt2lang \
+classes="ark:lid/remove_dialect.pl data/train/utt2lang \
         | utils/sym2int.pl -f 2 exp/ivectors_train/languages.txt - |"

 # An alternative prior.
 #utils/sym2int.pl -f 2 exp/ivectors_train/languages.txt \
-#  <(utils/remove_dialect.pl data/train/utt2lang) | \
+#  <(lid/remove_dialect.pl data/train/utt2lang) | \
 #  awk '{print $2}' | sort -n | uniq -c | \
 #  awk 'BEGIN{printf(" [ ");} {printf("%s ", 1.0/$1); } END{print(" ]"); }' \
 #   >exp/ivectors_train/inv_priors.vec

 # Create priors to rebalance the model. The following script rebalances
 # the languages as count(lang_test) / (count(lang_test) + count(lang_train)).
-utils/balance_priors_to_test.pl \
-    <(utils/remove_dialect.pl data/train/utt2lang) \
-    <(utils/remove_dialect.pl data/lre07/utt2lang) \
+lid/balance_priors_to_test.pl \
+    <(lid/remove_dialect.pl data/train/utt2lang) \
+    <(lid/remove_dialect.pl data/lre07/utt2lang) \
    exp/ivectors_train/languages.txt \
    exp/ivectors_train/priors.vec

@ -44,7 +44,7 @@ logistic-regression-train --config=$config "$train_ivectors" \
 logistic-regression-copy --scale-priors=exp/ivectors_train/priors.vec \
   $model $model_rebalanced

-trials="utils/remove_dialect.pl data/train/utt2lang \
+trials="lid/remove_dialect.pl data/train/utt2lang \
        | utils/sym2int.pl -f 2 exp/ivectors_train/languages.txt -|"
 scores="|utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt  \
        >exp/ivectors_train/train_scores"
@ -63,7 +63,7 @@ cat exp/ivectors_train/posteriors | \

 # note: we treat the language as a sentence; it happens that the WER/SER
 # corresponds to the recognition error rate.
-compute-wer --mode=present --text ark:<(utils/remove_dialect.pl data/train/utt2lang) \
+compute-wer --mode=present --text ark:<(lid/remove_dialect.pl data/train/utt2lang) \
  ark:exp/ivectors_train/output

 # %WER 4.19 [ 3000 / 71668, 0 ins, 0 del, 3000 sub ] [PARTIAL]
@ -76,7 +76,7 @@ logistic-regression-eval $model_rebalanced \
                          print $1, (argmax - 3); }' | \
  utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt >exp/ivectors_lre07/output

-compute-wer --text ark:<(utils/remove_dialect.pl data/lre07/utt2lang) \
+compute-wer --text ark:<(lid/remove_dialect.pl data/lre07/utt2lang) \
  ark:exp/ivectors_lre07/output
 # %WER 32.58 [ 2452 / 7527, 0 ins, 0 del, 2452 sub ]
 # %SER 32.58 [ 2452 / 7527 ]