sandbox/language_id: Adding scripts to produce the LRE07 General Closed-Set Language Recognition eval. Also fixing a minor bug in run_logistic_regression.sh when rebalancing priors.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@4005 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2014-05-19 23:11:46 +00:00 · 2014-05-19 23:11:46 +00:00 · 55c24a43aa
--- a/egs/lre/v1/local/lre07_logistic_regression_eval.sh
+++ b/egs/lre/v1/local/lre07_logistic_regression_eval.sh
@ -0,0 +1,92 @@
+#!/bin/bash
+# Copyright  2014   David Snyder
+# Apache 2.0.
+#
+# Calculates the 3s, 10s, and 30s error rates and C_avgs
+# on the LRE07 General Language Recognition closed-set
+# using the logistic regression model passed in as an argument.
+# Detailed results such as the probability of misses for individual
+# languages are computed in local/lre07_results.
+
+. cmd.sh
+. path.sh
+set -e
+
+model=$1
+
+mkdir -p local/lre07_results
+lre07dir=local/lre07_results
+
+# Compute the posterior probabilities for all durations (3s, 10s, and 30s),
+# as well as the target and nontarget files.
+test_ivectors="ark:ivector-normalize-length \
+         scp:exp/ivectors_lre07/ivector.scp ark:- |";
+logistic-regression-eval $model "$test_ivectors" \
+  ark,t:exp/ivectors_lre07/posteriors
+
+local/lre07_targets.pl exp/ivectors_lre07/posteriors data/lre07/utt2lang \
+  exp/ivectors_train/languages.txt $lre07dir/targets \
+  $lre07dir/nontargets>/dev/null
+
+# Create the the score (eg, targets.scr) file.
+local/score_lre07.v01d.pl -t $lre07dir/targets -n $lre07dir/nontargets
+
+# Compute the posterior probabilities for each duration, as well as
+# the target and nontarget files.
+for dur in "3" "10" "30"; do
+  utils/filter_scp.pl -f 0 data/lre07/"$dur"sec \
+    exp/ivectors_lre07/ivector.scp > \
+    exp/ivectors_lre07/ivector_"$dur"sec.scp
+  test_ivectors="ark:ivector-normalize-length \
+         scp:exp/ivectors_lre07/ivector_"$dur"sec.scp ark:- |";
+
+  logistic-regression-eval $model "$test_ivectors" \
+    ark,t:exp/ivectors_lre07/posteriors_"$dur"sec
+
+  local/lre07_targets.pl exp/ivectors_lre07/posteriors_"$dur"sec \
+    <(utils/filter_scp.pl -f 0 data/lre07/"$dur"sec data/lre07/utt2lang) \
+    exp/ivectors_train/languages.txt \
+    "$lre07dir"/targets_"$dur"sec "$lre07dir"/nontargets_"$dur"sec>/dev/null
+  local/score_lre07.v01d.pl -t "$lre07dir"/targets_"$dur"sec -n \
+    "$lre07dir"/nontargets_"$dur"sec>/dev/null
+done
+
+printf '% 15s' 'Duration (sec):'
+for dur in "avg" "3" "10" "30"; do
+  printf '% 7s' $dur;
+done
+echo
+
+printf '% 15s' 'ER (%):'
+
+# Get the overall classification and then individual duration error rates.
+er=$(compute-wer --text ark:<(lid/remove_dialect.pl data/lre07/utt2lang) \
+  ark:exp/ivectors_lre07/output 2>/dev/null | grep "WER" | awk '{print $2 }')
+printf '% 7.2f' $er
+
+for dur in "3" "10" "30"; do
+  er=$(compute-wer --text ark:<(utils/filter_scp.pl -f 0 \
+    data/lre07/"$dur"sec data/lre07/utt2lang | lid/remove_dialect.pl -) \
+    ark:<(utils/filter_scp.pl -f 0 data/lre07/"$dur"sec \
+      exp/ivectors_lre07/output) \
+    2>/dev/null | grep "WER" | awk '{print $2 }')
+    printf '% 7.2f' $er
+done
+echo
+
+printf '% 15s' 'C_avg (%):'
+
+# Get the overall C_avg and then C_avgs for the individual durations.
+cavg=$(tail -n 1 $lre07dir/targets.scr \
+     | awk '{print 100*$4 }')
+printf '% 7.2f' $cavg
+
+for dur in "3" "10" "30"; do
+  cavg=$(tail -n 1 $lre07dir/targets_${dur}sec.scr \
+       | awk '{print 100.0*$4 }')
+  printf '% 7.2f' $cavg
+done
+echo
+# Duration (sec):    avg      3     10     30
+#         ER (%):  33.04  53.21  29.55  16.37
+#      C_avg (%):  17.65  29.53  15.64   7.79
--- a/egs/lre/v1/local/lre07_targets.pl
+++ b/egs/lre/v1/local/lre07_targets.pl
@ -0,0 +1,109 @@
+#! /usr/bin/perl
+#
+# Copyright 2014  David Snyder
+# Apache 2.0.
+#
+# Creates the target and nontarget files used by score_lre07.v01d.pl for
+# NIST LRE 2007 General Language Recognition closed-set evaluation.
+# See http://www.itl.nist.gov/iad/mig//tests/lre/2007/LRE07EvalPlan-v8b.pdf
+# for more details on the evaluation. 
+
+if (@ARGV != 5) {
+  print STDERR "Usage: $0 <path-to-posteriors> <path-to-utt2lang> \
+    <path-to-languages.txt> <path-to-targets-output> \
+    <path-to-nontargets-output>\n";
+  exit(1);
+}
+
+($posts, $utt2lang, $languages, $targets, $nontargets) = @ARGV;
+%lang_to_idx = ();
+%idx_to_lang = ();
+%utt_to_lang = ();
+$oos_lang = "zzz";
+open(LANG2IDX, "<", $languages) || die "Cannot open $languages file";
+while (<LANG2IDX>) {
+  chomp;
+  @toks = split(" ", $_);
+  $lang = $toks[0];
+  $idx = $toks[1];
+  $lang_to_idx{$lang} = $idx;
+  $idx_to_lang{$idx} = $lang;
+}
+close(LANG2IDX) || die;
+
+open(UTT2LANG, "<", $utt2lang) || die "Cannot open $utt2lang file";
+while (<UTT2LANG>) {
+  chomp;
+  @toks = split(" ", $_);
+  $utt = $toks[0];
+  $lang = $toks[1];
+  $utt_to_lang{$utt} = $lang;
+}
+close(UTT2LANG) || die;
+
+open(POSTS, "<", $posts) || die "Cannot open $posts file";
+open(TARGETS, ">", $targets) || die "Cannot open $targets file";
+open(NONTARGETS, ">", $nontargets) || die "Cannot open $nontargets file";
+while($line = <POSTS>) {
+  chomp($line);
+  $line =~ s/[\[\]]//g;
+  @toks = split(" ", $line);
+  $utt = $toks[0];
+  $actual_lang = $utt_to_lang{$utt};
+  $size = $#toks + 1;
+  $max_lang = "zzz";
+  $max_log_prob = -9**9**9; #-inf
+  $target_prob = 0;
+  # Handle target
+  for ($i = 1; $i < $size; $i++) {
+    if ($max_log_prob < $toks[$i]) {
+      $max_log_prob = $toks[$i];
+      $max_lang = $idx_to_lang{$i-1};
+    }
+    if ($actual_lang eq $idx_to_lang{$i-1}) {
+      print "$actual_lang $idx_to_lang{$i-1}\n";
+    }
+    if (index($actual_lang, $idx_to_lang{$i-1}) != -1 
+      || $actual_lang eq $idx_to_lang{$i-1}) {
+      $target_prob = exp($toks[$i]); 
+    }
+  }
+
+  if (index($actual_lang, ".") != -1) {
+    @lang_parts = split("[.]", $actual_lang);
+    $lang = $lang_parts[0];
+  } else {
+    $lang = $actual_lang;
+  }
+  if ($lang =~ /(arabic|bengali|farsi|german|japanese|korean|russian|tamil|thai|vietnamese|chinese|english|hindustani|spanish)/i) {
+    if (index($actual_lang, $max_lang) != -1 || $actual_lang eq $max_lang) {
+      print TARGETS "general_lr $lang closed_set $utt t $target_prob "
+            ."$actual_lang\n";
+    } else {
+      print TARGETS "general_lr $lang closed_set $utt f $target_prob "
+            ."$actual_lang\n";
+    }
+  }
+  # Handle nontarget
+  for ($i = 1; $i < $size; $i++) {
+    $nontarget_lang = $idx_to_lang{$i-1};
+    next if (index($actual_lang, $nontarget_lang) != -1 
+      || $actual_lang eq $nontarget_lang);
+
+    # if the nontarget lang is most probable
+    if ($nontarget_lang =~ /(arabic|bengali|farsi|german|japanese|korean|russian|tamil|thai|vietnamese|chinese|english|hindustani|spanish)/i) {
+      $prob = exp($toks[$i]);
+      if (index($max_lang, $nontarget_lang) != -1 
+        || $max_lang eq $nontarget_lang) {
+        print NONTARGETS "general_lr $nontarget_lang closed_set $utt t "
+              ."$prob $actual_lang\n";
+      } else {
+        print NONTARGETS "general_lr $nontarget_lang closed_set $utt f "
+              ."$prob $actual_lang\n";
+      }
+    }
+  }
+}
+close(POSTS) || die;
+close(TARGETS) || die;
+close(NONTARGETS) || die;
--- a/egs/lre/v1/local/make_lre07.pl
+++ b/egs/lre/v1/local/make_lre07.pl
@ -36,14 +36,19 @@ open(WAV, ">$dir/wav.scp") || die "Failed opening output file $out_dir/wav.scp";
 open(UTT2SPK, ">$dir/utt2spk") || die "Failed opening output file $dir/utt2spk";
 open(SPK2UTT, ">$dir/spk2utt") || die "Failed opening output file $dir/spk2utt";
 open(UTT2LANG, ">$dir/utt2lang") || die "Failed opening output file $dir/utt2lang";
+open(DUR3, ">$dir/3sec") || die "Failed opening output file $dir/3sec";
+open(DUR10, ">$dir/10sec") || die "Failed opening output file $dir/10sec";
+open(DUR30, ">$dir/30sec") || die "Failed opening output file $dir/30sec";

 my $key_str = `wget -qO- "http://www.itl.nist.gov/iad/mig/tests/lang/2007/lid07key_v5.txt"`;
@key_lines = split("\n",$key_str);
 %utt2lang = (); 
+%utt2dur = (); 
 foreach (@key_lines) {
  @words = split(' ', $_);
  if (index($words[0], "#") == -1) {
    $utt2lang{$words[0]} = $words[1];
+    $utt2dur{$words[0]} = $words[5];
  }
 }

@ -55,11 +60,23 @@ foreach (sort keys(%wav)) {
  print UTT2SPK "$uttId $uttId\n";
  print SPK2UTT "$uttId $uttId\n";
  print UTT2LANG "$uttId $utt2lang{$uttId}\n";
+  if ($utt2dur{$uttId} == 3) {
+    print DUR3 "$uttId\n";
+  } elsif ($utt2dur{$uttId} == 10) {
+    print DUR10 "$uttId\n";
+  } elsif ($utt2dur{$uttId} == 30) {
+    print DUR30 "$uttId\n";
+  } else {
+    die "Invalid nominal duration in test segment";
+  }
 }
 close(WAV) || die;
 close(UTT2SPK) || die;
 close(SPK2UTT) || die;
 close(UTT2LANG) || die;
+close(DUR3) || die;
+close(DUR10) || die;
+close(DUR30) || die;
 close(WAVLIST) || die;
 system("rm -r $dir/tmp");

--- a/egs/lre/v1/local/score_lre07.v01d.pl
+++ b/egs/lre/v1/local/score_lre07.v01d.pl
--- a/egs/lre/v1/run_logistic_regression.sh
+++ b/egs/lre/v1/run_logistic_regression.sh
@ -31,7 +31,8 @@ classes="ark:lid/remove_dialect.pl data/train/utt2lang \
 # Create priors to rebalance the model. The following script rebalances
 # the languages as count(lang_test) / (count(lang_test) + count(lang_train)).
 lid/balance_priors_to_test.pl \
-    <(lid/remove_dialect.pl data/train/utt2lang) \
+    <(lid/remove_dialect.pl <(utils/filter_scp.pl -f 0 \
+        exp/ivectors_train/ivector.scp data/train/utt2lang)) \
    <(lid/remove_dialect.pl data/lre07/utt2lang) \
    exp/ivectors_train/languages.txt \
    exp/ivectors_train/priors.vec
@ -66,8 +67,8 @@ cat exp/ivectors_train/posteriors | \
 compute-wer --mode=present --text ark:<(lid/remove_dialect.pl data/train/utt2lang) \
  ark:exp/ivectors_train/output

-# %WER 4.19 [ 3000 / 71668, 0 ins, 0 del, 3000 sub ] [PARTIAL]
-# %SER 4.19 [ 3000 / 71668 ]
+# %WER 4.73 [ 3389 / 71668, 0 ins, 0 del, 3389 sub ] [PARTIAL]
+# %SER 4.73 [ 3389 / 71668 ]
 # Scored 71668 sentences, 16 not present in hyp.
 logistic-regression-eval $model_rebalanced \
  'ark:ivector-normalize-length scp:exp/ivectors_lre07/ivector.scp ark:- |' ark,t:- | \
@ -78,7 +79,13 @@ logistic-regression-eval $model_rebalanced \

 compute-wer --text ark:<(lid/remove_dialect.pl data/lre07/utt2lang) \
  ark:exp/ivectors_lre07/output
-# %WER 32.58 [ 2452 / 7527, 0 ins, 0 del, 2452 sub ]
-# %SER 32.58 [ 2452 / 7527 ]
+
+# %WER 33.04 [ 2487 / 7527, 0 ins, 0 del, 2487 sub ]
+# %SER 33.04 [ 2487 / 7527 ]
 # Scored 7527 sentences, 0 not present in hyp.

+# General LR closed-set eval.
+local/lre07_logistic_regression_eval.sh $model_rebalanced
+#Duration (sec):    avg      3     10     30
+#        ER (%):  33.04  53.21  29.55  16.37
+#     C_avg (%):  17.65  29.53  15.64   7.79