sandbox/language_id: Adding scripts to produce the LRE07 General Closed-Set Language Recognition eval. Also fixing a minor bug in run_logistic_regression.sh when rebalancing priors.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@4005 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
David Snyder 2014-05-19 23:11:46 +00:00
Родитель e5c630fd75
Коммит 55c24a43aa
5 изменённых файлов: 1366 добавлений и 5 удалений

Просмотреть файл

@ -0,0 +1,92 @@
#!/bin/bash
# Copyright 2014 David Snyder
# Apache 2.0.
#
# Calculates the 3s, 10s, and 30s error rates and C_avgs
# on the LRE07 General Language Recognition closed-set
# using the logistic regression model passed in as an argument.
# Detailed results such as the probability of misses for individual
# languages are computed in local/lre07_results.
. cmd.sh
. path.sh
set -e
model=$1
mkdir -p local/lre07_results
lre07dir=local/lre07_results
# Compute the posterior probabilities for all durations (3s, 10s, and 30s),
# as well as the target and nontarget files.
test_ivectors="ark:ivector-normalize-length \
scp:exp/ivectors_lre07/ivector.scp ark:- |";
logistic-regression-eval $model "$test_ivectors" \
ark,t:exp/ivectors_lre07/posteriors
local/lre07_targets.pl exp/ivectors_lre07/posteriors data/lre07/utt2lang \
exp/ivectors_train/languages.txt $lre07dir/targets \
$lre07dir/nontargets>/dev/null
# Create the the score (eg, targets.scr) file.
local/score_lre07.v01d.pl -t $lre07dir/targets -n $lre07dir/nontargets
# Compute the posterior probabilities for each duration, as well as
# the target and nontarget files.
for dur in "3" "10" "30"; do
utils/filter_scp.pl -f 0 data/lre07/"$dur"sec \
exp/ivectors_lre07/ivector.scp > \
exp/ivectors_lre07/ivector_"$dur"sec.scp
test_ivectors="ark:ivector-normalize-length \
scp:exp/ivectors_lre07/ivector_"$dur"sec.scp ark:- |";
logistic-regression-eval $model "$test_ivectors" \
ark,t:exp/ivectors_lre07/posteriors_"$dur"sec
local/lre07_targets.pl exp/ivectors_lre07/posteriors_"$dur"sec \
<(utils/filter_scp.pl -f 0 data/lre07/"$dur"sec data/lre07/utt2lang) \
exp/ivectors_train/languages.txt \
"$lre07dir"/targets_"$dur"sec "$lre07dir"/nontargets_"$dur"sec>/dev/null
local/score_lre07.v01d.pl -t "$lre07dir"/targets_"$dur"sec -n \
"$lre07dir"/nontargets_"$dur"sec>/dev/null
done
printf '% 15s' 'Duration (sec):'
for dur in "avg" "3" "10" "30"; do
printf '% 7s' $dur;
done
echo
printf '% 15s' 'ER (%):'
# Get the overall classification and then individual duration error rates.
er=$(compute-wer --text ark:<(lid/remove_dialect.pl data/lre07/utt2lang) \
ark:exp/ivectors_lre07/output 2>/dev/null | grep "WER" | awk '{print $2 }')
printf '% 7.2f' $er
for dur in "3" "10" "30"; do
er=$(compute-wer --text ark:<(utils/filter_scp.pl -f 0 \
data/lre07/"$dur"sec data/lre07/utt2lang | lid/remove_dialect.pl -) \
ark:<(utils/filter_scp.pl -f 0 data/lre07/"$dur"sec \
exp/ivectors_lre07/output) \
2>/dev/null | grep "WER" | awk '{print $2 }')
printf '% 7.2f' $er
done
echo
printf '% 15s' 'C_avg (%):'
# Get the overall C_avg and then C_avgs for the individual durations.
cavg=$(tail -n 1 $lre07dir/targets.scr \
| awk '{print 100*$4 }')
printf '% 7.2f' $cavg
for dur in "3" "10" "30"; do
cavg=$(tail -n 1 $lre07dir/targets_${dur}sec.scr \
| awk '{print 100.0*$4 }')
printf '% 7.2f' $cavg
done
echo
# Duration (sec): avg 3 10 30
# ER (%): 33.04 53.21 29.55 16.37
# C_avg (%): 17.65 29.53 15.64 7.79

109
egs/lre/v1/local/lre07_targets.pl Executable file
Просмотреть файл

@ -0,0 +1,109 @@
#! /usr/bin/perl
#
# Copyright 2014 David Snyder
# Apache 2.0.
#
# Creates the target and nontarget files used by score_lre07.v01d.pl for
# NIST LRE 2007 General Language Recognition closed-set evaluation.
# See http://www.itl.nist.gov/iad/mig//tests/lre/2007/LRE07EvalPlan-v8b.pdf
# for more details on the evaluation.
if (@ARGV != 5) {
print STDERR "Usage: $0 <path-to-posteriors> <path-to-utt2lang> \
<path-to-languages.txt> <path-to-targets-output> \
<path-to-nontargets-output>\n";
exit(1);
}
($posts, $utt2lang, $languages, $targets, $nontargets) = @ARGV;
%lang_to_idx = ();
%idx_to_lang = ();
%utt_to_lang = ();
$oos_lang = "zzz";
open(LANG2IDX, "<", $languages) || die "Cannot open $languages file";
while (<LANG2IDX>) {
chomp;
@toks = split(" ", $_);
$lang = $toks[0];
$idx = $toks[1];
$lang_to_idx{$lang} = $idx;
$idx_to_lang{$idx} = $lang;
}
close(LANG2IDX) || die;
open(UTT2LANG, "<", $utt2lang) || die "Cannot open $utt2lang file";
while (<UTT2LANG>) {
chomp;
@toks = split(" ", $_);
$utt = $toks[0];
$lang = $toks[1];
$utt_to_lang{$utt} = $lang;
}
close(UTT2LANG) || die;
open(POSTS, "<", $posts) || die "Cannot open $posts file";
open(TARGETS, ">", $targets) || die "Cannot open $targets file";
open(NONTARGETS, ">", $nontargets) || die "Cannot open $nontargets file";
while($line = <POSTS>) {
chomp($line);
$line =~ s/[\[\]]//g;
@toks = split(" ", $line);
$utt = $toks[0];
$actual_lang = $utt_to_lang{$utt};
$size = $#toks + 1;
$max_lang = "zzz";
$max_log_prob = -9**9**9; #-inf
$target_prob = 0;
# Handle target
for ($i = 1; $i < $size; $i++) {
if ($max_log_prob < $toks[$i]) {
$max_log_prob = $toks[$i];
$max_lang = $idx_to_lang{$i-1};
}
if ($actual_lang eq $idx_to_lang{$i-1}) {
print "$actual_lang $idx_to_lang{$i-1}\n";
}
if (index($actual_lang, $idx_to_lang{$i-1}) != -1
|| $actual_lang eq $idx_to_lang{$i-1}) {
$target_prob = exp($toks[$i]);
}
}
if (index($actual_lang, ".") != -1) {
@lang_parts = split("[.]", $actual_lang);
$lang = $lang_parts[0];
} else {
$lang = $actual_lang;
}
if ($lang =~ /(arabic|bengali|farsi|german|japanese|korean|russian|tamil|thai|vietnamese|chinese|english|hindustani|spanish)/i) {
if (index($actual_lang, $max_lang) != -1 || $actual_lang eq $max_lang) {
print TARGETS "general_lr $lang closed_set $utt t $target_prob "
."$actual_lang\n";
} else {
print TARGETS "general_lr $lang closed_set $utt f $target_prob "
."$actual_lang\n";
}
}
# Handle nontarget
for ($i = 1; $i < $size; $i++) {
$nontarget_lang = $idx_to_lang{$i-1};
next if (index($actual_lang, $nontarget_lang) != -1
|| $actual_lang eq $nontarget_lang);
# if the nontarget lang is most probable
if ($nontarget_lang =~ /(arabic|bengali|farsi|german|japanese|korean|russian|tamil|thai|vietnamese|chinese|english|hindustani|spanish)/i) {
$prob = exp($toks[$i]);
if (index($max_lang, $nontarget_lang) != -1
|| $max_lang eq $nontarget_lang) {
print NONTARGETS "general_lr $nontarget_lang closed_set $utt t "
."$prob $actual_lang\n";
} else {
print NONTARGETS "general_lr $nontarget_lang closed_set $utt f "
."$prob $actual_lang\n";
}
}
}
}
close(POSTS) || die;
close(TARGETS) || die;
close(NONTARGETS) || die;

Просмотреть файл

@ -36,14 +36,19 @@ open(WAV, ">$dir/wav.scp") || die "Failed opening output file $out_dir/wav.scp";
open(UTT2SPK, ">$dir/utt2spk") || die "Failed opening output file $dir/utt2spk";
open(SPK2UTT, ">$dir/spk2utt") || die "Failed opening output file $dir/spk2utt";
open(UTT2LANG, ">$dir/utt2lang") || die "Failed opening output file $dir/utt2lang";
open(DUR3, ">$dir/3sec") || die "Failed opening output file $dir/3sec";
open(DUR10, ">$dir/10sec") || die "Failed opening output file $dir/10sec";
open(DUR30, ">$dir/30sec") || die "Failed opening output file $dir/30sec";
my $key_str = `wget -qO- "http://www.itl.nist.gov/iad/mig/tests/lang/2007/lid07key_v5.txt"`;
@key_lines = split("\n",$key_str);
%utt2lang = ();
%utt2dur = ();
foreach (@key_lines) {
@words = split(' ', $_);
if (index($words[0], "#") == -1) {
$utt2lang{$words[0]} = $words[1];
$utt2dur{$words[0]} = $words[5];
}
}
@ -55,11 +60,23 @@ foreach (sort keys(%wav)) {
print UTT2SPK "$uttId $uttId\n";
print SPK2UTT "$uttId $uttId\n";
print UTT2LANG "$uttId $utt2lang{$uttId}\n";
if ($utt2dur{$uttId} == 3) {
print DUR3 "$uttId\n";
} elsif ($utt2dur{$uttId} == 10) {
print DUR10 "$uttId\n";
} elsif ($utt2dur{$uttId} == 30) {
print DUR30 "$uttId\n";
} else {
die "Invalid nominal duration in test segment";
}
}
close(WAV) || die;
close(UTT2SPK) || die;
close(SPK2UTT) || die;
close(UTT2LANG) || die;
close(DUR3) || die;
close(DUR10) || die;
close(DUR30) || die;
close(WAVLIST) || die;
system("rm -r $dir/tmp");

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -31,7 +31,8 @@ classes="ark:lid/remove_dialect.pl data/train/utt2lang \
# Create priors to rebalance the model. The following script rebalances
# the languages as count(lang_test) / (count(lang_test) + count(lang_train)).
lid/balance_priors_to_test.pl \
<(lid/remove_dialect.pl data/train/utt2lang) \
<(lid/remove_dialect.pl <(utils/filter_scp.pl -f 0 \
exp/ivectors_train/ivector.scp data/train/utt2lang)) \
<(lid/remove_dialect.pl data/lre07/utt2lang) \
exp/ivectors_train/languages.txt \
exp/ivectors_train/priors.vec
@ -66,8 +67,8 @@ cat exp/ivectors_train/posteriors | \
compute-wer --mode=present --text ark:<(lid/remove_dialect.pl data/train/utt2lang) \
ark:exp/ivectors_train/output
# %WER 4.19 [ 3000 / 71668, 0 ins, 0 del, 3000 sub ] [PARTIAL]
# %SER 4.19 [ 3000 / 71668 ]
# %WER 4.73 [ 3389 / 71668, 0 ins, 0 del, 3389 sub ] [PARTIAL]
# %SER 4.73 [ 3389 / 71668 ]
# Scored 71668 sentences, 16 not present in hyp.
logistic-regression-eval $model_rebalanced \
'ark:ivector-normalize-length scp:exp/ivectors_lre07/ivector.scp ark:- |' ark,t:- | \
@ -78,7 +79,13 @@ logistic-regression-eval $model_rebalanced \
compute-wer --text ark:<(lid/remove_dialect.pl data/lre07/utt2lang) \
ark:exp/ivectors_lre07/output
# %WER 32.58 [ 2452 / 7527, 0 ins, 0 del, 2452 sub ]
# %SER 32.58 [ 2452 / 7527 ]
# %WER 33.04 [ 2487 / 7527, 0 ins, 0 del, 2487 sub ]
# %SER 33.04 [ 2487 / 7527 ]
# Scored 7527 sentences, 0 not present in hyp.
# General LR closed-set eval.
local/lre07_logistic_regression_eval.sh $model_rebalanced
#Duration (sec): avg 3 10 30
# ER (%): 33.04 53.21 29.55 16.37
# C_avg (%): 17.65 29.53 15.64 7.79