trunk: adding some scripts that were skipped while merging sandbox/language_id.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@3935 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2014-04-21 01:56:39 +00:00
Родитель 95120e8bb2
Коммит d68f1be037
3 изменённых файлов: 97 добавлений и 9 удалений

Просмотреть файл

@ -0,0 +1,69 @@
#!/usr/bin/perl -w
my ($train_file, $test_file, $lang_file, $priors_file) = @ARGV;
open(UTT2LANG_TRAIN, "<$train_file") or die "no utt2lang training file";
%train_count = ();
$train_tot = 0;
while(<UTT2LANG_TRAIN>) {
$line = $_;
chomp($line);
@words = split(" ", $line);
$lang = $words[1];
if (not exists($train_count{$lang})) {
$train_count{$lang} = 1;
} else {
$train_count{$lang} += 1;
}
$train_tot += 1;
}
open(UTT2LANG_TEST, "<$test_file");
%test_count = ();
$test_tot = 0;
while(<UTT2LANG_TEST>) {
$line = $_;
chomp($line);
@words = split(" ", $line);
$lang = $words[1];
if (not exists($test_count{$lang})) {
$test_count{$lang} = 1;
} else {
$test_count{$lang} += 1;
}
$test_tot += 1;
}
foreach my $key (keys %train_count) {
if (not exists($test_count{$key})) {
$test_count{$key} = 0;
}
}
# load languages file
open(LANGUAGES, "<$lang_file");
@idx_to_lang = ();
$largest_idx = 0;
while(<LANGUAGES>) {
$line = $_;
chomp($line);
@words = split(" ", $line);
$lang = $words[0];
$idx = $words[1];
$idx_to_lang[$idx + 0] = $lang;
if ($idx > $largest_idx) {
$largest_idx = $idx;
}
}
$priors = " [ ";
foreach $lang (@idx_to_lang) {
$ratio = (1.0*$test_count{$lang}) / $train_count{$lang};
$priors .= "$ratio ";
}
$priors .= " ]";
open(PRIORS, ">$priors_file");
print PRIORS $priors;

Просмотреть файл

@ -0,0 +1,19 @@
#!/usr/bin/perl
# Removes the dialect parts on an utt2lang file.
# For example <utt> chinese.wu is converted to <utt> chinese.
my ($utt2lang_file) = @ARGV;
open(UTT2LANG, "<$utt2lang_file") or die "no utt2lang file";
$utt2lang_short = "";
while(<UTT2LANG>) {
$line = $_;
chomp($line);
@words = split(" ", $line);
$utt = $words[0];
$lang_long = $words[1];
@lang_parts = split('[.]', $lang_long);
# The actual language. Other parts are dialects or subcategories.
$lang = $lang_parts[0];
$utt2lang_short .= $utt . " " . $lang . "\n";
}
print $utt2lang_short;

Просмотреть файл

@ -11,28 +11,28 @@ set -e
config=conf/logistic-regression.conf
awk '{print $2}' <(utils/remove_dialect.pl data/train/utt2lang) | sort -u | \
awk '{print $2}' <(lid/remove_dialect.pl data/train/utt2lang) | sort -u | \
awk '{print $1, NR-1}' > exp/ivectors_train/languages.txt
model=exp/ivectors_train/logistic_regression
model_rebalanced=exp/ivectors_train/logistic_regression_rebalanced
train_ivectors="ark:ivector-normalize-length \
scp:exp/ivectors_train/ivector.scp ark:- |";
classes="ark:utils/remove_dialect.pl data/train/utt2lang \
classes="ark:lid/remove_dialect.pl data/train/utt2lang \
| utils/sym2int.pl -f 2 exp/ivectors_train/languages.txt - |"
# An alternative prior.
#utils/sym2int.pl -f 2 exp/ivectors_train/languages.txt \
# <(utils/remove_dialect.pl data/train/utt2lang) | \
# <(lid/remove_dialect.pl data/train/utt2lang) | \
# awk '{print $2}' | sort -n | uniq -c | \
# awk 'BEGIN{printf(" [ ");} {printf("%s ", 1.0/$1); } END{print(" ]"); }' \
# >exp/ivectors_train/inv_priors.vec
# Create priors to rebalance the model. The following script rebalances
# the languages as count(lang_test) / (count(lang_test) + count(lang_train)).
utils/balance_priors_to_test.pl \
<(utils/remove_dialect.pl data/train/utt2lang) \
<(utils/remove_dialect.pl data/lre07/utt2lang) \
lid/balance_priors_to_test.pl \
<(lid/remove_dialect.pl data/train/utt2lang) \
<(lid/remove_dialect.pl data/lre07/utt2lang) \
exp/ivectors_train/languages.txt \
exp/ivectors_train/priors.vec
@ -44,7 +44,7 @@ logistic-regression-train --config=$config "$train_ivectors" \
logistic-regression-copy --scale-priors=exp/ivectors_train/priors.vec \
$model $model_rebalanced
trials="utils/remove_dialect.pl data/train/utt2lang \
trials="lid/remove_dialect.pl data/train/utt2lang \
| utils/sym2int.pl -f 2 exp/ivectors_train/languages.txt -|"
scores="|utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt \
>exp/ivectors_train/train_scores"
@ -63,7 +63,7 @@ cat exp/ivectors_train/posteriors | \
# note: we treat the language as a sentence; it happens that the WER/SER
# corresponds to the recognition error rate.
compute-wer --mode=present --text ark:<(utils/remove_dialect.pl data/train/utt2lang) \
compute-wer --mode=present --text ark:<(lid/remove_dialect.pl data/train/utt2lang) \
ark:exp/ivectors_train/output
# %WER 4.19 [ 3000 / 71668, 0 ins, 0 del, 3000 sub ] [PARTIAL]
@ -76,7 +76,7 @@ logistic-regression-eval $model_rebalanced \
print $1, (argmax - 3); }' | \
utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt >exp/ivectors_lre07/output
compute-wer --text ark:<(utils/remove_dialect.pl data/lre07/utt2lang) \
compute-wer --text ark:<(lid/remove_dialect.pl data/lre07/utt2lang) \
ark:exp/ivectors_lre07/output
# %WER 32.58 [ 2452 / 7527, 0 ins, 0 del, 2452 sub ]
# %SER 32.58 [ 2452 / 7527 ]