зеркало из https://github.com/mozilla/kaldi.git
trunk: adding some scripts that were skipped while merging sandbox/language_id.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@3935 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
95120e8bb2
Коммит
d68f1be037
|
@ -0,0 +1,69 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
my ($train_file, $test_file, $lang_file, $priors_file) = @ARGV;
|
||||
open(UTT2LANG_TRAIN, "<$train_file") or die "no utt2lang training file";
|
||||
|
||||
%train_count = ();
|
||||
$train_tot = 0;
|
||||
while(<UTT2LANG_TRAIN>) {
|
||||
$line = $_;
|
||||
chomp($line);
|
||||
@words = split(" ", $line);
|
||||
$lang = $words[1];
|
||||
if (not exists($train_count{$lang})) {
|
||||
$train_count{$lang} = 1;
|
||||
} else {
|
||||
$train_count{$lang} += 1;
|
||||
}
|
||||
$train_tot += 1;
|
||||
}
|
||||
|
||||
open(UTT2LANG_TEST, "<$test_file");
|
||||
|
||||
%test_count = ();
|
||||
$test_tot = 0;
|
||||
while(<UTT2LANG_TEST>) {
|
||||
$line = $_;
|
||||
chomp($line);
|
||||
@words = split(" ", $line);
|
||||
$lang = $words[1];
|
||||
if (not exists($test_count{$lang})) {
|
||||
$test_count{$lang} = 1;
|
||||
} else {
|
||||
$test_count{$lang} += 1;
|
||||
}
|
||||
$test_tot += 1;
|
||||
}
|
||||
|
||||
foreach my $key (keys %train_count) {
|
||||
if (not exists($test_count{$key})) {
|
||||
$test_count{$key} = 0;
|
||||
}
|
||||
}
|
||||
|
||||
# load languages file
|
||||
open(LANGUAGES, "<$lang_file");
|
||||
@idx_to_lang = ();
|
||||
|
||||
$largest_idx = 0;
|
||||
while(<LANGUAGES>) {
|
||||
$line = $_;
|
||||
chomp($line);
|
||||
@words = split(" ", $line);
|
||||
$lang = $words[0];
|
||||
$idx = $words[1];
|
||||
$idx_to_lang[$idx + 0] = $lang;
|
||||
if ($idx > $largest_idx) {
|
||||
$largest_idx = $idx;
|
||||
}
|
||||
}
|
||||
|
||||
$priors = " [ ";
|
||||
foreach $lang (@idx_to_lang) {
|
||||
$ratio = (1.0*$test_count{$lang}) / $train_count{$lang};
|
||||
$priors .= "$ratio ";
|
||||
}
|
||||
|
||||
$priors .= " ]";
|
||||
open(PRIORS, ">$priors_file");
|
||||
print PRIORS $priors;
|
|
@ -0,0 +1,19 @@
|
|||
#!/usr/bin/perl
|
||||
# Removes the dialect parts on an utt2lang file.
|
||||
# For example <utt> chinese.wu is converted to <utt> chinese.
|
||||
|
||||
my ($utt2lang_file) = @ARGV;
|
||||
open(UTT2LANG, "<$utt2lang_file") or die "no utt2lang file";
|
||||
$utt2lang_short = "";
|
||||
while(<UTT2LANG>) {
|
||||
$line = $_;
|
||||
chomp($line);
|
||||
@words = split(" ", $line);
|
||||
$utt = $words[0];
|
||||
$lang_long = $words[1];
|
||||
@lang_parts = split('[.]', $lang_long);
|
||||
# The actual language. Other parts are dialects or subcategories.
|
||||
$lang = $lang_parts[0];
|
||||
$utt2lang_short .= $utt . " " . $lang . "\n";
|
||||
}
|
||||
print $utt2lang_short;
|
|
@ -11,28 +11,28 @@ set -e
|
|||
|
||||
config=conf/logistic-regression.conf
|
||||
|
||||
awk '{print $2}' <(utils/remove_dialect.pl data/train/utt2lang) | sort -u | \
|
||||
awk '{print $2}' <(lid/remove_dialect.pl data/train/utt2lang) | sort -u | \
|
||||
awk '{print $1, NR-1}' > exp/ivectors_train/languages.txt
|
||||
|
||||
model=exp/ivectors_train/logistic_regression
|
||||
model_rebalanced=exp/ivectors_train/logistic_regression_rebalanced
|
||||
train_ivectors="ark:ivector-normalize-length \
|
||||
scp:exp/ivectors_train/ivector.scp ark:- |";
|
||||
classes="ark:utils/remove_dialect.pl data/train/utt2lang \
|
||||
classes="ark:lid/remove_dialect.pl data/train/utt2lang \
|
||||
| utils/sym2int.pl -f 2 exp/ivectors_train/languages.txt - |"
|
||||
|
||||
# An alternative prior.
|
||||
#utils/sym2int.pl -f 2 exp/ivectors_train/languages.txt \
|
||||
# <(utils/remove_dialect.pl data/train/utt2lang) | \
|
||||
# <(lid/remove_dialect.pl data/train/utt2lang) | \
|
||||
# awk '{print $2}' | sort -n | uniq -c | \
|
||||
# awk 'BEGIN{printf(" [ ");} {printf("%s ", 1.0/$1); } END{print(" ]"); }' \
|
||||
# >exp/ivectors_train/inv_priors.vec
|
||||
|
||||
# Create priors to rebalance the model. The following script rebalances
|
||||
# the languages as count(lang_test) / (count(lang_test) + count(lang_train)).
|
||||
utils/balance_priors_to_test.pl \
|
||||
<(utils/remove_dialect.pl data/train/utt2lang) \
|
||||
<(utils/remove_dialect.pl data/lre07/utt2lang) \
|
||||
lid/balance_priors_to_test.pl \
|
||||
<(lid/remove_dialect.pl data/train/utt2lang) \
|
||||
<(lid/remove_dialect.pl data/lre07/utt2lang) \
|
||||
exp/ivectors_train/languages.txt \
|
||||
exp/ivectors_train/priors.vec
|
||||
|
||||
|
@ -44,7 +44,7 @@ logistic-regression-train --config=$config "$train_ivectors" \
|
|||
logistic-regression-copy --scale-priors=exp/ivectors_train/priors.vec \
|
||||
$model $model_rebalanced
|
||||
|
||||
trials="utils/remove_dialect.pl data/train/utt2lang \
|
||||
trials="lid/remove_dialect.pl data/train/utt2lang \
|
||||
| utils/sym2int.pl -f 2 exp/ivectors_train/languages.txt -|"
|
||||
scores="|utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt \
|
||||
>exp/ivectors_train/train_scores"
|
||||
|
@ -63,7 +63,7 @@ cat exp/ivectors_train/posteriors | \
|
|||
|
||||
# note: we treat the language as a sentence; it happens that the WER/SER
|
||||
# corresponds to the recognition error rate.
|
||||
compute-wer --mode=present --text ark:<(utils/remove_dialect.pl data/train/utt2lang) \
|
||||
compute-wer --mode=present --text ark:<(lid/remove_dialect.pl data/train/utt2lang) \
|
||||
ark:exp/ivectors_train/output
|
||||
|
||||
# %WER 4.19 [ 3000 / 71668, 0 ins, 0 del, 3000 sub ] [PARTIAL]
|
||||
|
@ -76,7 +76,7 @@ logistic-regression-eval $model_rebalanced \
|
|||
print $1, (argmax - 3); }' | \
|
||||
utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt >exp/ivectors_lre07/output
|
||||
|
||||
compute-wer --text ark:<(utils/remove_dialect.pl data/lre07/utt2lang) \
|
||||
compute-wer --text ark:<(lid/remove_dialect.pl data/lre07/utt2lang) \
|
||||
ark:exp/ivectors_lre07/output
|
||||
# %WER 32.58 [ 2452 / 7527, 0 ins, 0 del, 2452 sub ]
|
||||
# %SER 32.58 [ 2452 / 7527 ]
|
||||
|
|
Загрузка…
Ссылка в новой задаче