зеркало из https://github.com/mozilla/kaldi.git
sandbox/language_id: Adding local/make_callfriend.pl for handling the ldc96* datasets and a table for mapping callfriend datasets to languages. With lre07 for testing, run.sh now uses all available training data for training. Bug fix in make_lre07.pl.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@3740 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
377231677c
Коммит
a1561bb1c4
|
@ -0,0 +1,7 @@
|
|||
48 french
|
||||
49 arabic.standard
|
||||
54 korean
|
||||
55 chinese.mandarin.mainland
|
||||
56 chinese.mandarin.taiwan
|
||||
57 spanish.caribbean
|
||||
58 spanish.noncaribbean
|
|
@ -1,138 +0,0 @@
|
|||
#! /usr/bin/perl
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use local::load_lang;
|
||||
|
||||
my ($dataset, $in_top, $out_top) = @ARGV;
|
||||
die 'Usage: ' . File::Basename::basename($0)
|
||||
. " {48|49|54|55|56|57|58} in-dir out-dir\n"
|
||||
unless @ARGV == 3 && $dataset =~ /^4[89]|5[4-8]$/o
|
||||
&& $in_top && $out_top;
|
||||
|
||||
sub open_or_die ($$) {
|
||||
my ($mode, $path, $file) = @_;
|
||||
open($file, $mode, $path) or die "$path: $!\n";
|
||||
return ($path, $file);
|
||||
}
|
||||
my $lang_abbreviation_file = "local/language_abbreviations.txt";
|
||||
my ($long_lang, $abbr_lang, $num_lang) = load_lang($lang_abbreviation_file);
|
||||
my %doc = (
|
||||
'48' => '/callfriend_fre_1/cf_fre/docs/',
|
||||
'49' => '/doc/',
|
||||
'54' => '/doc/',
|
||||
'55' => '/doc/',
|
||||
'56' => '/doc/',
|
||||
'57' => '/docs/',
|
||||
'58' => '/doc/'
|
||||
);
|
||||
|
||||
my $doc = $in_top . $doc{$dataset};
|
||||
my ($meta_path, $meta_file, %speaker) =
|
||||
open_or_die('<', $doc . 'callinfo.tbl');
|
||||
|
||||
while (<$meta_file>) {
|
||||
my ($call, $speaker) = split(' PIN=|\|');
|
||||
$speaker{$call} = $speaker;
|
||||
}
|
||||
|
||||
close $meta_file or warn "$meta_path: $!\n";
|
||||
($meta_path, $meta_file) = open_or_die('<', $doc . 'spkrinfo.tbl');
|
||||
my %gender;
|
||||
|
||||
while (<$meta_file>) {
|
||||
my ($call, $gender) = split(',');
|
||||
$gender =~ tr/FM/fm/;
|
||||
$gender{$call} = $gender;
|
||||
}
|
||||
|
||||
close $meta_file or warn "$meta_path: $!\n";
|
||||
($, , $\) = (' ', "\n");
|
||||
$out_top .= '/ldc96s' . $dataset . '_';
|
||||
|
||||
my %data = (
|
||||
'48' => '/callfriend_fre_1/cf_fre/data/',
|
||||
'49' => '/data/',
|
||||
'54' => '/data/',
|
||||
'55' => '/data/',
|
||||
'56' => '/data/',
|
||||
'57' => '/data/',
|
||||
'58' => '/cf_spa_n/'
|
||||
);
|
||||
my %lang_name = (
|
||||
'48' => 'french',
|
||||
'49' => 'arabic.standard',
|
||||
'54' => 'korean',
|
||||
'55' => 'chinese.mandarin.mainland',
|
||||
'56' => 'chinese.mandarin.taiwan',
|
||||
'57' => 'spanish.caribbean',
|
||||
'58' => 'spanish.noncaribbean'
|
||||
);
|
||||
my $lang_code = $::num_lang{$::abbr_lang{$lang_name{$dataset}}};
|
||||
$in_top .= $data{$dataset};
|
||||
my $lang_name = $lang_name{$dataset};
|
||||
|
||||
sub open4sort ($;$) {
|
||||
my ($path, $flags) = @_;
|
||||
open_or_die('|-',
|
||||
($flags ? 'sort ' . $flags . ' >' : 'sort >')
|
||||
. $path);
|
||||
}
|
||||
|
||||
use File::Path;
|
||||
use File::Find;
|
||||
|
||||
foreach ('devtest', 'evltest', 'train') {
|
||||
|
||||
my $out_sub = $out_top . $_;
|
||||
File::Path::make_path($out_sub);
|
||||
$out_sub .= '/';
|
||||
|
||||
my ($wav_path, $wav_file) = open4sort($out_sub . 'wav.scp');
|
||||
my ($utt2lang_path, $utt2lang_file) =
|
||||
open4sort($out_sub . 'utt2lang');
|
||||
my ($utt2spk_path, $utt2spk_file) =
|
||||
open4sort($out_sub . 'utt2spk');
|
||||
my ($spk2gender_path, $spk2gender_file) =
|
||||
open4sort($out_sub . 'spk2gender', '-u');
|
||||
|
||||
File::Find::find(sub {
|
||||
|
||||
my ($call ) = /^(.*)\.sph$/o or return;
|
||||
my $orig_speaker = $speaker{$call};
|
||||
my $gender = $gender{$call};
|
||||
if (!defined $orig_speaker) {
|
||||
warn "No speaker defined for call $call.\n";
|
||||
return;
|
||||
}
|
||||
if (!defined $gender || !($gender eq "f" || $gender eq "m")) {
|
||||
warn "No gender defined or bad gender '$gender' for call $call.\n";
|
||||
return;
|
||||
}
|
||||
my $speaker = $lang_code . '_' . $orig_speaker;
|
||||
my $utt = $speaker . '_ldc96s' . $dataset . '_' . $call;
|
||||
print $wav_file
|
||||
$utt, 'sph2pipe -f wav -p -c 1', $File::Find::name, '|';
|
||||
print $utt2lang_file $utt , $lang_name;
|
||||
print $utt2spk_file $utt , $speaker;
|
||||
print $spk2gender_file $speaker, $gender;
|
||||
|
||||
}, $in_top . $_);
|
||||
|
||||
close $wav_file or warn "$wav_path: $!\n";
|
||||
close $utt2lang_file or warn "$utt2lang_path: $!\n";
|
||||
close $utt2spk_file or warn "$utt2spk_path: $!\n";
|
||||
close $spk2gender_file or warn "$spk2gender_path: $!\n";
|
||||
|
||||
print("utils/utt2spk_to_spk2utt.pl ${out_sub}utt2spk > ${out_sub}spk2utt");
|
||||
if (system("utils/utt2spk_to_spk2utt.pl ${out_sub}utt2spk > ${out_sub}spk2utt") != 0) {
|
||||
die "${out_sub}utt2spk: utt2spk_to_spk2utt.pl: $!\n";
|
||||
}
|
||||
if (system("utils/utt2spk_to_spk2utt.pl ${out_sub}utt2lang > ${out_sub}lang2utt") != 0) {
|
||||
die "${out_sub}utt2lang: utt2spk_to_spk2utt.pl: $!\n";
|
||||
}
|
||||
system("utils/fix_data_dir.sh $out_sub");
|
||||
if (system("utils/validate_data_dir.sh --no-text --no-feats $out_sub") != 0) {
|
||||
die "Failed validating output directory";
|
||||
}
|
||||
}
|
|
@ -62,4 +62,4 @@ close(UTT2LANG) || die;
|
|||
close(WAVLIST) || die;
|
||||
system("rm -r $dir/tmp");
|
||||
|
||||
(system("utils/validate_data_dir.sh --no-text --no-feats data/lre07") == 0) || die "Error validating data dir.";
|
||||
(system("utils/validate_data_dir.sh --no-text --no-feats $dir") == 0) || die "Error validating data dir.";
|
||||
|
|
|
@ -16,44 +16,59 @@ vaddir=`pwd`/mfcc
|
|||
# data/sre08_train_3conv_female data/sre08_train_3conv_male data/sre08_train_8conv_female \
|
||||
# data/sre08_train_8conv_male data/sre08_train_short2_male data/sre08_train_short2_female /export/a14/kumar/kaldi/language_id/egs/lre/v1/data/ldc96s*
|
||||
|
||||
local/make_sre_2008_train.pl local/language_abbreviations.txt /export/corpora5/LDC/LDC2011S05 data
|
||||
local/make_ldc96s.pl 49 /export/corpora5/LDC/LDC96S49 data
|
||||
local/make_ldc96s.pl 54 /export/corpora5/LDC/LDC96S54 data
|
||||
local/make_ldc96s.pl 55 /export/corpora5/LDC/LDC96S55 data
|
||||
local/make_ldc96s.pl 56 /export/corpora5/LDC/LDC96S56 data
|
||||
local/make_ldc96s.pl 57 /export/corpora5/LDC/LDC96S57 data
|
||||
local/make_ldc96s.pl 58 /export/corpora5/LDC/LDC96S58 data
|
||||
lang=local/callfriend_lang.txt
|
||||
lang_abbrev=local/language_abbreviations.txt
|
||||
local/make_callfriend.pl $lang_abbrev \
|
||||
/export/corpora5/LDC/LDC96S49 49 $lang data
|
||||
local/make_callfriend.pl $lang_abbrev \
|
||||
/export/corpora5/LDC/LDC96S54 54 $lang data
|
||||
local/make_callfriend.pl $lang_abbrev \
|
||||
/export/corpora5/LDC/LDC96S55 55 $lang data
|
||||
local/make_callfriend.pl $lang_abbrev \
|
||||
/export/corpora5/LDC/LDC96S56 56 $lang data
|
||||
local/make_callfriend.pl $lang_abbrev \
|
||||
/export/corpora5/LDC/LDC96S57 57 $lang data
|
||||
local/make_callfriend.pl $lang_abbrev \
|
||||
/export/corpora5/LDC/LDC96S58 58 $lang data
|
||||
local/make_sre_2008_train.pl local/language_abbreviations.txt \
|
||||
/export/corpora5/LDC/LDC2011S05 data
|
||||
|
||||
# we're not doing anything with the lre07 data currently.
|
||||
local/make_lre07.pl /export/corpora5/LDC/LDC2009S04 data/lre07
|
||||
local/make_lre07.pl /export/corpora5/LDC/LDC2009S04 data/test
|
||||
|
||||
utils/combine_data.sh data/all data/sre08_train_10sec_female data/sre08_train_10sec_male \
|
||||
data/sre08_train_3conv_female data/sre08_train_3conv_male data/sre08_train_8conv_female \
|
||||
data/sre08_train_8conv_male data/sre08_train_short2_male data/sre08_train_short2_female data/ldc96s*
|
||||
utils/combine_data.sh data/train data/sre08_train_10sec_female \
|
||||
data/sre08_train_10sec_male data/sre08_train_3conv_female \
|
||||
data/sre08_train_3conv_male data/sre08_train_8conv_female \
|
||||
data/sre08_train_8conv_male data/sre08_train_short2_male \
|
||||
data/sre08_train_short2_female data/ldc96s*
|
||||
|
||||
mfccdir=`pwd`/mfcc
|
||||
vaddir=`pwd`/mfcc
|
||||
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
|
||||
data/train exp/make_mfcc $mfccdir
|
||||
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
|
||||
data/test exp/make_mfcc $mfccdir
|
||||
|
||||
set -e
|
||||
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" data/all exp/make_mfcc $mfccdir
|
||||
|
||||
lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/all exp/make_vad $vaddir
|
||||
lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/train \
|
||||
exp/make_vad $vaddir
|
||||
lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/test \
|
||||
exp/make_vad $vaddir
|
||||
|
||||
# Use 4k of the 14k utterances for testing, but make sure the speakers do not
|
||||
# overlap with the rest of the data, which will be used for training.
|
||||
utils/subset_data_dir.sh --speakers data/all 4000 data/test
|
||||
utils/filter_scp.pl --exclude data/test/spk2utt < data/all/spk2utt | awk '{print $1}' > foo
|
||||
utils/subset_data_dir.sh --spk-list foo data/all data/train
|
||||
#utils/subset_data_dir.sh --speakers data/all 4000 data/test
|
||||
#utils/filter_scp.pl --exclude data/test/spk2utt < data/all/spk2utt | awk '{print $1}' > foo
|
||||
#utils/subset_data_dir.sh --spk-list foo data/all data/train
|
||||
|
||||
|
||||
utils/subset_data_dir.sh data/train 3000 data/train_3k
|
||||
utils/subset_data_dir.sh data/train 6000 data/train_6k
|
||||
|
||||
|
||||
lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train_3k 2048 exp/diag_ubm_2048
|
||||
lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train_6k exp/diag_ubm_2048 exp/full_ubm_2048_6k
|
||||
lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train_3k 2048 \
|
||||
exp/diag_ubm_2048
|
||||
lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train_6k \
|
||||
exp/diag_ubm_2048 exp/full_ubm_2048_6k
|
||||
|
||||
lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train exp/full_ubm_2048_6k exp/full_ubm_2048
|
||||
lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train \
|
||||
exp/full_ubm_2048_6k exp/full_ubm_2048
|
||||
|
||||
|
||||
lid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=2G,ram_free=2G" \
|
||||
|
|
|
@ -35,7 +35,7 @@ classes="ark:utils/remove_dialect.pl data/train/utt2lang \
|
|||
# the languages as count(lang_test) / (count(lang_test) + count(lang_train)).
|
||||
utils/balance_priors_to_test.pl \
|
||||
<(utils/remove_dialect.pl data/train/utt2lang) \
|
||||
<(utils/remove_dialect.pl data/lre07/utt2lang) \
|
||||
<(utils/remove_dialect.pl data/test/utt2lang) \
|
||||
exp/ivectors_train/languages.txt \
|
||||
exp/ivectors_train/priors.vec
|
||||
|
||||
|
@ -76,19 +76,19 @@ compute-wer --text ark:<(utils/remove_dialect.pl data/train/utt2lang) \
|
|||
|
||||
|
||||
logistic-regression-eval $model_rebalanced \
|
||||
scp:exp/lre07/ivector.scp ark,t:- | \
|
||||
scp:exp/test/ivector.scp ark,t:- | \
|
||||
awk '{max=$3; argmax=3; for(f=3;f<NF;f++) { if ($f>max)
|
||||
{ max=$f; argmax=f; }}
|
||||
print $1, (argmax - 3); }' | \
|
||||
utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt >exp/lre07/output
|
||||
utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt >exp/test/output
|
||||
|
||||
|
||||
# someone needs to extend this to run on the dev data.
|
||||
|
||||
compute-wer --text ark:<(utils/remove_dialect.pl data/lre07/utt2lang)\
|
||||
ark:exp/lre07/output
|
||||
# compute-wer --text ark:/dev/fd/63 ark:exp/lre07/output
|
||||
# %WER 52.58 [ 3958 / 7527, 0 ins, 0 del, 3958 sub ]
|
||||
# %SER 52.58 [ 3958 / 7527 ]
|
||||
compute-wer --text ark:<(utils/remove_dialect.pl data/test/utt2lang)\
|
||||
ark:exp/test/output
|
||||
# compute-wer --text ark:/dev/fd/63 ark:exp/test/output
|
||||
# %WER 58.83 [ 3958 / 7527, 0 ins, 0 del, 3958 sub ]
|
||||
# %SER 58.83 [ 3958 / 7527 ]
|
||||
# Scored 7527 sentences, 0 not present in hyp.
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче