sandbox/language_id: Adding additional LDC training data.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@3584 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
David Snyder 2014-02-24 20:53:42 +00:00
Родитель e7c733ad70
Коммит 90132a6b22
2 изменённых файлов: 153 добавлений и 8 удалений

134
egs/lre/v1/local/make_ldc96s.pl Executable file
Просмотреть файл

@ -0,0 +1,134 @@
#! /usr/bin/perl
use strict;
use warnings;
use local::load_lang;
my ($dataset, $in_top, $out_top) = @ARGV;
die 'Usage: ' . File::Basename::basename($0)
. " {48|49|54|55|56|57|58} in-dir out-dir\n"
unless @ARGV == 3 && $dataset =~ /^4[89]|5[4-8]$/o
&& $in_top && $out_top;
sub open_or_die ($$) {
my ($mode, $path, $file) = @_;
open($file, $mode, $path) or die "$path: $!\n";
return ($path, $file);
}
my $lang_abbreviation_file = "local/language_abbreviations.txt";
my ($long_lang, $abbr_lang, $num_lang) = load_lang($lang_abbreviation_file);
my %doc = (
'48' => '/callfriend_fre_1/cf_fre/docs/',
'49' => '/doc/',
'54' => '/doc/',
'55' => '/doc/',
'56' => '/doc/',
'57' => '/docs/',
'58' => '/doc/'
);
my $doc = $in_top . $doc{$dataset};
my ($meta_path, $meta_file, %speaker) =
open_or_die('<', $doc . 'callinfo.tbl');
while (<$meta_file>) {
my ($call, $speaker) = split(' PIN=|\|');
$speaker{$call} = $speaker;
}
close $meta_file or warn "$meta_path: $!\n";
($meta_path, $meta_file) = open_or_die('<', $doc . 'spkrinfo.tbl');
my %gender;
while (<$meta_file>) {
my ($call, $gender) = split(',');
$gender =~ tr/FM/fm/;
$gender{$call} = $gender;
}
close $meta_file or warn "$meta_path: $!\n";
($, , $\) = (' ', "\n");
$out_top .= '/ldc96s' . $dataset . '_';
my %data = (
'48' => '/callfriend_fre_1/cf_fre/data/',
'49' => '/data/',
'54' => '/data/',
'55' => '/data/',
'56' => '/data/',
'57' => '/data/',
'58' => '/cf_spa_n/'
);
my %lang_name = (
'48' => 'french',
'49' => 'arabic.standard',
'54' => 'korean',
'55' => 'chinese.mandarin.mainland',
'56' => 'chinese.mandarin.taiwan',
'57' => 'spanish.caribbean',
'58' => 'spanish.noncaribbean'
);
my $lang_code = $::num_lang{$::abbr_lang{$lang_name{$dataset}}};
$in_top .= $data{$dataset};
my $lang_name = $lang_name{$dataset};
sub open4sort ($;$) {
my ($path, $flags) = @_;
open_or_die('|-',
($flags ? 'sort ' . $flags . ' >' : 'sort >')
. $path);
}
use File::Path;
use File::Find;
foreach ('devtest', 'evltest', 'train') {
my $out_sub = $out_top . $_;
File::Path::make_path($out_sub);
$out_sub .= '/';
my ($wav_path, $wav_file) = open4sort($out_sub . 'wav.scp');
my ($utt2lang_path, $utt2lang_file) =
open4sort($out_sub . 'utt2lang');
my ($utt2spk_path, $utt2spk_file) =
open4sort($out_sub . 'utt2spk');
my ($spk2gender_path, $spk2gender_file) =
open4sort($out_sub . 'spk2gender', '-u');
File::Find::find(sub {
my ($call ) = /^(.*)\.sph$/o or return;
my $speaker = $speaker{$call};
if (!$speaker) {
warn "$call: No call metadata.\n";
return;
}
my $utt = $lang_code . '_' . $speaker . '_ldc96s' . $dataset
. '_' . $call;
print $wav_file
$utt, 'sph2pipe -f wav -p -c 1', $File::Find::name;
print $utt2lang_file $utt , $lang_name;
print $utt2spk_file $utt , $lang_code . "_" . $speaker;
print $spk2gender_file $speaker, $gender{$call};
}, $in_top . $_);
close $wav_file or warn "$wav_path: $!\n";
close $utt2lang_file or warn "$utt2lang_path: $!\n";
close $utt2spk_file or warn "$utt2spk_path: $!\n";
close $spk2gender_file or warn "$spk2gender_path: $!\n";
print("utils/utt2spk_to_spk2utt.pl ${out_sub}utt2spk > ${out_sub}spk2utt");
if (system("utils/utt2spk_to_spk2utt.pl ${out_sub}utt2spk > ${out_sub}spk2utt") != 0) {
die "${out_sub}utt2spk: utt2spk_to_spk2utt.pl: $!\n";
}
if (system("utils/utt2spk_to_spk2utt.pl ${out_sub}utt2lang > ${out_sub}lang2utt") != 0) {
die "${out_sub}utt2lang: utt2spk_to_spk2utt.pl: $!\n";
}
system("utils/fix_data_dir.sh $out_sub 1");
system("utils/validate_data_dir.sh --no-text --no-feats $out_sub");
}

Просмотреть файл

@ -11,27 +11,38 @@ set -e
mfccdir=`pwd`/mfcc
vaddir=`pwd`/mfcc
#local/make_sre_2008_train.pl local/language_abbreviations.txt /export/corpora5/LDC/LDC2011S05 data
#utils/combine_data.sh data/sre08_train data/sre08_train_10sec_female data/sre08_train_10sec_male \
# data/sre08_train_3conv_female data/sre08_train_3conv_male data/sre08_train_8conv_female \
# data/sre08_train_8conv_male data/sre08_train_short2_male data/sre08_train_short2_female /export/a14/kumar/kaldi/language_id/egs/lre/v1/data/ldc96s*
local/make_sre_2008_train.pl local/language_abbreviations.txt /export/corpora5/LDC/LDC2011S05 data
utils/combine_data.sh data/sre08_train data/sre08_train_10sec_female data/sre08_train_10sec_male \
local/make_ldc96s.pl 49 /export/corpora5/LDC/LDC96S49 data
local/make_ldc96s.pl 54 /export/corpora5/LDC/LDC96S54 data
local/make_ldc96s.pl 55 /export/corpora5/LDC/LDC96S55 data
local/make_ldc96s.pl 56 /export/corpora5/LDC/LDC96S56 data
local/make_ldc96s.pl 57 /export/corpora5/LDC/LDC96S57 data
local/make_ldc96s.pl 58 /export/corpora5/LDC/LDC96S58 data
utils/combine_data.sh data/train data/sre08_train_10sec_female data/sre08_train_10sec_male \
data/sre08_train_3conv_female data/sre08_train_3conv_male data/sre08_train_8conv_female \
data/sre08_train_8conv_male data/sre08_train_short2_male data/sre08_train_short2_female
data/sre08_train_8conv_male data/sre08_train_short2_male data/sre08_train_short2_female data/ldc96s*
mfccdir=`pwd`/mfcc
vaddir=`pwd`/mfcc
set -e
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" data/sre08_train exp/make_mfcc $mfccdir
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" data/train exp/make_mfcc $mfccdir
lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/sre08_train exp/make_vad $vaddir
lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/train exp/make_vad $vaddir
lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/sre08_train 2048 exp/diag_ubm_2048
lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/sre08_train exp/diag_ubm_2048 exp/full_ubm_2048
lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train 2048 exp/diag_ubm_2048
lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train exp/diag_ubm_2048 exp/full_ubm_2048
lid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=2G,ram_free=2G" \
--num-iters 5 exp/full_ubm_2048/final.ubm data/sre08_train \
--num-iters 5 exp/full_ubm_2048/final.ubm data/train \
exp/extractor_2048
lid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
exp/extractor_2048 data/sre08_train exp/ivectors_sre08_train
exp/extractor_2048 data/train exp/ivectors_train