зеркало из https://github.com/mozilla/kaldi.git
sandbox/language_id: Adding additional LDC training data.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@3584 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
e7c733ad70
Коммит
90132a6b22
|
@ -0,0 +1,134 @@
|
||||||
|
#! /usr/bin/perl
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use warnings;
|
||||||
|
use local::load_lang;
|
||||||
|
|
||||||
|
my ($dataset, $in_top, $out_top) = @ARGV;
|
||||||
|
die 'Usage: ' . File::Basename::basename($0)
|
||||||
|
. " {48|49|54|55|56|57|58} in-dir out-dir\n"
|
||||||
|
unless @ARGV == 3 && $dataset =~ /^4[89]|5[4-8]$/o
|
||||||
|
&& $in_top && $out_top;
|
||||||
|
|
||||||
|
sub open_or_die ($$) {
|
||||||
|
my ($mode, $path, $file) = @_;
|
||||||
|
open($file, $mode, $path) or die "$path: $!\n";
|
||||||
|
return ($path, $file);
|
||||||
|
}
|
||||||
|
my $lang_abbreviation_file = "local/language_abbreviations.txt";
|
||||||
|
my ($long_lang, $abbr_lang, $num_lang) = load_lang($lang_abbreviation_file);
|
||||||
|
my %doc = (
|
||||||
|
'48' => '/callfriend_fre_1/cf_fre/docs/',
|
||||||
|
'49' => '/doc/',
|
||||||
|
'54' => '/doc/',
|
||||||
|
'55' => '/doc/',
|
||||||
|
'56' => '/doc/',
|
||||||
|
'57' => '/docs/',
|
||||||
|
'58' => '/doc/'
|
||||||
|
);
|
||||||
|
|
||||||
|
my $doc = $in_top . $doc{$dataset};
|
||||||
|
my ($meta_path, $meta_file, %speaker) =
|
||||||
|
open_or_die('<', $doc . 'callinfo.tbl');
|
||||||
|
|
||||||
|
while (<$meta_file>) {
|
||||||
|
my ($call, $speaker) = split(' PIN=|\|');
|
||||||
|
$speaker{$call} = $speaker;
|
||||||
|
}
|
||||||
|
|
||||||
|
close $meta_file or warn "$meta_path: $!\n";
|
||||||
|
($meta_path, $meta_file) = open_or_die('<', $doc . 'spkrinfo.tbl');
|
||||||
|
my %gender;
|
||||||
|
|
||||||
|
while (<$meta_file>) {
|
||||||
|
my ($call, $gender) = split(',');
|
||||||
|
$gender =~ tr/FM/fm/;
|
||||||
|
$gender{$call} = $gender;
|
||||||
|
}
|
||||||
|
|
||||||
|
close $meta_file or warn "$meta_path: $!\n";
|
||||||
|
($, , $\) = (' ', "\n");
|
||||||
|
$out_top .= '/ldc96s' . $dataset . '_';
|
||||||
|
|
||||||
|
my %data = (
|
||||||
|
'48' => '/callfriend_fre_1/cf_fre/data/',
|
||||||
|
'49' => '/data/',
|
||||||
|
'54' => '/data/',
|
||||||
|
'55' => '/data/',
|
||||||
|
'56' => '/data/',
|
||||||
|
'57' => '/data/',
|
||||||
|
'58' => '/cf_spa_n/'
|
||||||
|
);
|
||||||
|
my %lang_name = (
|
||||||
|
'48' => 'french',
|
||||||
|
'49' => 'arabic.standard',
|
||||||
|
'54' => 'korean',
|
||||||
|
'55' => 'chinese.mandarin.mainland',
|
||||||
|
'56' => 'chinese.mandarin.taiwan',
|
||||||
|
'57' => 'spanish.caribbean',
|
||||||
|
'58' => 'spanish.noncaribbean'
|
||||||
|
);
|
||||||
|
my $lang_code = $::num_lang{$::abbr_lang{$lang_name{$dataset}}};
|
||||||
|
$in_top .= $data{$dataset};
|
||||||
|
my $lang_name = $lang_name{$dataset};
|
||||||
|
|
||||||
|
sub open4sort ($;$) {
|
||||||
|
my ($path, $flags) = @_;
|
||||||
|
open_or_die('|-',
|
||||||
|
($flags ? 'sort ' . $flags . ' >' : 'sort >')
|
||||||
|
. $path);
|
||||||
|
}
|
||||||
|
|
||||||
|
use File::Path;
|
||||||
|
use File::Find;
|
||||||
|
|
||||||
|
foreach ('devtest', 'evltest', 'train') {
|
||||||
|
|
||||||
|
my $out_sub = $out_top . $_;
|
||||||
|
File::Path::make_path($out_sub);
|
||||||
|
$out_sub .= '/';
|
||||||
|
|
||||||
|
my ($wav_path, $wav_file) = open4sort($out_sub . 'wav.scp');
|
||||||
|
my ($utt2lang_path, $utt2lang_file) =
|
||||||
|
open4sort($out_sub . 'utt2lang');
|
||||||
|
my ($utt2spk_path, $utt2spk_file) =
|
||||||
|
open4sort($out_sub . 'utt2spk');
|
||||||
|
my ($spk2gender_path, $spk2gender_file) =
|
||||||
|
open4sort($out_sub . 'spk2gender', '-u');
|
||||||
|
|
||||||
|
File::Find::find(sub {
|
||||||
|
|
||||||
|
my ($call ) = /^(.*)\.sph$/o or return;
|
||||||
|
my $speaker = $speaker{$call};
|
||||||
|
|
||||||
|
if (!$speaker) {
|
||||||
|
warn "$call: No call metadata.\n";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
my $utt = $lang_code . '_' . $speaker . '_ldc96s' . $dataset
|
||||||
|
. '_' . $call;
|
||||||
|
print $wav_file
|
||||||
|
$utt, 'sph2pipe -f wav -p -c 1', $File::Find::name;
|
||||||
|
print $utt2lang_file $utt , $lang_name;
|
||||||
|
print $utt2spk_file $utt , $lang_code . "_" . $speaker;
|
||||||
|
print $spk2gender_file $speaker, $gender{$call};
|
||||||
|
|
||||||
|
}, $in_top . $_);
|
||||||
|
|
||||||
|
close $wav_file or warn "$wav_path: $!\n";
|
||||||
|
close $utt2lang_file or warn "$utt2lang_path: $!\n";
|
||||||
|
close $utt2spk_file or warn "$utt2spk_path: $!\n";
|
||||||
|
close $spk2gender_file or warn "$spk2gender_path: $!\n";
|
||||||
|
|
||||||
|
print("utils/utt2spk_to_spk2utt.pl ${out_sub}utt2spk > ${out_sub}spk2utt");
|
||||||
|
if (system("utils/utt2spk_to_spk2utt.pl ${out_sub}utt2spk > ${out_sub}spk2utt") != 0) {
|
||||||
|
die "${out_sub}utt2spk: utt2spk_to_spk2utt.pl: $!\n";
|
||||||
|
}
|
||||||
|
if (system("utils/utt2spk_to_spk2utt.pl ${out_sub}utt2lang > ${out_sub}lang2utt") != 0) {
|
||||||
|
die "${out_sub}utt2lang: utt2spk_to_spk2utt.pl: $!\n";
|
||||||
|
}
|
||||||
|
system("utils/fix_data_dir.sh $out_sub 1");
|
||||||
|
system("utils/validate_data_dir.sh --no-text --no-feats $out_sub");
|
||||||
|
|
||||||
|
}
|
|
@ -11,27 +11,38 @@ set -e
|
||||||
mfccdir=`pwd`/mfcc
|
mfccdir=`pwd`/mfcc
|
||||||
vaddir=`pwd`/mfcc
|
vaddir=`pwd`/mfcc
|
||||||
|
|
||||||
|
#local/make_sre_2008_train.pl local/language_abbreviations.txt /export/corpora5/LDC/LDC2011S05 data
|
||||||
|
#utils/combine_data.sh data/sre08_train data/sre08_train_10sec_female data/sre08_train_10sec_male \
|
||||||
|
# data/sre08_train_3conv_female data/sre08_train_3conv_male data/sre08_train_8conv_female \
|
||||||
|
# data/sre08_train_8conv_male data/sre08_train_short2_male data/sre08_train_short2_female /export/a14/kumar/kaldi/language_id/egs/lre/v1/data/ldc96s*
|
||||||
|
|
||||||
local/make_sre_2008_train.pl local/language_abbreviations.txt /export/corpora5/LDC/LDC2011S05 data
|
local/make_sre_2008_train.pl local/language_abbreviations.txt /export/corpora5/LDC/LDC2011S05 data
|
||||||
utils/combine_data.sh data/sre08_train data/sre08_train_10sec_female data/sre08_train_10sec_male \
|
local/make_ldc96s.pl 49 /export/corpora5/LDC/LDC96S49 data
|
||||||
|
local/make_ldc96s.pl 54 /export/corpora5/LDC/LDC96S54 data
|
||||||
|
local/make_ldc96s.pl 55 /export/corpora5/LDC/LDC96S55 data
|
||||||
|
local/make_ldc96s.pl 56 /export/corpora5/LDC/LDC96S56 data
|
||||||
|
local/make_ldc96s.pl 57 /export/corpora5/LDC/LDC96S57 data
|
||||||
|
local/make_ldc96s.pl 58 /export/corpora5/LDC/LDC96S58 data
|
||||||
|
utils/combine_data.sh data/train data/sre08_train_10sec_female data/sre08_train_10sec_male \
|
||||||
data/sre08_train_3conv_female data/sre08_train_3conv_male data/sre08_train_8conv_female \
|
data/sre08_train_3conv_female data/sre08_train_3conv_male data/sre08_train_8conv_female \
|
||||||
data/sre08_train_8conv_male data/sre08_train_short2_male data/sre08_train_short2_female
|
data/sre08_train_8conv_male data/sre08_train_short2_male data/sre08_train_short2_female data/ldc96s*
|
||||||
|
|
||||||
mfccdir=`pwd`/mfcc
|
mfccdir=`pwd`/mfcc
|
||||||
vaddir=`pwd`/mfcc
|
vaddir=`pwd`/mfcc
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" data/sre08_train exp/make_mfcc $mfccdir
|
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" data/train exp/make_mfcc $mfccdir
|
||||||
|
|
||||||
lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/sre08_train exp/make_vad $vaddir
|
lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/train exp/make_vad $vaddir
|
||||||
|
|
||||||
|
|
||||||
lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/sre08_train 2048 exp/diag_ubm_2048
|
lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train 2048 exp/diag_ubm_2048
|
||||||
lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/sre08_train exp/diag_ubm_2048 exp/full_ubm_2048
|
lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train exp/diag_ubm_2048 exp/full_ubm_2048
|
||||||
|
|
||||||
|
|
||||||
lid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=2G,ram_free=2G" \
|
lid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=2G,ram_free=2G" \
|
||||||
--num-iters 5 exp/full_ubm_2048/final.ubm data/sre08_train \
|
--num-iters 5 exp/full_ubm_2048/final.ubm data/train \
|
||||||
exp/extractor_2048
|
exp/extractor_2048
|
||||||
|
|
||||||
lid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
|
lid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
|
||||||
exp/extractor_2048 data/sre08_train exp/ivectors_sre08_train
|
exp/extractor_2048 data/train exp/ivectors_train
|
||||||
|
|
Загрузка…
Ссылка в новой задаче