sandbox/language_id: Adding local/make_callfriend.pl for handling the ldc96* datasets and a table for mapping callfriend datasets to languages. With lre07 for testing, run.sh now uses all available training data for training. Bug fix in make_lre07.pl.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@3740 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2014-03-08 08:49:33 +00:00 · 2014-03-08 08:49:33 +00:00 · a1561bb1c4
--- a/egs/lre/v1/local/callfriend_lang.txt
+++ b/egs/lre/v1/local/callfriend_lang.txt
@ -0,0 +1,7 @@
+48 french
+49 arabic.standard
+54 korean
+55 chinese.mandarin.mainland
+56 chinese.mandarin.taiwan
+57 spanish.caribbean
+58 spanish.noncaribbean
--- a/egs/lre/v1/local/make_ldc96s.pl
+++ b/egs/lre/v1/local/make_ldc96s.pl
@ -1,138 +0,0 @@
-#! /usr/bin/perl
-
-use strict;
-use warnings;
-use local::load_lang;
-
-my ($dataset, $in_top, $out_top) = @ARGV;
-die 'Usage: ' . File::Basename::basename($0)
-  . " {48|49|54|55|56|57|58} in-dir out-dir\n"
-  unless @ARGV == 3 && $dataset =~ /^4[89]|5[4-8]$/o
-  && $in_top && $out_top;
-
-sub open_or_die ($$) {
-  my ($mode, $path, $file) = @_;
-  open($file, $mode, $path) or die "$path: $!\n";
-  return ($path, $file);
-}
-my $lang_abbreviation_file = "local/language_abbreviations.txt";
-my ($long_lang, $abbr_lang, $num_lang) = load_lang($lang_abbreviation_file);
-my %doc = (
-           '48' => '/callfriend_fre_1/cf_fre/docs/',
-           '49' => '/doc/',
-           '54' => '/doc/',
-           '55' => '/doc/',
-           '56' => '/doc/',
-           '57' => '/docs/',
-           '58' => '/doc/'
-          );
-
-my $doc = $in_top . $doc{$dataset};
-my ($meta_path, $meta_file, %speaker) =
-  open_or_die('<', $doc . 'callinfo.tbl');
-
-while (<$meta_file>) {
-  my ($call, $speaker) = split(' PIN=|\|');
-  $speaker{$call} = $speaker;
-}
-
-close $meta_file or warn "$meta_path: $!\n";
-($meta_path, $meta_file) = open_or_die('<', $doc . 'spkrinfo.tbl');
-my %gender;
-
-while (<$meta_file>) {
-  my ($call, $gender) = split(',');
-  $gender =~ tr/FM/fm/;
-  $gender{$call} = $gender;
-}
-
-close $meta_file or warn "$meta_path: $!\n";
-($, , $\) = (' ', "\n");
-$out_top .= '/ldc96s' . $dataset . '_';
-
-my %data = (
-            '48' => '/callfriend_fre_1/cf_fre/data/',
-            '49' => '/data/',
-            '54' => '/data/',
-            '55' => '/data/',
-            '56' => '/data/',
-            '57' => '/data/',
-            '58' => '/cf_spa_n/'
-           );
-my %lang_name = (
-                 '48' => 'french',
-                 '49' => 'arabic.standard',
-                 '54' => 'korean',
-                 '55' => 'chinese.mandarin.mainland',
-                 '56' => 'chinese.mandarin.taiwan',
-                 '57' => 'spanish.caribbean',
-                 '58' => 'spanish.noncaribbean'
-                );
-my $lang_code = $::num_lang{$::abbr_lang{$lang_name{$dataset}}};
-$in_top .= $data{$dataset};
-my $lang_name = $lang_name{$dataset};
-
-sub open4sort ($;$) {
-  my ($path, $flags) = @_;
-  open_or_die('|-',
-              ($flags ? 'sort ' . $flags . ' >' : 'sort >')
-              . $path);
-}
-
-use File::Path;
-use File::Find;
-
-foreach ('devtest', 'evltest', 'train') {
-
-  my $out_sub = $out_top . $_;
-  File::Path::make_path($out_sub);
-  $out_sub .= '/';
-
-  my ($wav_path, $wav_file) = open4sort($out_sub . 'wav.scp');
-  my ($utt2lang_path, $utt2lang_file) =
-	open4sort($out_sub . 'utt2lang');
-  my ($utt2spk_path, $utt2spk_file) =
-	open4sort($out_sub . 'utt2spk');
-  my ($spk2gender_path, $spk2gender_file) =
-	open4sort($out_sub . 'spk2gender', '-u');
-
-  File::Find::find(sub {
-
-                     my ($call   ) = /^(.*)\.sph$/o or return;
-                     my $orig_speaker  = $speaker{$call};
-                     my $gender = $gender{$call};
-                     if (!defined $orig_speaker) {
-                       warn "No speaker defined for call $call.\n";
-                       return;
-                     }
-                     if (!defined $gender || !($gender eq "f" || $gender eq "m")) {
-                       warn "No gender defined or bad gender '$gender' for call $call.\n";
-                       return;
-                     }
-                     my $speaker = $lang_code . '_' . $orig_speaker;
-                     my $utt = $speaker . '_ldc96s' . $dataset . '_' . $call;
-                     print $wav_file
-                       $utt, 'sph2pipe -f wav -p -c 1', $File::Find::name, '|';
-                     print $utt2lang_file   $utt    , $lang_name;
-                     print $utt2spk_file    $utt    , $speaker;
-                     print $spk2gender_file $speaker, $gender;
-
-                   }, $in_top . $_);
-
-  close $wav_file        or warn "$wav_path: $!\n";
-  close $utt2lang_file   or warn "$utt2lang_path: $!\n";
-  close $utt2spk_file    or warn "$utt2spk_path: $!\n";
-  close $spk2gender_file or warn "$spk2gender_path: $!\n";
-
-  print("utils/utt2spk_to_spk2utt.pl ${out_sub}utt2spk > ${out_sub}spk2utt");
-  if (system("utils/utt2spk_to_spk2utt.pl ${out_sub}utt2spk > ${out_sub}spk2utt") != 0) {
-    die "${out_sub}utt2spk: utt2spk_to_spk2utt.pl: $!\n";
-  }
-  if (system("utils/utt2spk_to_spk2utt.pl ${out_sub}utt2lang > ${out_sub}lang2utt") != 0) {
-    die "${out_sub}utt2lang: utt2spk_to_spk2utt.pl: $!\n";
-  }
-  system("utils/fix_data_dir.sh $out_sub");
-  if (system("utils/validate_data_dir.sh --no-text --no-feats $out_sub") != 0) {
-    die "Failed validating output directory";
-  }
-}
--- a/egs/lre/v1/local/make_lre07.pl
+++ b/egs/lre/v1/local/make_lre07.pl
@ -62,4 +62,4 @@ close(UTT2LANG) || die;
 close(WAVLIST) || die;
 system("rm -r $dir/tmp");

-(system("utils/validate_data_dir.sh --no-text --no-feats data/lre07") == 0) || die "Error validating data dir.";
+(system("utils/validate_data_dir.sh --no-text --no-feats $dir") == 0) || die "Error validating data dir.";
--- a/egs/lre/v1/run.sh
+++ b/egs/lre/v1/run.sh
@ -16,44 +16,59 @@ vaddir=`pwd`/mfcc
 #    data/sre08_train_3conv_female data/sre08_train_3conv_male data/sre08_train_8conv_female \
 #    data/sre08_train_8conv_male data/sre08_train_short2_male data/sre08_train_short2_female /export/a14/kumar/kaldi/language_id/egs/lre/v1/data/ldc96s*

-local/make_sre_2008_train.pl local/language_abbreviations.txt /export/corpora5/LDC/LDC2011S05 data
-local/make_ldc96s.pl 49 /export/corpora5/LDC/LDC96S49 data
-local/make_ldc96s.pl 54 /export/corpora5/LDC/LDC96S54 data
-local/make_ldc96s.pl 55 /export/corpora5/LDC/LDC96S55 data
-local/make_ldc96s.pl 56 /export/corpora5/LDC/LDC96S56 data
-local/make_ldc96s.pl 57 /export/corpora5/LDC/LDC96S57 data
-local/make_ldc96s.pl 58 /export/corpora5/LDC/LDC96S58 data
+lang=local/callfriend_lang.txt
+lang_abbrev=local/language_abbreviations.txt
+local/make_callfriend.pl $lang_abbrev \
+  /export/corpora5/LDC/LDC96S49 49 $lang data
+local/make_callfriend.pl $lang_abbrev \
+  /export/corpora5/LDC/LDC96S54 54 $lang data
+local/make_callfriend.pl $lang_abbrev \
+  /export/corpora5/LDC/LDC96S55 55 $lang data
+local/make_callfriend.pl $lang_abbrev \
+  /export/corpora5/LDC/LDC96S56 56 $lang data
+local/make_callfriend.pl $lang_abbrev \
+  /export/corpora5/LDC/LDC96S57 57 $lang data
+local/make_callfriend.pl $lang_abbrev \
+  /export/corpora5/LDC/LDC96S58 58 $lang data
+local/make_sre_2008_train.pl local/language_abbreviations.txt \
+  /export/corpora5/LDC/LDC2011S05 data

-# we're not doing anything with the lre07 data currently.
-local/make_lre07.pl /export/corpora5/LDC/LDC2009S04 data/lre07
+local/make_lre07.pl /export/corpora5/LDC/LDC2009S04 data/test

-utils/combine_data.sh data/all data/sre08_train_10sec_female data/sre08_train_10sec_male \
-    data/sre08_train_3conv_female data/sre08_train_3conv_male data/sre08_train_8conv_female \
-    data/sre08_train_8conv_male data/sre08_train_short2_male data/sre08_train_short2_female data/ldc96s*
+utils/combine_data.sh data/train data/sre08_train_10sec_female \
+    data/sre08_train_10sec_male data/sre08_train_3conv_female \
+    data/sre08_train_3conv_male data/sre08_train_8conv_female \
+    data/sre08_train_8conv_male data/sre08_train_short2_male \
+    data/sre08_train_short2_female data/ldc96s*

-mfccdir=`pwd`/mfcc
-vaddir=`pwd`/mfcc
+steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
+  data/train exp/make_mfcc $mfccdir
+steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
+  data/test exp/make_mfcc $mfccdir

-set -e
-steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" data/all exp/make_mfcc $mfccdir
-
-lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/all exp/make_vad $vaddir
+lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/train \
+  exp/make_vad $vaddir
+lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/test \
+  exp/make_vad $vaddir

 # Use 4k of the 14k utterances for testing, but make sure the speakers do not
 # overlap with the rest of the data, which will be used for training.
-utils/subset_data_dir.sh --speakers data/all 4000 data/test
-utils/filter_scp.pl --exclude data/test/spk2utt < data/all/spk2utt  | awk '{print $1}' > foo
-utils/subset_data_dir.sh --spk-list foo data/all data/train
+#utils/subset_data_dir.sh --speakers data/all 4000 data/test
+#utils/filter_scp.pl --exclude data/test/spk2utt < data/all/spk2utt  | awk '{print $1}' > foo
+#utils/subset_data_dir.sh --spk-list foo data/all data/train


 utils/subset_data_dir.sh data/train 3000 data/train_3k
 utils/subset_data_dir.sh data/train 6000 data/train_6k


-lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train_3k 2048 exp/diag_ubm_2048
-lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train_6k exp/diag_ubm_2048 exp/full_ubm_2048_6k
+lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train_3k 2048 \
+  exp/diag_ubm_2048
+lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train_6k \
+  exp/diag_ubm_2048 exp/full_ubm_2048_6k

-lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train exp/full_ubm_2048_6k exp/full_ubm_2048
+lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train \
+  exp/full_ubm_2048_6k exp/full_ubm_2048


 lid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=2G,ram_free=2G" \
--- a/egs/lre/v1/run_logistic_regression.sh
+++ b/egs/lre/v1/run_logistic_regression.sh
@ -35,7 +35,7 @@ classes="ark:utils/remove_dialect.pl data/train/utt2lang \
 # the languages as count(lang_test) / (count(lang_test) + count(lang_train)).
 utils/balance_priors_to_test.pl \
    <(utils/remove_dialect.pl data/train/utt2lang) \
-    <(utils/remove_dialect.pl data/lre07/utt2lang) \
+    <(utils/remove_dialect.pl data/test/utt2lang) \
    exp/ivectors_train/languages.txt \
    exp/ivectors_train/priors.vec

@ -76,19 +76,19 @@ compute-wer --text ark:<(utils/remove_dialect.pl data/train/utt2lang) \


 logistic-regression-eval $model_rebalanced \
-  scp:exp/lre07/ivector.scp ark,t:- | \
+  scp:exp/test/ivector.scp ark,t:- | \
  awk '{max=$3; argmax=3; for(f=3;f<NF;f++) { if ($f>max) 
                          { max=$f; argmax=f; }}  
                          print $1, (argmax - 3); }' | \
-  utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt >exp/lre07/output
+  utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt >exp/test/output


 # someone needs to extend this to run on the dev data.

-compute-wer --text ark:<(utils/remove_dialect.pl data/lre07/utt2lang)\
-  ark:exp/lre07/output
-# compute-wer --text ark:/dev/fd/63 ark:exp/lre07/output 
-# %WER 52.58 [ 3958 / 7527, 0 ins, 0 del, 3958 sub ]
-# %SER 52.58 [ 3958 / 7527 ]
+compute-wer --text ark:<(utils/remove_dialect.pl data/test/utt2lang)\
+  ark:exp/test/output
+# compute-wer --text ark:/dev/fd/63 ark:exp/test/output 
+# %WER 58.83 [ 3958 / 7527, 0 ins, 0 del, 3958 sub ]
+# %SER 58.83 [ 3958 / 7527 ]
 # Scored 7527 sentences, 0 not present in hyp.