sandbox/language_id: Adding vad-based utterance splitting scripts in lid setup

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@3826 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2014-04-01 04:12:07 +00:00 · 2014-04-01 04:12:07 +00:00 · 7954900219
--- a/egs/lre/v1/local/vad_split_utts.sh
+++ b/egs/lre/v1/local/vad_split_utts.sh
@ -0,0 +1,57 @@
+#!/bin/bash
+
+max_voiced=3000
+stage=0
+cleanup=true
+
+. utils/parse_options.sh
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <in-data-dir> <split-mfcc-out-dir> <out-data-dir>"
+  echo "e.g.: $0 --max-voiced 3000 data/train mfcc data/train_split"
+  echo "This script splits up long utterances into smaller pieces."
+  echo "It assumes the wav.scp contains has a certain form, with .sph"
+  echo "files in it (so the script is not completely general)."
+  exit 1;
+fi
+
+in_dir=$1
+mfccdir=$2
+dir=$3
+
+for f in $in_dir/{utt2spk,spk2utt,wav.scp,utt2lang,feats.scp,vad.scp}; do
+  if [ ! -f $f ]; then
+    echo "$0: expected input file $f to exist";
+    exit 1;
+  fi
+done
+
+if [ $stage -le 0 ]; then
+  utils/validate_data_dir.sh --no-text $in_dir || exit 1;
+  mkdir -p $dir/temp || exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+
+create-split-from-vad --max-voiced=$max_voiced scp:$in_dir/vad.scp $dir/frame_indexed_segments;
+
+extract-rows $dir/frame_indexed_segments scp:$in_dir/feats.scp ark,scp:$mfccdir/raw_mfcc_split.ark,$dir/feats.scp;
+
+copy-vector-segments $dir/frame_indexed_segments scp:$in_dir/vad.scp ark,scp:$mfccdir/vad_split.ark,$dir/temp/vad.scp;
+sort $dir/temp/vad.scp > $dir/vad.scp;
+fi
+
+if [ $stage -le 2 ]; then
+local/vad_split_utts_fix_data.pl $in_dir $dir;
+fi
+
+utils/filter_scp.pl -f 0 \
+<(echo "`awk < "$dir/segments" '{ print $2 }'`") $in_dir/wav.scp \
+ > $dir/wav.scp
+
+utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+utils/validate_data_dir.sh --no-text --no-feats $dir || exit 1;
+
+$cleanup && rm -r $dir/temp
+
+exit 0;
--- a/egs/lre/v1/local/vad_split_utts_fix_data.pl
+++ b/egs/lre/v1/local/vad_split_utts_fix_data.pl
@ -0,0 +1,50 @@
+#! /usr/bin/perl
+#
+# Copyright 2014  David Snyder
+# Apache 2.0.
+
+if (@ARGV != 2) {
+  print STDERR "Usage: $0 <in-data-dir> <out-data-dir>\n";
+  print STDERR "e.g. $0 data/train_unsplit data/train\n";
+  exit(1);
+}
+
+($in_dir, $out_dir) = @ARGV;
+
+%utt2lang = ();
+%utt2spk = ();
+
+open(UTT2LANG, "<$in_dir/utt2lang") or die "Cannot open utt2lang";
+while($line = <UTT2LANG>) {
+  ($utt, $lang) = split(" ", $line);
+  $utt2lang{$utt} = $lang;
+}
+close(UTT2LANG) or die;
+
+open(UTT2SPK, "<$in_dir/utt2spk") or die "Cannot open utt2spk";
+while($line = <UTT2SPK>) {
+  ($utt, $spk) = split(" ", $line);
+  $utt2spk{$utt} = $spk;
+}
+close(UTT2SPK) or die;
+
+open(FEATSEG, "<$out_dir/frame_indexed_segments") 
+  or die "Unable to open feats_segment";
+open(UTT2LANG, ">$out_dir/utt2lang") or die "Cannot open utt2lang";
+open(UTT2SPK, ">$out_dir/utt2spk") or die "Cannot open utt2spk";
+open(SEGMENT, ">$out_dir/segments") or die "Cannot open segments";
+
+while($seg = <FEATSEG>) {
+  ($split_utt, $utt, $start, $end) = split(" ", $seg);
+  print UTT2LANG "$split_utt $utt2lang{$utt}\n";
+  print UTT2SPK "$split_utt $utt\n";
+  $start_t = $start * 0.01;
+  $end_t = $end * 0.01;
+  print SEGMENT "$split_utt $utt $start_t $end_t\n";
+}
+
+close(FEATSEG) || die;
+close(UTT2LANG) || die;
+close(UTT2SPK) || die;
+close(SEGMENT) || die;
+system("utils/fix_data_dir.sh $out_dir");
--- a/egs/lre/v1/run.sh
+++ b/egs/lre/v1/run.sh
@ -2,8 +2,7 @@
 # Copyright  2014   David Snyder
 # Apache 2.0.
 #
-# An incomplete run.sh for this example. Currently this only trains up up a gender 
-# independent UBM and ivector with the SRE08 training data.
+# An incomplete run.sh for this example.

 . cmd.sh
 . path.sh
@ -49,9 +48,11 @@ rm foo

 local/split_long_utts.sh --max-utt-len 120 data/train_unsplit data/train

-##
-## HERE
-##
+# This commented script is an alternative to the above utterance
+# splitting method. Here we split the utterance based on the number of 
+# frames which are voiced, rather than the total number of frames.
+# max_voiced=3000 
+# local/vad_split_utts.sh --max-voiced $max_voiced data/train_unsplit $mfccdir data/train

 steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 100 --cmd "$train_cmd" \
  data/train exp/make_mfcc $mfccdir