sandbox/language_id: Adding vad-based utterance splitting scripts in lid setup

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@3826 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
David Snyder 2014-04-01 04:12:07 +00:00
Родитель f39b8993ab
Коммит 7954900219
3 изменённых файлов: 113 добавлений и 5 удалений

Просмотреть файл

@ -0,0 +1,57 @@
#!/bin/bash
max_voiced=3000
stage=0
cleanup=true
. utils/parse_options.sh
if [ $# -ne 3 ]; then
echo "Usage: $0 [options] <in-data-dir> <split-mfcc-out-dir> <out-data-dir>"
echo "e.g.: $0 --max-voiced 3000 data/train mfcc data/train_split"
echo "This script splits up long utterances into smaller pieces."
echo "It assumes the wav.scp contains has a certain form, with .sph"
echo "files in it (so the script is not completely general)."
exit 1;
fi
in_dir=$1
mfccdir=$2
dir=$3
for f in $in_dir/{utt2spk,spk2utt,wav.scp,utt2lang,feats.scp,vad.scp}; do
if [ ! -f $f ]; then
echo "$0: expected input file $f to exist";
exit 1;
fi
done
if [ $stage -le 0 ]; then
utils/validate_data_dir.sh --no-text $in_dir || exit 1;
mkdir -p $dir/temp || exit 1;
fi
if [ $stage -le 1 ]; then
create-split-from-vad --max-voiced=$max_voiced scp:$in_dir/vad.scp $dir/frame_indexed_segments;
extract-rows $dir/frame_indexed_segments scp:$in_dir/feats.scp ark,scp:$mfccdir/raw_mfcc_split.ark,$dir/feats.scp;
copy-vector-segments $dir/frame_indexed_segments scp:$in_dir/vad.scp ark,scp:$mfccdir/vad_split.ark,$dir/temp/vad.scp;
sort $dir/temp/vad.scp > $dir/vad.scp;
fi
if [ $stage -le 2 ]; then
local/vad_split_utts_fix_data.pl $in_dir $dir;
fi
utils/filter_scp.pl -f 0 \
<(echo "`awk < "$dir/segments" '{ print $2 }'`") $in_dir/wav.scp \
> $dir/wav.scp
utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
utils/validate_data_dir.sh --no-text --no-feats $dir || exit 1;
$cleanup && rm -r $dir/temp
exit 0;

Просмотреть файл

@ -0,0 +1,50 @@
#! /usr/bin/perl
#
# Copyright 2014 David Snyder
# Apache 2.0.
if (@ARGV != 2) {
print STDERR "Usage: $0 <in-data-dir> <out-data-dir>\n";
print STDERR "e.g. $0 data/train_unsplit data/train\n";
exit(1);
}
($in_dir, $out_dir) = @ARGV;
%utt2lang = ();
%utt2spk = ();
open(UTT2LANG, "<$in_dir/utt2lang") or die "Cannot open utt2lang";
while($line = <UTT2LANG>) {
($utt, $lang) = split(" ", $line);
$utt2lang{$utt} = $lang;
}
close(UTT2LANG) or die;
open(UTT2SPK, "<$in_dir/utt2spk") or die "Cannot open utt2spk";
while($line = <UTT2SPK>) {
($utt, $spk) = split(" ", $line);
$utt2spk{$utt} = $spk;
}
close(UTT2SPK) or die;
open(FEATSEG, "<$out_dir/frame_indexed_segments")
or die "Unable to open feats_segment";
open(UTT2LANG, ">$out_dir/utt2lang") or die "Cannot open utt2lang";
open(UTT2SPK, ">$out_dir/utt2spk") or die "Cannot open utt2spk";
open(SEGMENT, ">$out_dir/segments") or die "Cannot open segments";
while($seg = <FEATSEG>) {
($split_utt, $utt, $start, $end) = split(" ", $seg);
print UTT2LANG "$split_utt $utt2lang{$utt}\n";
print UTT2SPK "$split_utt $utt\n";
$start_t = $start * 0.01;
$end_t = $end * 0.01;
print SEGMENT "$split_utt $utt $start_t $end_t\n";
}
close(FEATSEG) || die;
close(UTT2LANG) || die;
close(UTT2SPK) || die;
close(SEGMENT) || die;
system("utils/fix_data_dir.sh $out_dir");

Просмотреть файл

@ -2,8 +2,7 @@
# Copyright 2014 David Snyder
# Apache 2.0.
#
# An incomplete run.sh for this example. Currently this only trains up up a gender
# independent UBM and ivector with the SRE08 training data.
# An incomplete run.sh for this example.
. cmd.sh
. path.sh
@ -49,9 +48,11 @@ rm foo
local/split_long_utts.sh --max-utt-len 120 data/train_unsplit data/train
##
## HERE
##
# This commented script is an alternative to the above utterance
# splitting method. Here we split the utterance based on the number of
# frames which are voiced, rather than the total number of frames.
# max_voiced=3000
# local/vad_split_utts.sh --max-voiced $max_voiced data/train_unsplit $mfccdir data/train
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 100 --cmd "$train_cmd" \
data/train exp/make_mfcc $mfccdir