sandbox/lid: introducing the splitting of long utterances into smaller pieces; various utility script updates.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@3757 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2014-03-10 04:55:22 +00:00
Родитель 18e1be0067
Коммит a1a368dc83
9 изменённых файлов: 132 добавлений и 27 удалений

Просмотреть файл

@ -0,0 +1,103 @@
#!/bin/bash
max_utt_len=60 # 60 seconds.
stage=0
cleanup=true
. utils/parse_options.sh
if [ $# -ne 2 ]; then
echo "Usage: $0 [options] <in-data-dir> <out-data-dir>"
echo "e.g.: $0 --max-utt-len 120 data/train data/train_split"
echo "This script splits up long utterances into smaller pieces."
echo "It assumes the wav.scp contains has a certain form, with .sph"
echo "files in it (so the script is not completely general)."
exit 1;
fi
in_dir=$1
dir=$2
for f in $in_dir/{utt2spk,spk2utt,wav.scp,utt2lang}; do
if [ ! -f $f ]; then
echo "$0: expected input file $f to exist";
exit 1;
fi
done
if [ $stage -le 0 ]; then
utils/validate_data_dir.sh --no-text --no-feats $in_dir || exit 1;
mkdir -p $dir/temp || exit 1;
cat $in_dir/wav.scp | perl -ane '
$_ =~ m:(^\S+) .* (\S+\.sph): || die "bad line $_";
$utt=$1; $sph=$2;
open (F, "<$sph") || die "Could not open sph file $sph";
$samp_count = -1;
while (<F>) {
if (m/sample_rate -i (\d+)/) { $1 == 8000 || die "bad sample rate for $sph: $_"; }
if (m/sample_count -i (\d+)/) { $samp_count = $1; }
if (m/end_head/) { last; }
}
close(F);
$samp_count > 0 || die "Could not get sample count for sph file $sph\n";
$secs = $samp_count / 8000;
print "$utt $secs\n";
' > $dir/temp/utt2len
fi
if [ $(cat $dir/temp/utt2len | wc -l) -ne $(cat $in_dir/utt2spk | wc -l) ]; then
echo "utt2spk and utt2len files have mismatched lengths";
exit 1;
fi
if [ $(cat $dir/temp/utt2len | wc -l) -ne $(cat $in_dir/utt2lang | wc -l) ]; then
echo "utt2spk and utt2lang files have mismatched lengths";
exit 1;
fi
if [ $stage -le 1 ]; then
# Create, in a pipe, a file with lines
# <utt-id> <length> <speaker-id> <language-id>
# and pipe it into a perl script that outputs the segments file.
awk '{print $2}' $in_dir/utt2spk | paste $dir/temp/utt2len - | \
paste - <(awk '{print $2}' $in_dir/utt2lang) | perl -e '
($max_utt_len, $out_dir) = @ARGV;
open(UTT2SPK, ">$out_dir/utt2spk") || die "opening utt2spk file $out_dir/utt2spk";
open(SEGMENTS, ">$out_dir/segments") || die "opening segments file $out_dir/segments";
open(UTT2LANG, ">$out_dir/utt2lang") || die "opening segments file $out_dir/utt2lang";
while(<STDIN>) {
($utt, $len, $speaker, $language) = split(" ", $_);
defined $speaker || die "Bad line $_";
$reco = $utt; # old utt-id becomes recording-id.
if ($len <= $max_utt_len) {
print SEGMENTS "${utt}-1 ${utt} 0 -1\n";
print UTT2SPK "${utt}-1 $speaker\n";
} else {
# We will now allow split length to exceed max_utt_len.
$num_split = int(($len + 0.999*$max_utt_len) / $max_utt_len);
$num_split >= 1 || die;
$split_len = $len / $num_split;
for ($n = 1; $n <= $num_split; $n++) {
$n_text = $n; # this will help remain in string-sorted order
if ($num_split >= 10 && $n < 10) { $n_text = "0$n_text"; }
if ($num_split >= 100 && $n < 100) { $n_text = "00$n_text"; }
$t_start = $split_len * ($n - 1); $t_end = $split_len * $n;
print SEGMENTS "${utt}-$n_text ${utt} $t_start $t_end\n";
print UTT2SPK "${utt}-$n_text $speaker\n";
print UTT2LANG "${utt}-$n_text $language\n";
}
}
}
close(SEGMENTS)||die; close(UTT2SPK)||die; close(UTT2LANG)||die; ' $max_utt_len $dir
fi
cp $in_dir/wav.scp $dir/
utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
utils/validate_data_dir.sh --no-text --no-feats $dir || exit 1;
$cleanup && rm -r $dir/temp
exit 0;

Просмотреть файл

@ -38,15 +38,17 @@ src_list="data/sre08_train_10sec_female \
# sources have this info, it will cause problems with combine_data.sh
for d in $src_list; do rm $d/spk2gender 2>/dev/null; done
utils/combine_data.sh data/train $src_list
utils/combine_data.sh data/train_unsplit $src_list
# original utt2lang will remain in data/train/.backup/utt2lang.
utils/apply_map.pl -f 2 --permissive local/lang_map.txt < data/train/utt2lang 2>/dev/null > foo
cp foo data/train/utt2lang
# original utt2lang will remain in data/train_unsplit/.backup/utt2lang.
utils/apply_map.pl -f 2 --permissive local/lang_map.txt < data/train_unsplit/utt2lang 2>/dev/null > foo
cp foo data/train_unsplit/utt2lang
echo "**Language count in training:**"
awk '{print $2}' foo | sort | uniq -c | sort -nr
rm foo
local/split_long_utts.sh --max-utt-len 120 data/train_unsplit data/train
##
## HERE
##
@ -68,17 +70,17 @@ lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/lre07 \
#utils/subset_data_dir.sh --spk-list foo data/all data/train
utils/subset_data_dir.sh data/train 3000 data/train_3k
utils/subset_data_dir.sh data/train 6000 data/train_6k
utils/subset_data_dir.sh data/train 5000 data/train_5k
utils/subset_data_dir.sh data/train 10000 data/train_10k
lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train_3k 2048 \
lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train_5k 2048 \
exp/diag_ubm_2048
lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train_6k \
exp/diag_ubm_2048 exp/full_ubm_2048_6k
lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train_10k \
exp/diag_ubm_2048 exp/full_ubm_2048_10k
lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train \
exp/full_ubm_2048_6k exp/full_ubm_2048
exp/full_ubm_2048_10k exp/full_ubm_2048
lid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=2G,ram_free=2G" \

Просмотреть файл

@ -81,7 +81,7 @@ else
done
utils/split_scp.pl $scp $split_scps || exit 1;
$cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
compute-mfcc-feats --verbose=2 --config=$mfcc_config scp:$logdir/wav.JOB.scp ark:- \| \
copy-feats --compress=$compress ark:- \

Просмотреть файл

@ -92,13 +92,13 @@ else
echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
split_scps=""
for ((n=1; n<=nj; n++)); do
split_scps="$split_scps $logdir/wav.$n.scp"
split_scps="$split_scps $logdir/wav_${name}.$n.scp"
done
utils/split_scp.pl $scp $split_scps || exit 1;
mfcc_feats="ark:compute-mfcc-feats --verbose=2 --config=$mfcc_config scp:$logdir/wav.JOB.scp ark:- |"
pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp:$logdir/wav.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
mfcc_feats="ark:compute-mfcc-feats --verbose=2 --config=$mfcc_config scp:$logdir/wav_${name}.JOB.scp ark:- |"
pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp:$logdir/wav_${name}.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
$cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
paste-feats --length-tolerance=$paste_length_tolerance "$mfcc_feats" "$pitch_feats" ark:- \| \
@ -120,7 +120,7 @@ for ((n=1; n<=nj; n++)); do
cat $mfcc_pitch_dir/raw_mfcc_pitch_$name.$n.scp || exit 1;
done > $data/feats.scp
rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null
rm $logdir/wav_${name}.*.scp $logdir/segments.* 2>/dev/null
nf=`cat $data/feats.scp | wc -l`
nu=`cat $data/utt2spk | wc -l`

Просмотреть файл

@ -89,13 +89,13 @@ else
echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
split_scps=""
for ((n=1; n<=nj; n++)); do
split_scps="$split_scps $logdir/wav.$n.scp"
split_scps="$split_scps $logdir/wav_${name}.$n.scp"
done
utils/split_scp.pl $scp $split_scps || exit 1;
$cmd JOB=1:$nj $logdir/make_pitch.JOB.log \
compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp:$logdir/wav.JOB.scp ark:- \| \
compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp:$logdir/wav_${name}.JOB.scp ark:- \| \
process-kaldi-pitch-feats $postprocess_config_opt ark:- \
ark,scp:$pitchdir/pitch_$name.JOB.ark,$pitchdir/pitch_$name.JOB.scp \
|| exit 1;
@ -113,7 +113,7 @@ for ((n=1; n<=nj; n++)); do
cat $pitchdir/pitch_$name.$n.scp || exit 1;
done > $data/feats.scp
rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null
rm $logdir/wav_${name}.*.scp $logdir/segments.* 2>/dev/null
nf=`cat $data/feats.scp | wc -l`
nu=`cat $data/utt2spk | wc -l`

Просмотреть файл

@ -77,13 +77,13 @@ else
echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
split_scps=""
for ((n=1; n<=nj; n++)); do
split_scps="$split_scps $logdir/wav.$n.scp"
split_scps="$split_scps $logdir/wav_${name}.$n.scp"
done
utils/split_scp.pl $scp $split_scps || exit 1;
$cmd JOB=1:$nj $logdir/make_plp_${name}.JOB.log \
compute-plp-feats --verbose=2 --config=$plp_config scp:$logdir/wav.JOB.scp ark:- \| \
compute-plp-feats --verbose=2 --config=$plp_config scp:$logdir/wav_${name}.JOB.scp ark:- \| \
copy-feats --compress=$compress ark:- \
ark,scp:$plpdir/raw_plp_$name.JOB.ark,$plpdir/raw_plp_$name.JOB.scp \
|| exit 1;
@ -102,7 +102,7 @@ for ((n=1; n<=nj; n++)); do
cat $plpdir/raw_plp_$name.$n.scp || exit 1;
done > $data/feats.scp
rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null
rm $logdir/wav_${name}.*.scp $logdir/segments.* 2>/dev/null
nf=`cat $data/feats.scp | wc -l`
nu=`cat $data/utt2spk | wc -l`

Просмотреть файл

@ -92,13 +92,13 @@ else
echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
split_scps=""
for ((n=1; n<=nj; n++)); do
split_scps="$split_scps $logdir/wav.$n.scp"
split_scps="$split_scps $logdir/wav_${name}.$n.scp"
done
utils/split_scp.pl $scp $split_scps || exit 1;
plp_feats="ark:compute-plp-feats --verbose=2 --config=$plp_config scp:$logdir/wav.JOB.scp ark:- |"
pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp:$logdir/wav.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
plp_feats="ark:compute-plp-feats --verbose=2 --config=$plp_config scp:$logdir/wav_${name}.JOB.scp ark:- |"
pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp:$logdir/wav_${name}.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
$cmd JOB=1:$nj $logdir/make_plp_pitch_${name}.JOB.log \
paste-feats --length-tolerance=$paste_length_tolerance "$plp_feats" "$pitch_feats" ark:- \| \
@ -120,7 +120,7 @@ for ((n=1; n<=nj; n++)); do
cat $plp_pitch_dir/raw_plp_pitch_$name.$n.scp || exit 1;
done > $data/feats.scp
rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null
rm $logdir/wav_${name}.*.scp $logdir/segments.* 2>/dev/null
nf=`cat $data/feats.scp | wc -l`
nu=`cat $data/utt2spk | wc -l`

Просмотреть файл

@ -145,7 +145,7 @@ function filter_utts {
new_nutts=$(cat $tmpdir/utts | wc -l)
old_nutts=$(cat $data/utt2spk | wc -l)
if [ $new_nutts -ne $old_nutts ]; then
echo "fix_data_dir.sh: kept $new_utts utterances out of $old_nutts"
echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts"
else
echo "fix_data_dir.sh: kept all $old_nutts utterances."
fi

Просмотреть файл

@ -116,7 +116,7 @@ if [ -f $data/wav.scp ]; then
check_sorted_and_uniq $data/segments
# We have a segments file -> interpret wav file as "recording-ids" not utterance-ids.
! cat $data/segments | \
awk '{if (NF != 4 || !($4 > $3)) { print "Bad line in segments file", $0; exit(1); }}' && \
awk '{if (NF != 4 || ($4 <= $3 && $4 != -1)) { print "Bad line in segments file", $0; exit(1); }}' && \
echo "$0: badly formatted segments file" && exit 1;
segments_len=`cat $data/segments | wc -l`