зеркало из https://github.com/mozilla/kaldi.git
sandbox/lid: introducing the splitting of long utterances into smaller pieces; various utility script updates.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@3757 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
18e1be0067
Коммит
a1a368dc83
|
@ -0,0 +1,103 @@
|
|||
#!/bin/bash
|
||||
|
||||
max_utt_len=60 # 60 seconds.
|
||||
stage=0
|
||||
cleanup=true
|
||||
|
||||
. utils/parse_options.sh
|
||||
|
||||
if [ $# -ne 2 ]; then
|
||||
echo "Usage: $0 [options] <in-data-dir> <out-data-dir>"
|
||||
echo "e.g.: $0 --max-utt-len 120 data/train data/train_split"
|
||||
echo "This script splits up long utterances into smaller pieces."
|
||||
echo "It assumes the wav.scp contains has a certain form, with .sph"
|
||||
echo "files in it (so the script is not completely general)."
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
in_dir=$1
|
||||
dir=$2
|
||||
|
||||
for f in $in_dir/{utt2spk,spk2utt,wav.scp,utt2lang}; do
|
||||
if [ ! -f $f ]; then
|
||||
echo "$0: expected input file $f to exist";
|
||||
exit 1;
|
||||
fi
|
||||
done
|
||||
|
||||
if [ $stage -le 0 ]; then
|
||||
utils/validate_data_dir.sh --no-text --no-feats $in_dir || exit 1;
|
||||
|
||||
mkdir -p $dir/temp || exit 1;
|
||||
|
||||
cat $in_dir/wav.scp | perl -ane '
|
||||
$_ =~ m:(^\S+) .* (\S+\.sph): || die "bad line $_";
|
||||
$utt=$1; $sph=$2;
|
||||
open (F, "<$sph") || die "Could not open sph file $sph";
|
||||
$samp_count = -1;
|
||||
while (<F>) {
|
||||
if (m/sample_rate -i (\d+)/) { $1 == 8000 || die "bad sample rate for $sph: $_"; }
|
||||
if (m/sample_count -i (\d+)/) { $samp_count = $1; }
|
||||
if (m/end_head/) { last; }
|
||||
}
|
||||
close(F);
|
||||
$samp_count > 0 || die "Could not get sample count for sph file $sph\n";
|
||||
$secs = $samp_count / 8000;
|
||||
print "$utt $secs\n";
|
||||
' > $dir/temp/utt2len
|
||||
fi
|
||||
|
||||
if [ $(cat $dir/temp/utt2len | wc -l) -ne $(cat $in_dir/utt2spk | wc -l) ]; then
|
||||
echo "utt2spk and utt2len files have mismatched lengths";
|
||||
exit 1;
|
||||
fi
|
||||
if [ $(cat $dir/temp/utt2len | wc -l) -ne $(cat $in_dir/utt2lang | wc -l) ]; then
|
||||
echo "utt2spk and utt2lang files have mismatched lengths";
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
|
||||
# Create, in a pipe, a file with lines
|
||||
# <utt-id> <length> <speaker-id> <language-id>
|
||||
# and pipe it into a perl script that outputs the segments file.
|
||||
awk '{print $2}' $in_dir/utt2spk | paste $dir/temp/utt2len - | \
|
||||
paste - <(awk '{print $2}' $in_dir/utt2lang) | perl -e '
|
||||
($max_utt_len, $out_dir) = @ARGV;
|
||||
open(UTT2SPK, ">$out_dir/utt2spk") || die "opening utt2spk file $out_dir/utt2spk";
|
||||
open(SEGMENTS, ">$out_dir/segments") || die "opening segments file $out_dir/segments";
|
||||
open(UTT2LANG, ">$out_dir/utt2lang") || die "opening segments file $out_dir/utt2lang";
|
||||
while(<STDIN>) {
|
||||
($utt, $len, $speaker, $language) = split(" ", $_);
|
||||
defined $speaker || die "Bad line $_";
|
||||
$reco = $utt; # old utt-id becomes recording-id.
|
||||
if ($len <= $max_utt_len) {
|
||||
print SEGMENTS "${utt}-1 ${utt} 0 -1\n";
|
||||
print UTT2SPK "${utt}-1 $speaker\n";
|
||||
} else {
|
||||
# We will now allow split length to exceed max_utt_len.
|
||||
$num_split = int(($len + 0.999*$max_utt_len) / $max_utt_len);
|
||||
$num_split >= 1 || die;
|
||||
$split_len = $len / $num_split;
|
||||
for ($n = 1; $n <= $num_split; $n++) {
|
||||
$n_text = $n; # this will help remain in string-sorted order
|
||||
if ($num_split >= 10 && $n < 10) { $n_text = "0$n_text"; }
|
||||
if ($num_split >= 100 && $n < 100) { $n_text = "00$n_text"; }
|
||||
$t_start = $split_len * ($n - 1); $t_end = $split_len * $n;
|
||||
print SEGMENTS "${utt}-$n_text ${utt} $t_start $t_end\n";
|
||||
print UTT2SPK "${utt}-$n_text $speaker\n";
|
||||
print UTT2LANG "${utt}-$n_text $language\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
close(SEGMENTS)||die; close(UTT2SPK)||die; close(UTT2LANG)||die; ' $max_utt_len $dir
|
||||
fi
|
||||
|
||||
cp $in_dir/wav.scp $dir/
|
||||
utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
|
||||
utils/validate_data_dir.sh --no-text --no-feats $dir || exit 1;
|
||||
|
||||
$cleanup && rm -r $dir/temp
|
||||
|
||||
exit 0;
|
|
@ -38,15 +38,17 @@ src_list="data/sre08_train_10sec_female \
|
|||
# sources have this info, it will cause problems with combine_data.sh
|
||||
for d in $src_list; do rm $d/spk2gender 2>/dev/null; done
|
||||
|
||||
utils/combine_data.sh data/train $src_list
|
||||
utils/combine_data.sh data/train_unsplit $src_list
|
||||
|
||||
# original utt2lang will remain in data/train/.backup/utt2lang.
|
||||
utils/apply_map.pl -f 2 --permissive local/lang_map.txt < data/train/utt2lang 2>/dev/null > foo
|
||||
cp foo data/train/utt2lang
|
||||
# original utt2lang will remain in data/train_unsplit/.backup/utt2lang.
|
||||
utils/apply_map.pl -f 2 --permissive local/lang_map.txt < data/train_unsplit/utt2lang 2>/dev/null > foo
|
||||
cp foo data/train_unsplit/utt2lang
|
||||
echo "**Language count in training:**"
|
||||
awk '{print $2}' foo | sort | uniq -c | sort -nr
|
||||
rm foo
|
||||
|
||||
local/split_long_utts.sh --max-utt-len 120 data/train_unsplit data/train
|
||||
|
||||
##
|
||||
## HERE
|
||||
##
|
||||
|
@ -68,17 +70,17 @@ lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/lre07 \
|
|||
#utils/subset_data_dir.sh --spk-list foo data/all data/train
|
||||
|
||||
|
||||
utils/subset_data_dir.sh data/train 3000 data/train_3k
|
||||
utils/subset_data_dir.sh data/train 6000 data/train_6k
|
||||
utils/subset_data_dir.sh data/train 5000 data/train_5k
|
||||
utils/subset_data_dir.sh data/train 10000 data/train_10k
|
||||
|
||||
|
||||
lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train_3k 2048 \
|
||||
lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train_5k 2048 \
|
||||
exp/diag_ubm_2048
|
||||
lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train_6k \
|
||||
exp/diag_ubm_2048 exp/full_ubm_2048_6k
|
||||
lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train_10k \
|
||||
exp/diag_ubm_2048 exp/full_ubm_2048_10k
|
||||
|
||||
lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train \
|
||||
exp/full_ubm_2048_6k exp/full_ubm_2048
|
||||
exp/full_ubm_2048_10k exp/full_ubm_2048
|
||||
|
||||
|
||||
lid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=2G,ram_free=2G" \
|
||||
|
|
|
@ -81,7 +81,7 @@ else
|
|||
done
|
||||
|
||||
utils/split_scp.pl $scp $split_scps || exit 1;
|
||||
|
||||
|
||||
$cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
|
||||
compute-mfcc-feats --verbose=2 --config=$mfcc_config scp:$logdir/wav.JOB.scp ark:- \| \
|
||||
copy-feats --compress=$compress ark:- \
|
||||
|
|
|
@ -92,13 +92,13 @@ else
|
|||
echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
|
||||
split_scps=""
|
||||
for ((n=1; n<=nj; n++)); do
|
||||
split_scps="$split_scps $logdir/wav.$n.scp"
|
||||
split_scps="$split_scps $logdir/wav_${name}.$n.scp"
|
||||
done
|
||||
|
||||
utils/split_scp.pl $scp $split_scps || exit 1;
|
||||
|
||||
mfcc_feats="ark:compute-mfcc-feats --verbose=2 --config=$mfcc_config scp:$logdir/wav.JOB.scp ark:- |"
|
||||
pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp:$logdir/wav.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
|
||||
mfcc_feats="ark:compute-mfcc-feats --verbose=2 --config=$mfcc_config scp:$logdir/wav_${name}.JOB.scp ark:- |"
|
||||
pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp:$logdir/wav_${name}.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
|
||||
|
||||
$cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
|
||||
paste-feats --length-tolerance=$paste_length_tolerance "$mfcc_feats" "$pitch_feats" ark:- \| \
|
||||
|
@ -120,7 +120,7 @@ for ((n=1; n<=nj; n++)); do
|
|||
cat $mfcc_pitch_dir/raw_mfcc_pitch_$name.$n.scp || exit 1;
|
||||
done > $data/feats.scp
|
||||
|
||||
rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null
|
||||
rm $logdir/wav_${name}.*.scp $logdir/segments.* 2>/dev/null
|
||||
|
||||
nf=`cat $data/feats.scp | wc -l`
|
||||
nu=`cat $data/utt2spk | wc -l`
|
||||
|
|
|
@ -89,13 +89,13 @@ else
|
|||
echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
|
||||
split_scps=""
|
||||
for ((n=1; n<=nj; n++)); do
|
||||
split_scps="$split_scps $logdir/wav.$n.scp"
|
||||
split_scps="$split_scps $logdir/wav_${name}.$n.scp"
|
||||
done
|
||||
|
||||
utils/split_scp.pl $scp $split_scps || exit 1;
|
||||
|
||||
$cmd JOB=1:$nj $logdir/make_pitch.JOB.log \
|
||||
compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp:$logdir/wav.JOB.scp ark:- \| \
|
||||
compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp:$logdir/wav_${name}.JOB.scp ark:- \| \
|
||||
process-kaldi-pitch-feats $postprocess_config_opt ark:- \
|
||||
ark,scp:$pitchdir/pitch_$name.JOB.ark,$pitchdir/pitch_$name.JOB.scp \
|
||||
|| exit 1;
|
||||
|
@ -113,7 +113,7 @@ for ((n=1; n<=nj; n++)); do
|
|||
cat $pitchdir/pitch_$name.$n.scp || exit 1;
|
||||
done > $data/feats.scp
|
||||
|
||||
rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null
|
||||
rm $logdir/wav_${name}.*.scp $logdir/segments.* 2>/dev/null
|
||||
|
||||
nf=`cat $data/feats.scp | wc -l`
|
||||
nu=`cat $data/utt2spk | wc -l`
|
||||
|
|
|
@ -77,13 +77,13 @@ else
|
|||
echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
|
||||
split_scps=""
|
||||
for ((n=1; n<=nj; n++)); do
|
||||
split_scps="$split_scps $logdir/wav.$n.scp"
|
||||
split_scps="$split_scps $logdir/wav_${name}.$n.scp"
|
||||
done
|
||||
|
||||
utils/split_scp.pl $scp $split_scps || exit 1;
|
||||
|
||||
$cmd JOB=1:$nj $logdir/make_plp_${name}.JOB.log \
|
||||
compute-plp-feats --verbose=2 --config=$plp_config scp:$logdir/wav.JOB.scp ark:- \| \
|
||||
compute-plp-feats --verbose=2 --config=$plp_config scp:$logdir/wav_${name}.JOB.scp ark:- \| \
|
||||
copy-feats --compress=$compress ark:- \
|
||||
ark,scp:$plpdir/raw_plp_$name.JOB.ark,$plpdir/raw_plp_$name.JOB.scp \
|
||||
|| exit 1;
|
||||
|
@ -102,7 +102,7 @@ for ((n=1; n<=nj; n++)); do
|
|||
cat $plpdir/raw_plp_$name.$n.scp || exit 1;
|
||||
done > $data/feats.scp
|
||||
|
||||
rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null
|
||||
rm $logdir/wav_${name}.*.scp $logdir/segments.* 2>/dev/null
|
||||
|
||||
nf=`cat $data/feats.scp | wc -l`
|
||||
nu=`cat $data/utt2spk | wc -l`
|
||||
|
|
|
@ -92,13 +92,13 @@ else
|
|||
echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
|
||||
split_scps=""
|
||||
for ((n=1; n<=nj; n++)); do
|
||||
split_scps="$split_scps $logdir/wav.$n.scp"
|
||||
split_scps="$split_scps $logdir/wav_${name}.$n.scp"
|
||||
done
|
||||
|
||||
utils/split_scp.pl $scp $split_scps || exit 1;
|
||||
|
||||
plp_feats="ark:compute-plp-feats --verbose=2 --config=$plp_config scp:$logdir/wav.JOB.scp ark:- |"
|
||||
pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp:$logdir/wav.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
|
||||
plp_feats="ark:compute-plp-feats --verbose=2 --config=$plp_config scp:$logdir/wav_${name}.JOB.scp ark:- |"
|
||||
pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp:$logdir/wav_${name}.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
|
||||
|
||||
$cmd JOB=1:$nj $logdir/make_plp_pitch_${name}.JOB.log \
|
||||
paste-feats --length-tolerance=$paste_length_tolerance "$plp_feats" "$pitch_feats" ark:- \| \
|
||||
|
@ -120,7 +120,7 @@ for ((n=1; n<=nj; n++)); do
|
|||
cat $plp_pitch_dir/raw_plp_pitch_$name.$n.scp || exit 1;
|
||||
done > $data/feats.scp
|
||||
|
||||
rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null
|
||||
rm $logdir/wav_${name}.*.scp $logdir/segments.* 2>/dev/null
|
||||
|
||||
nf=`cat $data/feats.scp | wc -l`
|
||||
nu=`cat $data/utt2spk | wc -l`
|
||||
|
|
|
@ -145,7 +145,7 @@ function filter_utts {
|
|||
new_nutts=$(cat $tmpdir/utts | wc -l)
|
||||
old_nutts=$(cat $data/utt2spk | wc -l)
|
||||
if [ $new_nutts -ne $old_nutts ]; then
|
||||
echo "fix_data_dir.sh: kept $new_utts utterances out of $old_nutts"
|
||||
echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts"
|
||||
else
|
||||
echo "fix_data_dir.sh: kept all $old_nutts utterances."
|
||||
fi
|
||||
|
|
|
@ -116,7 +116,7 @@ if [ -f $data/wav.scp ]; then
|
|||
check_sorted_and_uniq $data/segments
|
||||
# We have a segments file -> interpret wav file as "recording-ids" not utterance-ids.
|
||||
! cat $data/segments | \
|
||||
awk '{if (NF != 4 || !($4 > $3)) { print "Bad line in segments file", $0; exit(1); }}' && \
|
||||
awk '{if (NF != 4 || ($4 <= $3 && $4 != -1)) { print "Bad line in segments file", $0; exit(1); }}' && \
|
||||
echo "$0: badly formatted segments file" && exit 1;
|
||||
|
||||
segments_len=`cat $data/segments | wc -l`
|
||||
|
|
Загрузка…
Ссылка в новой задаче