sandbox/language_id: rationalize the way the utt2lang is treated.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@3581 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2014-02-24 20:35:22 +00:00
Родитель 8cd6ba8928
Коммит 677a1d1677
5 изменённых файлов: 15 добавлений и 50 удалений

Просмотреть файл

@ -0,0 +1,3 @@
Note: these scripts are mostly the same as the speaker-id scripts in
../../../sre08/v1/sid, except that they use add-deltas-sdc instead of
add-deltas, and a smaller default ivector dimension (400 vs. 600).

Просмотреть файл

@ -78,6 +78,6 @@ if [ $stage -le 2 ]; then
echo "$0: computing mean of iVectors for each speaker and length-normalizing"
$cmd $dir/log/speaker_mean.log \
ivector-normalize-length scp:$dir/ivector.scp ark:- \| \
ivector-mean ark:$data/lang2utt ark:- ark:- ark,t:$dir/num_utts.ark \| \
ivector-mean "ark:utils/spk2utt_to_utt2spl.pl $data/utt2lang|" ark:- ark:- ark,t:$dir/num_utts.ark \| \
ivector-normalize-length ark:- ark,scp:$dir/lang_ivector.ark,$dir/lang_ivector.scp || exit 1;
fi

Просмотреть файл

@ -46,7 +46,6 @@ for file in utt2spk utt2lang feats.scp text cmvn.scp segments reco2file_and_chan
done
utils/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt
utils/utt2spk_to_spk2utt.pl <$dest/utt2lang >$dest/lang2utt
utils/fix_data_dir.sh $dest 1 || exit 1;

Просмотреть файл

@ -5,16 +5,13 @@
# text, and utt2spk are present in any of them.
# It puts the original contents of data-dir into
# data-dir/.backup
# giving the second option of <LANG> to do utt2lang processing instead of utt2spk processing
if [ $# -lt 1 ]; then
echo "Usage: fix_data_dir.sh data-dir <LANG=1/0>"
if [ $# != 1 ]; then
echo "Usage: fix_data_dir.sh data-dir"
exit 1
fi
data=$1
lang=$2
mkdir -p $data/.backup
[ ! -d $data ] && echo "$0: no such directory $data" && exit 1;
@ -26,6 +23,7 @@ trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
export LC_ALL=C
function check_sorted {
file=$1
sort -k1,1 -u <$file >$file.tmp
@ -37,13 +35,14 @@ function check_sorted {
fi
}
for x in utt2lang lang2utt utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp reco2file_and_channel spk2gender; do
for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp reco2file_and_channel spk2gender utt2lang; do
if [ -f $data/$x ]; then
cp $data/$x $data/.backup/$x
check_sorted $data/$x
fi
done
function filter_file {
filter=$1
file_to_filter=$2
@ -115,32 +114,6 @@ function filter_speakers {
done
}
function filter_langs {
# throughout this program, we regard utt2lang as primary and lang2utt as derived, so...
utils/utt2spk_to_spk2utt.pl $data/utt2lang > $data/lang2utt
check_sorted $data/utt2lang;
check_sorted $data/lang2utt;
cat $data/lang2utt | awk '{print $1}' > $tmpdir/languages
for s in cmvn.scp; do
f=$data/$s
if [ -f $f ]; then
utils/filter_scp.pl $f $tmpdir/languages > $tmpdir/languages.tmp
mv $tmpdir/languages.tmp $tmpdir/languages
fi
done
filter_file $tmpdir/languages $data/lang2utt
utils/spk2utt_to_utt2spk.pl $data/lang2utt > $data/utt2lang
for s in cmvn.scp; do
f=$data/$s
if [ -f $f ]; then
filter_file $tmpdir/languages $f
fi
done
}
function filter_utts {
cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
@ -156,19 +129,10 @@ function filter_utts {
! cat $data/spk2utt | sort | cmp - $data/spk2utt && \
echo "spk2utt is not in sorted order (fix this yourself)" && exit 1;
! cat $data/utt2lang | sort -k2 | cmp - $data/utt2lang && \
echo "utt2lang is not in sorted order (fix this yourself)" && exit 1;
! cat $data/utt2lang | sort | cmp - $data/utt2lang && \
echo "utt2lang is not in sorted order (fix this yourself)" && exit 1;
! cat $data/lang2utt | sort | diff - $data/lang2utt && \
echo "lang2utt is not in sorted order (fix this yourself)" && exit 1;
maybe_wav=
[ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist.
for x in feats.scp text segments $maybe_wav; do
for x in feats.scp text segments utt2lang $maybe_wav; do
if [ -f $data/$x ]; then
utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp
mv $tmpdir/utts.tmp $tmpdir/utts
@ -183,7 +147,6 @@ function filter_utts {
else
nfeats=0
fi
ntext=`cat $data/text 2>/dev/null | wc -l`
if [ "$nutts" -ne "$nfeats" -o "$nutts" -ne "$ntext" ]; then
echo "fix_data_dir.sh: kept $nutts utterances, vs. $nfeats features and $ntext transcriptions."
@ -191,7 +154,7 @@ function filter_utts {
echo "fix_data_dir.sh: kept all $nutts utterances."
fi
for x in utt2lang utt2spk feats.scp text segments $maybe_wav; do
for x in utt2spk feats.scp text segments utt2lang $maybe_wav; do
if [ -f $data/$x ]; then
cp $data/$x $data/.backup/$x
if ! cmp -s $data/$x <( utils/filter_scp.pl $tmpdir/utts $data/$x ) ; then
@ -203,9 +166,6 @@ function filter_utts {
}
filter_recordings
if [ $lang -eq 1 ] ; then
filter_langs
fi
filter_speakers
filter_utts
filter_recordings

Просмотреть файл

@ -61,7 +61,7 @@ if [ ! -d $s1 ]; then
else
need_to_split=false
for f in utt2spk spk2utt feats.scp text wav.scp cmvn.scp spk2gender \
vad.scp segments reco2file_and_channel; do
vad.scp segments reco2file_and_channel utt2lang; do
if [[ -f $data/$f && ( ! -f $s1/$f || $s1/$f -ot $data/$f ) ]]; then
need_to_split=true
fi
@ -78,6 +78,7 @@ for n in `seq $numsplit`; do
vads="$vads $data/split$numsplit/$n/vad.scp"
texts="$texts $data/split$numsplit/$n/text"
utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk"
utt2langs="$utt2langs $data/split$numsplit/$n/utt2lang"
done
if $split_per_spk; then
@ -94,6 +95,8 @@ utils/split_scp.pl $utt2spk_opt $data/feats.scp $feats || exit 1
[ -f $data/vad.scp ] && utils/split_scp.pl $utt2spk_opt $data/vad.scp $vads
[ -f $data/utt2lang ] && utils/split_scp.pl $utt2spk_opt $data/utt2lang $utt2langs
# If lockfile is not installed, just don't lock it. It's not a big deal.
which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock