trunk: Fixes and extensions to fix_data_dir.sh and validate_data_dir.sh (e.g. regarding spk2gender); associated improvement to documentation, and fixes to Fisher and RM data-prep scripts.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@2658 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2013-07-05 19:05:15 +00:00
Родитель 680e7455f0
Коммит 91401e76ef
5 изменённых файлов: 35 добавлений и 9 удалений

Просмотреть файл

@ -137,7 +137,8 @@ if [ $stage -le 2 ]; then
sed 's:\[lipsmack\]:[noise]:g' > $tmpdir/text.2
cp $tmpdir/text.2 data/train_all/text
# create segments file and utt2spk file...
cat data/train_all/text | perl -ane 'm:([^-]+)-(\S+): || die; print "$1-$2 $1\n"; ' > data/train_all/utt2spk
! cat data/train_all/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > data/train_all/utt2spk \
&& echo "Error producing utt2spk file" && exit 1;
cat data/train_all/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2; $s = sprintf("%.2f", 0.01*$3);
$e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e\n"; ' > data/train_all/segments
@ -153,4 +154,15 @@ if [ $stage -le 3 ]; then
sort -k1,1 -u > data/train_all/wav.scp || exit 1;
fi
if [ $stage -le 4 ]; then
# get the spk2gender information. This is not a standard part of our
# file formats
# The files "filetable2fe_03_p2_sph1 fe_03_05852.sph ff
cat $links/fe_03_p1_sph{1,2,3,4,5,6,7}/filetable.txt \
$links/fe_03_p2_sph{1,2,3,4,5,6,7}/docs/filetable2.txt | \
perl -ane 'm:^\S+ (\S+)\.sph ([fm])([fm]): || die "bad line $_;"; print "$1-A $2\n", "$1-B $3\n"; ' | \
sort | uniq | utils/filter_scp.pl data/train_all/spk2utt > data/train_all/spk2gender
fi
echo "Data preparation succeeded"

Просмотреть файл

@ -34,7 +34,7 @@ if [ ! -d $RMROOT/rm1_audio1 -o ! -d $RMROOT/rm1_audio2 ]; then
fi
if [ ! -d $RMROOT/rm2_audio ]; then
echo "**Warning: $RMROOT/rm2_audio does not exist; won't create spk2gender.map file correctly***"
echo "**Warning: $RMROOT/rm2_audio does not exist; won't create spk2gender file correctly***"
sleep 1
fi
@ -86,10 +86,10 @@ done
cat $RMROOT/rm1_audio2/2_5_1/rm1/doc/al_spkrs.txt \
$RMROOT/rm2_audio/3-1.2/rm2/doc/al_spkrs.txt | \
perl -ane 'tr/A-Z/a-z/;print;' | grep -v ';' | \
awk '{print $1, $2}' | sort | uniq > $tmpdir/spk2gender.map || exit 1;
awk '{print $1, $2}' | sort | uniq > $tmpdir/spk2gender || exit 1;
for t in train test_mar87 test_oct87 test_feb89 test_oct89 test_feb91 test_sep92; do
utils/filter_scp.pl data/$t/spk2utt $tmpdir/spk2gender.map >data/$t/spk2gender.map
utils/filter_scp.pl data/$t/spk2utt $tmpdir/spk2gender >data/$t/spk2gender
done
local/make_rm_lm.pl $RMROOT/rm1_audio1/rm1/doc/wp_gram.txt > $tmpdir/G.txt || exit 1;

Просмотреть файл

@ -103,7 +103,8 @@ function filter_speakers {
fi
if [ -f $data/spk2gender ]; then
# We don't handle the case when the spk2gender does not cover all speakers.
filter_file $data/spk2utt $data/spk2gender
cat $data/spk2utt | awk '{print $1}' >$tmpdir/speakers
filter_file $tmpdir/speakers $data/spk2gender
fi
}

Просмотреть файл

@ -205,12 +205,12 @@ if [ -f $data/spk2gender ]; then
check_sorted $data/spk2gender
! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \
echo "Mal-formed spk2gender file" && exit 1;
cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.cmvn
cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender
cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
if ! cmp -s $tmpdir/speakers{,.cmvn}; then
echo "$0: Error: in $data, speaker lists extracted from spkutt and cmvn"
if ! cmp -s $tmpdir/speakers{,.spk2gender}; then
echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/speakers{,.cmvn}
partial_diff $tmpdir/speakers{,.spk2gender}
exit 1;
fi
fi

Просмотреть файл

@ -181,6 +181,17 @@ calls -- but it's good enough for our purposes. If you have no information at a
the speaker identities, you can just make the speaker-ids the same as the utterance-ids,
so the format of the file would be just <DFN>\<utterance-id\> \<utterance-id\></DFN>.
There is another file that exists in some setups; it is used only occasionally and
not in the Kaldi system build. We show what it looks like in the Resource Management
(RM) setup:
\verbatim
s5# head -3 ../../rm/s5/data/train/spk2gender
adg0 f
ahh0 m
ajp0 m
\endverbatim
This file maps from speaker-id to either "m" or "f" depending on the speaker gender.
All of these files should be sorted. If they are not sorted, you will get errors
when you run the scripts. In \ref io_sec_tables we explain why this is needed.
It has to do with the I/O framework; the ultimate reason for the sorting is to
@ -262,6 +273,8 @@ steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
\endverbatim
(this example is from <DFN>egs/swbd/s5/run.sh</DFN>).
\section data_prep_lang Data preparation-- the "lang" directory.
Now we turn our attention to the "lang" directory.