зеркало из https://github.com/mozilla/kaldi.git
trunk: Fixes and extensions to fix_data_dir.sh and validate_data_dir.sh (e.g. regarding spk2gender); associated improvement to documentation, and fixes to Fisher and RM data-prep scripts.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@2658 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
680e7455f0
Коммит
91401e76ef
|
@ -137,7 +137,8 @@ if [ $stage -le 2 ]; then
|
|||
sed 's:\[lipsmack\]:[noise]:g' > $tmpdir/text.2
|
||||
cp $tmpdir/text.2 data/train_all/text
|
||||
# create segments file and utt2spk file...
|
||||
cat data/train_all/text | perl -ane 'm:([^-]+)-(\S+): || die; print "$1-$2 $1\n"; ' > data/train_all/utt2spk
|
||||
! cat data/train_all/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > data/train_all/utt2spk \
|
||||
&& echo "Error producing utt2spk file" && exit 1;
|
||||
|
||||
cat data/train_all/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2; $s = sprintf("%.2f", 0.01*$3);
|
||||
$e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e\n"; ' > data/train_all/segments
|
||||
|
@ -153,4 +154,15 @@ if [ $stage -le 3 ]; then
|
|||
sort -k1,1 -u > data/train_all/wav.scp || exit 1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 4 ]; then
|
||||
# get the spk2gender information. This is not a standard part of our
|
||||
# file formats
|
||||
# The files "filetable2fe_03_p2_sph1 fe_03_05852.sph ff
|
||||
cat $links/fe_03_p1_sph{1,2,3,4,5,6,7}/filetable.txt \
|
||||
$links/fe_03_p2_sph{1,2,3,4,5,6,7}/docs/filetable2.txt | \
|
||||
perl -ane 'm:^\S+ (\S+)\.sph ([fm])([fm]): || die "bad line $_;"; print "$1-A $2\n", "$1-B $3\n"; ' | \
|
||||
sort | uniq | utils/filter_scp.pl data/train_all/spk2utt > data/train_all/spk2gender
|
||||
fi
|
||||
|
||||
echo "Data preparation succeeded"
|
||||
|
||||
|
|
|
@ -34,7 +34,7 @@ if [ ! -d $RMROOT/rm1_audio1 -o ! -d $RMROOT/rm1_audio2 ]; then
|
|||
fi
|
||||
|
||||
if [ ! -d $RMROOT/rm2_audio ]; then
|
||||
echo "**Warning: $RMROOT/rm2_audio does not exist; won't create spk2gender.map file correctly***"
|
||||
echo "**Warning: $RMROOT/rm2_audio does not exist; won't create spk2gender file correctly***"
|
||||
sleep 1
|
||||
fi
|
||||
|
||||
|
@ -86,10 +86,10 @@ done
|
|||
cat $RMROOT/rm1_audio2/2_5_1/rm1/doc/al_spkrs.txt \
|
||||
$RMROOT/rm2_audio/3-1.2/rm2/doc/al_spkrs.txt | \
|
||||
perl -ane 'tr/A-Z/a-z/;print;' | grep -v ';' | \
|
||||
awk '{print $1, $2}' | sort | uniq > $tmpdir/spk2gender.map || exit 1;
|
||||
awk '{print $1, $2}' | sort | uniq > $tmpdir/spk2gender || exit 1;
|
||||
|
||||
for t in train test_mar87 test_oct87 test_feb89 test_oct89 test_feb91 test_sep92; do
|
||||
utils/filter_scp.pl data/$t/spk2utt $tmpdir/spk2gender.map >data/$t/spk2gender.map
|
||||
utils/filter_scp.pl data/$t/spk2utt $tmpdir/spk2gender >data/$t/spk2gender
|
||||
done
|
||||
|
||||
local/make_rm_lm.pl $RMROOT/rm1_audio1/rm1/doc/wp_gram.txt > $tmpdir/G.txt || exit 1;
|
||||
|
|
|
@ -103,7 +103,8 @@ function filter_speakers {
|
|||
fi
|
||||
if [ -f $data/spk2gender ]; then
|
||||
# We don't handle the case when the spk2gender does not cover all speakers.
|
||||
filter_file $data/spk2utt $data/spk2gender
|
||||
cat $data/spk2utt | awk '{print $1}' >$tmpdir/speakers
|
||||
filter_file $tmpdir/speakers $data/spk2gender
|
||||
fi
|
||||
}
|
||||
|
||||
|
|
|
@ -205,12 +205,12 @@ if [ -f $data/spk2gender ]; then
|
|||
check_sorted $data/spk2gender
|
||||
! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \
|
||||
echo "Mal-formed spk2gender file" && exit 1;
|
||||
cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.cmvn
|
||||
cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender
|
||||
cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
|
||||
if ! cmp -s $tmpdir/speakers{,.cmvn}; then
|
||||
echo "$0: Error: in $data, speaker lists extracted from spkutt and cmvn"
|
||||
if ! cmp -s $tmpdir/speakers{,.spk2gender}; then
|
||||
echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender"
|
||||
echo "$0: differ, partial diff is:"
|
||||
partial_diff $tmpdir/speakers{,.cmvn}
|
||||
partial_diff $tmpdir/speakers{,.spk2gender}
|
||||
exit 1;
|
||||
fi
|
||||
fi
|
||||
|
|
|
@ -181,6 +181,17 @@ calls -- but it's good enough for our purposes. If you have no information at a
|
|||
the speaker identities, you can just make the speaker-ids the same as the utterance-ids,
|
||||
so the format of the file would be just <DFN>\<utterance-id\> \<utterance-id\></DFN>.
|
||||
|
||||
There is another file that exists in some setups; it is used only occasionally and
|
||||
not in the Kaldi system build. We show what it looks like in the Resource Management
|
||||
(RM) setup:
|
||||
\verbatim
|
||||
s5# head -3 ../../rm/s5/data/train/spk2gender
|
||||
adg0 f
|
||||
ahh0 m
|
||||
ajp0 m
|
||||
\endverbatim
|
||||
This file maps from speaker-id to either "m" or "f" depending on the speaker gender.
|
||||
|
||||
All of these files should be sorted. If they are not sorted, you will get errors
|
||||
when you run the scripts. In \ref io_sec_tables we explain why this is needed.
|
||||
It has to do with the I/O framework; the ultimate reason for the sorting is to
|
||||
|
@ -262,6 +273,8 @@ steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
|
|||
\endverbatim
|
||||
(this example is from <DFN>egs/swbd/s5/run.sh</DFN>).
|
||||
|
||||
|
||||
|
||||
\section data_prep_lang Data preparation-- the "lang" directory.
|
||||
|
||||
Now we turn our attention to the "lang" directory.
|
||||
|
|
Загрузка…
Ссылка в новой задаче