зеркало из https://github.com/mozilla/kaldi.git
sandbox/language_id: rationalize the way the utt2lang is treated.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@3581 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
8cd6ba8928
Коммит
677a1d1677
|
@ -0,0 +1,3 @@
|
|||
Note: these scripts are mostly the same as the speaker-id scripts in
|
||||
../../../sre08/v1/sid, except that they use add-deltas-sdc instead of
|
||||
add-deltas, and a smaller default ivector dimension (400 vs. 600).
|
|
@ -78,6 +78,6 @@ if [ $stage -le 2 ]; then
|
|||
echo "$0: computing mean of iVectors for each speaker and length-normalizing"
|
||||
$cmd $dir/log/speaker_mean.log \
|
||||
ivector-normalize-length scp:$dir/ivector.scp ark:- \| \
|
||||
ivector-mean ark:$data/lang2utt ark:- ark:- ark,t:$dir/num_utts.ark \| \
|
||||
ivector-mean "ark:utils/spk2utt_to_utt2spl.pl $data/utt2lang|" ark:- ark:- ark,t:$dir/num_utts.ark \| \
|
||||
ivector-normalize-length ark:- ark,scp:$dir/lang_ivector.ark,$dir/lang_ivector.scp || exit 1;
|
||||
fi
|
||||
|
|
|
@ -46,7 +46,6 @@ for file in utt2spk utt2lang feats.scp text cmvn.scp segments reco2file_and_chan
|
|||
done
|
||||
|
||||
utils/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt
|
||||
utils/utt2spk_to_spk2utt.pl <$dest/utt2lang >$dest/lang2utt
|
||||
|
||||
utils/fix_data_dir.sh $dest 1 || exit 1;
|
||||
|
||||
|
|
|
@ -5,16 +5,13 @@
|
|||
# text, and utt2spk are present in any of them.
|
||||
# It puts the original contents of data-dir into
|
||||
# data-dir/.backup
|
||||
# giving the second option of <LANG> to do utt2lang processing instead of utt2spk processing
|
||||
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "Usage: fix_data_dir.sh data-dir <LANG=1/0>"
|
||||
if [ $# != 1 ]; then
|
||||
echo "Usage: fix_data_dir.sh data-dir"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
data=$1
|
||||
lang=$2
|
||||
|
||||
mkdir -p $data/.backup
|
||||
|
||||
[ ! -d $data ] && echo "$0: no such directory $data" && exit 1;
|
||||
|
@ -26,6 +23,7 @@ trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
|
|||
|
||||
export LC_ALL=C
|
||||
|
||||
|
||||
function check_sorted {
|
||||
file=$1
|
||||
sort -k1,1 -u <$file >$file.tmp
|
||||
|
@ -37,13 +35,14 @@ function check_sorted {
|
|||
fi
|
||||
}
|
||||
|
||||
for x in utt2lang lang2utt utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp reco2file_and_channel spk2gender; do
|
||||
for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp reco2file_and_channel spk2gender utt2lang; do
|
||||
if [ -f $data/$x ]; then
|
||||
cp $data/$x $data/.backup/$x
|
||||
check_sorted $data/$x
|
||||
fi
|
||||
done
|
||||
|
||||
|
||||
function filter_file {
|
||||
filter=$1
|
||||
file_to_filter=$2
|
||||
|
@ -115,32 +114,6 @@ function filter_speakers {
|
|||
done
|
||||
}
|
||||
|
||||
function filter_langs {
|
||||
# throughout this program, we regard utt2lang as primary and lang2utt as derived, so...
|
||||
utils/utt2spk_to_spk2utt.pl $data/utt2lang > $data/lang2utt
|
||||
check_sorted $data/utt2lang;
|
||||
check_sorted $data/lang2utt;
|
||||
|
||||
cat $data/lang2utt | awk '{print $1}' > $tmpdir/languages
|
||||
for s in cmvn.scp; do
|
||||
f=$data/$s
|
||||
if [ -f $f ]; then
|
||||
utils/filter_scp.pl $f $tmpdir/languages > $tmpdir/languages.tmp
|
||||
mv $tmpdir/languages.tmp $tmpdir/languages
|
||||
fi
|
||||
done
|
||||
|
||||
filter_file $tmpdir/languages $data/lang2utt
|
||||
utils/spk2utt_to_utt2spk.pl $data/lang2utt > $data/utt2lang
|
||||
|
||||
for s in cmvn.scp; do
|
||||
f=$data/$s
|
||||
if [ -f $f ]; then
|
||||
filter_file $tmpdir/languages $f
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
function filter_utts {
|
||||
cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
|
||||
|
||||
|
@ -156,19 +129,10 @@ function filter_utts {
|
|||
! cat $data/spk2utt | sort | cmp - $data/spk2utt && \
|
||||
echo "spk2utt is not in sorted order (fix this yourself)" && exit 1;
|
||||
|
||||
! cat $data/utt2lang | sort -k2 | cmp - $data/utt2lang && \
|
||||
echo "utt2lang is not in sorted order (fix this yourself)" && exit 1;
|
||||
|
||||
! cat $data/utt2lang | sort | cmp - $data/utt2lang && \
|
||||
echo "utt2lang is not in sorted order (fix this yourself)" && exit 1;
|
||||
|
||||
! cat $data/lang2utt | sort | diff - $data/lang2utt && \
|
||||
echo "lang2utt is not in sorted order (fix this yourself)" && exit 1;
|
||||
|
||||
|
||||
maybe_wav=
|
||||
[ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist.
|
||||
for x in feats.scp text segments $maybe_wav; do
|
||||
for x in feats.scp text segments utt2lang $maybe_wav; do
|
||||
if [ -f $data/$x ]; then
|
||||
utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp
|
||||
mv $tmpdir/utts.tmp $tmpdir/utts
|
||||
|
@ -183,7 +147,6 @@ function filter_utts {
|
|||
else
|
||||
nfeats=0
|
||||
fi
|
||||
|
||||
ntext=`cat $data/text 2>/dev/null | wc -l`
|
||||
if [ "$nutts" -ne "$nfeats" -o "$nutts" -ne "$ntext" ]; then
|
||||
echo "fix_data_dir.sh: kept $nutts utterances, vs. $nfeats features and $ntext transcriptions."
|
||||
|
@ -191,7 +154,7 @@ function filter_utts {
|
|||
echo "fix_data_dir.sh: kept all $nutts utterances."
|
||||
fi
|
||||
|
||||
for x in utt2lang utt2spk feats.scp text segments $maybe_wav; do
|
||||
for x in utt2spk feats.scp text segments utt2lang $maybe_wav; do
|
||||
if [ -f $data/$x ]; then
|
||||
cp $data/$x $data/.backup/$x
|
||||
if ! cmp -s $data/$x <( utils/filter_scp.pl $tmpdir/utts $data/$x ) ; then
|
||||
|
@ -203,9 +166,6 @@ function filter_utts {
|
|||
}
|
||||
|
||||
filter_recordings
|
||||
if [ $lang -eq 1 ] ; then
|
||||
filter_langs
|
||||
fi
|
||||
filter_speakers
|
||||
filter_utts
|
||||
filter_recordings
|
||||
|
|
|
@ -61,7 +61,7 @@ if [ ! -d $s1 ]; then
|
|||
else
|
||||
need_to_split=false
|
||||
for f in utt2spk spk2utt feats.scp text wav.scp cmvn.scp spk2gender \
|
||||
vad.scp segments reco2file_and_channel; do
|
||||
vad.scp segments reco2file_and_channel utt2lang; do
|
||||
if [[ -f $data/$f && ( ! -f $s1/$f || $s1/$f -ot $data/$f ) ]]; then
|
||||
need_to_split=true
|
||||
fi
|
||||
|
@ -78,6 +78,7 @@ for n in `seq $numsplit`; do
|
|||
vads="$vads $data/split$numsplit/$n/vad.scp"
|
||||
texts="$texts $data/split$numsplit/$n/text"
|
||||
utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk"
|
||||
utt2langs="$utt2langs $data/split$numsplit/$n/utt2lang"
|
||||
done
|
||||
|
||||
if $split_per_spk; then
|
||||
|
@ -94,6 +95,8 @@ utils/split_scp.pl $utt2spk_opt $data/feats.scp $feats || exit 1
|
|||
|
||||
[ -f $data/vad.scp ] && utils/split_scp.pl $utt2spk_opt $data/vad.scp $vads
|
||||
|
||||
[ -f $data/utt2lang ] && utils/split_scp.pl $utt2spk_opt $data/utt2lang $utt2langs
|
||||
|
||||
# If lockfile is not installed, just don't lock it. It's not a big deal.
|
||||
which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче