From b4106d7dec7443d94042258d9eb0f0bc5a8c09af Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 22 Mar 2016 03:38:54 -0400 Subject: [PATCH] rt03 script --- egs/fisher_swbd/s5/local/rt03_data_prep.sh | 4 +- egs/fisher_swbd/s5/local/score_sclite.sh | 8 +- .../s5c/local/nnet3/run_ivector_common.sh | 4 +- egs/swbd/s5c/local/rt03_data_prep.sh | 113 ++++++++++++++++++ egs/swbd/s5c/local/score_sclite.sh | 40 +++++-- 5 files changed, 152 insertions(+), 17 deletions(-) create mode 100755 egs/swbd/s5c/local/rt03_data_prep.sh diff --git a/egs/fisher_swbd/s5/local/rt03_data_prep.sh b/egs/fisher_swbd/s5/local/rt03_data_prep.sh index 35dd9b399..a18637a6a 100755 --- a/egs/fisher_swbd/s5/local/rt03_data_prep.sh +++ b/egs/fisher_swbd/s5/local/rt03_data_prep.sh @@ -51,7 +51,7 @@ awk -v sph2pipe=$sph2pipe '{ # en_4156 A unknown_speaker 301.85 302.48 #grep -v ';;' $pem \ -cat $tdir/*.stm | grep -v ';;' \ +cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \ | awk '{ spk=$1"-"(($2==1)?"A":"B"); utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); @@ -63,7 +63,7 @@ cat $tdir/*.stm | grep -v ';;' \ # TODO(arnab): We should really be lowercasing this since the Edinburgh # recipe uses lowercase. This is not used in the actual scoring. #grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \ -cat $tdir/*.stm | grep -v ';;' \ +cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \ | awk '{ spk=$1"-"(($2==1)?"A":"B"); utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); diff --git a/egs/fisher_swbd/s5/local/score_sclite.sh b/egs/fisher_swbd/s5/local/score_sclite.sh index c8f29d68b..a5ac4932e 100755 --- a/egs/fisher_swbd/s5/local/score_sclite.sh +++ b/egs/fisher_swbd/s5/local/score_sclite.sh @@ -128,8 +128,8 @@ rt03* ) if [ $stage -le 3 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.${wip}.log \ - grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \ - grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \ + grep -v '^fsh_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \ + grep -v '^fsh_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \ $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.swbd $dir/score_LMWT_${wip}/${name}.ctm.swbd || exit 1; done fi @@ -137,8 +137,8 @@ rt03* ) if [ $stage -le 3 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.fsh.LMWT.${wip}.log \ - grep -v '^fsh_' $data/stm '>' $dir/score_LMWT_${wip}/stm.fsh '&&' \ - grep -v '^fsh_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.fsh '&&' \ + grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.fsh '&&' \ + grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.fsh '&&' \ $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.fsh $dir/score_LMWT_${wip}/${name}.ctm.fsh || exit 1; done fi diff --git a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh index f1e335426..3f7c782ff 100755 --- a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh +++ b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh @@ -81,7 +81,7 @@ for line in sys.stdin.readlines(): utils/fix_data_dir.sh data/${dataset}_hires; done - for dataset in eval2000 train_dev; do + for dataset in eval2000 train_dev rt03; do # Create MFCCs for the eval set utils/copy_data_dir.sh data/$dataset data/${dataset}_hires steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \ @@ -133,7 +133,7 @@ if [ $stage -le 8 ]; then steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ data/${train_set}_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_$train_set || exit 1; - for data_set in eval2000 train_dev; do + for data_set in eval2000 train_dev rt03; do steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_$data_set || exit 1; done diff --git a/egs/swbd/s5c/local/rt03_data_prep.sh b/egs/swbd/s5c/local/rt03_data_prep.sh new file mode 100755 index 000000000..a18637a6a --- /dev/null +++ b/egs/swbd/s5c/local/rt03_data_prep.sh @@ -0,0 +1,113 @@ +#!/bin/bash + +# RT-03 data preparation (conversational telephone speech part only) +# Adapted from Arnab Ghoshal's script for Hub-5 Eval 2000 by Peng Qi + +# To be run from one directory above this script. + +# Expects the standard directory layout for RT-03 + +if [ $# -ne 1 ]; then + echo "Usage: "`basename $0`" " + echo "See comments in the script for more details" + exit 1 +fi + +sdir=$1 +[ ! -d $sdir/data/audio/eval03/english/cts ] \ + && echo Expecting directory $sdir/data/audio/eval03/english/cts to be present && exit 1; +[ ! -d $sdir/data/references/eval03/english/cts ] \ + && echo Expecting directory $tdir/data/references/eval03/english/cts to be present && exit 1; + +. path.sh + +dir=data/local/rt03 +mkdir -p $dir + +rtroot=$sdir +tdir=$sdir/data/references/eval03/english/cts +sdir=$sdir/data/audio/eval03/english/cts + +find $sdir -iname '*.sph' | sort > $dir/sph.flist +sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \ + > $dir/sph.scp + +sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +[ ! -x $sph2pipe ] \ + && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; + +awk -v sph2pipe=$sph2pipe '{ + printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); + printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2); +}' < $dir/sph.scp | sort > $dir/wav.scp || exit 1; +#side A - channel 1, side B - channel 2 + +# Get segments file... +# segments file format is: utt-id side-id start-time end-time, e.g.: +# sw02001-A_000098-001156 sw02001-A 0.98 11.56 +#pem=$sdir/english/hub5e_00.pem +#[ ! -f $pem ] && echo "No such file $pem" && exit 1; +# pem file has lines like: +# en_4156 A unknown_speaker 301.85 302.48 + +#grep -v ';;' $pem \ +cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \ + | awk '{ + spk=$1"-"(($2==1)?"A":"B"); + utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); + print utt,spk,$4,$5;}' \ + | sort -u > $dir/segments + +# stm file has lines like: +# en_4156 A en_4156_A 357.64 359.64 HE IS A POLICE OFFICER +# TODO(arnab): We should really be lowercasing this since the Edinburgh +# recipe uses lowercase. This is not used in the actual scoring. +#grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \ +cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \ + | awk '{ + spk=$1"-"(($2==1)?"A":"B"); + utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); + printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' \ + | sort > $dir/text.all + +# We'll use the stm file for sclite scoring. There seem to be various errors +# in the stm file that upset hubscr.pl, and we fix them here. +cat $tdir/*.stm | \ + sed -e 's:((:(:' -e 's:::g' -e 's:::g' | \ + grep -v inter_segment_gap | \ + awk '{ + printf $1; if ($1==";;") printf(" %s",$2); else printf(($2==1)?" A":" B"); for(n=3;n<=NF;n++) printf(" %s", $n); print ""; }'\ + > $dir/stm +#$tdir/reference/hub5e00.english.000405.stm > $dir/stm +cp $rtroot/data/trans_rules/en20030506.glm $dir/glm + +# next line uses command substitution +# Just checking that the segments are the same in pem vs. stm. +! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \ + echo "Segments from pem file and stm file do not match." && exit 1; + +grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text + +# create an utt2spk file that assumes each conversation side is +# a separate speaker. +awk '{print $1,$2;}' $dir/segments > $dir/utt2spk +utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt + +# cp $dir/segments $dir/segments.tmp +# awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \ +# $dir/segments.tmp > $dir/segments + +awk '{print $1}' $dir/wav.scp \ + | perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; + print "$1-$2 $1 $2\n"; ' \ + > $dir/reco2file_and_channel || exit 1; + +dest=data/rt03 +mkdir -p $dest +for x in wav.scp segments text utt2spk spk2utt stm glm reco2file_and_channel; do + cp $dir/$x $dest/$x +done + +echo Data preparation and formatting completed for RT-03 +echo "(but not MFCC extraction)" + diff --git a/egs/swbd/s5c/local/score_sclite.sh b/egs/swbd/s5c/local/score_sclite.sh index 1b551d365..7ac33fdd2 100755 --- a/egs/swbd/s5c/local/score_sclite.sh +++ b/egs/swbd/s5c/local/score_sclite.sh @@ -110,23 +110,45 @@ if [ $stage -le 2 ]; then fi # For eval2000 score the subsets -case "$name" in eval2000* ) - # Score only the, swbd part... +case "$name" in + eval2000*) + # Score only the, swbd part... + if [ $stage -le 3 ]; then + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.${wip}.log \ + grep -v '^en_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \ + grep -v '^en_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \ + $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.swbd $dir/score_LMWT_${wip}/${name}.ctm.swbd || exit 1; + done + fi + # Score only the, callhome part... + if [ $stage -le 3 ]; then + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.callhm.LMWT.${wip}.log \ + grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.callhm '&&' \ + grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.callhm '&&' \ + $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.callhm $dir/score_LMWT_${wip}/${name}.ctm.callhm || exit 1; + done + fi + ;; +rt03* ) + + # Score only the swbd part... if [ $stage -le 3 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.${wip}.log \ - grep -v '^en_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \ - grep -v '^en_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \ + grep -v '^fsh_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \ + grep -v '^fsh_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \ $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.swbd $dir/score_LMWT_${wip}/${name}.ctm.swbd || exit 1; done fi - # Score only the, callhome part... + # Score only the fisher part... if [ $stage -le 3 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.callhm.LMWT.${wip}.log \ - grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.callhm '&&' \ - grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.callhm '&&' \ - $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.callhm $dir/score_LMWT_${wip}/${name}.ctm.callhm || exit 1; + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.fsh.LMWT.${wip}.log \ + grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.fsh '&&' \ + grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.fsh '&&' \ + $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.fsh $dir/score_LMWT_${wip}/${name}.ctm.fsh || exit 1; done fi ;;