From b4106d7dec7443d94042258d9eb0f0bc5a8c09af Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 22 Mar 2016 03:38:54 -0400
Subject: [PATCH] rt03 script

---
 egs/fisher_swbd/s5/local/rt03_data_prep.sh    |   4 +-
 egs/fisher_swbd/s5/local/score_sclite.sh      |   8 +-
 .../s5c/local/nnet3/run_ivector_common.sh     |   4 +-
 egs/swbd/s5c/local/rt03_data_prep.sh          | 113 ++++++++++++++++++
 egs/swbd/s5c/local/score_sclite.sh            |  40 +++++--
 5 files changed, 152 insertions(+), 17 deletions(-)
 create mode 100755 egs/swbd/s5c/local/rt03_data_prep.sh

diff --git a/egs/fisher_swbd/s5/local/rt03_data_prep.sh b/egs/fisher_swbd/s5/local/rt03_data_prep.sh
index 35dd9b399..a18637a6a 100755
--- a/egs/fisher_swbd/s5/local/rt03_data_prep.sh
+++ b/egs/fisher_swbd/s5/local/rt03_data_prep.sh
@@ -51,7 +51,7 @@ awk -v sph2pipe=$sph2pipe '{
 # en_4156 A unknown_speaker 301.85 302.48
 
 #grep -v ';;' $pem \
-cat $tdir/*.stm | grep -v ';;' \
+cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
   | awk '{
            spk=$1"-"(($2==1)?"A":"B");
            utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
@@ -63,7 +63,7 @@ cat $tdir/*.stm | grep -v ';;' \
 # TODO(arnab): We should really be lowercasing this since the Edinburgh
 # recipe uses lowercase. This is not used in the actual scoring.
 #grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
-cat $tdir/*.stm | grep -v ';;' \
+cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
   | awk '{
            spk=$1"-"(($2==1)?"A":"B");
            utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
diff --git a/egs/fisher_swbd/s5/local/score_sclite.sh b/egs/fisher_swbd/s5/local/score_sclite.sh
index c8f29d68b..a5ac4932e 100755
--- a/egs/fisher_swbd/s5/local/score_sclite.sh
+++ b/egs/fisher_swbd/s5/local/score_sclite.sh
@@ -128,8 +128,8 @@ rt03* )
   if [ $stage -le 3 ]; then
     for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
       $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.${wip}.log \
-        grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \
-        grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \
+        grep -v '^fsh_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \
+        grep -v '^fsh_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \
         $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.swbd $dir/score_LMWT_${wip}/${name}.ctm.swbd || exit 1;
     done
   fi
@@ -137,8 +137,8 @@ rt03* )
   if [ $stage -le 3 ]; then
     for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
       $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.fsh.LMWT.${wip}.log \
-        grep -v '^fsh_' $data/stm '>' $dir/score_LMWT_${wip}/stm.fsh '&&' \
-        grep -v '^fsh_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.fsh '&&' \
+        grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.fsh '&&' \
+        grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.fsh '&&' \
         $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.fsh $dir/score_LMWT_${wip}/${name}.ctm.fsh || exit 1;
     done
   fi
diff --git a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
index f1e335426..3f7c782ff 100755
--- a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
+++ b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
@@ -81,7 +81,7 @@ for line in sys.stdin.readlines():
     utils/fix_data_dir.sh data/${dataset}_hires;
   done
 
-  for dataset in eval2000 train_dev; do
+  for dataset in eval2000 train_dev rt03; do
     # Create MFCCs for the eval set
     utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
     steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \
@@ -133,7 +133,7 @@ if [ $stage -le 8 ]; then
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
     data/${train_set}_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_$train_set || exit 1;
 
-  for data_set in eval2000 train_dev; do
+  for data_set in eval2000 train_dev rt03; do
     steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
       data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_$data_set || exit 1;
   done
diff --git a/egs/swbd/s5c/local/rt03_data_prep.sh b/egs/swbd/s5c/local/rt03_data_prep.sh
new file mode 100755
index 000000000..a18637a6a
--- /dev/null
+++ b/egs/swbd/s5c/local/rt03_data_prep.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# RT-03 data preparation (conversational telephone speech part only) 
+# Adapted from Arnab Ghoshal's script for Hub-5 Eval 2000 by Peng Qi
+
+# To be run from one directory above this script.
+
+# Expects the standard directory layout for RT-03
+
+if [ $# -ne 1 ]; then
+  echo "Usage: "`basename $0`" <rt03-dir>"
+  echo "See comments in the script for more details"
+  exit 1
+fi
+
+sdir=$1
+[ ! -d $sdir/data/audio/eval03/english/cts ] \
+  && echo Expecting directory $sdir/data/audio/eval03/english/cts to be present && exit 1;
+[ ! -d $sdir/data/references/eval03/english/cts ] \
+  && echo Expecting directory $tdir/data/references/eval03/english/cts to be present && exit 1;
+
+. path.sh 
+
+dir=data/local/rt03
+mkdir -p $dir
+
+rtroot=$sdir
+tdir=$sdir/data/references/eval03/english/cts
+sdir=$sdir/data/audio/eval03/english/cts
+
+find $sdir -iname '*.sph' | sort > $dir/sph.flist
+sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
+  > $dir/sph.scp
+
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+[ ! -x $sph2pipe ] \
+  && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
+
+awk -v sph2pipe=$sph2pipe '{
+  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); 
+  printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
+}' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
+#side A - channel 1, side B - channel 2
+
+# Get segments file...
+# segments file format is: utt-id side-id start-time end-time, e.g.:
+# sw02001-A_000098-001156 sw02001-A 0.98 11.56
+#pem=$sdir/english/hub5e_00.pem
+#[ ! -f $pem ] && echo "No such file $pem" && exit 1;
+# pem file has lines like: 
+# en_4156 A unknown_speaker 301.85 302.48
+
+#grep -v ';;' $pem \
+cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
+  | awk '{
+           spk=$1"-"(($2==1)?"A":"B");
+           utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
+           print utt,spk,$4,$5;}' \
+  | sort -u > $dir/segments
+
+# stm file has lines like:
+# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER 
+# TODO(arnab): We should really be lowercasing this since the Edinburgh
+# recipe uses lowercase. This is not used in the actual scoring.
+#grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
+cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
+  | awk '{
+           spk=$1"-"(($2==1)?"A":"B");
+           utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
+           printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' \
+  | sort > $dir/text.all
+
+# We'll use the stm file for sclite scoring.  There seem to be various errors
+# in the stm file that upset hubscr.pl, and we fix them here.
+cat $tdir/*.stm | \
+  sed -e 's:((:(:' -e 's:<B_ASIDE>::g' -e 's:<E_ASIDE>::g' | \
+  grep -v inter_segment_gap | \
+  awk '{
+           printf $1; if ($1==";;") printf(" %s",$2); else printf(($2==1)?" A":" B"); for(n=3;n<=NF;n++) printf(" %s", $n); print ""; }'\
+  > $dir/stm  
+#$tdir/reference/hub5e00.english.000405.stm >  $dir/stm
+cp $rtroot/data/trans_rules/en20030506.glm  $dir/glm
+
+# next line uses command substitution
+# Just checking that the segments are the same in pem vs. stm.
+! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \
+   echo "Segments from pem file and stm file do not match." && exit 1;
+
+grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
+   
+# create an utt2spk file that assumes each conversation side is
+# a separate speaker.
+awk '{print $1,$2;}' $dir/segments > $dir/utt2spk  
+utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+
+# cp $dir/segments $dir/segments.tmp
+# awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \
+#   $dir/segments.tmp > $dir/segments
+
+awk '{print $1}' $dir/wav.scp \
+  | perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
+               print "$1-$2 $1 $2\n"; ' \
+  > $dir/reco2file_and_channel || exit 1;
+
+dest=data/rt03
+mkdir -p $dest
+for x in wav.scp segments text utt2spk spk2utt stm glm reco2file_and_channel; do
+  cp $dir/$x $dest/$x
+done
+
+echo Data preparation and formatting completed for RT-03
+echo "(but not MFCC extraction)"
+
diff --git a/egs/swbd/s5c/local/score_sclite.sh b/egs/swbd/s5c/local/score_sclite.sh
index 1b551d365..7ac33fdd2 100755
--- a/egs/swbd/s5c/local/score_sclite.sh
+++ b/egs/swbd/s5c/local/score_sclite.sh
@@ -110,23 +110,45 @@ if [ $stage -le 2 ]; then
 fi
 
 # For eval2000 score the subsets
-case "$name" in eval2000* )
-  # Score only the, swbd part...
+case "$name" in 
+  eval2000*)
+    # Score only the, swbd part...
+    if [ $stage -le 3 ]; then
+      for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+        $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.${wip}.log \
+          grep -v '^en_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \
+          grep -v '^en_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \
+          $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.swbd $dir/score_LMWT_${wip}/${name}.ctm.swbd || exit 1;
+      done
+    fi
+    # Score only the, callhome part...
+    if [ $stage -le 3 ]; then
+      for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+        $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.callhm.LMWT.${wip}.log \
+          grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.callhm '&&' \
+          grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.callhm '&&' \
+          $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.callhm $dir/score_LMWT_${wip}/${name}.ctm.callhm || exit 1;
+      done
+    fi
+    ;;
+rt03* )
+    
+  # Score only the swbd part...
   if [ $stage -le 3 ]; then
     for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
       $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.${wip}.log \
-        grep -v '^en_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \
-        grep -v '^en_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \
+        grep -v '^fsh_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \
+        grep -v '^fsh_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \
         $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.swbd $dir/score_LMWT_${wip}/${name}.ctm.swbd || exit 1;
     done
   fi
-  # Score only the, callhome part...
+  # Score only the fisher part...
   if [ $stage -le 3 ]; then
     for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
-      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.callhm.LMWT.${wip}.log \
-        grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.callhm '&&' \
-        grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.callhm '&&' \
-        $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.callhm $dir/score_LMWT_${wip}/${name}.ctm.callhm || exit 1;
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.fsh.LMWT.${wip}.log \
+        grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.fsh '&&' \
+        grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.fsh '&&' \
+        $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.fsh $dir/score_LMWT_${wip}/${name}.ctm.fsh || exit 1;
     done
   fi
  ;;