sandbox/pawel: scoring, final beamforming and mdm data prep scripts

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/pawel@4085 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2014-06-27 07:34:10 +00:00 · 2014-06-27 07:34:10 +00:00 · 618ecf2300
--- a/egs/ami/s5/local/ami_beamform.sh
+++ b/egs/ami/s5/local/ami_beamform.sh
@ -58,40 +58,17 @@ do
  echo $channels >> $wdir/channels_$numch
 done < $meetings

-######
+#do noise cancellation
+
+if [ $wiener_filtering == "true" ]; then
+  echo "Wiener filtering not yet implemented."
+  exit 1;
+fi
+
 #do beamforming
-######

 echo -e "Beamforming\n"

 $cmd JOB=0:$nj $wdir/log/beamform.JOB.log \
     local/beamformit.sh $nj JOB $numch $meetings $sdir $odir

-: << "C"
-(
-
-  utils/split_scp.pl -j $nj JOB $meetings $meetings.JOB
-
-  while read line; do
-    BeamformIt -s $line -c $wdir/channels_$numch \
-                        --config_file=conf/beamformit.cfg \
-                        --source_dir=$sdir \
-                        --result_dir=$odir/temp_dir \
-                        --do_compute_reference=1
-
-    mkdir -p $odir/$line 
-    mv $odir/temp_dir/$line/${line}_seg.del  $odir/$line/${line}_MDM$numch.del
-    mv $odir/temp_dir/$line/${line}_seg.del2 $odir/$line/${line}_MDM$numch.del2
-    mv $odir/temp_dir/$line/${line}_seg.info $odir/$line/${line}_MDM$numch.info
-    mv $odir/temp_dir/$line/${line}_seg.ovl  $odir/$line/${line}_MDM$numch.ovl
-    mv $odir/temp_dir/$line/${line}_seg.weat $odir/$line/${line}_MDM$numch.weat
-    mv $odir/temp_dir/$line/${line}_seg.wa*  $odir/$line/${line}_MDM$numch.wav
-    mv $odir/temp_dir/$line/${line}_seg2.wa* $odir/$line/${line}_MDM${numch}_seg2.wav
-   
-    rm -r $odir/temp_dir  
-  done < $meetings.JOB
-
-)
-C
-
-
--- a/egs/ami/s5/local/ami_mdm_scoring_data_prep_edin.sh
+++ b/egs/ami/s5/local/ami_mdm_scoring_data_prep_edin.sh
@ -1,104 +0,0 @@
-#!/bin/bash
-
-# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)
-# AMI Corpus dev/eval data preparation 
-
-. path.sh
-
-#check existing directories
-if [ $# != 4 ]; then
-  echo "Usage: ami_sdm_scoring_data_prep_edin.sh /path/to/AMI rt09-seg-file set-name mic"
-  exit 1; 
-fi 
-
-AMI_DIR=$1
-SEGS=$2 #assuming here all normalisation stuff was done
-SET=$3
-mic=$4
-
-tmpdir=data/local/$mic/$SET
-dir=data/$mic/$SET
-
-mkdir -p $tmpdir
-
-# Audio data directory check
-if [ ! -d $AMI_DIR ]; then
-  echo "Error: run.sh requires a directory argument"
-  exit 1; 
-fi  
-
-# find headset wav audio files only, here we again get all
-# the files in the corpora and filter only specific sessions
-# while building segments
-
-find $AMI_DIR -iname '*bmf[248].wav' | sort > $tmpdir/wav.flist
-n=`cat $tmpdir/wav.flist | wc -l`
-echo "In total, $n files were found."
-
-# (1a) Transcriptions preparation
-# here we start with rt09 transcriptions, hence not much to do
-
-awk '{meeting=$1; channel="MDM"; speaker=$3; stime=$4; etime=$5;
- printf("AMI_%s_%s_%s_%07.0f_%07.0f", meeting, channel, speaker, int(100*stime+0.5), int(100*etime+0.5));
- for(i=6;i<=NF;i++) printf(" %s", $i); printf "\n"}' $SEGS | sort  > $tmpdir/text
-
-
-# (1c) Make segment files from transcript
-#segments file format is: utt-id side-id start-time end-time, e.g.:
-#AMI_ES2011a_H00_FEE041_0003415_0003484
-awk '{ 
-       segment=$1;
-       split(segment,S,"[_]");
-       audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6];
-       print segment " " audioname " " startf/100 " " endf/100 " " 0
-}' < $tmpdir/text > $tmpdir/segments
-
-#EN2001a.Array1-01.wav
-#sed -e 's?.*/??' -e 's?.sph??' $dir/wav.flist | paste - $dir/wav.flist \
-#  > $dir/wav.scp
-
-sed -e 's?.*/??' -e 's?.wav??' $tmpdir/wav.flist | \
- perl -ne 'split; $_ =~ m/(.*)_bmf([0-9])/; print "AMI_$1_MDM\n"' | \
-  paste - $tmpdir/wav.flist > $tmpdir/wav.scp
-
-#Keep only devset part of waves
-awk '{print $2}' $tmpdir/segments | sort -u | join - $tmpdir/wav.scp | sort -o $tmpdir/wav.scp
-
-# this file reco2file_and_channel maps recording-id (e.g. sw02001-A)
-# to the file name sw02001 and the A, e.g.
-# sw02001-A  sw02001 A
-# In this case it's trivial, but in other corpora the information might
-# be less obvious.  Later it will be needed for ctm scoring.
-
-awk '{print $1 $2}' $tmpdir/wav.scp | \
-  perl -ane '$_ =~ m:^(\S+MDM).*\/([IETB].*)\.wav$: || die "bad label $_"; 
-       print "$1 $2 0\n"; '\
-  > $tmpdir/reco2file_and_channel || exit 1;
-
-# we assume we adapt to the session only
-awk '{print $1}' $tmpdir/segments | \
-  perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; 
-          print "$1$2$3 $1\n";'  \
-    > $tmpdir/utt2spk || exit 1;
-
-sort -k 2 $tmpdir/utt2spk | utils/utt2spk_to_spk2utt.pl > $tmpdir/spk2utt || exit 1;
-
-# but we want to properly score the overlapped segments, hence we generate the extra
-# utt2spk_stm file containing speakers ids used to generate the stms for mdm/sdm case
-awk '{print $1}' $tmpdir/segments | \
-  perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; 
-          print "$1$2$3 $1$2\n";'  \
-    > $tmpdir/utt2spk_stm || exit 1;
-
-# Copy stuff into its final locations [this has been moved from the format_data
-# script]
-mkdir -p $dir
-for f in spk2utt utt2spk utt2spk_stm wav.scp text segments reco2file_and_channel; do
-  cp $tmpdir/$f $dir/$f || exit 1;
-done
-
-cp local/english.glm $dir/glm
-utils/convert2stm.pl $dir utt2spk_stm > $dir/stm
-
-echo AMI $SET set data preparation succeeded.
-
--- a/egs/ami/s5/local/ami_prepare_dict.sh
+++ b/egs/ami/s5/local/ami_prepare_dict.sh
@ -1,8 +1,8 @@
 #!/bin/bash

-# Formatting the Mississippi State dictionary for use in Edinburgh. Differs 
-# from the one in Kaldi s5 recipe in that it uses lower-case --Arnab (Jan 2013)
+#Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)

+# Formatting the CMUDict for the use AMI recipe
 # To be run from one directory above this script.

 . path.sh
@ -39,13 +39,19 @@ echo sil > $dir/optional_silence.txt
 # have stress or tone.
 echo -n >$dir/extra_questions.txt

+# Add to the lexicon selected backchannels
+( echo 'MM-HMM m'; \
+  #echo 'UM-HUH spn'; \ <-this one is already in CMUDict
+  echo 'HMM eh m'; \
+  echo 'MM m'; )  | cat - $dir/lexicon1.txt  > $dir/lexicon2.txt || exit 1;
+
 # Add to the lexicon the silences, noises etc.
 ( echo '!sil sil'; echo '[vocalized-noise] spn'; echo '[noise] nsn'; \
  echo '[laughter] lau'; echo '<unk> spn' ) \
-  | cat - $dir/lexicon1.txt  > $dir/lexicon2.txt || exit 1;
+  | cat - $dir/lexicon2.txt  > $dir/lexicon3.txt || exit 1;

 pushd $wdir >&/dev/null
-ln -sf lexicon2.txt lexicon.txt # This is the final lexicon.
+ln -sf lexicon3.txt lexicon.txt # This is the final lexicon.
 popd >&/dev/null

 echo Prepared input dictionary and phone-sets for AMI phase 1.
--- a/egs/ami/s5/local/beamformit.sh
+++ b/egs/ami/s5/local/beamformit.sh
@ -16,21 +16,18 @@ utils/split_scp.pl -j $nj $job $meetings $meetings.$job

 while read line; do

+  mkdir -p $odir/$line
  BeamformIt -s $line -c $wdir/channels_$numch \
                        --config_file `pwd`/conf/ami.cfg \
                        --source_dir $sdir \
-                        --result_dir $odir/temp_dir
-
+                        --result_dir $odir/$line
  mkdir -p $odir/$line
-  mv $odir/temp_dir/$line/${line}_seg.del  $odir/$line/${line}_MDM$numch.del
-  mv $odir/temp_dir/$line/${line}_seg.del2 $odir/$line/${line}_MDM$numch.del2
-  mv $odir/temp_dir/$line/${line}_seg.info $odir/$line/${line}_MDM$numch.info
-  mv $odir/temp_dir/$line/${line}_seg.ovl  $odir/$line/${line}_MDM$numch.ovl
-  mv $odir/temp_dir/$line/${line}_seg.weat $odir/$line/${line}_MDM$numch.weat
-  mv $odir/temp_dir/$line/${line}_seg.wa*  $odir/$line/${line}_MDM$numch.wav
-  mv $odir/temp_dir/$line/${line}_seg2.wa* $odir/$line/${line}_MDM${numch}_seg2.wav
-
-  rm -r $odir/temp_dir
+  mv $odir/$line/${line}.del  $odir/$line/${line}_MDM$numch.del
+  mv $odir/$line/${line}.del2 $odir/$line/${line}_MDM$numch.del2
+  mv $odir/$line/${line}.info $odir/$line/${line}_MDM$numch.info
+  mv $odir/$line/${line}.ovl  $odir/$line/${line}_MDM$numch.ovl
+  mv $odir/$line/${line}.weat $odir/$line/${line}_MDM$numch.weat
+  mv $odir/$line/${line}.wav  $odir/$line/${line}_MDM$numch.wav

 done < $meetings.$job

--- a/egs/ami/s5/local/score.sh
+++ b/egs/ami/s5/local/score.sh
@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright Johns Hopkins University (Author: Daniel Povey) 2012
+# Copyright University of Edinburgh (Author: Pawel Swietojanski) 2014
+# Apache 2.0
+
+orig_args=
+for x in "$@"; do orig_args="$orig_args '$x'"; done
+
+# begin configuration section.  we include all the options that score_sclite.sh or
+# score_basic.sh might need, or parse_options.sh will die.
+cmd=run.pl
+stage=0
+min_lmwt=9
+max_lmwt=20
+reverse=false
+asclite=false
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score.sh [options] <data-dir> <lang-dir|graph-dir> <decode-dir>" && exit;
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  echo "    --reverse (true/false)          # score with time reversed features "
+  echo "    --asclite (true/false)          # score with ascltie instead of sclite (overlapped speech)"
+  exit 1;
+fi
+
+data=$1
+
+if [ -f $data/stm ]; then # use sclite scoring.
+  if $asclite; then
+    echo "$data/stm exists and asclite is $asclite: using local/score_asclite.sh"
+    eval local/score_asclite.sh $orig_args
+  else
+    echo "$data/stm exists: using local/score_sclite.sh"
+    eval local/score_sclite.sh $orig_args
+  fi
+else
+  echo "$data/stm does not exist: using local/score_basic.sh"
+  eval local/score_basic.sh $orig_args
+fi
--- a/egs/ami/s5/local/score_asclite.sh
+++ b/egs/ami/s5/local/score_asclite.sh
@ -0,0 +1,86 @@
+#!/bin/bash
+# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+min_lmwt=9
+max_lmwt=20
+reverse=false
+asclite=false
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score_asclite.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  echo "    --reverse (true/false)          # score with time reversed features "
+  exit 1;
+fi
+
+data=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+dir=$3
+
+model=$dir/../final.mdl # assume model one level up from decoding dir.
+
+hubscr=$KALDI_ROOT/tools/sctk-2.4.0/bin/hubscr.pl 
+[ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
+hubdir=`dirname $hubscr`
+
+for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \
+     $model $data/segments $data/reco2file_and_channel $dir/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+done
+
+name=`basename $data`; # e.g. eval2000
+
+mkdir -p $dir/ascoring/log
+
+if [ $stage -le 0 ]; then
+  if $reverse; then
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/ascoring/log/get_ctm.LMWT.log \
+      mkdir -p $dir/ascore_LMWT/ '&&' \
+      lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+      lattice-reverse ark:- ark:- \| \
+      lattice-align-words --reorder=false $lang/phones/word_boundary.int $model ark:- ark:- \| \
+      nbest-to-ctm ark:- - \| \
+      utils/int2sym.pl -f 5 $lang/words.txt  \| \
+      utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
+      '>' $dir/ascore_LMWT/$name.ctm || exit 1;
+  else
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/ascoring/log/get_ctm.LMWT.log \
+      mkdir -p $dir/ascore_LMWT/ '&&' \
+      lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+      lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
+      nbest-to-ctm ark:- - \| \
+      utils/int2sym.pl -f 5 $lang/words.txt  \| \
+      utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
+      '>' $dir/ascore_LMWT/$name.ctm || exit 1;
+  fi
+fi
+
+if [ $stage -le 1 ]; then
+# Remove some stuff we don't want to score, from the ctm.
+  for x in $dir/ascore_*/$name.ctm; do
+    cp $x $dir/tmpf;
+    cat $dir/tmpf | grep -i -v -E '\[noise|laughter|vocalized-noise\]' | \
+      grep -i -v -E '<unk>' > $x;
+#      grep -i -v -E '<UNK>|%HESITATION' > $x;
+  done
+fi
+
+if [ $stage -le 2 ]; then  
+  $cmd LMWT=$min_lmwt:$max_lmwt $dir/ascoring/log/score.LMWT.log \
+    cp $data/stm $dir/ascore_LMWT/ '&&' \
+    $hubscr -G -v -m 1:2 -o4 -a -C -B 8192 -p $hubdir -V -l english \
+         -h rt-stt -g $data/glm -r $dir/ascore_LMWT/stm $dir/ascore_LMWT/${name}.ctm || exit 1;
+fi
+
+exit 0
--- a/egs/ami/s5/path.sh
+++ b/egs/ami/s5/path.sh
@ -10,7 +10,8 @@ KALDIBIN=$KALDIBIN:$KALDISRC/sgmmbin:$KALDISRC/tiedbin

 FSTBIN=$KALDI_ROOT/tools/openfst/bin
 LMBIN=$KALDI_ROOT/tools/irstlm/bin
-BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt-3.51
+#BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt-3.51
+BEAMFORMIT=/disk/data1/s1136550/BeamformIt-3.51

 [ -d $PWD/local ] || { echo "Error: 'local' subdirectory not found."; }
 [ -d $PWD/utils ] || { echo "Error: 'utils' subdirectory not found."; }
--- a/egs/ami/s5/run_mdm.sh
+++ b/egs/ami/s5/run_mdm.sh
@ -10,14 +10,18 @@ mic=mdm$nmics
 AMI_DIR=
 #AMI_DIR=/gpfs/scratch/s1136550/ami/amicorpus
 AMI_DIR=/disk/data2/amicorpus
+MDM_DIR=/disk/data1/s1136550/ami/mdm

-local/ami_beamform.sh --nj 12 $nmics $AMI_DIR /disk/data1/ami
+#local/ami_beamform.sh --nj 12 $nmics $AMI_DIR /disk/data1/s1136550/ami/mdm

-exit 1;
 #PREPARE DATA STARTING FROM RT09 SEGMENTATIONS

-local/ami_text_prep.sh
-local/ami_mdm_data_prep.sh $AMI_DIR
+#local/ami_text_prep.sh
+local/ami_mdm_data_prep.sh $MDM_DIR $mic
+local/ami_mdm_scoring_data_prep.sh $MDM_DIR $mic dev
+local/ami_mdm_scoring_data_prep.sh $MDM_DIR $mic eval
+
+exit 1;

 # We will keep the dict and lang the same as in IHM case
 # local/ami_prepare_dict.sh
--- a/tools/Makefile
+++ b/tools/Makefile
@ -163,15 +163,14 @@ openblas_compiled:
 	-git clone git://github.com/xianyi/OpenBLAS
 	$(MAKE) PREFIX=`pwd`/OpenBLAS/install FC=gfortran $(fortran_opt) DEBUG=1 USE_THREAD=0 -C OpenBLAS all install

+beamformit: beamformit-3.51

-beamformit: beamformit-3.5
-
-.PHONY: beamformit-3.5
-
-beamformit-3.5: beamformit-3.5.tgz
-
-beamformit-3.5.tgz:
-    wget http://www.xavieranguera.com/beamformit/releases/BeamformIt-3.5.tgz
+.PHONY: beamformit-3.51

+beamformit-3.51: beamformit-3.51.tgz
+        tar -xozf BeamformIt-3.51.tgz; \
+        cd BeamformIt-3.51; cmake . ; make

+beamformit-3.51.tgz:
+	wget -c -T 10 http://www.xavieranguera.com/beamformit/releases/BeamformIt-3.51.tgz