diff --git a/egs/ami/README.txt b/egs/ami/README.txt new file mode 100644 index 000000000..17eca146c --- /dev/null +++ b/egs/ami/README.txt @@ -0,0 +1,32 @@ + +About the AMI corpus: + +WEB: http://groups.inf.ed.ac.uk/ami/corpus/ +LICENCE: http://groups.inf.ed.ac.uk/ami/corpus/license.shtml + +"The AMI Meeting Corpus consists of 100 hours of meeting recordings. The recordings use a range of signals synchronized to a common timeline. These include close-talking and far-field microphones, individual and room-view video cameras, and output from a slide projector and an electronic whiteboard. During the meetings, the participants also have unsynchronized pens available to them that record what is written. The meetings were recorded in English using three different rooms with different acoustic properties, and include mostly non-native speakers." See http://groups.inf.ed.ac.uk/ami/corpus/overview.shtml for more details. + + +About the recipe: + +s5) + +The scripts under this directory build systems using AMI data only, this includes both training, development and evaluation sets (following Full ASR split on http://groups.inf.ed.ac.uk/ami/corpus/datasets.shtml). This is different from RT evaluation campaigns that usually combined couple of different meeting datasets from multiple sources. In general, the recipe reproduce baseline systems build in [1] but without propeirary components* that means we use CMUDict [2] and in the future will try to use open texts to estimate background language model. + +Currently, one can build the systems for close-talking scenario, for which we refer as +-- IHM (Individual Headset Microphones) +and two variants of distant speech +-- SDM (Single Distant Microphone) using 1st micarray and, +-- MDM (Multiple Distant Microphones) where the mics are combined using BeamformIt [3] toolkit. + +To run all su-recipes the following (non-standard) software is expected to be installed +1) SRILM - to build language models (look at KALDI_ROOT/tools/install_srilm.sh) +2) BeamformIt (for MDM scenario, installed with Kaldi tools) +3) Java (optional, but if available will be used to extract transcripts from XML) + +[1] "Hybrid acoustic models for distant and multichannel large vocabulary speech recognition", Pawel Swietojanski, Arnab Ghoshal and Steve Renals, In Proc. ASRU, December 2013 +[2] http://www.speech.cs.cmu.edu/cgi-bin/cmudict +[3] "Acoustic beamforming for speaker diarization of meetings", Xavier Anguera, Chuck Wooters and Javier Hernando, IEEE Transactions on Audio, Speech and Language Processing, September 2007, volume 15, number 7, pp.2011-2023. + +*) there is still optional dependency on Fisher transcripts (LDC2004T19, LDC2005T19) to build background language model and closely reproduce [1]. + diff --git a/egs/ami/s5/RESULTS_ihm b/egs/ami/s5/RESULTS_ihm new file mode 100644 index 000000000..23c9eee61 --- /dev/null +++ b/egs/ami/s5/RESULTS_ihm @@ -0,0 +1,15 @@ + + +dev +exp/ihm/tri2a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev.ctm.filt.dtl:Percent Total Error = 38.0% (35925) +exp/ihm/tri3a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_14/dev.ctm.filt.dtl:Percent Total Error = 35.3% (33329) +exp/ihm/tri4a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev.ctm.filt.dtl:Percent Total Error = 32.1% (30364) +exp/ihm/tri4a_mmi_b0.1/decode_dev_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_12/dev.ctm.filt.dtl:Percent Total Error = 29.9% (28220) + +eval +exp/ihm/tri2a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_13/eval.ctm.filt.dtl:Percent Total Error = 43.7% (39330) +exp/ihm/tri3a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_14/eval.ctm.filt.dtl:Percent Total Error = 40.4% (36385) +exp/ihm/tri4a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_13/eval_o4.ctm.filt.dtl:Percent Total Error = 35.0% (31463) +exp/ihm/tri4a_mmi_b0.1/decode_eval_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.dtl:Percent Total Error = 31.7% (28518) + + diff --git a/egs/ami/s5/RESULTS_mdm b/egs/ami/s5/RESULTS_mdm new file mode 100644 index 000000000..200c42257 --- /dev/null +++ b/egs/ami/s5/RESULTS_mdm @@ -0,0 +1,15 @@ + + +#Beamforming of 8 microphones, WER scores with up to 4 overlapping speakers + +dev +exp/mdm8/tri2a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.dtl:Percent Total Error = 58.8% (55568) +exp/mdm8/tri3a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.dtl:Percent Total Error = 57.0% (53855) +exp/mdm8/tri3a_mmi_b0.1/decode_dev_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_10/dev_o4.ctm.filt.dtl:Percent Total Error = 54.9% (51926) + +eval +exp/mdm8/tri2a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_13/eval_o4.ctm.filt.dtl:Percent Total Error = 64.4% (57916) +exp/mdm8/tri3a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_13/eval_o4.ctm.filt.dtl:Percent Total Error = 61.9% (55738) +exp/mdm8/tri3a_mmi_b0.1/decode_eval_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_10/eval_o4.ctm.filt.dtl:Percent Total Error = 59.3% (53370) + + diff --git a/egs/ami/s5/RESULTS_sdm b/egs/ami/s5/RESULTS_sdm new file mode 100644 index 000000000..b417bad79 --- /dev/null +++ b/egs/ami/s5/RESULTS_sdm @@ -0,0 +1,14 @@ + + +#the below are WER scores with up to 4 overlapping speakers + +dev +exp/sdm1/tri2a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.dtl:Percent Total Error = 66.9% (63190) +exp/sdm1/tri3a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.dtl:Percent Total Error = 64.5% (60963) +exp/sdm1/tri3a_mmi_b0.1/decode_dev_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_10/dev_o4.ctm.filt.dtl:Percent Total Error = 62.2% (58772) + +eval +exp/sdm1/tri2a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_13/eval_o4.ctm.filt.dtl:Percent Total Error = 71.8% (64577) +exp/sdm1/tri3a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.dtl:Percent Total Error = 69.5% (62576) +exp/sdm1/tri3a_mmi_b0.1/decode_eval_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_10/eval_o4.ctm.filt.dtl:Percent Total Error = 67.2% (60447) + diff --git a/egs/ami/s5/cmd.sh b/egs/ami/s5/cmd.sh new file mode 100644 index 000000000..ea88482dd --- /dev/null +++ b/egs/ami/s5/cmd.sh @@ -0,0 +1,17 @@ +# "queue.pl" uses qsub. The options to it are +# options to qsub. If you have GridEngine installed, +# change this to a queue you have access to. +# Otherwise, use "run.pl", which will run jobs locally +# (make sure your --num-jobs options are no more than +# the number of cpus on your machine. + +# On Eddie use: +#export train_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=08:00:00" +#export decode_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=05:00:00 -pe memory-2G 4" +#export highmem_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=05:00:00 -pe memory-2G 4" +#export scoring_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=00:20:00" + +# To run locally, use: +export train_cmd=run.pl +export decode_cmd=run.pl +export highmem_cmd=run.pl diff --git a/egs/ami/s5/conf/ami.cfg b/egs/ami/s5/conf/ami.cfg new file mode 100644 index 000000000..70fdd8586 --- /dev/null +++ b/egs/ami/s5/conf/ami.cfg @@ -0,0 +1,50 @@ +#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) + +# scrolling size to compute the delays +scroll_size = 250 + +# cross correlation computation window size +window_size = 500 + +#amount of maximum points for the xcorrelation taken into account +nbest_amount = 4 + +#flag wether to apply an automatic noise thresholding +do_noise_threshold = 1 + +#Percentage of frames with lower xcorr taken as noisy +noise_percent = 10 + +######## acoustic modelling parameters + +#transition probabilities weight for multichannel decoding +trans_weight_multi = 25 +trans_weight_nbest = 25 + +### + +#flag wether to print the feaures after setting them, or not +print_features = 1 + +#flag wether to use the bad frames in the sum process +do_avoid_bad_frames = 1 + +#flag to use the best channel (SNR) as a reference +#defined from command line +do_compute_reference = 1 + +#flag wether to use a uem file or not(process all the file) +do_use_uem_file = 0 + +#flag wether to use an adaptative weights scheme or fixed weights +do_adapt_weights = 1 + +#flag wether to output the sph files or just run the system to create the auxiliary files +do_write_sph_files = 1 + +####directories where to store/retrieve info#### +#channels_file = ./cfg-files/channels + +#show needs to be passed as argument normally, here a default one is given just in case +#show_id = Ttmp + diff --git a/egs/ami/s5/conf/decode.conf b/egs/ami/s5/conf/decode.conf new file mode 100644 index 000000000..c8a0ece58 --- /dev/null +++ b/egs/ami/s5/conf/decode.conf @@ -0,0 +1,3 @@ +beam=11.0 # beam for decoding. Was 13.0 in the scripts. +first_beam=8.0 # beam for 1st-pass decoding in SAT. + diff --git a/egs/ami/s5/conf/fbank.conf b/egs/ami/s5/conf/fbank.conf new file mode 100644 index 000000000..df18aec63 --- /dev/null +++ b/egs/ami/s5/conf/fbank.conf @@ -0,0 +1,10 @@ +--window-type=hamming # disable Dans window, use the standard +--use-energy=false # only fbank outputs +--sample-frequency=16000 # AMI is sampled at 16kHz + +#--low-freq=64 # typical setup from Frantisek Grezl +#--high-freq=3800 +--dither=1 + +--num-mel-bins=40 # 8kHz so we use 15 bins +--htk-compat=true # try to make it compatible with HTK diff --git a/egs/ami/s5/conf/mfcc.conf b/egs/ami/s5/conf/mfcc.conf new file mode 100644 index 000000000..a1aa3d6c1 --- /dev/null +++ b/egs/ami/s5/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false # only non-default option. +--sample-frequency=16000 diff --git a/egs/ami/s5/local/ami_beamform.sh b/egs/ami/s5/local/ami_beamform.sh new file mode 100755 index 000000000..058ac9c55 --- /dev/null +++ b/egs/ami/s5/local/ami_beamform.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +#Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski) +#Apache 2.0 + +wiener_filtering=false +nj=4 +cmd=run.pl + +# End configuration section +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Wrong #arguments ($#, expected 4)" + echo "Usage: steps/ami_beamform.sh [options] " + echo "main options (for others, see top of script file)" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --wiener-filtering # Cancel noise with Wiener filter prior to beamforming" + exit 1; +fi + +numch=$1 +sdir=$2 +odir=$3 +wdir=data/local/beamforming + +mkdir -p $odir +mkdir -p $wdir/log + +meetings=$wdir/meetings.list + +cat local/split_train.orig local/split_dev.orig local/split_eval.orig | sort > $meetings + +ch_inc=$((8/$numch)) +bmf= +for ch in `seq 1 $ch_inc 8`; do + bmf="$bmf $ch" +done + +echo "Will use the following channels: $bmf" + +#make the channel file +if [ -f $wdir/channels_$numch ]; then + rm $wdir/channels_$numch +fi +touch $wdir/channels_$numch + +while read line; +do + channels="$line " + for ch in $bmf; do + channels="$channels $line/audio/$line.Array1-0$ch.wav" + done + echo $channels >> $wdir/channels_$numch +done < $meetings + +#do noise cancellation + +if [ $wiener_filtering == "true" ]; then + echo "Wiener filtering not yet implemented." + exit 1; +fi + +#do beamforming + +echo -e "Beamforming\n" + +$cmd JOB=1:$nj $wdir/log/beamform.JOB.log \ + local/beamformit.sh $nj JOB $numch $meetings $sdir $odir + diff --git a/egs/ami/s5/local/ami_download.sh b/egs/ami/s5/local/ami_download.sh new file mode 100755 index 000000000..49708128b --- /dev/null +++ b/egs/ami/s5/local/ami_download.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski, Jonathan Kilgour) + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo " where is either ihm, sdm or mdm and is download space." + exit 1; +fi + +mic=$1 +adir=$2 +amiurl=http://groups.inf.ed.ac.uk/ami +annotver=ami_public_manual_1.6.1 +wdir=data/local/downloads + +if [[ ! "$mic" =~ ^(ihm|sdm|mdm)$ ]]; then + echo "$0. Wrong option." + exit 1; +fi + +mics="1 2 3 4 5 6 7 8" +if [ "$mic" == "sdm" ]; then + mics=1 +fi + +mkdir -p $adir +mkdir -p $wdir/log + +#download annotations + +annot="$adir/$annotver" +if [[ ! -d $adir/annotations || ! -f "$annot" ]]; then + echo "Downloading annotiations..." + wget -nv -O $annot.zip $amiurl/AMICorpusAnnotations/$annotver.zip &> $wdir/log/download_ami_annot.log + mkdir -p $adir/annotations + unzip -o -d $adir/annotations $annot.zip &> /dev/null +fi +[ ! -f "$adir/annotations/AMI-metadata.xml" ] && echo "$0: File AMI-Metadata.xml not found under $adir/annotations." && exit 1; + +#download waves + +cat local/split_train.orig local/split_eval.orig local/split_dev.orig > $wdir/ami_meet_ids.flist + +wgetfile=$wdir/wget_$mic.sh +manifest="wget -O $adir/MANIFEST.TXT http://groups.inf.ed.ac.uk/ami/download/temp/amiBuild-04237-Sun-Jun-15-2014.manifest.txt" +license="wget -O $adir/LICENCE.TXT http://groups.inf.ed.ac.uk/ami/download/temp/Creative-Commons-Attribution-NonCommercial-ShareAlike-2.5.txt" + +echo "#!/bin/bash" > $wgetfile +echo $manifest >> $wgetfile +echo $license >> $wgetfile +while read line; do + if [ "$mic" == "ihm" ]; then + extra_headset= #some meetings have 5 sepakers (headsets) + for mtg in EN2001a EN2001d EN2001e; do + [ "$mtg" == "$line" ] && extra_headset=4; + done + for m in 0 1 2 3 $extra_headset; do + echo "wget -nv -c -P $adir/$line/audio $amiurl/AMICorpusMirror/amicorpus/$line/audio/$line.Headset-$m.wav" >> $wgetfile + done + else + for m in $mics; do + echo "wget -nv -c -P $adir/$line/audio $amiurl/AMICorpusMirror/amicorpus/$line/audio/$line.Array1-0$m.wav" >> $wgetfile + done + fi +done < $wdir/ami_meet_ids.flist + +chmod +x $wgetfile +echo "Downloading audio files for $mic scenario." +echo "Look at $wdir/log/download_ami_$mic.log for progress" +$wgetfile &> $wdir/log/download_ami_$mic.log + +#do rough check if #wavs is as expected, it will fail anyway in data prep stage if it isn't +if [ "$mic" == "ihm" ]; then + num_files=`find $adir -iname *Headset*` + if [ $num_files -ne 687 ]; then + echo "Warning: Found $num_files headset wavs but expected 687. Check $wdir/log/download_ami_$mic.log for details." + exit 1; + fi +else + num_files=`find $adir -iname *Array1*` + if [[ $num_files -lt 1352 && "$mic" == "mdm" ]]; then + echo "Warning: Found $num_files distant Array1 waves but expected 1352 for mdm. Check $wdir/log/download_ami_$mic.log for details." + exit 1; + elif [[ $num_files -lt 169 && "$mic" == "sdm" ]]; then + echo "Warning: Found $num_files distant Array1 waves but expected 169 for sdm. Check $wdir/log/download_ami_$mic.log for details." + exit 1; + fi +fi + +echo "Downloads of AMI corpus completed succesfully. License can be found under $adir/LICENCE.TXT" +exit 0; + + + diff --git a/egs/ami/s5/local/ami_format_data.sh b/egs/ami/s5/local/ami_format_data.sh new file mode 100755 index 000000000..b76f321bb --- /dev/null +++ b/egs/ami/s5/local/ami_format_data.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# + +if [ -f path.sh ]; then . path.sh; fi + +if [ $# -ne 1 ]; then + echo 'Usage: $0 ' + exit +fi + +silprob=0.5 +arpa_lm=$1 + +[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; + +cp -r data/lang data/lang_test + +# grep -v ' ' etc. is only for future-proofing this script. Our +# LM doesn't have these "invalid combinations". These can cause +# determinization failures of CLG [ends up being epsilon cycles]. +# Note: remove_oovs.pl takes a list of words in the LM that aren't in +# our word list. Since our LM doesn't have any, we just give it +# /dev/null [we leave it in the script to show how you'd do it]. +gunzip -c "$arpa_lm" | \ + grep -v ' ' | \ + grep -v ' ' | \ + grep -v ' ' | \ + arpa2fst - | fstprint | \ + utils/remove_oovs.pl /dev/null | \ + utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \ + --osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \ + fstrmepsilon > data/lang_test/G.fst + fstisstochastic data/lang_test/G.fst + +echo "Checking how stochastic G is (the first of these numbers should be small):" +fstisstochastic data/lang_test/G.fst + +## Check lexicon. +## just have a look and make sure it seems sane. +echo "First few lines of lexicon FST:" +fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head + +echo Performing further checks + +# Checking that G.fst is determinizable. +fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G. + +# Checking that L_disambig.fst is determinizable. +fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L. + +# Checking that disambiguated lexicon times G is determinizable +# Note: we do this with fstdeterminizestar not fstdeterminize, as +# fstdeterminize was taking forever (presumbaly relates to a bug +# in this version of OpenFst that makes determinization slow for +# some case). +fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ + fstdeterminizestar >/dev/null || echo Error + +# Checking that LG is stochastic: +fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ + fstisstochastic || echo LG is not stochastic + +echo AMI_format_data succeeded. + diff --git a/egs/ami/s5/local/ami_ihm_data_prep.sh b/egs/ami/s5/local/ami_ihm_data_prep.sh new file mode 100755 index 000000000..3a1d43d1e --- /dev/null +++ b/egs/ami/s5/local/ami_ihm_data_prep.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski) +# AMI Corpus training data preparation +# Apache 2.0 + +# To be run from one directory above this script. + +. path.sh + +#check existing directories +if [ $# != 1 ]; then + echo "Usage: ami_ihm_data_prep.sh /path/to/AMI" + exit 1; +fi + +AMI_DIR=$1 + +SEGS=data/local/annotations/train.txt +dir=data/local/ihm/train +mkdir -p $dir + +# Audio data directory check +if [ ! -d $AMI_DIR ]; then + echo "Error: $AMI_DIR directory does not exists." + exit 1; +fi + +# And transcripts check +if [ ! -f $SEGS ]; then + echo "Error: File $SEGS no found (run ami_text_prep.sh)." + exit 1; +fi + + +# find headset wav audio files only +find $AMI_DIR -iname '*.Headset-*.wav' | sort > $dir/wav.flist +n=`cat $dir/wav.flist | wc -l` +echo "In total, $n headset files were found." +[ $n -ne 687 ] && \ + echo "Warning: expected 687 (168 mtgs x 4 mics + 3 mtgs x 5 mics) data files, found $n" + +# (1a) Transcriptions preparation +# here we start with normalised transcriptions, the utt ids follow the convention +# AMI_MEETING_CHAN_SPK_STIME_ETIME +# AMI_ES2011a_H00_FEE041_0003415_0003484 +# we use uniq as some (rare) entries are doubled in transcripts + +awk '{meeting=$1; channel=$2; speaker=$3; stime=$4; etime=$5; + printf("AMI_%s_%s_%s_%07.0f_%07.0f", meeting, channel, speaker, int(100*stime+0.5), int(100*etime+0.5)); + for(i=6;i<=NF;i++) printf(" %s", $i); printf "\n"}' $SEGS | sort | uniq > $dir/text + +# (1b) Make segment files from transcript + +awk '{ + segment=$1; + split(segment,S,"[_]"); + audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6]; + print segment " " audioname " " startf*10/1000 " " endf*10/1000 " " +}' < $dir/text > $dir/segments + +# (1c) Make wav.scp file. + +sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \ + perl -ne 'split; $_ =~ m/(.*)\..*\-([0-9])/; print "AMI_$1_H0$2\n"' | \ + paste - $dir/wav.flist > $dir/wav1.scp + +#Keep only train part of waves +awk '{print $2}' $dir/segments | sort -u | join - $dir/wav1.scp > $dir/wav2.scp + +#replace path with an appropriate sox command that select single channel only +awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp + +# (1d) reco2file_and_channel +cat $dir/wav.scp \ + | perl -ane '$_ =~ m:^(\S+)(H0[0-4])\s+.*\/([IETB].*)\.wav.*$: || die "bad label $_"; + print "$1$2 $3 A\n"; ' > $dir/reco2file_and_channel || exit 1; + + +awk '{print $1}' $dir/segments | \ + perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; + print "$1$2$3 $1$2\n";' > $dir/utt2spk || exit 1; + +sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; + +# Copy stuff into its final location +mkdir -p data/ihm/train +for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do + cp $dir/$f data/ihm/train/$f || exit 1; +done + +utils/validate_data_dir.sh --no-feats data/ihm/train || exit 1; + +echo AMI IHM data preparation succeeded. + diff --git a/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh new file mode 100755 index 000000000..a0cca9c5f --- /dev/null +++ b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski) +# AMI Corpus dev/eval data preparation + +. path.sh + +#check existing directories +if [ $# != 2 ]; then + echo "Usage: ami_*_scoring_data_prep_edin.sh /path/to/AMI set-name" + exit 1; +fi + +AMI_DIR=$1 +SET=$2 +SEGS=data/local/annotations/$SET.txt + +dir=data/local/ihm/$SET +mkdir -p $dir + +# Audio data directory check +if [ ! -d $AMI_DIR ]; then + echo "Error: run.sh requires a directory argument" + exit 1; +fi + +# And transcripts check +if [ ! -f $SEGS ]; then + echo "Error: File $SEGS no found (run ami_text_prep.sh)." + exit 1; +fi + +# find headset wav audio files only, here we again get all +# the files in the corpora and filter only specific sessions +# while building segments + +find $AMI_DIR -iname '*.Headset-*.wav' | sort > $dir/wav.flist +n=`cat $dir/wav.flist | wc -l` +echo "In total, $n headset files were found." +[ $n -ne 687 ] && \ + echo "Warning: expected 687 (168 mtgs x 4 mics + 3 mtgs x 5 mics) data files, found $n" + +# (1a) Transcriptions preparation +# here we start with normalised transcriptions, the utt ids follow the convention +# AMI_MEETING_CHAN_SPK_STIME_ETIME +# AMI_ES2011a_H00_FEE041_0003415_0003484 + +awk '{meeting=$1; channel=$2; speaker=$3; stime=$4; etime=$5; + printf("AMI_%s_%s_%s_%07.0f_%07.0f", meeting, channel, speaker, int(100*stime+0.5), int(100*etime+0.5)); + for(i=6;i<=NF;i++) printf(" %s", $i); printf "\n"}' $SEGS | sort | uniq > $dir/text + +# (1c) Make segment files from transcript +#segments file format is: utt-id side-id start-time end-time, e.g.: + +awk '{ + segment=$1; + split(segment,S,"[_]"); + audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6]; + print segment " " audioname " " startf*10/1000 " " endf*10/1000 " " +}' < $dir/text > $dir/segments + +#prepare wav.scp +sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \ + perl -ne 'split; $_ =~ m/(.*)\..*\-([0-9])/; print "AMI_$1_H0$2\n"' | \ + paste - $dir/wav.flist > $dir/wav1.scp + +#Keep only train part of waves +awk '{print $2}' $dir/segments | sort -u | join - $dir/wav1.scp > $dir/wav2.scp + +#replace path with an appropriate sox command that select single channel only +awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp + +# (1d) reco2file_and_channel +cat $dir/wav.scp \ + | perl -ane '$_ =~ m:^(\S+)(H0[0-4])\s+.*\/([IETB].*)\.wav.*$: || die "bad label $_"; + print "$1$2 $3 A\n"; ' > $dir/reco2file_and_channel || exit 1; + +awk '{print $1}' $dir/segments | \ + perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "segments: bad label $_"; + print "$1$2$3 $1$2\n";' > $dir/utt2spk || exit 1; + +sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; + +#check and correct the case when segment timings for given speaker overlap themself +#(important for simulatenous asclite scoring to proceed). +#There is actually only one such case for devset and automatic segmentetions +join $dir/utt2spkm $dir/segments | \ + perl -ne '{BEGIN{$pu=""; $pt=0.0;} split; + if ($pu eq $_[1] && $pt > $_[3]) { + print "$_[0] $_[2] $_[3] $_[4]>$_[0] $_[2] $pt $_[4]\n" + } + $pu=$_[1]; $pt=$_[4]; + }' > $dir/segments_to_fix +if [ `cat $dir/segments_to_fix | wc -l` -gt 0 ]; then + echo "$0. Applying following fixes to segments" + cat $dir/segments_to_fix + while read line; do + p1=`echo $line | awk -F'>' '{print $1}'` + p2=`echo $line | awk -F'>' '{print $2}'` + sed -ir "s!$p1!$p2!" $dir/segments + done < $dir/segments_to_fix +fi + +# Copy stuff into its final locations +fdir=data/ihm/$SET +mkdir -p $fdir +for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do + cp $dir/$f $fdir/$f || exit 1; +done + +#Produce STMs for sclite scoring +local/convert2stm.pl $dir > $fdir/stm +cp local/english.glm $fdir/glm + +utils/validate_data_dir.sh --no-feats $fdir || exit 1; + +echo AMI $SET set data preparation succeeded. + diff --git a/egs/ami/s5/local/ami_mdm_data_prep.sh b/egs/ami/s5/local/ami_mdm_data_prep.sh new file mode 100755 index 000000000..bc7e4180b --- /dev/null +++ b/egs/ami/s5/local/ami_mdm_data_prep.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski) +# AMI Corpus dev/eval data preparation + +# To be run from one directory above this script. + +. path.sh + +#check existing directories +if [ $# != 2 ]; then + echo "Usage: ami_data_prep.sh " + exit 1; +fi + +AMI_DIR=$1 +mic=$2 + +SEGS=data/local/annotations/train.txt +dir=data/local/$mic/train +odir=data/$mic/train +mkdir -p $dir + +# Audio data directory check +if [ ! -d $AMI_DIR ]; then + echo "Error: run.sh requires a directory argument" + exit 1; +fi + +# And transcripts check +if [ ! -f $SEGS ]; then + echo "Error: File $SEGS no found (run ami_text_prep.sh)." + exit 1; +fi + +# find MDM mics +find $AMI_DIR -iname "*${mic}.wav" | sort > $dir/wav.flist + +n=`cat $dir/wav.flist | wc -l` +echo "In total, $n headset files were found." +[ $n -ne 169 ] && \ + echo Warning: expected 169 data data files, found $n + +# (1a) Transcriptions preparation +# here we start with rt09 transcriptions, hence not much to do + +awk '{meeting=$1; channel="MDM"; speaker=$3; stime=$4; etime=$5; + printf("AMI_%s_%s_%s_%07.0f_%07.0f", meeting, channel, speaker, int(100*stime+0.5), int(100*etime+0.5)); + for(i=6;i<=NF;i++) printf(" %s", $i); printf "\n"}' $SEGS | sort | uniq > $dir/text + +# (1c) Make segment files from transcript +#segments file format is: utt-id side-id start-time end-time, e.g.: +#AMI_ES2011a_H00_FEE041_0003415_0003484 +awk '{ + segment=$1; + split(segment,S,"[_]"); + audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6]; + print segment " " audioname " " startf/100 " " endf/100 " " +}' < $dir/text > $dir/segments + +#EN2001a.Array1-01.wav +#sed -e 's?.*/??' -e 's?.sph??' $dir/wav.flist | paste - $dir/wav.flist \ +# > $dir/wav.scp + +sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \ + perl -ne 'split; $_ =~ m/(.*)\_.*/; print "AMI_$1_MDM\n"' | \ + paste - $dir/wav.flist > $dir/wav1.scp + +#Keep only training part of waves +awk '{print $2}' $dir/segments | sort -u | join - $dir/wav1.scp | sort -o $dir/wav2.scp +#Two distant recordings are missing, agree segments with wav.scp +awk '{print $1}' $dir/wav2.scp | join -2 2 - $dir/segments | \ + awk '{print $2" "$1" "$3" "$4" "$5}' > $dir/s; mv $dir/s $dir/segments +#...and text with segments +awk '{print $1}' $dir/segments | join - $dir/text > $dir/t; mv $dir/t $dir/text + +#replace path with an appropriate sox command that select single channel only +awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp + +#prep reco2file_and_channel +cat $dir/wav.scp | \ + perl -ane '$_ =~ m:^(\S+MDM).*\/([IETB].*)\.wav.*$: || die "bad label $_"; + print "$1 $2 A\n"; ' > $dir/reco2file_and_channel || exit 1; + +# we assume we adapt to the session only +awk '{print $1}' $dir/segments | \ + perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; + print "$1$2$3 $1\n";' \ + > $dir/utt2spk || exit 1; + +sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; + +# Copy stuff into its final locations +mkdir -p $odir +for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do + cp $dir/$f $odir/$f | exit 1; +done + +utils/validate_data_dir.sh --no-feats $odir + +echo AMI MDM data preparation succeeded. + diff --git a/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh new file mode 100755 index 000000000..406add86b --- /dev/null +++ b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh @@ -0,0 +1,126 @@ +#!/bin/bash + +# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski) +# AMI Corpus dev/eval data preparation + +. path.sh + +#check existing directories +if [ $# != 3 ]; then + echo "Usage: ami_mdm_scoring_data_prep.sh /path/to/AMI-MDM mic-name set-name" + exit 1; +fi + +AMI_DIR=$1 +mic=$2 +SET=$3 + +SEGS=data/local/annotations/$SET.txt +tmpdir=data/local/$mic/$SET +dir=data/$mic/$SET + +mkdir -p $tmpdir + +# Audio data directory check +if [ ! -d $AMI_DIR ]; then + echo "Error: run.sh requires a directory argument" + exit 1; +fi + +# And transcripts check +if [ ! -f $SEGS ]; then + echo "Error: File $SEGS no found (run ami_text_prep.sh)." + exit 1; +fi + +# find selected mdm wav audio files only +find $AMI_DIR -iname "*${mic}.wav" | sort > $tmpdir/wav.flist +n=`cat $tmpdir/wav.flist | wc -l` +if [ $n -ne 169 ]; then + echo "Warning. Expected to find 169 files but found $n." +fi + +# (1a) Transcriptions preparation +awk '{meeting=$1; channel="MDM"; speaker=$3; stime=$4; etime=$5; + printf("AMI_%s_%s_%s_%07.0f_%07.0f", meeting, channel, speaker, int(100*stime+0.5), int(100*etime+0.5)); + for(i=6;i<=NF;i++) printf(" %s", $i); printf "\n"}' $SEGS | sort | uniq > $tmpdir/text + +# (1c) Make segment files from transcript +#segments file format is: utt-id side-id start-time end-time, e.g.: +#AMI_ES2011a_H00_FEE041_0003415_0003484 +awk '{ + segment=$1; + split(segment,S,"[_]"); + audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6]; + print segment " " audioname " " startf/100 " " endf/100 " " +}' < $tmpdir/text > $tmpdir/segments + +#EN2001a.Array1-01.wav +#sed -e 's?.*/??' -e 's?.sph??' $dir/wav.flist | paste - $dir/wav.flist \ +# > $dir/wav.scp + +sed -e 's?.*/??' -e 's?.wav??' $tmpdir/wav.flist | \ + perl -ne 'split; $_ =~ m/(.*)\_.*/; print "AMI_$1_MDM\n"' | \ + paste - $tmpdir/wav.flist > $tmpdir/wav1.scp + +#Keep only devset part of waves +awk '{print $2}' $tmpdir/segments | sort -u | join - $tmpdir/wav1.scp > $tmpdir/wav2.scp + +#replace path with an appropriate sox command that select single channel only +awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp + +#prep reco2file_and_channel +cat $tmpdir/wav.scp | \ + perl -ane '$_ =~ m:^(\S+MDM)\s+.*\/([IETB].*)\.wav.*$: || die "bad label $_"; + print "$1 $2 A\n"; ' > $tmpdir/reco2file_and_channel || exit 1; + +# we assume we adapt to the session only +awk '{print $1}' $tmpdir/segments | \ + perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; + print "$1$2$3 $1\n";' \ + > $tmpdir/utt2spk || exit 1; + +sort -k 2 $tmpdir/utt2spk | utils/utt2spk_to_spk2utt.pl > $tmpdir/spk2utt || exit 1; + +# but we want to properly score the overlapped segments, hence we generate the extra +# utt2spk_stm file containing speakers ids used to generate the stms for mdm/sdm case +awk '{print $1}' $tmpdir/segments | \ + perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; + print "$1$2$3 $1$2\n";' > $tmpdir/utt2spk_stm || exit 1; + +#check and correct case when segment timings for a given speaker overlap themself +#(important for simulatenous asclite scoring to proceed). +#There is actually only one such case for devset and automatic segmentetions +join $tmpdir/utt2spk_stm $tmpdir/segments | \ + perl -ne '{BEGIN{$pu=""; $pt=0.0;} split; + if ($pu eq $_[1] && $pt > $_[3]) { + print "$_[0] $_[2] $_[3] $_[4]>$_[0] $_[2] $pt $_[4]\n" + } + $pu=$_[1]; $pt=$_[4]; + }' > $tmpdir/segments_to_fix +if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then + echo "$0. Applying following fixes to segments" + cat $tmpdir/segments_to_fix + while read line; do + p1=`echo $line | awk -F'>' '{print $1}'` + p2=`echo $line | awk -F'>' '{print $2}'` + sed -ir "s!$p1!$p2!" $tmpdir/segments + done < $tmpdir/segments_to_fix +fi + +# Copy stuff into its final locations [this has been moved from the format_data +# script] +mkdir -p $dir +for f in spk2utt utt2spk utt2spk_stm wav.scp text segments reco2file_and_channel; do + cp $tmpdir/$f $dir/$f || exit 1; +done + +cp local/english.glm $dir/glm +#note, although utt2spk contains mappings to the whole meetings for simulatenous scoring +#we need to know which speakers overlap at meeting level, hence we generate an extra utt2spk_stm file +local/convert2stm.pl $dir utt2spk_stm > $dir/stm + +utils/validate_data_dir.sh --no-feats $dir + +echo AMI $SET set data preparation succeeded. + diff --git a/egs/ami/s5/local/ami_prepare_dict.sh b/egs/ami/s5/local/ami_prepare_dict.sh new file mode 100755 index 000000000..2f5c675f1 --- /dev/null +++ b/egs/ami/s5/local/ami_prepare_dict.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +#adapted from fisher dict preparation script, Author: Pawel Swietojanski + +dir=data/local/dict +mkdir -p $dir +echo "Getting CMU dictionary" +svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $dir/cmudict + +# silence phones, one per line. +for w in sil laughter noise oov; do echo $w; done > $dir/silence_phones.txt +echo sil > $dir/optional_silence.txt + +# For this setup we're discarding stress. +cat $dir/cmudict/cmudict.0.7a.symbols | sed s/[0-9]//g | \ + perl -ane 's:\r::; print;' | sort | uniq > $dir/nonsilence_phones.txt + +# An extra question will be added by including the silence phones in one class. +cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1; + +grep -v ';;;' $dir/cmudict/cmudict.0.7a | \ + perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; s: : :; print; }' | \ + sed s/[0-9]//g | sort | uniq > $dir/lexicon1_raw_nosil.txt || exit 1; + +#cat eddie_data/rt09.ami.ihmtrain09.v3.dct | sort > $dir/lexicon1_raw_nosil.txt + +# limit the vocabulary to the predefined 50k words +wget -nv -O $dir/wordlist.50k.gz http://www.openslr.org/resources/9/wordlist.50k.gz +gunzip -c $dir/wordlist.50k.gz > $dir/wordlist.50k +join $dir/lexicon1_raw_nosil.txt $dir/wordlist.50k > $dir/lexicon1_raw_nosil_50k.txt + +# Add prons for laughter, noise, oov +for w in `grep -v sil $dir/silence_phones.txt`; do + echo "[$w] $w" +done | cat - $dir/lexicon1_raw_nosil_50k.txt > $dir/lexicon2_raw_50k.txt || exit 1; + +# add some specific words, those are only with 100 missing occurences or more +( echo "MM M"; \ + echo "HMM HH M"; \ + echo "MM-HMM M HH M"; \ + echo "COLOUR K AH L ER"; \ + echo "COLOURS K AH L ER Z"; \ + echo "REMOTES R IH M OW T Z"; \ + echo "FAVOURITE F EY V ER IH T"; \ + echo " oov" ) | cat - $dir/lexicon2_raw_50k.txt \ + | sort -u > $dir/lexicon3_extra_50k.txt + +cp $dir/lexicon3_extra_50k.txt $dir/lexicon.txt + +[ ! -f $dir/lexicon.txt ] && exit 1; + +# This is just for diagnostics: +cat data/ihm/train/text | \ + awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \ + sort -nr > $dir/word_counts + +awk '{print $1}' $dir/lexicon.txt | \ + perl -e '($word_counts)=@ARGV; + open(W, "<$word_counts")||die "opening word-counts $word_counts"; + while() { chop; $seen{$_}=1; } + while() { + ($c,$w) = split; + if (!defined $seen{$w}) { print; } + } ' $dir/word_counts > $dir/oov_counts.txt + +echo "*Highest-count OOVs are:" +head -n 20 $dir/oov_counts.txt + +utils/validate_dict_dir.pl $dir diff --git a/egs/ami/s5/local/ami_sdm_data_prep.sh b/egs/ami/s5/local/ami_sdm_data_prep.sh new file mode 100755 index 000000000..8eda00f1d --- /dev/null +++ b/egs/ami/s5/local/ami_sdm_data_prep.sh @@ -0,0 +1,100 @@ +#!/bin/bash + +# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski) +# AMI Corpus dev/eval data preparation + +. path.sh + +#check existing directories +if [ $# != 2 ]; then + echo "Usage: ami_sdm_data_prep.sh " + exit 1; +fi + +AMI_DIR=$1 +MICNUM=$2 +DSET="sdm$MICNUM" + +SEGS=data/local/annotations/train.txt +dir=data/local/$DSET/train +mkdir -p $dir + +# Audio data directory check +if [ ! -d $AMI_DIR ]; then + echo "Error: run.sh requires a directory argument" + exit 1; +fi + +# And transcripts check +if [ ! -f $SEGS ]; then + echo "Error: File $SEGS no found (run ami_text_prep.sh)." + exit 1; +fi + +# as the sdm we treat first mic from the array +find $AMI_DIR -iname "*.Array1-0$MICNUM.wav" | sort > $dir/wav.flist + +n=`cat $dir/wav.flist | wc -l` + +echo "In total, $n files were found." +[ $n -ne 169 ] && \ + echo Warning: expected 169 data data files, found $n + +# (1a) Transcriptions preparation +# here we start with already normalised transcripts, just make the ids +# Note, we set here SDM rather than, for example, SDM1 as we want to easily use +# the same alignments across different mics + +awk '{meeting=$1; channel="SDM"; speaker=$3; stime=$4; etime=$5; + printf("AMI_%s_%s_%s_%07.0f_%07.0f", meeting, channel, speaker, int(100*stime+0.5), int(100*etime+0.5)); + for(i=6;i<=NF;i++) printf(" %s", $i); printf "\n"}' $SEGS | sort | uniq > $dir/text + +# (1c) Make segment files from transcript +#segments file format is: utt-id side-id start-time end-time, e.g.: +#AMI_ES2011a_H00_FEE041_0003415_0003484 +awk '{ + segment=$1; + split(segment,S,"[_]"); + audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6]; + print segment " " audioname " " startf/100 " " endf/100 " " +}' < $dir/text > $dir/segments + +#EN2001a.Array1-01.wav + +sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \ + perl -ne 'split; $_ =~ m/(.*)\..*/; print "AMI_$1_SDM\n"' | \ + paste - $dir/wav.flist > $dir/wav1.scp + +#Keep only training part of waves +awk '{print $2}' $dir/segments | sort -u | join - $dir/wav1.scp | sort -o $dir/wav2.scp +#Two distant recordings are missing, agree segments with wav.scp +awk '{print $1}' $dir/wav2.scp | join -2 2 - $dir/segments | \ + awk '{print $2" "$1" "$3" "$4" "$5}' > $dir/s; mv $dir/s $dir/segments +#...and text with segments +awk '{print $1}' $dir/segments | join - $dir/text > $dir/t; mv $dir/t $dir/text + +#replace path with an appropriate sox command that select a single channel only +awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp + +# this file reco2file_and_channel maps recording-id +cat $dir/wav.scp | \ + perl -ane '$_ =~ m:^(\S+SDM)\s+.*\/([IETB].*)\.wav.*$: || die "bad label $_"; + print "$1 $2 A\n"; ' > $dir/reco2file_and_channel || exit 1; + +# Assumtion, for sdm we adapt to the session only +awk '{print $1}' $dir/segments | \ + perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; + print "$1$2$3 $1\n";' | sort > $dir/utt2spk || exit 1; + +sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; + +# Copy stuff into its final locations +mkdir -p data/$DSET/train +for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do + cp $dir/$f data/$DSET/train/$f || exit 1; +done + +utils/validate_data_dir.sh --no-feats data/$DSET/train + +echo AMI $DSET data preparation succeeded. + diff --git a/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh new file mode 100755 index 000000000..90690731e --- /dev/null +++ b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh @@ -0,0 +1,131 @@ +#!/bin/bash + +# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski) +# AMI Corpus dev/eval data preparation + +. path.sh + +#check existing directories +if [ $# != 3 ]; then + echo "Usage: ami_sdm_scoring_data_prep.sh " + exit 1; +fi + +AMI_DIR=$1 +MICNUM=$2 +SET=$3 +DSET="sdm$MICNUM" + +SEGS=data/local/annotations/$SET.txt +tmpdir=data/local/$DSET/$SET +dir=data/$DSET/$SET + +mkdir -p $tmpdir + +# Audio data directory check +if [ ! -d $AMI_DIR ]; then + echo "Error: run.sh requires a directory argument" + exit 1; +fi + +# And transcripts check +if [ ! -f $SEGS ]; then + echo "Error: File $SEGS no found (run ami_text_prep.sh)." + exit 1; +fi + +# find headset wav audio files only, here we again get all +# the files in the corpora and filter only specific sessions +# while building segments + +find $AMI_DIR -iname "*.Array1-0$MICNUM.wav" | sort > $tmpdir/wav.flist + +n=`cat $tmpdir/wav.flist | wc -l` +echo "In total, $n files were found." + +# (1a) Transcriptions preparation +# here we start with normalised transcripts + +awk '{meeting=$1; channel="SDM"; speaker=$3; stime=$4; etime=$5; + printf("AMI_%s_%s_%s_%07.0f_%07.0f", meeting, channel, speaker, int(100*stime+0.5), int(100*etime+0.5)); + for(i=6;i<=NF;i++) printf(" %s", $i); printf "\n"}' $SEGS | sort | uniq > $tmpdir/text + +# (1c) Make segment files from transcript +#segments file format is: utt-id side-id start-time end-time, e.g.: +#AMI_ES2011a_H00_FEE041_0003415_0003484 +awk '{ + segment=$1; + split(segment,S,"[_]"); + audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6]; + print segment " " audioname " " startf/100 " " endf/100 " " +}' < $tmpdir/text > $tmpdir/segments + +#EN2001a.Array1-01.wav +#sed -e 's?.*/??' -e 's?.sph??' $dir/wav.flist | paste - $dir/wav.flist \ +# > $dir/wav.scp + +sed -e 's?.*/??' -e 's?.wav??' $tmpdir/wav.flist | \ + perl -ne 'split; $_ =~ m/(.*)\..*/; print "AMI_$1_SDM\n"' | \ + paste - $tmpdir/wav.flist > $tmpdir/wav1.scp + +#Keep only devset part of waves +awk '{print $2}' $tmpdir/segments | sort -u | join - $tmpdir/wav1.scp > $tmpdir/wav2.scp + +#replace path with an appropriate sox command that select single channel only +awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp + +#prep reco2file_and_channel +cat $tmpdir/wav.scp | \ + perl -ane '$_ =~ m:^(\S+SDM).*\/([IETB].*)\.wav.*$: || die "bad label $_"; + print "$1 $2 A\n"; '\ + > $tmpdir/reco2file_and_channel || exit 1; + +# we assume we adapt to the session only +awk '{print $1}' $tmpdir/segments | \ + perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; + print "$1$2$3 $1\n";' \ + > $tmpdir/utt2spk || exit 1; + +sort -k 2 $tmpdir/utt2spk | utils/utt2spk_to_spk2utt.pl > $tmpdir/spk2utt || exit 1; + +# but we want to properly score the overlapped segments, hence we generate the extra +# utt2spk_stm file containing speakers ids used to generate the stms for mdm/sdm case +awk '{print $1}' $tmpdir/segments | \ + perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; + print "$1$2$3 $1$2\n";' \ + > $tmpdir/utt2spk_stm || exit 1; + +#check and correct the case when segment timings for given speaker overlap themself +#(important for simulatenous asclite scoring to proceed). +#There is actually only one such case for devset and automatic segmentetions +join $tmpdir/utt2spk_stm $tmpdir/segments | \ + perl -ne '{BEGIN{$pu=""; $pt=0.0;} split; + if ($pu eq $_[1] && $pt > $_[3]) { + print "$_[0] $_[2] $_[3] $_[4]>$_[0] $_[2] $pt $_[4]\n" + } + $pu=$_[1]; $pt=$_[4]; + }' > $tmpdir/segments_to_fix +if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then + echo "$0. Applying following fixes to segments" + cat $tmpdir/segments_to_fix + while read line; do + p1=`echo $line | awk -F'>' '{print $1}'` + p2=`echo $line | awk -F'>' '{print $2}'` + sed -ir "s!$p1!$p2!" $tmpdir/segments + done < $tmpdir/segments_to_fix +fi + +# Copy stuff into its final locations [this has been moved from the format_data +# script] +mkdir -p $dir +for f in spk2utt utt2spk utt2spk_stm wav.scp text segments reco2file_and_channel; do + cp $tmpdir/$f $dir/$f || exit 1; +done + +local/convert2stm.pl $dir utt2spk_stm > $dir/stm +cp local/english.glm $dir/glm + +utils/validate_data_dir.sh --no-feats $dir + +echo AMI $DSET scenario and $SET set data preparation succeeded. + diff --git a/egs/ami/s5/local/ami_split_segments.pl b/egs/ami/s5/local/ami_split_segments.pl new file mode 100755 index 000000000..0bdc98196 --- /dev/null +++ b/egs/ami/s5/local/ami_split_segments.pl @@ -0,0 +1,218 @@ +#!/usr/bin/perl + +# Copyright 2014 University of Edinburgh (Author: Pawel Swietojanski) + +# The script - based on punctuation times - splits segments longer than #words (input parameter) +# and produces bit more more normalised form of transcripts, as follows +# MeetID Channel Spkr stime etime transcripts + +#use List::MoreUtils 'indexes'; +use strict; +use warnings; + +sub split_transcripts; +sub normalise_transcripts; + +sub merge_hashes { + my ($h1, $h2) = @_; + my %hash1 = %$h1; my %hash2 = %$h2; + foreach my $key2 ( keys %hash2 ) { + if( exists $hash1{$key2} ) { + warn "Key [$key2] is in both hashes!"; + next; + } else { + $hash1{$key2} = $hash2{$key2}; + } + } + return %hash1; +} + +sub print_hash { + my ($h) = @_; + my %hash = %$h; + foreach my $k (sort keys %hash) { + print "$k : $hash{$k}\n"; + } +} + +sub get_name { + #no warnings; + my $sname = sprintf("%07d_%07d", $_[0]*100, $_[1]*100) || die 'Input undefined!'; + #use warnings; + return $sname; +} + +sub split_on_comma { + + my ($text, $comma_times, $btime, $etime, $max_words_per_seg)= @_; + my %comma_hash = %$comma_times; + + print "Btime, Etime : $btime, $etime\n"; + + my $stime = ($etime+$btime)/2; #split time + my $skey = ""; + my $otime = $btime; + foreach my $k (sort {$comma_hash{$a} cmp $comma_hash{$b} } keys %comma_hash) { + print "Key : $k : $comma_hash{$k}\n"; + my $ktime = $comma_hash{$k}; + if ($ktime==$btime) { next; } + if ($ktime==$etime) { last; } + if (abs($stime-$ktime)/20) { + $st=$comma_hash{$skey}; + $et = $etime; + } + my (@utts) = split (' ', $utts1[$i]); + if ($#utts < $max_words_per_seg) { + my $nm = get_name($st, $et); + print "SplittedOnComma[$i]: $nm : $utts1[$i]\n"; + $transcripts{$nm} = $utts1[$i]; + } else { + print 'Continue splitting!'; + my %transcripts2 = split_on_comma($utts1[$i], \%comma_hash, $st, $et, $max_words_per_seg); + %transcripts = merge_hashes(\%transcripts, \%transcripts2); + } + } + return %transcripts; +} + +sub split_transcripts { + @_ == 4 || die 'split_transcripts: transcript btime etime max_word_per_seg'; + + my ($text, $btime, $etime, $max_words_per_seg) = @_; + my (@transcript) = @$text; + + my (@punct_indices) = grep { $transcript[$_] =~ /^[\.,\?\!\:]$/ } 0..$#transcript; + my (@time_indices) = grep { $transcript[$_] =~ /^[0-9]+\.[0-9]*/ } 0..$#transcript; + my (@puncts_times) = delete @transcript[@time_indices]; + my (@puncts) = @transcript[@punct_indices]; + + if ($#puncts_times != $#puncts) { + print 'Ooops, different number of punctuation signs and timestamps! Skipping.'; + return (); + } + + #first split on full stops + my (@full_stop_indices) = grep { $puncts[$_] =~ /[\.\?]/ } 0..$#puncts; + my (@full_stop_times) = @puncts_times[@full_stop_indices]; + + unshift (@full_stop_times, $btime); + push (@full_stop_times, $etime); + + my %comma_puncts = (); + for (my $i=0, my $j=0;$i<=$#punct_indices; $i++) { + my $lbl = "$transcript[$punct_indices[$i]]$j"; + if ($lbl =~ /[\.\?].+/) { next; } + $transcript[$punct_indices[$i]] = $lbl; + $comma_puncts{$lbl} = $puncts_times[$i]; + $j++; + } + + #print_hash(\%comma_puncts); + + print "InpTrans : @transcript\n"; + print "Full stops: @full_stop_times\n"; + + my @utts1 = split (/[\.\?]/, uc join(' ', @transcript)); + my %transcripts = (); + for (my $i=0; $i<=$#utts1; $i++) { + my (@utts) = split (' ', $utts1[$i]); + if ($#utts < $max_words_per_seg) { + print "ReadyTrans: $utts1[$i]\n"; + $transcripts{get_name($full_stop_times[$i], $full_stop_times[$i+1])} = $utts1[$i]; + } else { + print "TransToSplit: $utts1[$i]\n"; + my %transcripts2 = split_on_comma($utts1[$i], \%comma_puncts, $full_stop_times[$i], $full_stop_times[$i+1], $max_words_per_seg); + print "Hash TR2:\n"; print_hash(\%transcripts2); + print "Hash TR:\n"; print_hash(\%transcripts); + %transcripts = merge_hashes(\%transcripts, \%transcripts2); + print "Hash TR_NEW : \n"; print_hash(\%transcripts); + } + } + return %transcripts; +} + +sub normalise_transcripts { + my $text = $_[0]; + + #DO SOME ROUGH AND OBVIOUS PRELIMINARY NORMALISATION, AS FOLLOWS + #remove the remaining punctation labels e.g. some text ,0 some text ,1 + $text =~ s/[\.\,\?\!\:][0-9]+//g; + #there are some extra spurious puncations without spaces, e.g. UM,I, replace with space + $text =~ s/[A-Z']+,[A-Z']+/ /g; + #split words combination, ie. ANTI-TRUST to ANTI TRUST (None of them appears in cmudict anyway) + #$text =~ s/(.*)([A-Z])\s+(\-)(.*)/$1$2$3$4/g; + $text =~ s/\-/ /g; + #substitute X_M_L with X. M. L. etc. + $text =~ s/\_/. /g; + #normalise and trim spaces + $text =~ s/^\s*//g; + $text =~ s/\s*$//g; + $text =~ s/\s+/ /g; + #some transcripts are empty with -, nullify (and ignore) them + $text =~ s/^\-$//g; + $text =~ s/\s+\-$//; + # apply few exception for dashed phrases, Mm-Hmm, Uh-Huh, etc. those are frequent in AMI + # and will be added to dictionary + $text =~ s/MM HMM/MM\-HMM/g; + $text =~ s/UH HUH/UH\-HUH/g; + + return $text; +} + +if (@ARGV != 2) { + print STDERR "Usage: ami_split_segments.pl \n"; + exit(1); +} + +my $meet_file = shift @ARGV; +my $out_file = shift @ARGV; +my %transcripts = (); + +open(W, ">$out_file") || die "opening output file $out_file"; +open(S, "<$meet_file") || die "opening meeting file $meet_file"; + +while() { + + my @A = split(" ", $_); + if (@A < 9) { print "Skipping line @A"; next; } + + my ($meet_id, $channel, $spk, $channel2, $trans_btime, $trans_etime, $aut_btime, $aut_etime) = @A[0..7]; + my @transcript = @A[8..$#A]; + my %transcript = split_transcripts(\@transcript, $aut_btime, $aut_etime, 30); + + for my $key (keys %transcript) { + my $value = $transcript{$key}; + my $segment = normalise_transcripts($value); + my @times = split(/\_/, $key); + if ($times[0] >= $times[1]) { + print "Warning, $meet_id, $spk, $times[0] > $times[1]. Skipping. \n"; next; + } + if (length($segment)>0) { + print W join " ", $meet_id, "H0${channel2}", $spk, $times[0]/100.0, $times[1]/100.0, $segment, "\n"; + } + } + +} +close(S); +close(W); + +print STDERR "Finished." diff --git a/egs/ami/s5/local/ami_text_prep.sh b/egs/ami/s5/local/ami_text_prep.sh new file mode 100755 index 000000000..2fbda5bb6 --- /dev/null +++ b/egs/ami/s5/local/ami_text_prep.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski), 2014, Apache 2.0 + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + exit 1; +fi + +amidir=$1 +wdir=data/local/annotations + +#extract text from AMI XML annotations +local/ami_xml2text.sh $amidir + +[ ! -f $wdir/transcripts1 ] && echo "$0: File $wdir/transcripts1 not found." && exit 1; + +echo "Preprocessing transcripts..." +local/ami_split_segments.pl $wdir/transcripts1 $wdir/transcripts2 &> $wdir/log/split_segments.log + +#make final train/dev/eval splits +for dset in train eval dev; do + [ ! -f local/split_$dset.final ] && cp local/split_$dset.orig local/split_$dset.final + grep -f local/split_$dset.final $wdir/transcripts2 > $wdir/$dset.txt +done + + + + + + + diff --git a/egs/ami/s5/local/ami_train_lms.sh b/egs/ami/s5/local/ami_train_lms.sh new file mode 100755 index 000000000..be2da466d --- /dev/null +++ b/egs/ami/s5/local/ami_train_lms.sh @@ -0,0 +1,176 @@ +#!/bin/bash + +# Copyright 2013 Arnab Ghoshal, Pawel Swietojanski + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# To be run from one directory above this script. + +# Begin configuration section. +fisher= +order=3 +swbd= +google= +web_sw= +web_fsh= +web_mtg= +# end configuration sections + +help_message="Usage: "`basename $0`" [options] +Train language models for AMI and optionally for Switchboard, Fisher and web-data from University of Washington.\n +options: + --help # print this message and exit + --fisher DIR # directory for Fisher transcripts + --order N # N-gram order (default: '$order') + --swbd DIR # Directory for Switchboard transcripts + --web-sw FILE # University of Washington (191M) Switchboard web data + --web-fsh FILE # University of Washington (525M) Fisher web data + --web-mtg FILE # University of Washington (150M) CMU+ICSI+NIST meeting data +"; + +. utils/parse_options.sh + +if [ $# -ne 4 ]; then + printf "$help_message\n"; + exit 1; +fi + +train=$1 # data/ihm/train/text +dev=$2 # data/ihm/dev/text +lexicon=$3 # data/ihm/dict/lexicon.txt +dir=$4 # data/local/lm + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +set -o errexit +mkdir -p $dir +export LC_ALL=C + +cut -d' ' -f2- $train | gzip -c > $dir/train.gz +cut -d' ' -f2- $dev | gzip -c > $dir/dev.gz + +awk '{print $1}' $lexicon | sort -u > $dir/wordlist.lex +gunzip -c $dir/train.gz | tr ' ' '\n' | grep -v ^$ | sort -u > $dir/wordlist.train +sort -u $dir/wordlist.lex $dir/wordlist.train > $dir/wordlist + +ngram-count -text $dir/train.gz -order $order -limit-vocab -vocab $dir/wordlist \ + -unk -map-unk "" -kndiscount -interpolate -lm $dir/ami.o${order}g.kn.gz +echo "PPL for AMI LM:" +ngram -unk -lm $dir/ami.o${order}g.kn.gz -ppl $dir/dev.gz +ngram -unk -lm $dir/ami.o${order}g.kn.gz -ppl $dir/dev.gz -debug 2 >& $dir/ppl2 +mix_ppl="$dir/ppl2" +mix_tag="ami" +mix_lms=( "$dir/ami.o${order}g.kn.gz" ) +num_lms=1 + +if [ ! -z "$swbd" ]; then + mkdir -p $dir/swbd + + find $swbd -iname '*-trans.text' -exec cat {} \; | cut -d' ' -f4- \ + | gzip -c > $dir/swbd/text0.gz + gunzip -c $dir/swbd/text0.gz | swbd_map_words.pl | gzip -c \ + > $dir/swbd/text1.gz + ngram-count -text $dir/swbd/text1.gz -order $order -limit-vocab \ + -vocab $dir/wordlist -unk -map-unk "" -kndiscount -interpolate \ + -lm $dir/swbd/swbd.o${order}g.kn.gz + echo "PPL for SWBD LM:" + ngram -unk -lm $dir/swbd/swbd.o${order}g.kn.gz -ppl $dir/dev.gz + ngram -unk -lm $dir/swbd/swbd.o${order}g.kn.gz -ppl $dir/dev.gz -debug 2 \ + >& $dir/swbd/ppl2 + + mix_ppl="$mix_ppl $dir/swbd/ppl2" + mix_tag="${mix_tag}_swbd" + mix_lms=("${mix_lms[@]}" "$dir/swbd/swbd.o${order}g.kn.gz") + num_lms=$[ num_lms + 1 ] +fi + +if [ ! -z "$fisher" ]; then + [ ! -d "$fisher/part1/data/trans" ] \ + && echo "Cannot find transcripts in Fisher directory: '$fisher'" \ + && exit 1; + mkdir -p $dir/fisher + + find $fisher -path '*/trans/*fe*.txt' -exec cat {} \; | grep -v ^# | grep -v ^$ \ + | cut -d' ' -f4- | gzip -c > $dir/fisher/text0.gz + gunzip -c $dir/fisher/text0.gz | fisher_map_words.pl \ + | gzip -c > $dir/fisher/text1.gz + ngram-count -debug 0 -text $dir/fisher/text1.gz -order $order -limit-vocab \ + -vocab $dir/wordlist -unk -map-unk "" -kndiscount -interpolate \ + -lm $dir/fisher/fisher.o${order}g.kn.gz + echo "PPL for Fisher LM:" + ngram -unk -lm $dir/fisher/fisher.o${order}g.kn.gz -ppl $dir/dev.gz + ngram -unk -lm $dir/fisher/fisher.o${order}g.kn.gz -ppl $dir/dev.gz -debug 2 \ + >& $dir/fisher/ppl2 + + mix_ppl="$mix_ppl $dir/fisher/ppl2" + mix_tag="${mix_tag}_fsh" + mix_lms=("${mix_lms[@]}" "$dir/fisher/fisher.o${order}g.kn.gz") + num_lms=$[ num_lms + 1 ] +fi + +if [ ! -z "$google1B" ]; then + mkdir -p $dir/google + wget -O $dir/google/cantab.lm3.bz2 http://vm.cantabresearch.com:6080/demo/cantab.lm3.bz2 + wget -O $dir/google/150000.lex http://vm.cantabresearch.com:6080/demo/150000.lex + + ngram -unk -limit-vocab -vocab $dir/wordlist -lm $dir/google.cantab.lm3.bz3 \ + -write-lm $dir/google/google.o${order}g.kn.gz + + mix_ppl="$mix_ppl $dir/goog1e/ppl2" + mix_tag="${mix_tag}_fsh" + mix_lms=("${mix_lms[@]}" "$dir/google/google.o${order}g.kn.gz") + num_lms=$[ num_lms + 1 ] +fi + +## The University of Washington conversational web data can be obtained as: +## wget --no-check-certificate http://ssli.ee.washington.edu/data/191M_conversational_web-filt+periods.gz +if [ ! -z "$web_sw" ]; then + echo "Interpolating web-LM not implemented yet" +fi + +## The University of Washington Fisher conversational web data can be obtained as: +## wget --no-check-certificate http://ssli.ee.washington.edu/data/525M_fisher_conv_web-filt+periods.gz +if [ ! -z "$web_fsh" ]; then + echo "Interpolating web-LM not implemented yet" +fi + +## The University of Washington meeting web data can be obtained as: +## wget --no-check-certificate http://ssli.ee.washington.edu/data/150M_cmu+icsi+nist-meetings.gz +if [ ! -z "$web_mtg" ]; then + echo "Interpolating web-LM not implemented yet" +fi + +if [ $num_lms -gt 1 ]; then + echo "Computing interpolation weights from: $mix_ppl" + compute-best-mix $mix_ppl >& $dir/mix.log + grep 'best lambda' $dir/mix.log \ + | perl -e '$_=<>; s/.*\(//; s/\).*//; @A = split; for $i (@A) {print "$i\n";}' \ + > $dir/mix.weights + weights=( `cat $dir/mix.weights` ) + cmd="ngram -lm ${mix_lms[0]} -lambda 0.715759 -mix-lm ${mix_lms[1]}" + for i in `seq 2 $((num_lms-1))`; do + cmd="$cmd -mix-lm${i} ${mix_lms[$i]} -mix-lambda${i} ${weights[$i]}" + done + cmd="$cmd -unk -write-lm $dir/${mix_tag}.o${order}g.kn.gz" + echo "Interpolating LMs with command: \"$cmd\"" + $cmd + echo "PPL for the interolated LM:" + ngram -unk -lm $dir/${mix_tag}.o${order}g.kn.gz -ppl $dir/dev.gz +fi + +#save the lm name for furher use +echo "${mix_tag}.o${order}g.kn" > $dir/final_lm + diff --git a/egs/ami/s5/local/ami_xml2text.sh b/egs/ami/s5/local/ami_xml2text.sh new file mode 100755 index 000000000..4d5431c6a --- /dev/null +++ b/egs/ami/s5/local/ami_xml2text.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# Copyright, University of Edinburgh (Pawel Swietojanski and Jonathan Kilgour) + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + exit 1; +fi + +adir=$1 +wdir=data/local/annotations + +[ ! -f $adir/annotations/AMI-metadata.xml ] && echo "$0: File $adir/annotations/AMI-metadata.xml no found." && exit 1; + +mkdir -p $wdir/log + +JAVA_VER=$(java -version 2>&1 | sed 's/java version "\(.*\)\.\(.*\)\..*"/\1\2/; 1q') + +if [ "$JAVA_VER" -ge 15 ]; then + if [ ! -d $wdir/nxt ]; then + echo "Downloading NXT annotation tool..." + wget -O $wdir/nxt.zip http://sourceforge.net/projects/nite/files/nite/nxt_1.4.4/nxt_1.4.4.zip &> /dev/null + unzip -d $wdir/nxt $wdir/nxt.zip &> /dev/null + fi + + if [ ! -f $wdir/transcripts0 ]; then + echo "Parsing XML files (can take several minutes)..." + nxtlib=$wdir/nxt/lib + java -cp $nxtlib/nxt.jar:$nxtlib/xmlParserAPIs.jar:$nxtlib/xalan.jar:$nxtlib \ + FunctionQuery -c $adir/annotations/AMI-metadata.xml -q '($s segment)(exists $w1 w):$s^$w1' -atts obs who \ + '@extract(($sp speaker)($m meeting):$m@observation=$s@obs && $m^$sp & $s@who==$sp@nxt_agent,global_name, 0)'\ + '@extract(($sp speaker)($m meeting):$m@observation=$s@obs && $m^$sp & $s@who==$sp@nxt_agent, channel, 0)' \ + transcriber_start transcriber_end starttime endtime '$s' '@extract(($w w):$s^$w & $w@punc="true", starttime,0,0)' \ + 1> $wdir/transcripts0 2> $wdir/log/nxt_export.log + fi +else + echo "$0. Java not found. Will download exported version of transcripts." + annots=ami_manual_annotations_v1.6.1_export + wget -O $wdir/$annots.gzip http://groups.inf.ed.ac.uk/ami/AMICorpusAnnotations/$annots.gzip + gunzip -c $wdir/${annots}.gzip > $wdir/transcripts0 +fi + +#remove NXT logs dumped to stdio +grep -e '^Found' -e '^Obs' -i -v $wdir/transcripts0 > $wdir/transcripts1 + +exit 0; + diff --git a/egs/ami/s5/local/beamformit.sh b/egs/ami/s5/local/beamformit.sh new file mode 100755 index 000000000..fcef166a4 --- /dev/null +++ b/egs/ami/s5/local/beamformit.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Copyright 2014, University of Edibnurgh (Author: Pawel Swietojanski) + +. ./path.sh + +nj=$1 +job=$2 +numch=$3 +meetings=$4 +sdir=$5 +odir=$6 +wdir=data/local/beamforming + +utils/split_scp.pl -j $nj $((job-1)) $meetings $meetings.$job + +while read line; do + + mkdir -p $odir/$line + BeamformIt -s $line -c $wdir/channels_$numch \ + --config_file `pwd`/conf/ami.cfg \ + --source_dir $sdir \ + --result_dir $odir/$line + mkdir -p $odir/$line + mv $odir/$line/${line}.del $odir/$line/${line}_MDM$numch.del + mv $odir/$line/${line}.del2 $odir/$line/${line}_MDM$numch.del2 + mv $odir/$line/${line}.info $odir/$line/${line}_MDM$numch.info + mv $odir/$line/${line}.ovl $odir/$line/${line}_MDM$numch.ovl + mv $odir/$line/${line}.weat $odir/$line/${line}_MDM$numch.weat + mv $odir/$line/${line}.wav $odir/$line/${line}_MDM$numch.wav + +done < $meetings.$job + diff --git a/egs/ami/s5/local/convert2stm.pl b/egs/ami/s5/local/convert2stm.pl new file mode 100755 index 000000000..703504344 --- /dev/null +++ b/egs/ami/s5/local/convert2stm.pl @@ -0,0 +1,101 @@ +#!/usr/bin/perl + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# 2013 University of Edinburgh (Author: Pawel Swietojanski) + +# This takes as standard input path to directory containing all the usual +# data files - segments, text, utt2spk and reco2file_and_channel and creates stm + +if (@ARGV < 1 || @ARGV > 2) { + print STDERR "Usage: convert2stm.pl [] > stm-file\n"; + exit(1); +} + +$dir=shift @ARGV; +$utt2spk_file=shift @ARGV || 'utt2spk'; + +$segments = "$dir/segments"; +$reco2file_and_channel = "$dir/reco2file_and_channel"; +$text = "$dir/text"; +$utt2spk_file = "$dir/$utt2spk_file"; + +open(S, "<$segments") || die "opening segments file $segments"; +while() { + @A = split(" ", $_); + @A > 3 || die "convert2stm: Bad line in segments file: $_"; + ($utt, $recording_id, $begin_time, $end_time) = @A[0..3]; + $utt2reco{$utt} = $recording_id; + $begin{$utt} = $begin_time; + $end{$utt} = $end_time; +} +close(S); + +open(R, "<$reco2file_and_channel") || die "open reco2file_and_channel file $reco2file_and_channel"; +while() { + @A = split(" ", $_); + @A == 3 || die "convert2stm: Bad line in reco2file_and_channel file: $_"; + ($recording_id, $file, $channel) = @A; + $reco2file{$recording_id} = $file; + $reco2channel{$recording_id} = $channel; +} +close(R); + +open(T, "<$text") || die "open text file $text"; +while() { + @A = split(" ", $_); + $utt = shift @A; + $utt2text{$utt} = "@A"; +} +close(T); + +open(U, "<$utt2spk_file") || die "open utt2spk file $utt2spk_file"; +while() { + @A = split(" ", $_); + @A == 2 || die "convert2stm: Bad line in utt2spk file: $_"; + ($utt, $spk) = @A; + $utt2spk{$utt} = $spk; +} +close(U); + +# Now generate the stm file +foreach $utt (sort keys(%utt2reco)) { + + # lines look like: + # [