From c903457f3cf72d749e0f73fd7b2625fbbab8a833 Mon Sep 17 00:00:00 2001 From: Pawel Swietojanski Date: Wed, 25 Jun 2014 09:09:59 +0000 Subject: [PATCH] sandbox/pawel: ready data prep. stages, buiding lms git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/pawel@4081 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8 --- egs/ami/s5/conf/ami.cfg | 50 + egs/ami/s5/local/ami_beamform.sh | 2 +- egs/ami/s5/local/ami_ihm_data_prep.sh | 90 + egs/ami/s5/local/ami_ihm_data_prep_edin.sh | 103 - ...p_edin.sh => ami_ihm_scoring_data_prep.sh} | 73 +- egs/ami/s5/local/ami_mdm_data_prep_edin.sh | 99 - egs/ami/s5/local/ami_prepare_dict.sh | 59 +- ...data_prep_edin.sh => ami_sdm_data_prep.sh} | 48 +- ...p_edin.sh => ami_sdm_scoring_data_prep.sh} | 34 +- egs/ami/s5/local/ami_split_segments.pl | 4 + egs/ami/s5/local/ami_text_prep.sh | 2 - egs/ami/s5/local/ami_train_lms.sh | 157 ++ egs/ami/s5/local/beamformit.sh | 8 +- egs/ami/s5/local/convert2stm.pl | 98 + egs/ami/s5/local/english.glm | 2023 +++++++++++++++++ egs/ami/s5/local/fisher_map_words.pl | 83 + egs/ami/s5/path.sh | 2 +- egs/ami/s5/run_ihm.sh | 8 +- egs/ami/s5/run_mdm.sh | 5 +- egs/ami/s5/run_sdm.sh | 13 +- 20 files changed, 2623 insertions(+), 338 deletions(-) create mode 100644 egs/ami/s5/conf/ami.cfg create mode 100755 egs/ami/s5/local/ami_ihm_data_prep.sh delete mode 100755 egs/ami/s5/local/ami_ihm_data_prep_edin.sh rename egs/ami/s5/local/{ami_ihm_scoring_data_prep_edin.sh => ami_ihm_scoring_data_prep.sh} (51%) delete mode 100755 egs/ami/s5/local/ami_mdm_data_prep_edin.sh rename egs/ami/s5/local/{ami_sdm_data_prep_edin.sh => ami_sdm_data_prep.sh} (64%) rename egs/ami/s5/local/{ami_sdm_scoring_data_prep_edin.sh => ami_sdm_scoring_data_prep.sh} (82%) create mode 100755 egs/ami/s5/local/ami_train_lms.sh create mode 100755 egs/ami/s5/local/convert2stm.pl create mode 100644 egs/ami/s5/local/english.glm create mode 100755 egs/ami/s5/local/fisher_map_words.pl diff --git a/egs/ami/s5/conf/ami.cfg b/egs/ami/s5/conf/ami.cfg new file mode 100644 index 000000000..70fdd8586 --- /dev/null +++ b/egs/ami/s5/conf/ami.cfg @@ -0,0 +1,50 @@ +#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) + +# scrolling size to compute the delays +scroll_size = 250 + +# cross correlation computation window size +window_size = 500 + +#amount of maximum points for the xcorrelation taken into account +nbest_amount = 4 + +#flag wether to apply an automatic noise thresholding +do_noise_threshold = 1 + +#Percentage of frames with lower xcorr taken as noisy +noise_percent = 10 + +######## acoustic modelling parameters + +#transition probabilities weight for multichannel decoding +trans_weight_multi = 25 +trans_weight_nbest = 25 + +### + +#flag wether to print the feaures after setting them, or not +print_features = 1 + +#flag wether to use the bad frames in the sum process +do_avoid_bad_frames = 1 + +#flag to use the best channel (SNR) as a reference +#defined from command line +do_compute_reference = 1 + +#flag wether to use a uem file or not(process all the file) +do_use_uem_file = 0 + +#flag wether to use an adaptative weights scheme or fixed weights +do_adapt_weights = 1 + +#flag wether to output the sph files or just run the system to create the auxiliary files +do_write_sph_files = 1 + +####directories where to store/retrieve info#### +#channels_file = ./cfg-files/channels + +#show needs to be passed as argument normally, here a default one is given just in case +#show_id = Ttmp + diff --git a/egs/ami/s5/local/ami_beamform.sh b/egs/ami/s5/local/ami_beamform.sh index fb154485e..5efdb486a 100755 --- a/egs/ami/s5/local/ami_beamform.sh +++ b/egs/ami/s5/local/ami_beamform.sh @@ -64,7 +64,7 @@ done < $meetings echo -e "Beamforming\n" -$cmd JOB=1:$nj $wdir/log/beamform.JOB.log \ +$cmd JOB=0:$nj $wdir/log/beamform.JOB.log \ local/beamformit.sh $nj JOB $numch $meetings $sdir $odir : << "C" diff --git a/egs/ami/s5/local/ami_ihm_data_prep.sh b/egs/ami/s5/local/ami_ihm_data_prep.sh new file mode 100755 index 000000000..c94bd58b2 --- /dev/null +++ b/egs/ami/s5/local/ami_ihm_data_prep.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski) +# AMI Corpus training data preparation +# Apache 2.0 + +# To be run from one directory above this script. + +. path.sh + +#check existing directories +if [ $# != 1 ]; then + echo "Usage: ami_ihm_data_prep.sh /path/to/AMI" + exit 1; +fi + +AMI_DIR=$1 + +SEGS=data/local/annotations/train.txt +dir=data/local/ihm/train +mkdir -p $dir + +# Audio data directory check +if [ ! -d $AMI_DIR ]; then + echo "Error: $AMI_DIR directory does not exists." + exit 1; +fi + +# And transcripts check +if [ ! -f $SEGS ]; then + echo "Error: File $SEGS no found (run ami_text_prep.sh)." + exit 1; +fi + + +# find headset wav audio files only +find $AMI_DIR -iname '*.Headset-*.wav' | sort > $dir/wav.flist +n=`cat $dir/wav.flist | wc -l` +echo "In total, $n headset files were found." +[ $n -ne 684 ] && \ + echo "Warning: expected 684 (171 meetings x 4 headsets) data files, found $n" + +# (1a) Transcriptions preparation +# here we start with normalised transcriptions, the utt ids follow the convention +# AMI_MEETING_CHAN_SPK_STIME_ETIME +# AMI_ES2011a_H00_FEE041_0003415_0003484 + +awk '{meeting=$1; channel=$2; speaker=$3; stime=$4; etime=$5; + printf("AMI_%s_%s_%s_%07.0f_%07.0f", meeting, channel, speaker, int(100*stime+0.5), int(100*etime+0.5)); + for(i=6;i<=NF;i++) printf(" %s", $i); printf "\n"}' $SEGS | sort > $dir/text + +# (1b) Make segment files from transcript + +awk '{ + segment=$1; + split(segment,S,"[_]"); + audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6]; + print segment " " audioname " " startf*10/1000 " " endf*10/1000 " " 0 +}' < $dir/text > $dir/segments + +# (1c) Make wav.scp file. + +sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \ + perl -ne 'split; $_ =~ m/(.*)\..*\-([0-9])/; print "AMI_$1_H0$2\n"' | \ + paste - $dir/wav.flist > $dir/wav.scp + +#Keep only train part of waves +awk '{print $2}' $dir/segments | sort -u | join - $dir/wav.scp | sort -o $dir/wav.scp + +# (1d) reco2file_and_channel + +awk '{print $1}' $dir/wav.scp \ + | perl -ane '$_ =~ m:^(\S+)(H0[0-4])$: || die "bad label $_"; + print "$1$2 $1$2 $2\n"; ' > $dir/reco2file_and_channel || exit 1; + +awk '{print $1}' $dir/segments | \ + perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; + print "$1$2$3 $1$2\n";' \ + > $dir/utt2spk || exit 1; + +sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; + +# Copy stuff into its final location +mkdir -p data/ihm/train +for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do + cp $dir/$f data/ihm/train/$f || exit 1; +done + +echo AMI IHM data preparation succeeded. + diff --git a/egs/ami/s5/local/ami_ihm_data_prep_edin.sh b/egs/ami/s5/local/ami_ihm_data_prep_edin.sh deleted file mode 100755 index bf70a0ecf..000000000 --- a/egs/ami/s5/local/ami_ihm_data_prep_edin.sh +++ /dev/null @@ -1,103 +0,0 @@ -#!/bin/bash - -# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski) -# AMI Corpus training data preparation -# Apache 2.0 - -# To be run from one directory above this script. - -. path.sh - -#check existing directories -if [ $# != 2 ]; then - echo "Usage: ami_data_prep_edin.sh /path/to/AMI" - exit 1; -fi - -AMI_DIR=$1 -RT09_SEGS=$2 #assuming here all normalisation stuff was done - -dir=data/local/ihm/train -mkdir -p $dir - -# Audio data directory check -if [ ! -d $AMI_DIR ]; then - echo "Error: run.sh requires a directory argument" - exit 1; -fi - -# find headset wav audio files only -find $AMI_DIR -iname '*.Headset-*.wav' | sort > $dir/wav.flist - -n=`cat $dir/wav.flist | wc -l` - -echo "In total, $n headset files were found." -#[ $n -ne 2435 ] && \ -# echo Warning: expected 2435 data data files, found $n - -# (1a) Transcriptions preparation -# here we start with rt09 transcriptions, hence not much to do - -cut -d" " -f1,4- $RT09_SEGS | sort > $dir/text - -# **NOTE: swbd1_map_words.pl has been modified to make the pattern matches -# case insensitive -#local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt > $dir/text # final transcripts - -# (1c) Make segment files from transcript -#segments file format is: utt-id side-id start-time end-time, e.g.: -#AMI_ES2011a_H00_FEE041_0003415_0003484 -awk '{ - segment=$1; - split(segment,S,"[_]"); - audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6]; - print segment " " audioname " " startf*10/1000 " " endf*10/1000 " " 0 -}' < $dir/text > $dir/segments - -#sw02001-A_000098-001156 sw02001-A 0.98 11.56 -#awk '{ -# segment=$1; -# split(segment,S,"[_-]"); -# side=S[2]; audioname=S[1]; startf=S[3]; endf=S[4]; -# print segment " " audioname "-" side " " startf/100 " " endf/100 -#}' < $dir/text > $dir/segments - -#EN2001a.Headset-0.wav -#sed -e 's?.*/??' -e 's?.sph??' $dir/wav.flist | paste - $dir/wav.flist \ -# > $dir/wav.scp - -sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \ - perl -ne 'split; $_ =~ m/(.*)\..*\-([0-9])/; print "AMI_$1_H0$2\n"' | \ - paste - $dir/wav.flist > $dir/wav.scp - -# this file reco2file_and_channel maps recording-id (e.g. sw02001-A) -# to the file name sw02001 and the A, e.g. -# sw02001-A sw02001 A -# In this case it's trivial, but in other corpora the information might -# be less obvious. Later it will be needed for ctm scoring. - -awk '{print $1}' $dir/wav.scp \ - | perl -ane '$_ =~ m:^(\S+)(H0[0-4])$: || die "bad label $_"; - print "$1$2 $1$2 $2\n"; ' \ - > $dir/reco2file_and_channel || exit 1; - -awk '{print $1}' $dir/segments | \ - perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; - print "$1$2$3 $1$2\n";' \ - > $dir/utt2spk || exit 1; - -sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; - -# We assume each conversation side is a separate speaker. This is a very -# reasonable assumption for Switchboard. The actual speaker info file is at: -# http://www.ldc.upenn.edu/Catalog/desc/addenda/swb-multi-annot.summary - -# Copy stuff into its final locations [this has been moved from the format_data -# script] -mkdir -p data/ihm/train -for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do - cp $dir/$f data/ihm/train/$f || exit 1; -done - -echo AMI data preparation succeeded. - diff --git a/egs/ami/s5/local/ami_ihm_scoring_data_prep_edin.sh b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh similarity index 51% rename from egs/ami/s5/local/ami_ihm_scoring_data_prep_edin.sh rename to egs/ami/s5/local/ami_ihm_scoring_data_prep.sh index d9c709bd4..b01c8141b 100755 --- a/egs/ami/s5/local/ami_ihm_scoring_data_prep_edin.sh +++ b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh @@ -6,19 +6,17 @@ . path.sh #check existing directories -if [ $# != 3 ]; then - echo "Usage: scoring_data_prep_edin.sh /path/to/SWBD rt09-seg-file set-name" +if [ $# != 2 ]; then + echo "Usage: ami_*_scoring_data_prep_edin.sh /path/to/AMI set-name" exit 1; fi AMI_DIR=$1 -RT09_SEGS=$2 #assuming here all normalisation stuff was done -SET=$3 +SET=$2 +SEGS=data/local/annotations/$SET.txt -tmpdir=data/local/ihm/$SET -dir=data/ihm/$SET - -mkdir -p $tmpdir +dir=data/local/ihm/$SET +mkdir -p $dir # Audio data directory check if [ ! -d $AMI_DIR ]; then @@ -26,39 +24,47 @@ if [ ! -d $AMI_DIR ]; then exit 1; fi +# And transcripts check +if [ ! -f $SEGS ]; then + echo "Error: File $SEGS no found (run ami_text_prep.sh)." + exit 1; +fi + # find headset wav audio files only, here we again get all # the files in the corpora and filter only specific sessions # while building segments -find $AMI_DIR -iname '*.Headset-*.wav' | sort > $tmpdir/wav.flist -n=`cat $tmpdir/wav.flist | wc -l` +find $AMI_DIR -iname '*.Headset-*.wav' | sort > $dir/wav.flist +n=`cat $dir/wav.flist | wc -l` echo "In total, $n headset files were found." +[ $n -ne 684 ] && \ + echo "Warning: expected 684 (171 meetings x 4 headsets) data files, found $n" # (1a) Transcriptions preparation -# here we start with rt09 transcriptions, hence not much to do +# here we start with normalised transcriptions, the utt ids follow the convention +# AMI_MEETING_CHAN_SPK_STIME_ETIME +# AMI_ES2011a_H00_FEE041_0003415_0003484 -cut -d" " -f1,4- $RT09_SEGS | sort > $tmpdir/text +awk '{meeting=$1; channel=$2; speaker=$3; stime=$4; etime=$5; + printf("AMI_%s_%s_%s_%07.0f_%07.0f", meeting, channel, speaker, int(100*stime+0.5), int(100*etime+0.5)); + for(i=6;i<=NF;i++) printf(" %s", $i); printf "\n"}' $SEGS | sort > $dir/text # (1c) Make segment files from transcript #segments file format is: utt-id side-id start-time end-time, e.g.: -#AMI_ES2011a_H00_FEE041_0003415_0003484 + awk '{ segment=$1; split(segment,S,"[_]"); audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6]; print segment " " audioname " " startf*10/1000 " " endf*10/1000 " " 0 -}' < $tmpdir/text > $tmpdir/segments +}' < $dir/text > $dir/segments -#EN2001a.Headset-0.wav -#sed -e 's?.*/??' -e 's?.sph??' $dir/wav.flist | paste - $dir/wav.flist \ -# > $dir/wav.scp - -sed -e 's?.*/??' -e 's?.wav??' $tmpdir/wav.flist | \ +sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \ perl -ne 'split; $_ =~ m/(.*)\..*\-([0-9])/; print "AMI_$1_H0$2\n"' | \ - paste - $tmpdir/wav.flist > $tmpdir/wav.scp + paste - $dir/wav.flist > $dir/wav.scp #Keep only devset part of waves -awk '{print $2}' $tmpdir/segments | sort -u | join - $tmpdir/wav.scp | sort -o $tmpdir/wav.scp +awk '{print $2}' $dir/segments | sort -u | join - $dir/wav.scp | sort -o $dir/wav.scp # this file reco2file_and_channel maps recording-id (e.g. sw02001-A) # to the file name sw02001 and the A, e.g. @@ -66,29 +72,28 @@ awk '{print $2}' $tmpdir/segments | sort -u | join - $tmpdir/wav.scp | sort -o $ # In this case it's trivial, but in other corpora the information might # be less obvious. Later it will be needed for ctm scoring. -awk '{print $1 $2}' $tmpdir/wav.scp | \ +awk '{print $1 $2}' $dir/wav.scp | \ perl -ane '$_ =~ m:^(\S+H0[0-4]).*\/([IETB].*)\.wav$: || die "bad label $_"; print "$1 $2 0\n"; '\ - > $tmpdir/reco2file_and_channel || exit 1; + > $dir/reco2file_and_channel || exit 1; -awk '{print $1}' $tmpdir/segments | \ +awk '{print $1}' $dir/segments | \ perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; print "$1$2$3 $1$2\n";' \ - > $tmpdir/utt2spk || exit 1; + > $dir/utt2spk || exit 1; -sort -k 2 $tmpdir/utt2spk | utils/utt2spk_to_spk2utt.pl > $tmpdir/spk2utt || exit 1; +sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; -# We assume each conversation side is a separate speaker. - -# Copy stuff into its final locations [this has been moved from the format_data -# script] -mkdir -p $dir +# Copy stuff into its final locations +fdir=data/ihm/$SET +mkdir -p $fdir for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do - cp $tmpdir/$f $dir/$f || exit 1; + cp $dir/$f $fdir/$f || exit 1; done -utils/convert2stm.pl $dir > $dir/stm -cp local/english.glm $dir/glm +#Produce STMs for sclite scoring +local/convert2stm.pl $dir > $fdir/stm +cp local/english.glm $fdir/glm echo AMI $SET set data preparation succeeded. diff --git a/egs/ami/s5/local/ami_mdm_data_prep_edin.sh b/egs/ami/s5/local/ami_mdm_data_prep_edin.sh deleted file mode 100755 index 101d2a783..000000000 --- a/egs/ami/s5/local/ami_mdm_data_prep_edin.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/bin/bash - -# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski) -# AMI Corpus dev/eval data preparation - -# To be run from one directory above this script. - -. path.sh - -#check existing directories -if [ $# != 3 ]; then - echo "Usage: ami_data_prep_edin.sh /path/to/AMI rt09 mic" - exit 1; -fi - -AMI_DIR=$1 -SEGS=$2 #assuming here all normalisation stuff was done -mic=$3 - -dir=data/local/$mic/train -odir=data/$mic/train -mkdir -p $dir - -# Audio data directory check -if [ ! -d $AMI_DIR ]; then - echo "Error: run.sh requires a directory argument" - exit 1; -fi - -# as the sdm we treat first mic from the array -find $AMI_DIR -iname '*bmf[248].wav' | sort > $dir/wav.flist - -n=`cat $dir/wav.flist | wc -l` - -echo "In total, $n headset files were found." -#[ $n -ne 2435 ] && \ -# echo Warning: expected 2435 data data files, found $n - -# (1a) Transcriptions preparation -# here we start with rt09 transcriptions, hence not much to do - -awk '{meeting=$1; channel="MDM"; speaker=$3; stime=$4; etime=$5; - printf("AMI_%s_%s_%s_%07.0f_%07.0f", meeting, channel, speaker, int(100*stime+0.5), int(100*etime+0.5)); - for(i=6;i<=NF;i++) printf(" %s", $i); printf "\n"}' $SEGS | sort > $dir/text - -# **NOTE: swbd1_map_words.pl has been modified to make the pattern matches -# case insensitive -#local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt > $dir/text # final transcripts - -# (1c) Make segment files from transcript -#segments file format is: utt-id side-id start-time end-time, e.g.: -#AMI_ES2011a_H00_FEE041_0003415_0003484 -awk '{ - segment=$1; - split(segment,S,"[_]"); - audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6]; - print segment " " audioname " " startf/100 " " endf/100 " " 0 -}' < $dir/text > $dir/segments - -#EN2001a.Array1-01.wav -#sed -e 's?.*/??' -e 's?.sph??' $dir/wav.flist | paste - $dir/wav.flist \ -# > $dir/wav.scp - -sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \ - perl -ne 'split; $_ =~ m/(.*)_bmf[248].*/; print "AMI_$1_MDM\n"' | \ - paste - $dir/wav.flist > $dir/wav.scp - -# this file reco2file_and_channel maps recording-id (e.g. sw02001-A) -# to the file name sw02001 and the A, e.g. -# sw02001-A sw02001 A -# In this case it's trivial, but in other corpora the information might -# be less obvious. Later it will be needed for ctm scoring. - -awk '{print $1 $2}' $dir/wav.scp | \ - perl -ane '$_ =~ m:^(\S+MDM).*\/([IETB].*)\.wav$: || die "bad label $_"; - print "$1 $2 0\n"; '\ - > $dir/reco2file_and_channel || exit 1; - -# we assume we adapt to the session only -awk '{print $1}' $dir/segments | \ - perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; - print "$1$2$3 $1\n";' \ - > $dir/utt2spk || exit 1; - -sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; - -# We assume each conversation side is a separate speaker. This is a very -# reasonable assumption for Switchboard. The actual speaker info file is at: -# http://www.ldc.upenn.edu/Catalog/desc/addenda/swb-multi-annot.summary - -# Copy stuff into its final locations [this has been moved from the format_data -# script] -mkdir -p $odir -for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do - cp $dir/$f $odir/$f | exit 1; -done - -echo AMI data preparation succeeded. - diff --git a/egs/ami/s5/local/ami_prepare_dict.sh b/egs/ami/s5/local/ami_prepare_dict.sh index 45958fbe3..ed54ae3ce 100755 --- a/egs/ami/s5/local/ami_prepare_dict.sh +++ b/egs/ami/s5/local/ami_prepare_dict.sh @@ -10,20 +10,23 @@ #check existing directories [ $# != 0 ] && echo "Usage: local/ami_ihm_data_prep_edin.sh" && exit 1; -srcdir=data/local/train # This is where we downloaded some stuff.. -dir=data/local/dict -mkdir -p $dir +sdir=data/local/annotations +wdir=data/local/dict +cmuurl=http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/ +cmuver=cmudict.0.7a -rt09_train=data/local/rt09.ami.ihmtrain09.v3.dct +req="$sdir/transcripts2 local/wordlist.50k" +[ ! -f "$sdir/transcripts2" ] && echo "No such file $sdir/transcripts2 (need to run ami_text_prep.sh first)" && exit 1; -#as rt09_train is a superset of rt09_test, including some words -#fitting training transcription, we will use the training dict -#in Kaldi both for decoding and decoding +mkdir -p $wdir -# assume swbd_p1_data_prep.sh was done already. -#[ ! -f "$srcdict" ] && echo "No such file $srcdict" && exit 1; +if [ ! -f $wdir/$cmuver ]; then + wget -O $wdir/$cmuver svn $cmuurl/$cmuver + wget -O $wdir/$cmuver.phones svn $cmuurl/$cmuver.phones + wget -O $wdir/$cmuver.symbols svn $cmuurl/$cmuver.symbols +fi -cat $rt09_train | sort > $dir/lexicon1.txt +grep -e "^;;;" -v $wdir/$cmuver | sort > $dir/lexicon1.txt cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \ grep -v sil > $dir/nonsilence_phones.txt || exit 1; @@ -41,41 +44,7 @@ echo -n >$dir/extra_questions.txt echo '[laughter] lau'; echo ' spn' ) \ | cat - $dir/lexicon1.txt > $dir/lexicon2.txt || exit 1; -# Map the words in the lexicon. That is-- for each word in the lexicon, we map it -# to a new written form. The transformations we do are: -# remove laughter markings, e.g. -# [LAUGHTER-STORY] -> STORY -# Remove partial-words, e.g. -# -[40]1K W AH N K EY -# becomes -1K -# and -# -[AN]Y IY -# becomes -# -Y -# -[A]B[OUT]- B -# becomes -# -B- -# Also, curly braces, which appear to be used for "nonstandard" -# words or non-words, are removed, e.g. -# {WOLMANIZED} W OW L M AX N AY Z D -# -> WOLMANIZED -# Also, mispronounced words, e.g. -# [YEAM/YEAH] Y AE M -# are changed to just e.g. YEAM, i.e. the orthography -# of the mispronounced version. -# Note-- this is only really to be used in training. The main practical -# reason is to avoid having tons of disambiguation symbols, which -# we otherwise would get because there are many partial words with -# the same phone sequences (most problematic: S). -# Also, map -# THEM_1 EH M -> THEM -# so that multiple pronunciations just have alternate entries -# in the lexicon. - -#local/swbd1_map_words.pl -f 1 $dir/lexicon2.txt | sort -u \ -# > $dir/lexicon3.txt || exit 1; - -pushd $dir >&/dev/null +pushd $wdir >&/dev/null ln -sf lexicon2.txt lexicon.txt # This is the final lexicon. popd >&/dev/null diff --git a/egs/ami/s5/local/ami_sdm_data_prep_edin.sh b/egs/ami/s5/local/ami_sdm_data_prep.sh similarity index 64% rename from egs/ami/s5/local/ami_sdm_data_prep_edin.sh rename to egs/ami/s5/local/ami_sdm_data_prep.sh index b17b14583..0b8c618c5 100755 --- a/egs/ami/s5/local/ami_sdm_data_prep_edin.sh +++ b/egs/ami/s5/local/ami_sdm_data_prep.sh @@ -6,17 +6,17 @@ . path.sh #check existing directories -if [ $# != 3 ]; then - echo "Usage: ami_data_prep_edin.sh /path/to/AMI" +if [ $# != 2 ]; then + echo "Usage: ami_sdm_data_prep.sh " exit 1; fi AMI_DIR=$1 -SEGS=$2 #assuming here all normalisation stuff was done -MICNUM=$3 -MICID="m$MICNUM" +MICNUM=$2 +DSET="sdm$MICNUM" -dir=data/local/sdm/$MICID/train +SEGS=data/local/annotations/train.txt +dir=data/local/$DSET/train mkdir -p $dir # Audio data directory check @@ -25,26 +25,30 @@ if [ ! -d $AMI_DIR ]; then exit 1; fi +# And transcripts check +if [ ! -f $SEGS ]; then + echo "Error: File $SEGS no found (run ami_text_prep.sh)." + exit 1; +fi + # as the sdm we treat first mic from the array find $AMI_DIR -iname "*.Array1-0$MICNUM.wav" | sort > $dir/wav.flist n=`cat $dir/wav.flist | wc -l` echo "In total, $n files were found." -#[ $n -ne 2435 ] && \ -# echo Warning: expected 2435 data data files, found $n +[ $n -ne 169 ] && \ + echo Warning: expected 169 data data files, found $n # (1a) Transcriptions preparation # here we start with already normalised transcripts, just make the ids +# Note, we set here SDM rather than, for example, SDM1 as we want to easily use +# the same alignments across different mics awk '{meeting=$1; channel="SDM"; speaker=$3; stime=$4; etime=$5; printf("AMI_%s_%s_%s_%07.0f_%07.0f", meeting, channel, speaker, int(100*stime+0.5), int(100*etime+0.5)); for(i=6;i<=NF;i++) printf(" %s", $i); printf "\n"}' $SEGS | sort > $dir/text -# **NOTE: swbd1_map_words.pl has been modified to make the pattern matches -# case insensitive -#local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt > $dir/text # final transcripts - # (1c) Make segment files from transcript #segments file format is: utt-id side-id start-time end-time, e.g.: #AMI_ES2011a_H00_FEE041_0003415_0003484 @@ -56,19 +60,15 @@ awk '{ }' < $dir/text > $dir/segments #EN2001a.Array1-01.wav -#sed -e 's?.*/??' -e 's?.sph??' $dir/wav.flist | paste - $dir/wav.flist \ -# > $dir/wav.scp sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \ perl -ne 'split; $_ =~ m/(.*)\..*/; print "AMI_$1_SDM\n"' | \ paste - $dir/wav.flist > $dir/wav.scp -# this file reco2file_and_channel maps recording-id (e.g. sw02001-A) -# to the file name sw02001 and the A, e.g. -# sw02001-A sw02001 A -# In this case it's trivial, but in other corpora the information might -# be less obvious. Later it will be needed for ctm scoring. +#Keep only train part of waves +awk '{print $2}' $dir/segments | sort -u | join - $dir/wav.scp | sort -o $dir/wav.scp +# this file reco2file_and_channel maps recording-id awk '{print $1 $2}' $dir/wav.scp | \ perl -ane '$_ =~ m:^(\S+SDM).*\/([IETB].*)\.wav$: || die "bad label $_"; print "$1 $2 0\n"; '\ @@ -82,16 +82,14 @@ awk '{print $1}' $dir/segments | \ sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; -# We assume each conversation side is a separate speaker. This is a very -# reasonable assumption for Switchboard. The actual speaker info file is at: -# http://www.ldc.upenn.edu/Catalog/desc/addenda/swb-multi-annot.summary +# We distant scenario we assume no infomration (without diarisation) about speakers is available # Copy stuff into its final locations [this has been moved from the format_data # script] -mkdir -p data/sdm/$MICID/train +mkdir -p data/$DSET/train for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do - cp $dir/$f data/sdm/$MICID/train/$f || exit 1; + cp $dir/$f data/$DSET/train/$f || exit 1; done -echo AMI data preparation succeeded. +echo AMI $DSET data preparation succeeded. diff --git a/egs/ami/s5/local/ami_sdm_scoring_data_prep_edin.sh b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh similarity index 82% rename from egs/ami/s5/local/ami_sdm_scoring_data_prep_edin.sh rename to egs/ami/s5/local/ami_sdm_scoring_data_prep.sh index 30eecc2f0..63da9603e 100755 --- a/egs/ami/s5/local/ami_sdm_scoring_data_prep_edin.sh +++ b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh @@ -1,4 +1,4 @@ -#!/bin/bash +!/bin/bash # Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski) # AMI Corpus dev/eval data preparation @@ -6,19 +6,19 @@ . path.sh #check existing directories -if [ $# != 4 ]; then - echo "Usage: ami_sdm_scoring_data_prep_edin.sh /path/to/AMI rt09-seg-file set-name" +if [ $# != 3 ]; then + echo "Usage: ami_sdm_scoring_data_prep.sh " exit 1; fi AMI_DIR=$1 -SEGS=$2 #assuming here all normalisation stuff was done +MICNUM=$2 SET=$3 -MICNUM=$4 -MICID="m$MICNUM" +DSET="sdm$MICNUM" -tmpdir=data/local/sdm/$MICID/$SET -dir=data/sdm/$MICID/$SET +SEGS=data/local/annotations/$SET.txt +tmpdir=data/local/$DSET/$SET +dir=data/$DSET/$SET mkdir -p $tmpdir @@ -28,6 +28,12 @@ if [ ! -d $AMI_DIR ]; then exit 1; fi +# And transcripts check +if [ ! -f $SEGS ]; then + echo "Error: File $SEGS no found (run ami_text_prep.sh)." + exit 1; +fi + # find headset wav audio files only, here we again get all # the files in the corpora and filter only specific sessions # while building segments @@ -65,12 +71,7 @@ sed -e 's?.*/??' -e 's?.wav??' $tmpdir/wav.flist | \ #Keep only devset part of waves awk '{print $2}' $tmpdir/segments | sort -u | join - $tmpdir/wav.scp | sort -o $tmpdir/wav.scp -# this file reco2file_and_channel maps recording-id (e.g. sw02001-A) -# to the file name sw02001 and the A, e.g. -# sw02001-A sw02001 A -# In this case it's trivial, but in other corpora the information might -# be less obvious. Later it will be needed for ctm scoring. - +#prep reco2file_and_channel awk '{print $1 $2}' $tmpdir/wav.scp | \ perl -ane '$_ =~ m:^(\S+SDM).*\/([IETB].*)\.wav$: || die "bad label $_"; print "$1 $2 0\n"; '\ @@ -91,9 +92,6 @@ awk '{print $1}' $tmpdir/segments | \ print "$1$2$3 $1$2\n";' \ > $tmpdir/utt2spk_stm || exit 1; - -# We assume each conversation side is a separate speaker. - # Copy stuff into its final locations [this has been moved from the format_data # script] mkdir -p $dir @@ -104,5 +102,5 @@ done utils/convert2stm.pl $dir utt2spk_stm > $dir/stm cp local/english.glm $dir/glm -echo AMI $SET set data preparation succeeded. +echo AMI $DSET scenario and $SET set data preparation succeeded. diff --git a/egs/ami/s5/local/ami_split_segments.pl b/egs/ami/s5/local/ami_split_segments.pl index d769145ba..57e2c2d60 100755 --- a/egs/ami/s5/local/ami_split_segments.pl +++ b/egs/ami/s5/local/ami_split_segments.pl @@ -170,6 +170,10 @@ sub normalise_transcripts { #some transcripts are empty with -, nullify (and ignore) them $text =~ s/^\-$//g; $text =~ s/\s+\-$//; + # apply few exception for dashed phrases, Mm-Hmm, Uh-Huh, etc. those are frequent in AMI + # and will be added to dictionary + $text =~ s/MM HMM/MM\-HMM/g; + $text =~ s/UH HUH/UH\-HUH/g; return $text; } diff --git a/egs/ami/s5/local/ami_text_prep.sh b/egs/ami/s5/local/ami_text_prep.sh index 4a7083307..ab58d42f1 100755 --- a/egs/ami/s5/local/ami_text_prep.sh +++ b/egs/ami/s5/local/ami_text_prep.sh @@ -16,12 +16,10 @@ local/ami_xml2text.sh $amidir echo "Preprocessing transcripts..." local/ami_split_segments.pl $wdir/transcripts1 $wdir/transcripts2 &> $wdir/log/split_segments.log - #HMM #MM HMM #MM UHM - grep -f local/split_train.orig $wdir/transcripts2 > $wdir/train.txt grep -f local/split_dev.orig $wdir/transcripts2 > $wdir/dev.txt grep -f local/split_eval.orig $wdir/transcripts2 > $wdir/eval.txt diff --git a/egs/ami/s5/local/ami_train_lms.sh b/egs/ami/s5/local/ami_train_lms.sh new file mode 100755 index 000000000..906248cff --- /dev/null +++ b/egs/ami/s5/local/ami_train_lms.sh @@ -0,0 +1,157 @@ +#!/bin/bash -v + +# Copyright 2013 Arnab Ghoshal, Pawel Swietojanski + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# To be run from one directory above this script. + +# Begin configuration section. +fisher= +order=3 +swbd= +web_sw= +web_fsh= +web_mtg= +# end configuration sections + +help_message="Usage: "`basename $0`" [options] +Train language models for AMI and optionally for Switchboard, Fisher and web-data from University of Washington.\n +options: + --help # print this message and exit + --fisher DIR # directory for Fisher transcripts + --order N # N-gram order (default: '$order') + --swbd DIR # Directory for Switchboard transcripts + --web-sw FILE # University of Washington (191M) Switchboard web data + --web-fsh FILE # University of Washington (525M) Fisher web data + --web-mtg FILE # University of Washington (150M) CMU+ICSI+NIST meeting data +"; + +. utils/parse_options.sh + +if [ $# -ne 4 ]; then + printf "$help_message\n"; + exit 1; +fi + +train=$1 # data/local/train/text +dev=$2 # data/local/dev/text +lexicon=$3 # data/local/dict/lexicon.txt +dir=$4 # data/local/lm + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +set -o errexit +mkdir -p $dir +export LC_ALL=C + +cut -d' ' -f2- $train | gzip -c > $dir/train.gz +cut -d' ' -f2- $dev | gzip -c > $dir/dev.gz + +awk '{print $1}' $lexicon | sort -u > $dir/wordlist.lex +gunzip -c $dir/train.gz | tr ' ' '\n' | grep -v ^$ | sort -u > $dir/wordlist.train +sort -u $dir/wordlist.lex $dir/wordlist.train > $dir/wordlist + +ngram-count -text $dir/train.gz -order $order -limit-vocab -vocab $dir/wordlist \ + -unk -map-unk "" -kndiscount -interpolate -lm $dir/ami.o${order}g.kn.gz +echo "PPL for AMI LM:" +ngram -unk -lm $dir/ami.o${order}g.kn.gz -ppl $dir/dev.gz +ngram -unk -lm $dir/ami.o${order}g.kn.gz -ppl $dir/dev.gz -debug 2 >& $dir/ppl2 +mix_ppl="$dir/ppl2" +mix_tag="ami" +mix_lms=( "$dir/swbd/ami.o${order}g.kn.gz" ) +num_lms=1 + +if [ ! -z "$swbd" ]; then + mkdir -p $dir/swbd + + find $swbd -iname '*-trans.text' -exec cat {} \; | cut -d' ' -f4- \ + | gzip -c > $dir/swbd/text0.gz + gunzip -c $dir/swbd/text0.gz | swbd_map_words.pl | gzip -c \ + > $dir/swbd/text1.gz + ngram-count -text $dir/swbd/text1.gz -order $order -limit-vocab \ + -vocab $dir/wordlist -unk -map-unk "" -kndiscount -interpolate \ + -lm $dir/swbd/swbd.o${order}g.kn.gz + echo "PPL for SWBD LM:" + ngram -unk -lm $dir/swbd/swbd.o${order}g.kn.gz -ppl $dir/dev.gz + ngram -unk -lm $dir/swbd/swbd.o${order}g.kn.gz -ppl $dir/dev.gz -debug 2 \ + >& $dir/swbd/ppl2 + + mix_ppl="$mix_ppl $dir/swbd/ppl2" + mix_tag="${mix_tag}_swbd" + mix_lms=("${mix_lms[@]}" "$dir/swbd/swbd.o${order}g.kn.gz") + num_lms=$[ num_lms + 1 ] +fi + +if [ ! -z "$fisher" ]; then + [ ! -d "$fisher/data/trans" ] \ + && echo "Cannot find transcripts in Fisher directory: '$fisher'" \ + && exit 1; + mkdir -p $dir/fisher + + find $fisher -path '*/trans/*fe*.txt' -exec cat {} \; | grep -v ^# | grep -v ^$ \ + | cut -d' ' -f4- | gzip -c > $dir/fisher/text0.gz + gunzip -c $dir/fisher/text0.gz | fisher_map_words.pl \ + | gzip -c > $dir/fisher/text1.gz + ngram-count -text $dir/fisher/text1.gz -order $order -limit-vocab \ + -vocab $dir/wordlist -unk -map-unk "" -kndiscount -interpolate \ + -lm $dir/fisher/fisher.o${order}g.kn.gz + echo "PPL for Fisher LM:" + ngram -unk -lm $dir/fisher/fisher.o${order}g.kn.gz -ppl $dir/dev.gz + ngram -unk -lm $dir/fisher/fisher.o${order}g.kn.gz -ppl $dir/dev.gz -debug 2 \ + >& $dir/fisher/ppl2 + + mix_ppl="$mix_ppl $dir/fisher/ppl2" + mix_tag="${mix_tag}_fsh" + mix_lms=("${mix_lms[@]}" "$dir/swbd/fisher.o${order}g.kn.gz") + num_lms=$[ num_lms + 1 ] +fi + +## The University of Washington conversational web data can be obtained as: +## wget --no-check-certificate http://ssli.ee.washington.edu/data/191M_conversational_web-filt+periods.gz +if [ ! -z "$web_sw" ]; then + echo "Interpolating web-LM not implemented yet" +fi + +## The University of Washington Fisher conversational web data can be obtained as: +## wget --no-check-certificate http://ssli.ee.washington.edu/data/525M_fisher_conv_web-filt+periods.gz +if [ ! -z "$web_fsh" ]; then + echo "Interpolating web-LM not implemented yet" +fi + +## The University of Washington meeting web data can be obtained as: +## wget --no-check-certificate http://ssli.ee.washington.edu/data/150M_cmu+icsi+nist-meetings.gz +if [ ! -z "$web_mtg" ]; then + echo "Interpolating web-LM not implemented yet" +fi + +echo "Computing interpolation weights from: $mix_ppl" +compute-best-mix $best_mix_ppl >& $dir/mix.log +grep 'best lambda' $dir/sw1_fsh_mix.log \ + | perl -e '$_=<>; s/.*\(//; s/\).*//; @A = split; for $i (@A) {print "$i\n";}' \ + > $dir/mix.weights +weights=( `cat $dir/mix.weights` ) +cmd="ngram -lm ${mix_lms[0]} -lambda ${weights[0]} -mix-lm ${mix_lms[1]}" +for i in `seq 2 $num_lms`; do + cmd="$cmd -mix-lm${i} ${mix_lms[$i]} -mix-lambda${i} ${weights[$i]}" +done +cmd="$cmd -unk -write-lm $dir/${mix_tag}.o${order}g.kn.gz" +echo "Interpolating LMs with command: \"$cmd\"" +$cmd +echo "PPL for the interolated LM:" +ngram -unk -lm $dir/${mix_tag}.o${order}g.kn.gz -ppl $dir/dev.gz + + diff --git a/egs/ami/s5/local/beamformit.sh b/egs/ami/s5/local/beamformit.sh index b9aa92e4b..d5171aa9d 100755 --- a/egs/ami/s5/local/beamformit.sh +++ b/egs/ami/s5/local/beamformit.sh @@ -16,11 +16,11 @@ utils/split_scp.pl -j $nj $job $meetings $meetings.$job while read line; do -# --config_file=`pwd`/conf/beamformit.cfg \ BeamformIt -s $line -c $wdir/channels_$numch \ - --source_dir=$sdir \ - --result_dir=$odir/temp_dir \ - --do_compute_reference=1 + --config_file `pwd`/conf/ami.cfg \ + --source_dir $sdir \ + --result_dir $odir/temp_dir + mkdir -p $odir/$line mv $odir/temp_dir/$line/${line}_seg.del $odir/$line/${line}_MDM$numch.del mv $odir/temp_dir/$line/${line}_seg.del2 $odir/$line/${line}_MDM$numch.del2 diff --git a/egs/ami/s5/local/convert2stm.pl b/egs/ami/s5/local/convert2stm.pl new file mode 100755 index 000000000..a9baf84fc --- /dev/null +++ b/egs/ami/s5/local/convert2stm.pl @@ -0,0 +1,98 @@ +#!/usr/bin/perl + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# 2013 University of Edinburgh (Author: Pawel Swietojanski) + +# This takes as standard input path to directory containing all the usual +# data files - segments, text, utt2spk and reco2file_and_channel and creates stm + +if (@ARGV < 1 || @ARGV > 2) { + print STDERR "Usage: convert2stm.pl [] > stm-file\n"; + exit(1); +} + +$dir=shift @ARGV; +$utt2spk_file=shift @ARGV || 'utt2spk'; + +$segments = "$dir/segments"; +$reco2file_and_channel = "$dir/reco2file_and_channel"; +$text = "$dir/text"; +$utt2spk_file = "$dir/$utt2spk_file"; + +open(S, "<$segments") || die "opening segments file $segments"; +while() { + @A = split(" ", $_); + @A > 4 || die "Bad line in segments file: $_"; + ($utt, $recording_id, $begin_time, $end_time) = @A[0..3]; + $utt2reco{$utt} = $recording_id; + $begin{$utt} = $begin_time; + $end{$utt} = $end_time; +} +close(S); + +open(R, "<$reco2file_and_channel") || die "open reco2file_and_channel file $reco2file_and_channel"; +while() { + @A = split(" ", $_); + @A == 3 || die "Bad line in reco2file_and_channel file: $_"; + ($recording_id, $file, $channel) = @A; + $reco2file{$recording_id} = $file; + $reco2channel{$recording_id} = $channel; +} +close(R); + +open(T, "<$text") || die "open text file $text"; +while() { + @A = split(" ", $_); + $utt = shift @A; + $utt2text{$utt} = "@A"; +} +close(T); + +open(U, "<$utt2spk_file") || die "open utt2spk file $utt2spk_file"; +while() { + @A = split(" ", $_); + @A == 2 || die "Bad line in utt2spk file: $_"; + ($utt, $spk) = @A; + $utt2spk{$utt} = $spk; +} +close(U); + +# Now generate the stm file +foreach $utt (sort keys(%utt2reco)) { + + # lines look like: + # [