sandbox/pawel: download scripts, beamforming scripts, BeamformIt installation under tools, improved text normalisation

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/pawel@4075 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Pawel Swietojanski 2014-06-23 08:19:53 +00:00
Родитель 230cde5349
Коммит af0df729d5
17 изменённых файлов: 370 добавлений и 90 удалений

17
egs/ami/s5/cmd.sh Normal file
Просмотреть файл

@ -0,0 +1,17 @@
# "queue.pl" uses qsub. The options to it are
# options to qsub. If you have GridEngine installed,
# change this to a queue you have access to.
# Otherwise, use "run.pl", which will run jobs locally
# (make sure your --num-jobs options are no more than
# the number of cpus on your machine.
# On Eddie use:
#export train_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=08:00:00"
#export decode_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=05:00:00 -pe memory-2G 4"
#export highmem_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=05:00:00 -pe memory-2G 4"
#export scoring_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=00:20:00"
# To run locally, use:
export train_cmd=run.pl
export decode_cmd=run.pl
export highmem_cmd=run.pl

Просмотреть файл

@ -0,0 +1,54 @@
# scrolling size to compute the delays
scroll_size = 250
# cross correlation computation window size
window_size = 500
#amount of maximum points for the xcorrelation taken into account
nbest_amount = 4
#flag wether to apply an automatic noise thresholding
do_noise_threshold = 1
#Percentage of frames with lower xcorr taken as noisy
noise_percent = 10
######## acoustic modelling parameters
#transition probabilities weight for multichannel decoding
trans_weight_multi = 25
trans_weight_nbest = 25
###
#flag wether to print the feaures after setting them, or not
print_features = 1
#flag wether to use the bad frames in the sum process
do_avoid_bad_frames = 1
#flag to use the best channel (SNR) as a reference
#defined from command line
do_compute_reference = 1
#do_compute_reference = 0
#reference_channel = 0
#flag wether to use a uem file or not(process all the file)
do_use_uem_file = 0
#flag wether to use an adaptative weights scheme or fixed weights
do_adapt_weights = 1
#flag wether to output the sph files or just run the system to create the auxiliary files
do_write_sph_files = 1
#selects the way that the files are read from the channels file
full_path = 1
####directories where to store/retrieve info####
channels_file = ./cfg-files/channels
#show needs to be passed as argument normally, here a default one is given just in case
show_id = Ttmp

Просмотреть файл

@ -0,0 +1,97 @@
#!/bin/bash
#Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)
#Apache 2.0
wiener_filtering=false
nj=4
cmd=run.pl
# End configuration section
echo "$0 $@" # Print the command line for logging
[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Wrong #arguments ($#, expected 4)"
echo "Usage: steps/ami_beamform.sh [options] <num-mics> <ami-dir> <wav-out-dir>"
echo "main options (for others, see top of script file)"
echo " --nj <nj> # number of parallel jobs"
echo " --cmd <cmd> # Command to run in parallel with"
echo " --wiener-filtering <true/false> # Cancel noise with Wiener filter prior to beamforming"
exit 1;
fi
numch=$1
sdir=$2
odir=$3
wdir=data/local/beamforming
mkdir -p $odir
mkdir -p $wdir/log
meetings=$wdir/meetings.list
cat local/split_train.orig local/split_dev.orig local/split_eval.orig | sort > $meetings
ch_inc=$((8/$numch))
bmf=
for ch in `seq 1 $ch_inc 8`; do
bmf="$bmf $ch"
done
echo "Will use the following channels: $bmf"
#make the channel file
if [ -f $wdir/channels_$numch ]; then
rm $wdir/channels_$numch
fi
touch $wdir/channels_$numch
while read line;
do
channels="$line "
for ch in $bmf; do
channels="$channels $line/audio/$line.Array1-0$ch.wav"
done
echo $channels >> $wdir/channels_$numch
done < $meetings
######
#do beamforming
######
echo -e "Beamforming\n"
$cmd JOB=1:$nj $wdir/log/beamform.JOB.log \
local/beamformit.sh $nj JOB $numch $meetings $sdir $odir
: << "C"
(
utils/split_scp.pl -j $nj JOB $meetings $meetings.JOB
while read line; do
BeamformIt -s $line -c $wdir/channels_$numch \
--config_file=conf/beamformit.cfg \
--source_dir=$sdir \
--result_dir=$odir/temp_dir \
--do_compute_reference=1
mkdir -p $odir/$line
mv $odir/temp_dir/$line/${line}_seg.del $odir/$line/${line}_MDM$numch.del
mv $odir/temp_dir/$line/${line}_seg.del2 $odir/$line/${line}_MDM$numch.del2
mv $odir/temp_dir/$line/${line}_seg.info $odir/$line/${line}_MDM$numch.info
mv $odir/temp_dir/$line/${line}_seg.ovl $odir/$line/${line}_MDM$numch.ovl
mv $odir/temp_dir/$line/${line}_seg.weat $odir/$line/${line}_MDM$numch.weat
mv $odir/temp_dir/$line/${line}_seg.wa* $odir/$line/${line}_MDM$numch.wav
mv $odir/temp_dir/$line/${line}_seg2.wa* $odir/$line/${line}_MDM${numch}_seg2.wav
rm -r $odir/temp_dir
done < $meetings.JOB
)
C

Просмотреть файл

@ -11,45 +11,61 @@ fi
mic=$1
adir=$2
amiurl=http://groups.inf.ed.ac.uk/ami
annotver=ami_public_manual_1.6.1.zip
wdir=data/local/downloads
mkdir -p $adir/amicorpus
if [[ ! "$mic" =~ ^(ihm|sdm|mdm)$ ]]; then
echo "$0. Wrong <mic> option."
exit 1;
fi
mics="1 2 3 4 5 6 7 8"
if [ "$mic" == "sdm" ]; then
mics=1
fi
mkdir -p $adir
mkdir -p $wdir/log
#download annotations
annot="$adir/ami_public_manual_1.6.zip"
annot="$adir/$annotver"
if [[ ! -d $adir/annotations || ! -f "$annot" ]]; then
echo "Downloading annotiations..."
wget -O $annot $amiurl/AMICorpusAnnotations/ami_public_manual_1.6.zip
wget -O $annot $amiurl/AMICorpusAnnotations/$annotver &> $wdir/log/download_ami_annot.log
mkdir $adir/annotations
unzip -d $adir/annotations $annot &> /dev/null
fi
[ ! -f "$adir/annotations/AMI-metadata.xml" ] && echo "$0: File AMI-Metadata.xml not found under $adir/annotations." && exit 1;
#download waves
ihm_template="wget -P amicorpus/IB4011/audio http://groups.inf.ed.ac.uk/ami/AMICorpusMirror//amicorpus/IB4011/audio/IB4011.Headset-3.wav"
license="wget http://groups.inf.ed.ac.uk/ami/download/temp/amiBuild-04237-Sun-Jun-15-2014.manifest.txt
wget http://groups.inf.ed.ac.uk/ami/download/temp/Creative-Commons-Attribution-NonCommercial-ShareAlike-2.5.txt"
wgetfile=$adir/wget_$mic.sh
cat local/split_train.orig local/split_eval.orig local/split_dev.orig > $wdir/ami_meet_ids.flist
wgetfile=$wdir/wget_$mic.sh
manifest="wget -O $adir/MANIFEST.TXT http://groups.inf.ed.ac.uk/ami/download/temp/amiBuild-04237-Sun-Jun-15-2014.manifest.txt"
license="wget -O $adir/LICENCE.TXT http://groups.inf.ed.ac.uk/ami/download/temp/Creative-Commons-Attribution-NonCommercial-ShareAlike-2.5.txt"
echo "#!/bin/bash" > $wgetfile
echo $manifest >> $wgetfile
echo $license >> $wgetfile
cat local/split_train.orig local/split_eval.orig local/split_dev.orig > $adir/ami_file_ids.flist
if [ "$mic" == "ihm" ]; then
while read line; do
for hid in 0 1 2 3; do
echo "wget -P $adir/$line/audio $amiurl/AMICorpusMirror/amicorpus/$line/audio/$line.Headset-$hid.wav" >> $wgetfile
if [ "$mic" == "ihm" ]; then
for m in 0 1 2 3; do
echo "wget -c -P $adir/$line/audio $amiurl/AMICorpusMirror/amicorpus/$line/audio/$line.Headset-$m.wav" >> $wgetfile
done
done < $adir/ami_file_ids.flist
elif [ "$mic" == "sdm" ]; then
elif [ "$mic" == "mdm" ]; then
else
exit 1;
for m in $mics; do
echo "wget -c -P $adir/$line/audio $amiurl/AMICorpusMirror/amicorpus/$line/audio/$line.Array1-0$m.wav" >> $wgetfile
done
fi
done < $wdir/ami_meet_ids.flist
#chmod +x $wgetfile
#. $wgetfile &> $adir/log/download$mic.log
chmod +x $wgetfile
echo "Downloading audio files for $mic scenario."
echo "Look at $wdir/log/download_ami_$mic.log for download progress"
$wgetfile &> $wdir/log/download_ami_$mic.log
echo "Downloads of AMI corpus completed succesfully. License can be found under $adir/LICENSE.TXT"

Просмотреть файл

@ -1,20 +1,16 @@
#!/bin/bash
# Switchboard-1 training data preparation customized for Edinburgh
# Author: Arnab Ghoshal (Jan 2013)
# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)
# AMI Corpus training data preparation
# Apache 2.0
# To be run from one directory above this script.
## The input is some directory containing the switchboard-1 release 2
## corpus (LDC97S62). Note: we don't make many assumptions about how
## you unpacked this. We are just doing a "find" command to locate
## the .sph files.
. path.sh
#check existing directories
if [ $# != 2 ]; then
echo "Usage: ami_data_prep_edin.sh /path/to/SWBD"
echo "Usage: ami_data_prep_edin.sh /path/to/AMI"
exit 1;
fi

Просмотреть файл

@ -1,14 +1,7 @@
#!/bin/bash
# Switchboard-1 training data preparation customized for Edinburgh
# Author: Arnab Ghoshal (Jan 2013)
# To be run from one directory above this script.
## The input is some directory containing the switchboard-1 release 2
## corpus (LDC97S62). Note: we don't make many assumptions about how
## you unpacked this. We are just doing a "find" command to locate
## the .sph files.
# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)
# AMI Corpus dev/eval data preparation
. path.sh

Просмотреть файл

@ -1,7 +1,7 @@
#!/bin/bash
# Switchboard-1 training data preparation customized for Edinburgh
# Author: Arnab Ghoshal (Jan 2013)
# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)
# AMI Corpus dev/eval data preparation
# To be run from one directory above this script.

Просмотреть файл

@ -1,14 +1,7 @@
#!/bin/bash
# Switchboard-1 training data preparation customized for Edinburgh
# Author: Arnab Ghoshal (Jan 2013)
# To be run from one directory above this script.
## The input is some directory containing the switchboard-1 release 2
## corpus (LDC97S62). Note: we don't make many assumptions about how
## you unpacked this. We are just doing a "find" command to locate
## the .sph files.
# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)
# AMI Corpus dev/eval data preparation
. path.sh

Просмотреть файл

@ -1,8 +1,7 @@
#!/bin/bash
# Switchboard-1 training data preparation customized for Edinburgh
# Author: Arnab Ghoshal (Jan 2013)
# To be run from one directory above this script.
# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)
# AMI Corpus dev/eval data preparation
. path.sh

Просмотреть файл

@ -1,14 +1,7 @@
#!/bin/bash
# Switchboard-1 training data preparation customized for Edinburgh
# Author: Arnab Ghoshal (Jan 2013)
# To be run from one directory above this script.
## The input is some directory containing the switchboard-1 release 2
## corpus (LDC97S62). Note: we don't make many assumptions about how
## you unpacked this. We are just doing a "find" command to locate
## the .sph files.
# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)
# AMI Corpus dev/eval data preparation
. path.sh

Просмотреть файл

@ -2,7 +2,7 @@
# Copyright 2014 University of Edinburgh (Author: Pawel Swietojanski)
# The script splits too long AMI segments based on punctuation signs
# The script - based on punctuation times - splits segments longer than #words (input parameter)
# and produces bit more more normalised form of transcripts, as follows
# MeetID Channel Spkr stime etime transcripts
@ -72,8 +72,6 @@ sub split_on_comma {
}
print "Splitting $text on $skey at time $otime (stime is $stime)\n";
my %transcripts = ();
my @utts1 = split(/$skey\s+/, $text);
for (my $i=0; $i<=$#utts1; $i++) {
my $st = $btime;
@ -102,13 +100,14 @@ sub split_transcripts {
my ($text, $btime, $etime, $max_words_per_seg) = @_;
my (@transcript) = @$text;
my (@punct_indices) = grep { $transcript[$_] =~ /^[\.,\?]$/ } 0..$#transcript;
my (@punct_indices) = grep { $transcript[$_] =~ /^[\.,\?\!\:]$/ } 0..$#transcript;
my (@time_indices) = grep { $transcript[$_] =~ /^[0-9]+\.[0-9]*/ } 0..$#transcript;
my (@puncts_times) = delete @transcript[@time_indices];
my (@puncts) = @transcript[@punct_indices];
if ($#puncts_times != $#puncts) {
die 'Ooops, different number of punctuation signs and timestamps!';
print 'Ooops, different number of punctuation signs and timestamps! Skipping.';
return ();
}
#first split on full stops
@ -156,13 +155,12 @@ sub normalise_transcripts {
#DO SOME ROUGH AND OBVIOUS PRELIMINARY NORMALISATION, AS FOLLOWS
#remove the remaining punctation labels e.g. some text ,0 some text ,1
$text =~ s/[\.\,\?][0-9]+//g;
$text =~ s/[\.\,\?\!\:][0-9]+//g;
#there are some extra spurious puncations without spaces, e.g. UM,I, replace with space
$text =~ s/[A-Z']+,[A-Z']+/ /g;
#normalise the standalone '-' signs, e.g. IS THERE D - to IS THERE D-
#some extra steps will be required to agree transcripts with dict as '-'
#also denotes not finished sentence and may be added to the fully pronounced words
$text =~ s/(.*)([A-Z])\s+(\-)(.*)/$1$2$3$4/g;
#split words combination, ie. ANTI-TRUST to ANTI TRUST (None of them appears in cmudict anyway)
#$text =~ s/(.*)([A-Z])\s+(\-)(.*)/$1$2$3$4/g;
$text =~ s/\-/ /g;
#substitute X_M_L with X. M. L. etc.
$text =~ s/\_/. /g;
#normalise and trim spaces
@ -170,13 +168,14 @@ sub normalise_transcripts {
$text =~ s/\s*$//g;
$text =~ s/\s+/ /g;
#some transcripts are empty with -, nullify (and ignore) them
$text =~ s/^\-$//;
$text =~ s/^\-$//g;
$text =~ s/\s+\-$//;
return $text;
}
if (@ARGV != 2) {
print STDERR "Usage: ami_prepare_meeting.pl <meet-file> <out-file>\n";
print STDERR "Usage: ami_split_segments.pl <meet-file> <out-file>\n";
exit(1);
}
@ -186,24 +185,27 @@ my %transcripts = ();
open(W, ">$out_file") || die "opening output file $out_file";
open(S, "<$meet_file") || die "opening meeting file $meet_file";
while(<S>) {
my @A = split(" ", $_);
@A > 8 || next;
my ($meet_id, $channel, $spk, $channel2, $btime, $etime, $btime2, $etime2) = @A[0..7];
if (@A < 9) { print "Skipping line @A"; next; }
my ($meet_id, $channel, $spk, $channel2, $trans_btime, $trans_etime, $aut_btime, $aut_etime) = @A[0..7];
my @transcript = @A[8..$#A];
my %transcript = split_transcripts(\@transcript, $btime, $etime, 25);
my %transcript = split_transcripts(\@transcript, $trans_btime, $trans_etime, 30);
for my $key (keys %transcript) {
my $value = $transcript{$key};
my $seg_name = "AMI_${meet_id}_H0${channel2}_${spk}_${key}";
my $text = normalise_transcripts($value);
my $segment = normalise_transcripts($value);
my @times = split(/\_/, $key);
if (length($text)>0) {
$transcripts{$seg_name}=$text;
print W join " ", $seg_name, $times[0]/100.0, $times[1]/100.0, $transcripts{$seg_name}, "\n";
if (length($segment)>0) {
print W join " ", $meet_id, "H0${channel2}", $spk, $times[0]/100.0, $times[1]/100.0, $segment, "\n";
}
}
}
close(S);
close(W);
print STDERR "Finished."

Просмотреть файл

@ -0,0 +1,34 @@
#!/bin/bash
if [ $# -ne 1 ]; then
echo "Usage: $0 <ami-dir>"
exit 1;
fi
amidir=$1
wdir=data/local/annotations
#extract text from AMI XML annotations
local/ami_xml2text.sh $amidir
[ ! -f $wdir/transcripts1 ] && echo "$0: File $wdir/transcripts1 not found." && exit 1;
echo "Preprocessing transcripts..."
local/ami_split_segments.pl $wdir/transcripts1 $wdir/transcripts2 &> $wdir/log/split_segments.log
#HMM
#MM HMM
#MM UHM
grep -f local/split_train.orig $wdir/transcripts2 > $wdir/train.txt
grep -f local/split_dev.orig $wdir/transcripts2 > $wdir/dev.txt
grep -f local/split_eval.orig $wdir/transcripts2 > $wdir/eval.txt

Просмотреть файл

@ -24,7 +24,7 @@ if [ ! -f $wdir/transcripts0 ]; then
echo "Parsing XML files (can take several minutes)..."
nxtlib=$wdir/nxt/lib
java -cp $nxtlib/nxt.jar:$nxtlib/xmlParserAPIs.jar:$nxtlib/xalan.jar:$nxtlib \
FunctionQuery -c $adir/annotations/AMI-metadata.xml -q '($s segment)' -atts obs who \
FunctionQuery -c $adir/annotations/AMI-metadata.xml -q '($s segment)(exists $w1 w):$s^$w1' -atts obs who \
'@extract(($sp speaker)($m meeting):$m@observation=$s@obs && $m^$sp & $s@who==$sp@nxt_agent,global_name, 0)'\
'@extract(($sp speaker)($m meeting):$m@observation=$s@obs && $m^$sp & $s@who==$sp@nxt_agent, channel, 0)' \
transcriber_start transcriber_end starttime endtime '$s' '@extract(($w w):$s^$w & $w@punc="true", starttime,0,0)' \

36
egs/ami/s5/local/beamformit.sh Executable file
Просмотреть файл

@ -0,0 +1,36 @@
#!/bin/bash
# Copyright 2014, University of Edibnurgh (Author: Pawel Swietojanski)
. ./path.sh
nj=$1
job=$2
numch=$3
meetings=$4
sdir=$5
odir=$6
wdir=data/local/beamforming
utils/split_scp.pl -j $nj $job $meetings $meetings.$job
while read line; do
# --config_file=`pwd`/conf/beamformit.cfg \
BeamformIt -s $line -c $wdir/channels_$numch \
--source_dir=$sdir \
--result_dir=$odir/temp_dir \
--do_compute_reference=1
mkdir -p $odir/$line
mv $odir/temp_dir/$line/${line}_seg.del $odir/$line/${line}_MDM$numch.del
mv $odir/temp_dir/$line/${line}_seg.del2 $odir/$line/${line}_MDM$numch.del2
mv $odir/temp_dir/$line/${line}_seg.info $odir/$line/${line}_MDM$numch.info
mv $odir/temp_dir/$line/${line}_seg.ovl $odir/$line/${line}_MDM$numch.ovl
mv $odir/temp_dir/$line/${line}_seg.weat $odir/$line/${line}_MDM$numch.weat
mv $odir/temp_dir/$line/${line}_seg.wa* $odir/$line/${line}_MDM$numch.wav
mv $odir/temp_dir/$line/${line}_seg2.wa* $odir/$line/${line}_MDM${numch}_seg2.wav
rm -r $odir/temp_dir
done < $meetings.$job

32
egs/ami/s5/path.sh Normal file
Просмотреть файл

@ -0,0 +1,32 @@
export LC_ALL=C # For expected sorting and joining behaviour
KALDI_ROOT=/gpfs/scratch/s1136550/kaldi-code
KALDISRC=$KALDI_ROOT/src
KALDIBIN=$KALDISRC/bin:$KALDISRC/featbin:$KALDISRC/fgmmbin:$KALDISRC/fstbin
KALDIBIN=$KALDIBIN:$KALDISRC/gmmbin:$KALDISRC/latbin:$KALDISRC/nnetbin
KALDIBIN=$KALDIBIN:$KALDISRC/sgmmbin:$KALDISRC/tiedbin
FSTBIN=$KALDI_ROOT/tools/openfst/bin
LMBIN=$KALDI_ROOT/tools/irstlm/bin
BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt-3.5
[ -d $PWD/local ] || { echo "Error: 'local' subdirectory not found."; }
[ -d $PWD/utils ] || { echo "Error: 'utils' subdirectory not found."; }
[ -d $PWD/steps ] || { echo "Error: 'steps' subdirectory not found."; }
export kaldi_local=$PWD/local
export kaldi_utils=$PWD/utils
export kaldi_steps=$PWD/steps
SCRIPTS=$kaldi_local:$kaldi_utils:$kaldi_steps
PATH=$PATH:$KALDIBIN:$FSTBIN:$LMBIN:$SCRIPTS:$BEAMFORMIT
#CUDA_VER='cuda-5.0.35'
#export PATH=$PATH:/opt/$CUDA_VER/bin
#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/$CUDA_VER/lib64:/opt/$CUDA_VER/lib

Просмотреть файл

@ -4,14 +4,19 @@
. ./path.sh
#MDM - Multiple Distant Microphones
mic=$1
#AMI_DIR=/exports/work/inf_hcrc_cstr_nst/meetings/ami_corpus_wav/
AMI_DIR=/exports/work/inf_hcrc_cstr_nst/pawel/ami/bmf_wavs/$mic
nmics=8
mic=mdm$nmics
AMI_DIR=
AMI_DIR=/gpfs/scratch/s1136550/ami/amicorpus
local/ami_beamform.sh --nj 16 $nmics $AMI_DIR /disk/data1/ami
exit 1;
#PREPARE DATA STARTING FROM RT09 SEGMENTATIONS
#local/ami_${mic}_data_prep_edin.sh $AMI_DIR data/local/ami_train_v1_x.segs $mic
local/ami_text_prep.sh
local/ami_mdm_data_prep.sh $AMI_DIR
# We will keep the dict and lang the same as in IHM case
# local/ami_prepare_dict.sh

Просмотреть файл

@ -162,3 +162,16 @@ fortran_opt = $(shell gcc -v 2>&1 | perl -e '$$x = join(" ", <STDIN>); if($$x =~
openblas_compiled:
-git clone git://github.com/xianyi/OpenBLAS
$(MAKE) PREFIX=`pwd`/OpenBLAS/install FC=gfortran $(fortran_opt) DEBUG=1 USE_THREAD=0 -C OpenBLAS all install
beamformit: beamformit-3.5
.PHONY: beamformit-3.5
beamformit-3.5: beamformit-3.5.tgz
beamformit-3.5.tgz:
wget http://www.xavieranguera.com/beamformit/releases/BeamformIt-3.5.tgz