From c903457f3cf72d749e0f73fd7b2625fbbab8a833 Mon Sep 17 00:00:00 2001
From: Pawel Swietojanski
Date: Wed, 25 Jun 2014 09:09:59 +0000
Subject: [PATCH] sandbox/pawel: ready data prep. stages, buiding lms
git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/pawel@4081 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
---
egs/ami/s5/conf/ami.cfg | 50 +
egs/ami/s5/local/ami_beamform.sh | 2 +-
egs/ami/s5/local/ami_ihm_data_prep.sh | 90 +
egs/ami/s5/local/ami_ihm_data_prep_edin.sh | 103 -
...p_edin.sh => ami_ihm_scoring_data_prep.sh} | 73 +-
egs/ami/s5/local/ami_mdm_data_prep_edin.sh | 99 -
egs/ami/s5/local/ami_prepare_dict.sh | 59 +-
...data_prep_edin.sh => ami_sdm_data_prep.sh} | 48 +-
...p_edin.sh => ami_sdm_scoring_data_prep.sh} | 34 +-
egs/ami/s5/local/ami_split_segments.pl | 4 +
egs/ami/s5/local/ami_text_prep.sh | 2 -
egs/ami/s5/local/ami_train_lms.sh | 157 ++
egs/ami/s5/local/beamformit.sh | 8 +-
egs/ami/s5/local/convert2stm.pl | 98 +
egs/ami/s5/local/english.glm | 2023 +++++++++++++++++
egs/ami/s5/local/fisher_map_words.pl | 83 +
egs/ami/s5/path.sh | 2 +-
egs/ami/s5/run_ihm.sh | 8 +-
egs/ami/s5/run_mdm.sh | 5 +-
egs/ami/s5/run_sdm.sh | 13 +-
20 files changed, 2623 insertions(+), 338 deletions(-)
create mode 100644 egs/ami/s5/conf/ami.cfg
create mode 100755 egs/ami/s5/local/ami_ihm_data_prep.sh
delete mode 100755 egs/ami/s5/local/ami_ihm_data_prep_edin.sh
rename egs/ami/s5/local/{ami_ihm_scoring_data_prep_edin.sh => ami_ihm_scoring_data_prep.sh} (51%)
delete mode 100755 egs/ami/s5/local/ami_mdm_data_prep_edin.sh
rename egs/ami/s5/local/{ami_sdm_data_prep_edin.sh => ami_sdm_data_prep.sh} (64%)
rename egs/ami/s5/local/{ami_sdm_scoring_data_prep_edin.sh => ami_sdm_scoring_data_prep.sh} (82%)
create mode 100755 egs/ami/s5/local/ami_train_lms.sh
create mode 100755 egs/ami/s5/local/convert2stm.pl
create mode 100644 egs/ami/s5/local/english.glm
create mode 100755 egs/ami/s5/local/fisher_map_words.pl
diff --git a/egs/ami/s5/conf/ami.cfg b/egs/ami/s5/conf/ami.cfg
new file mode 100644
index 000000000..70fdd8586
--- /dev/null
+++ b/egs/ami/s5/conf/ami.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/ami/s5/local/ami_beamform.sh b/egs/ami/s5/local/ami_beamform.sh
index fb154485e..5efdb486a 100755
--- a/egs/ami/s5/local/ami_beamform.sh
+++ b/egs/ami/s5/local/ami_beamform.sh
@@ -64,7 +64,7 @@ done < $meetings
echo -e "Beamforming\n"
-$cmd JOB=1:$nj $wdir/log/beamform.JOB.log \
+$cmd JOB=0:$nj $wdir/log/beamform.JOB.log \
local/beamformit.sh $nj JOB $numch $meetings $sdir $odir
: << "C"
diff --git a/egs/ami/s5/local/ami_ihm_data_prep.sh b/egs/ami/s5/local/ami_ihm_data_prep.sh
new file mode 100755
index 000000000..c94bd58b2
--- /dev/null
+++ b/egs/ami/s5/local/ami_ihm_data_prep.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+
+# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)
+# AMI Corpus training data preparation
+# Apache 2.0
+
+# To be run from one directory above this script.
+
+. path.sh
+
+#check existing directories
+if [ $# != 1 ]; then
+ echo "Usage: ami_ihm_data_prep.sh /path/to/AMI"
+ exit 1;
+fi
+
+AMI_DIR=$1
+
+SEGS=data/local/annotations/train.txt
+dir=data/local/ihm/train
+mkdir -p $dir
+
+# Audio data directory check
+if [ ! -d $AMI_DIR ]; then
+ echo "Error: $AMI_DIR directory does not exists."
+ exit 1;
+fi
+
+# And transcripts check
+if [ ! -f $SEGS ]; then
+ echo "Error: File $SEGS no found (run ami_text_prep.sh)."
+ exit 1;
+fi
+
+
+# find headset wav audio files only
+find $AMI_DIR -iname '*.Headset-*.wav' | sort > $dir/wav.flist
+n=`cat $dir/wav.flist | wc -l`
+echo "In total, $n headset files were found."
+[ $n -ne 684 ] && \
+ echo "Warning: expected 684 (171 meetings x 4 headsets) data files, found $n"
+
+# (1a) Transcriptions preparation
+# here we start with normalised transcriptions, the utt ids follow the convention
+# AMI_MEETING_CHAN_SPK_STIME_ETIME
+# AMI_ES2011a_H00_FEE041_0003415_0003484
+
+awk '{meeting=$1; channel=$2; speaker=$3; stime=$4; etime=$5;
+ printf("AMI_%s_%s_%s_%07.0f_%07.0f", meeting, channel, speaker, int(100*stime+0.5), int(100*etime+0.5));
+ for(i=6;i<=NF;i++) printf(" %s", $i); printf "\n"}' $SEGS | sort > $dir/text
+
+# (1b) Make segment files from transcript
+
+awk '{
+ segment=$1;
+ split(segment,S,"[_]");
+ audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6];
+ print segment " " audioname " " startf*10/1000 " " endf*10/1000 " " 0
+}' < $dir/text > $dir/segments
+
+# (1c) Make wav.scp file.
+
+sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \
+ perl -ne 'split; $_ =~ m/(.*)\..*\-([0-9])/; print "AMI_$1_H0$2\n"' | \
+ paste - $dir/wav.flist > $dir/wav.scp
+
+#Keep only train part of waves
+awk '{print $2}' $dir/segments | sort -u | join - $dir/wav.scp | sort -o $dir/wav.scp
+
+# (1d) reco2file_and_channel
+
+awk '{print $1}' $dir/wav.scp \
+ | perl -ane '$_ =~ m:^(\S+)(H0[0-4])$: || die "bad label $_";
+ print "$1$2 $1$2 $2\n"; ' > $dir/reco2file_and_channel || exit 1;
+
+awk '{print $1}' $dir/segments | \
+ perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_";
+ print "$1$2$3 $1$2\n";' \
+ > $dir/utt2spk || exit 1;
+
+sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
+
+# Copy stuff into its final location
+mkdir -p data/ihm/train
+for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
+ cp $dir/$f data/ihm/train/$f || exit 1;
+done
+
+echo AMI IHM data preparation succeeded.
+
diff --git a/egs/ami/s5/local/ami_ihm_data_prep_edin.sh b/egs/ami/s5/local/ami_ihm_data_prep_edin.sh
deleted file mode 100755
index bf70a0ecf..000000000
--- a/egs/ami/s5/local/ami_ihm_data_prep_edin.sh
+++ /dev/null
@@ -1,103 +0,0 @@
-#!/bin/bash
-
-# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)
-# AMI Corpus training data preparation
-# Apache 2.0
-
-# To be run from one directory above this script.
-
-. path.sh
-
-#check existing directories
-if [ $# != 2 ]; then
- echo "Usage: ami_data_prep_edin.sh /path/to/AMI"
- exit 1;
-fi
-
-AMI_DIR=$1
-RT09_SEGS=$2 #assuming here all normalisation stuff was done
-
-dir=data/local/ihm/train
-mkdir -p $dir
-
-# Audio data directory check
-if [ ! -d $AMI_DIR ]; then
- echo "Error: run.sh requires a directory argument"
- exit 1;
-fi
-
-# find headset wav audio files only
-find $AMI_DIR -iname '*.Headset-*.wav' | sort > $dir/wav.flist
-
-n=`cat $dir/wav.flist | wc -l`
-
-echo "In total, $n headset files were found."
-#[ $n -ne 2435 ] && \
-# echo Warning: expected 2435 data data files, found $n
-
-# (1a) Transcriptions preparation
-# here we start with rt09 transcriptions, hence not much to do
-
-cut -d" " -f1,4- $RT09_SEGS | sort > $dir/text
-
-# **NOTE: swbd1_map_words.pl has been modified to make the pattern matches
-# case insensitive
-#local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt > $dir/text # final transcripts
-
-# (1c) Make segment files from transcript
-#segments file format is: utt-id side-id start-time end-time, e.g.:
-#AMI_ES2011a_H00_FEE041_0003415_0003484
-awk '{
- segment=$1;
- split(segment,S,"[_]");
- audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6];
- print segment " " audioname " " startf*10/1000 " " endf*10/1000 " " 0
-}' < $dir/text > $dir/segments
-
-#sw02001-A_000098-001156 sw02001-A 0.98 11.56
-#awk '{
-# segment=$1;
-# split(segment,S,"[_-]");
-# side=S[2]; audioname=S[1]; startf=S[3]; endf=S[4];
-# print segment " " audioname "-" side " " startf/100 " " endf/100
-#}' < $dir/text > $dir/segments
-
-#EN2001a.Headset-0.wav
-#sed -e 's?.*/??' -e 's?.sph??' $dir/wav.flist | paste - $dir/wav.flist \
-# > $dir/wav.scp
-
-sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \
- perl -ne 'split; $_ =~ m/(.*)\..*\-([0-9])/; print "AMI_$1_H0$2\n"' | \
- paste - $dir/wav.flist > $dir/wav.scp
-
-# this file reco2file_and_channel maps recording-id (e.g. sw02001-A)
-# to the file name sw02001 and the A, e.g.
-# sw02001-A sw02001 A
-# In this case it's trivial, but in other corpora the information might
-# be less obvious. Later it will be needed for ctm scoring.
-
-awk '{print $1}' $dir/wav.scp \
- | perl -ane '$_ =~ m:^(\S+)(H0[0-4])$: || die "bad label $_";
- print "$1$2 $1$2 $2\n"; ' \
- > $dir/reco2file_and_channel || exit 1;
-
-awk '{print $1}' $dir/segments | \
- perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_";
- print "$1$2$3 $1$2\n";' \
- > $dir/utt2spk || exit 1;
-
-sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
-
-# We assume each conversation side is a separate speaker. This is a very
-# reasonable assumption for Switchboard. The actual speaker info file is at:
-# http://www.ldc.upenn.edu/Catalog/desc/addenda/swb-multi-annot.summary
-
-# Copy stuff into its final locations [this has been moved from the format_data
-# script]
-mkdir -p data/ihm/train
-for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
- cp $dir/$f data/ihm/train/$f || exit 1;
-done
-
-echo AMI data preparation succeeded.
-
diff --git a/egs/ami/s5/local/ami_ihm_scoring_data_prep_edin.sh b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
similarity index 51%
rename from egs/ami/s5/local/ami_ihm_scoring_data_prep_edin.sh
rename to egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
index d9c709bd4..b01c8141b 100755
--- a/egs/ami/s5/local/ami_ihm_scoring_data_prep_edin.sh
+++ b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
@@ -6,19 +6,17 @@
. path.sh
#check existing directories
-if [ $# != 3 ]; then
- echo "Usage: scoring_data_prep_edin.sh /path/to/SWBD rt09-seg-file set-name"
+if [ $# != 2 ]; then
+ echo "Usage: ami_*_scoring_data_prep_edin.sh /path/to/AMI set-name"
exit 1;
fi
AMI_DIR=$1
-RT09_SEGS=$2 #assuming here all normalisation stuff was done
-SET=$3
+SET=$2
+SEGS=data/local/annotations/$SET.txt
-tmpdir=data/local/ihm/$SET
-dir=data/ihm/$SET
-
-mkdir -p $tmpdir
+dir=data/local/ihm/$SET
+mkdir -p $dir
# Audio data directory check
if [ ! -d $AMI_DIR ]; then
@@ -26,39 +24,47 @@ if [ ! -d $AMI_DIR ]; then
exit 1;
fi
+# And transcripts check
+if [ ! -f $SEGS ]; then
+ echo "Error: File $SEGS no found (run ami_text_prep.sh)."
+ exit 1;
+fi
+
# find headset wav audio files only, here we again get all
# the files in the corpora and filter only specific sessions
# while building segments
-find $AMI_DIR -iname '*.Headset-*.wav' | sort > $tmpdir/wav.flist
-n=`cat $tmpdir/wav.flist | wc -l`
+find $AMI_DIR -iname '*.Headset-*.wav' | sort > $dir/wav.flist
+n=`cat $dir/wav.flist | wc -l`
echo "In total, $n headset files were found."
+[ $n -ne 684 ] && \
+ echo "Warning: expected 684 (171 meetings x 4 headsets) data files, found $n"
# (1a) Transcriptions preparation
-# here we start with rt09 transcriptions, hence not much to do
+# here we start with normalised transcriptions, the utt ids follow the convention
+# AMI_MEETING_CHAN_SPK_STIME_ETIME
+# AMI_ES2011a_H00_FEE041_0003415_0003484
-cut -d" " -f1,4- $RT09_SEGS | sort > $tmpdir/text
+awk '{meeting=$1; channel=$2; speaker=$3; stime=$4; etime=$5;
+ printf("AMI_%s_%s_%s_%07.0f_%07.0f", meeting, channel, speaker, int(100*stime+0.5), int(100*etime+0.5));
+ for(i=6;i<=NF;i++) printf(" %s", $i); printf "\n"}' $SEGS | sort > $dir/text
# (1c) Make segment files from transcript
#segments file format is: utt-id side-id start-time end-time, e.g.:
-#AMI_ES2011a_H00_FEE041_0003415_0003484
+
awk '{
segment=$1;
split(segment,S,"[_]");
audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6];
print segment " " audioname " " startf*10/1000 " " endf*10/1000 " " 0
-}' < $tmpdir/text > $tmpdir/segments
+}' < $dir/text > $dir/segments
-#EN2001a.Headset-0.wav
-#sed -e 's?.*/??' -e 's?.sph??' $dir/wav.flist | paste - $dir/wav.flist \
-# > $dir/wav.scp
-
-sed -e 's?.*/??' -e 's?.wav??' $tmpdir/wav.flist | \
+sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \
perl -ne 'split; $_ =~ m/(.*)\..*\-([0-9])/; print "AMI_$1_H0$2\n"' | \
- paste - $tmpdir/wav.flist > $tmpdir/wav.scp
+ paste - $dir/wav.flist > $dir/wav.scp
#Keep only devset part of waves
-awk '{print $2}' $tmpdir/segments | sort -u | join - $tmpdir/wav.scp | sort -o $tmpdir/wav.scp
+awk '{print $2}' $dir/segments | sort -u | join - $dir/wav.scp | sort -o $dir/wav.scp
# this file reco2file_and_channel maps recording-id (e.g. sw02001-A)
# to the file name sw02001 and the A, e.g.
@@ -66,29 +72,28 @@ awk '{print $2}' $tmpdir/segments | sort -u | join - $tmpdir/wav.scp | sort -o $
# In this case it's trivial, but in other corpora the information might
# be less obvious. Later it will be needed for ctm scoring.
-awk '{print $1 $2}' $tmpdir/wav.scp | \
+awk '{print $1 $2}' $dir/wav.scp | \
perl -ane '$_ =~ m:^(\S+H0[0-4]).*\/([IETB].*)\.wav$: || die "bad label $_";
print "$1 $2 0\n"; '\
- > $tmpdir/reco2file_and_channel || exit 1;
+ > $dir/reco2file_and_channel || exit 1;
-awk '{print $1}' $tmpdir/segments | \
+awk '{print $1}' $dir/segments | \
perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_";
print "$1$2$3 $1$2\n";' \
- > $tmpdir/utt2spk || exit 1;
+ > $dir/utt2spk || exit 1;
-sort -k 2 $tmpdir/utt2spk | utils/utt2spk_to_spk2utt.pl > $tmpdir/spk2utt || exit 1;
+sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
-# We assume each conversation side is a separate speaker.
-
-# Copy stuff into its final locations [this has been moved from the format_data
-# script]
-mkdir -p $dir
+# Copy stuff into its final locations
+fdir=data/ihm/$SET
+mkdir -p $fdir
for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
- cp $tmpdir/$f $dir/$f || exit 1;
+ cp $dir/$f $fdir/$f || exit 1;
done
-utils/convert2stm.pl $dir > $dir/stm
-cp local/english.glm $dir/glm
+#Produce STMs for sclite scoring
+local/convert2stm.pl $dir > $fdir/stm
+cp local/english.glm $fdir/glm
echo AMI $SET set data preparation succeeded.
diff --git a/egs/ami/s5/local/ami_mdm_data_prep_edin.sh b/egs/ami/s5/local/ami_mdm_data_prep_edin.sh
deleted file mode 100755
index 101d2a783..000000000
--- a/egs/ami/s5/local/ami_mdm_data_prep_edin.sh
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/bin/bash
-
-# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)
-# AMI Corpus dev/eval data preparation
-
-# To be run from one directory above this script.
-
-. path.sh
-
-#check existing directories
-if [ $# != 3 ]; then
- echo "Usage: ami_data_prep_edin.sh /path/to/AMI rt09 mic"
- exit 1;
-fi
-
-AMI_DIR=$1
-SEGS=$2 #assuming here all normalisation stuff was done
-mic=$3
-
-dir=data/local/$mic/train
-odir=data/$mic/train
-mkdir -p $dir
-
-# Audio data directory check
-if [ ! -d $AMI_DIR ]; then
- echo "Error: run.sh requires a directory argument"
- exit 1;
-fi
-
-# as the sdm we treat first mic from the array
-find $AMI_DIR -iname '*bmf[248].wav' | sort > $dir/wav.flist
-
-n=`cat $dir/wav.flist | wc -l`
-
-echo "In total, $n headset files were found."
-#[ $n -ne 2435 ] && \
-# echo Warning: expected 2435 data data files, found $n
-
-# (1a) Transcriptions preparation
-# here we start with rt09 transcriptions, hence not much to do
-
-awk '{meeting=$1; channel="MDM"; speaker=$3; stime=$4; etime=$5;
- printf("AMI_%s_%s_%s_%07.0f_%07.0f", meeting, channel, speaker, int(100*stime+0.5), int(100*etime+0.5));
- for(i=6;i<=NF;i++) printf(" %s", $i); printf "\n"}' $SEGS | sort > $dir/text
-
-# **NOTE: swbd1_map_words.pl has been modified to make the pattern matches
-# case insensitive
-#local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt > $dir/text # final transcripts
-
-# (1c) Make segment files from transcript
-#segments file format is: utt-id side-id start-time end-time, e.g.:
-#AMI_ES2011a_H00_FEE041_0003415_0003484
-awk '{
- segment=$1;
- split(segment,S,"[_]");
- audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6];
- print segment " " audioname " " startf/100 " " endf/100 " " 0
-}' < $dir/text > $dir/segments
-
-#EN2001a.Array1-01.wav
-#sed -e 's?.*/??' -e 's?.sph??' $dir/wav.flist | paste - $dir/wav.flist \
-# > $dir/wav.scp
-
-sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \
- perl -ne 'split; $_ =~ m/(.*)_bmf[248].*/; print "AMI_$1_MDM\n"' | \
- paste - $dir/wav.flist > $dir/wav.scp
-
-# this file reco2file_and_channel maps recording-id (e.g. sw02001-A)
-# to the file name sw02001 and the A, e.g.
-# sw02001-A sw02001 A
-# In this case it's trivial, but in other corpora the information might
-# be less obvious. Later it will be needed for ctm scoring.
-
-awk '{print $1 $2}' $dir/wav.scp | \
- perl -ane '$_ =~ m:^(\S+MDM).*\/([IETB].*)\.wav$: || die "bad label $_";
- print "$1 $2 0\n"; '\
- > $dir/reco2file_and_channel || exit 1;
-
-# we assume we adapt to the session only
-awk '{print $1}' $dir/segments | \
- perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_";
- print "$1$2$3 $1\n";' \
- > $dir/utt2spk || exit 1;
-
-sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
-
-# We assume each conversation side is a separate speaker. This is a very
-# reasonable assumption for Switchboard. The actual speaker info file is at:
-# http://www.ldc.upenn.edu/Catalog/desc/addenda/swb-multi-annot.summary
-
-# Copy stuff into its final locations [this has been moved from the format_data
-# script]
-mkdir -p $odir
-for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
- cp $dir/$f $odir/$f | exit 1;
-done
-
-echo AMI data preparation succeeded.
-
diff --git a/egs/ami/s5/local/ami_prepare_dict.sh b/egs/ami/s5/local/ami_prepare_dict.sh
index 45958fbe3..ed54ae3ce 100755
--- a/egs/ami/s5/local/ami_prepare_dict.sh
+++ b/egs/ami/s5/local/ami_prepare_dict.sh
@@ -10,20 +10,23 @@
#check existing directories
[ $# != 0 ] && echo "Usage: local/ami_ihm_data_prep_edin.sh" && exit 1;
-srcdir=data/local/train # This is where we downloaded some stuff..
-dir=data/local/dict
-mkdir -p $dir
+sdir=data/local/annotations
+wdir=data/local/dict
+cmuurl=http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/
+cmuver=cmudict.0.7a
-rt09_train=data/local/rt09.ami.ihmtrain09.v3.dct
+req="$sdir/transcripts2 local/wordlist.50k"
+[ ! -f "$sdir/transcripts2" ] && echo "No such file $sdir/transcripts2 (need to run ami_text_prep.sh first)" && exit 1;
-#as rt09_train is a superset of rt09_test, including some words
-#fitting training transcription, we will use the training dict
-#in Kaldi both for decoding and decoding
+mkdir -p $wdir
-# assume swbd_p1_data_prep.sh was done already.
-#[ ! -f "$srcdict" ] && echo "No such file $srcdict" && exit 1;
+if [ ! -f $wdir/$cmuver ]; then
+ wget -O $wdir/$cmuver svn $cmuurl/$cmuver
+ wget -O $wdir/$cmuver.phones svn $cmuurl/$cmuver.phones
+ wget -O $wdir/$cmuver.symbols svn $cmuurl/$cmuver.symbols
+fi
-cat $rt09_train | sort > $dir/lexicon1.txt
+grep -e "^;;;" -v $wdir/$cmuver | sort > $dir/lexicon1.txt
cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
grep -v sil > $dir/nonsilence_phones.txt || exit 1;
@@ -41,41 +44,7 @@ echo -n >$dir/extra_questions.txt
echo '[laughter] lau'; echo ' spn' ) \
| cat - $dir/lexicon1.txt > $dir/lexicon2.txt || exit 1;
-# Map the words in the lexicon. That is-- for each word in the lexicon, we map it
-# to a new written form. The transformations we do are:
-# remove laughter markings, e.g.
-# [LAUGHTER-STORY] -> STORY
-# Remove partial-words, e.g.
-# -[40]1K W AH N K EY
-# becomes -1K
-# and
-# -[AN]Y IY
-# becomes
-# -Y
-# -[A]B[OUT]- B
-# becomes
-# -B-
-# Also, curly braces, which appear to be used for "nonstandard"
-# words or non-words, are removed, e.g.
-# {WOLMANIZED} W OW L M AX N AY Z D
-# -> WOLMANIZED
-# Also, mispronounced words, e.g.
-# [YEAM/YEAH] Y AE M
-# are changed to just e.g. YEAM, i.e. the orthography
-# of the mispronounced version.
-# Note-- this is only really to be used in training. The main practical
-# reason is to avoid having tons of disambiguation symbols, which
-# we otherwise would get because there are many partial words with
-# the same phone sequences (most problematic: S).
-# Also, map
-# THEM_1 EH M -> THEM
-# so that multiple pronunciations just have alternate entries
-# in the lexicon.
-
-#local/swbd1_map_words.pl -f 1 $dir/lexicon2.txt | sort -u \
-# > $dir/lexicon3.txt || exit 1;
-
-pushd $dir >&/dev/null
+pushd $wdir >&/dev/null
ln -sf lexicon2.txt lexicon.txt # This is the final lexicon.
popd >&/dev/null
diff --git a/egs/ami/s5/local/ami_sdm_data_prep_edin.sh b/egs/ami/s5/local/ami_sdm_data_prep.sh
similarity index 64%
rename from egs/ami/s5/local/ami_sdm_data_prep_edin.sh
rename to egs/ami/s5/local/ami_sdm_data_prep.sh
index b17b14583..0b8c618c5 100755
--- a/egs/ami/s5/local/ami_sdm_data_prep_edin.sh
+++ b/egs/ami/s5/local/ami_sdm_data_prep.sh
@@ -6,17 +6,17 @@
. path.sh
#check existing directories
-if [ $# != 3 ]; then
- echo "Usage: ami_data_prep_edin.sh /path/to/AMI"
+if [ $# != 2 ]; then
+ echo "Usage: ami_sdm_data_prep.sh "
exit 1;
fi
AMI_DIR=$1
-SEGS=$2 #assuming here all normalisation stuff was done
-MICNUM=$3
-MICID="m$MICNUM"
+MICNUM=$2
+DSET="sdm$MICNUM"
-dir=data/local/sdm/$MICID/train
+SEGS=data/local/annotations/train.txt
+dir=data/local/$DSET/train
mkdir -p $dir
# Audio data directory check
@@ -25,26 +25,30 @@ if [ ! -d $AMI_DIR ]; then
exit 1;
fi
+# And transcripts check
+if [ ! -f $SEGS ]; then
+ echo "Error: File $SEGS no found (run ami_text_prep.sh)."
+ exit 1;
+fi
+
# as the sdm we treat first mic from the array
find $AMI_DIR -iname "*.Array1-0$MICNUM.wav" | sort > $dir/wav.flist
n=`cat $dir/wav.flist | wc -l`
echo "In total, $n files were found."
-#[ $n -ne 2435 ] && \
-# echo Warning: expected 2435 data data files, found $n
+[ $n -ne 169 ] && \
+ echo Warning: expected 169 data data files, found $n
# (1a) Transcriptions preparation
# here we start with already normalised transcripts, just make the ids
+# Note, we set here SDM rather than, for example, SDM1 as we want to easily use
+# the same alignments across different mics
awk '{meeting=$1; channel="SDM"; speaker=$3; stime=$4; etime=$5;
printf("AMI_%s_%s_%s_%07.0f_%07.0f", meeting, channel, speaker, int(100*stime+0.5), int(100*etime+0.5));
for(i=6;i<=NF;i++) printf(" %s", $i); printf "\n"}' $SEGS | sort > $dir/text
-# **NOTE: swbd1_map_words.pl has been modified to make the pattern matches
-# case insensitive
-#local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt > $dir/text # final transcripts
-
# (1c) Make segment files from transcript
#segments file format is: utt-id side-id start-time end-time, e.g.:
#AMI_ES2011a_H00_FEE041_0003415_0003484
@@ -56,19 +60,15 @@ awk '{
}' < $dir/text > $dir/segments
#EN2001a.Array1-01.wav
-#sed -e 's?.*/??' -e 's?.sph??' $dir/wav.flist | paste - $dir/wav.flist \
-# > $dir/wav.scp
sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \
perl -ne 'split; $_ =~ m/(.*)\..*/; print "AMI_$1_SDM\n"' | \
paste - $dir/wav.flist > $dir/wav.scp
-# this file reco2file_and_channel maps recording-id (e.g. sw02001-A)
-# to the file name sw02001 and the A, e.g.
-# sw02001-A sw02001 A
-# In this case it's trivial, but in other corpora the information might
-# be less obvious. Later it will be needed for ctm scoring.
+#Keep only train part of waves
+awk '{print $2}' $dir/segments | sort -u | join - $dir/wav.scp | sort -o $dir/wav.scp
+# this file reco2file_and_channel maps recording-id
awk '{print $1 $2}' $dir/wav.scp | \
perl -ane '$_ =~ m:^(\S+SDM).*\/([IETB].*)\.wav$: || die "bad label $_";
print "$1 $2 0\n"; '\
@@ -82,16 +82,14 @@ awk '{print $1}' $dir/segments | \
sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
-# We assume each conversation side is a separate speaker. This is a very
-# reasonable assumption for Switchboard. The actual speaker info file is at:
-# http://www.ldc.upenn.edu/Catalog/desc/addenda/swb-multi-annot.summary
+# We distant scenario we assume no infomration (without diarisation) about speakers is available
# Copy stuff into its final locations [this has been moved from the format_data
# script]
-mkdir -p data/sdm/$MICID/train
+mkdir -p data/$DSET/train
for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
- cp $dir/$f data/sdm/$MICID/train/$f || exit 1;
+ cp $dir/$f data/$DSET/train/$f || exit 1;
done
-echo AMI data preparation succeeded.
+echo AMI $DSET data preparation succeeded.
diff --git a/egs/ami/s5/local/ami_sdm_scoring_data_prep_edin.sh b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
similarity index 82%
rename from egs/ami/s5/local/ami_sdm_scoring_data_prep_edin.sh
rename to egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
index 30eecc2f0..63da9603e 100755
--- a/egs/ami/s5/local/ami_sdm_scoring_data_prep_edin.sh
+++ b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+!/bin/bash
# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)
# AMI Corpus dev/eval data preparation
@@ -6,19 +6,19 @@
. path.sh
#check existing directories
-if [ $# != 4 ]; then
- echo "Usage: ami_sdm_scoring_data_prep_edin.sh /path/to/AMI rt09-seg-file set-name"
+if [ $# != 3 ]; then
+ echo "Usage: ami_sdm_scoring_data_prep.sh "
exit 1;
fi
AMI_DIR=$1
-SEGS=$2 #assuming here all normalisation stuff was done
+MICNUM=$2
SET=$3
-MICNUM=$4
-MICID="m$MICNUM"
+DSET="sdm$MICNUM"
-tmpdir=data/local/sdm/$MICID/$SET
-dir=data/sdm/$MICID/$SET
+SEGS=data/local/annotations/$SET.txt
+tmpdir=data/local/$DSET/$SET
+dir=data/$DSET/$SET
mkdir -p $tmpdir
@@ -28,6 +28,12 @@ if [ ! -d $AMI_DIR ]; then
exit 1;
fi
+# And transcripts check
+if [ ! -f $SEGS ]; then
+ echo "Error: File $SEGS no found (run ami_text_prep.sh)."
+ exit 1;
+fi
+
# find headset wav audio files only, here we again get all
# the files in the corpora and filter only specific sessions
# while building segments
@@ -65,12 +71,7 @@ sed -e 's?.*/??' -e 's?.wav??' $tmpdir/wav.flist | \
#Keep only devset part of waves
awk '{print $2}' $tmpdir/segments | sort -u | join - $tmpdir/wav.scp | sort -o $tmpdir/wav.scp
-# this file reco2file_and_channel maps recording-id (e.g. sw02001-A)
-# to the file name sw02001 and the A, e.g.
-# sw02001-A sw02001 A
-# In this case it's trivial, but in other corpora the information might
-# be less obvious. Later it will be needed for ctm scoring.
-
+#prep reco2file_and_channel
awk '{print $1 $2}' $tmpdir/wav.scp | \
perl -ane '$_ =~ m:^(\S+SDM).*\/([IETB].*)\.wav$: || die "bad label $_";
print "$1 $2 0\n"; '\
@@ -91,9 +92,6 @@ awk '{print $1}' $tmpdir/segments | \
print "$1$2$3 $1$2\n";' \
> $tmpdir/utt2spk_stm || exit 1;
-
-# We assume each conversation side is a separate speaker.
-
# Copy stuff into its final locations [this has been moved from the format_data
# script]
mkdir -p $dir
@@ -104,5 +102,5 @@ done
utils/convert2stm.pl $dir utt2spk_stm > $dir/stm
cp local/english.glm $dir/glm
-echo AMI $SET set data preparation succeeded.
+echo AMI $DSET scenario and $SET set data preparation succeeded.
diff --git a/egs/ami/s5/local/ami_split_segments.pl b/egs/ami/s5/local/ami_split_segments.pl
index d769145ba..57e2c2d60 100755
--- a/egs/ami/s5/local/ami_split_segments.pl
+++ b/egs/ami/s5/local/ami_split_segments.pl
@@ -170,6 +170,10 @@ sub normalise_transcripts {
#some transcripts are empty with -, nullify (and ignore) them
$text =~ s/^\-$//g;
$text =~ s/\s+\-$//;
+ # apply few exception for dashed phrases, Mm-Hmm, Uh-Huh, etc. those are frequent in AMI
+ # and will be added to dictionary
+ $text =~ s/MM HMM/MM\-HMM/g;
+ $text =~ s/UH HUH/UH\-HUH/g;
return $text;
}
diff --git a/egs/ami/s5/local/ami_text_prep.sh b/egs/ami/s5/local/ami_text_prep.sh
index 4a7083307..ab58d42f1 100755
--- a/egs/ami/s5/local/ami_text_prep.sh
+++ b/egs/ami/s5/local/ami_text_prep.sh
@@ -16,12 +16,10 @@ local/ami_xml2text.sh $amidir
echo "Preprocessing transcripts..."
local/ami_split_segments.pl $wdir/transcripts1 $wdir/transcripts2 &> $wdir/log/split_segments.log
-
#HMM
#MM HMM
#MM UHM
-
grep -f local/split_train.orig $wdir/transcripts2 > $wdir/train.txt
grep -f local/split_dev.orig $wdir/transcripts2 > $wdir/dev.txt
grep -f local/split_eval.orig $wdir/transcripts2 > $wdir/eval.txt
diff --git a/egs/ami/s5/local/ami_train_lms.sh b/egs/ami/s5/local/ami_train_lms.sh
new file mode 100755
index 000000000..906248cff
--- /dev/null
+++ b/egs/ami/s5/local/ami_train_lms.sh
@@ -0,0 +1,157 @@
+#!/bin/bash -v
+
+# Copyright 2013 Arnab Ghoshal, Pawel Swietojanski
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# To be run from one directory above this script.
+
+# Begin configuration section.
+fisher=
+order=3
+swbd=
+web_sw=
+web_fsh=
+web_mtg=
+# end configuration sections
+
+help_message="Usage: "`basename $0`" [options]
+Train language models for AMI and optionally for Switchboard, Fisher and web-data from University of Washington.\n
+options:
+ --help # print this message and exit
+ --fisher DIR # directory for Fisher transcripts
+ --order N # N-gram order (default: '$order')
+ --swbd DIR # Directory for Switchboard transcripts
+ --web-sw FILE # University of Washington (191M) Switchboard web data
+ --web-fsh FILE # University of Washington (525M) Fisher web data
+ --web-mtg FILE # University of Washington (150M) CMU+ICSI+NIST meeting data
+";
+
+. utils/parse_options.sh
+
+if [ $# -ne 4 ]; then
+ printf "$help_message\n";
+ exit 1;
+fi
+
+train=$1 # data/local/train/text
+dev=$2 # data/local/dev/text
+lexicon=$3 # data/local/dict/lexicon.txt
+dir=$4 # data/local/lm
+
+for f in "$text" "$lexicon"; do
+ [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+set -o errexit
+mkdir -p $dir
+export LC_ALL=C
+
+cut -d' ' -f2- $train | gzip -c > $dir/train.gz
+cut -d' ' -f2- $dev | gzip -c > $dir/dev.gz
+
+awk '{print $1}' $lexicon | sort -u > $dir/wordlist.lex
+gunzip -c $dir/train.gz | tr ' ' '\n' | grep -v ^$ | sort -u > $dir/wordlist.train
+sort -u $dir/wordlist.lex $dir/wordlist.train > $dir/wordlist
+
+ngram-count -text $dir/train.gz -order $order -limit-vocab -vocab $dir/wordlist \
+ -unk -map-unk "" -kndiscount -interpolate -lm $dir/ami.o${order}g.kn.gz
+echo "PPL for AMI LM:"
+ngram -unk -lm $dir/ami.o${order}g.kn.gz -ppl $dir/dev.gz
+ngram -unk -lm $dir/ami.o${order}g.kn.gz -ppl $dir/dev.gz -debug 2 >& $dir/ppl2
+mix_ppl="$dir/ppl2"
+mix_tag="ami"
+mix_lms=( "$dir/swbd/ami.o${order}g.kn.gz" )
+num_lms=1
+
+if [ ! -z "$swbd" ]; then
+ mkdir -p $dir/swbd
+
+ find $swbd -iname '*-trans.text' -exec cat {} \; | cut -d' ' -f4- \
+ | gzip -c > $dir/swbd/text0.gz
+ gunzip -c $dir/swbd/text0.gz | swbd_map_words.pl | gzip -c \
+ > $dir/swbd/text1.gz
+ ngram-count -text $dir/swbd/text1.gz -order $order -limit-vocab \
+ -vocab $dir/wordlist -unk -map-unk "" -kndiscount -interpolate \
+ -lm $dir/swbd/swbd.o${order}g.kn.gz
+ echo "PPL for SWBD LM:"
+ ngram -unk -lm $dir/swbd/swbd.o${order}g.kn.gz -ppl $dir/dev.gz
+ ngram -unk -lm $dir/swbd/swbd.o${order}g.kn.gz -ppl $dir/dev.gz -debug 2 \
+ >& $dir/swbd/ppl2
+
+ mix_ppl="$mix_ppl $dir/swbd/ppl2"
+ mix_tag="${mix_tag}_swbd"
+ mix_lms=("${mix_lms[@]}" "$dir/swbd/swbd.o${order}g.kn.gz")
+ num_lms=$[ num_lms + 1 ]
+fi
+
+if [ ! -z "$fisher" ]; then
+ [ ! -d "$fisher/data/trans" ] \
+ && echo "Cannot find transcripts in Fisher directory: '$fisher'" \
+ && exit 1;
+ mkdir -p $dir/fisher
+
+ find $fisher -path '*/trans/*fe*.txt' -exec cat {} \; | grep -v ^# | grep -v ^$ \
+ | cut -d' ' -f4- | gzip -c > $dir/fisher/text0.gz
+ gunzip -c $dir/fisher/text0.gz | fisher_map_words.pl \
+ | gzip -c > $dir/fisher/text1.gz
+ ngram-count -text $dir/fisher/text1.gz -order $order -limit-vocab \
+ -vocab $dir/wordlist -unk -map-unk "" -kndiscount -interpolate \
+ -lm $dir/fisher/fisher.o${order}g.kn.gz
+ echo "PPL for Fisher LM:"
+ ngram -unk -lm $dir/fisher/fisher.o${order}g.kn.gz -ppl $dir/dev.gz
+ ngram -unk -lm $dir/fisher/fisher.o${order}g.kn.gz -ppl $dir/dev.gz -debug 2 \
+ >& $dir/fisher/ppl2
+
+ mix_ppl="$mix_ppl $dir/fisher/ppl2"
+ mix_tag="${mix_tag}_fsh"
+ mix_lms=("${mix_lms[@]}" "$dir/swbd/fisher.o${order}g.kn.gz")
+ num_lms=$[ num_lms + 1 ]
+fi
+
+## The University of Washington conversational web data can be obtained as:
+## wget --no-check-certificate http://ssli.ee.washington.edu/data/191M_conversational_web-filt+periods.gz
+if [ ! -z "$web_sw" ]; then
+ echo "Interpolating web-LM not implemented yet"
+fi
+
+## The University of Washington Fisher conversational web data can be obtained as:
+## wget --no-check-certificate http://ssli.ee.washington.edu/data/525M_fisher_conv_web-filt+periods.gz
+if [ ! -z "$web_fsh" ]; then
+ echo "Interpolating web-LM not implemented yet"
+fi
+
+## The University of Washington meeting web data can be obtained as:
+## wget --no-check-certificate http://ssli.ee.washington.edu/data/150M_cmu+icsi+nist-meetings.gz
+if [ ! -z "$web_mtg" ]; then
+ echo "Interpolating web-LM not implemented yet"
+fi
+
+echo "Computing interpolation weights from: $mix_ppl"
+compute-best-mix $best_mix_ppl >& $dir/mix.log
+grep 'best lambda' $dir/sw1_fsh_mix.log \
+ | perl -e '$_=<>; s/.*\(//; s/\).*//; @A = split; for $i (@A) {print "$i\n";}' \
+ > $dir/mix.weights
+weights=( `cat $dir/mix.weights` )
+cmd="ngram -lm ${mix_lms[0]} -lambda ${weights[0]} -mix-lm ${mix_lms[1]}"
+for i in `seq 2 $num_lms`; do
+ cmd="$cmd -mix-lm${i} ${mix_lms[$i]} -mix-lambda${i} ${weights[$i]}"
+done
+cmd="$cmd -unk -write-lm $dir/${mix_tag}.o${order}g.kn.gz"
+echo "Interpolating LMs with command: \"$cmd\""
+$cmd
+echo "PPL for the interolated LM:"
+ngram -unk -lm $dir/${mix_tag}.o${order}g.kn.gz -ppl $dir/dev.gz
+
+
diff --git a/egs/ami/s5/local/beamformit.sh b/egs/ami/s5/local/beamformit.sh
index b9aa92e4b..d5171aa9d 100755
--- a/egs/ami/s5/local/beamformit.sh
+++ b/egs/ami/s5/local/beamformit.sh
@@ -16,11 +16,11 @@ utils/split_scp.pl -j $nj $job $meetings $meetings.$job
while read line; do
-# --config_file=`pwd`/conf/beamformit.cfg \
BeamformIt -s $line -c $wdir/channels_$numch \
- --source_dir=$sdir \
- --result_dir=$odir/temp_dir \
- --do_compute_reference=1
+ --config_file `pwd`/conf/ami.cfg \
+ --source_dir $sdir \
+ --result_dir $odir/temp_dir
+
mkdir -p $odir/$line
mv $odir/temp_dir/$line/${line}_seg.del $odir/$line/${line}_MDM$numch.del
mv $odir/temp_dir/$line/${line}_seg.del2 $odir/$line/${line}_MDM$numch.del2
diff --git a/egs/ami/s5/local/convert2stm.pl b/egs/ami/s5/local/convert2stm.pl
new file mode 100755
index 000000000..a9baf84fc
--- /dev/null
+++ b/egs/ami/s5/local/convert2stm.pl
@@ -0,0 +1,98 @@
+#!/usr/bin/perl
+
+# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
+# 2013 University of Edinburgh (Author: Pawel Swietojanski)
+
+# This takes as standard input path to directory containing all the usual
+# data files - segments, text, utt2spk and reco2file_and_channel and creates stm
+
+if (@ARGV < 1 || @ARGV > 2) {
+ print STDERR "Usage: convert2stm.pl [] > stm-file\n";
+ exit(1);
+}
+
+$dir=shift @ARGV;
+$utt2spk_file=shift @ARGV || 'utt2spk';
+
+$segments = "$dir/segments";
+$reco2file_and_channel = "$dir/reco2file_and_channel";
+$text = "$dir/text";
+$utt2spk_file = "$dir/$utt2spk_file";
+
+open(S, "<$segments") || die "opening segments file $segments";
+while() {
+ @A = split(" ", $_);
+ @A > 4 || die "Bad line in segments file: $_";
+ ($utt, $recording_id, $begin_time, $end_time) = @A[0..3];
+ $utt2reco{$utt} = $recording_id;
+ $begin{$utt} = $begin_time;
+ $end{$utt} = $end_time;
+}
+close(S);
+
+open(R, "<$reco2file_and_channel") || die "open reco2file_and_channel file $reco2file_and_channel";
+while() {
+ @A = split(" ", $_);
+ @A == 3 || die "Bad line in reco2file_and_channel file: $_";
+ ($recording_id, $file, $channel) = @A;
+ $reco2file{$recording_id} = $file;
+ $reco2channel{$recording_id} = $channel;
+}
+close(R);
+
+open(T, "<$text") || die "open text file $text";
+while() {
+ @A = split(" ", $_);
+ $utt = shift @A;
+ $utt2text{$utt} = "@A";
+}
+close(T);
+
+open(U, "<$utt2spk_file") || die "open utt2spk file $utt2spk_file";
+while() {
+ @A = split(" ", $_);
+ @A == 2 || die "Bad line in utt2spk file: $_";
+ ($utt, $spk) = @A;
+ $utt2spk{$utt} = $spk;
+}
+close(U);
+
+# Now generate the stm file
+foreach $utt (sort keys(%utt2reco)) {
+
+ # lines look like:
+ # [