зеркало из https://github.com/mozilla/kaldi.git
adding aurora4 eg
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@3292 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
94c4646aba
Коммит
6a4251460f
|
@ -0,0 +1,29 @@
|
||||||
|
# "queue.pl" uses qsub. The options to it are
|
||||||
|
# options to qsub. If you have GridEngine installed,
|
||||||
|
# change this to a queue you have access to.
|
||||||
|
# Otherwise, use "run.pl", which will run jobs locally
|
||||||
|
# (make sure your --num-jobs options are no more than
|
||||||
|
# the number of cpus on your machine.
|
||||||
|
|
||||||
|
#a) JHU cluster options
|
||||||
|
export train_cmd="queue.pl -l arch=*64"
|
||||||
|
export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
|
||||||
|
export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
|
||||||
|
|
||||||
|
#export cuda_cmd="..."
|
||||||
|
|
||||||
|
|
||||||
|
#b) BUT cluster options
|
||||||
|
#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
|
||||||
|
#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
|
||||||
|
#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
|
||||||
|
|
||||||
|
#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
|
||||||
|
#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
|
||||||
|
#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
|
||||||
|
|
||||||
|
#c) run it locally...
|
||||||
|
#export train_cmd=run.pl
|
||||||
|
#export decode_cmd=run.pl
|
||||||
|
export cuda_cmd=run.pl
|
||||||
|
#export mkgraph_cmd=run.pl
|
|
@ -0,0 +1,2 @@
|
||||||
|
beam=18.0 # beam for decoding. Was 13.0 in the scripts.
|
||||||
|
latbeam=10.0 # this has most effect on size of the lattices.
|
|
@ -0,0 +1,11 @@
|
||||||
|
# No non-default options for now.
|
||||||
|
--window-type=hamming # disable Dans window, use the standard
|
||||||
|
--use-energy=false # only fbank outputs
|
||||||
|
--sample-frequency=16000 # Cantonese is sampled at 8kHz
|
||||||
|
|
||||||
|
--low-freq=64 # typical setup from Frantisek Grezl
|
||||||
|
--high-freq=8000
|
||||||
|
--dither=1
|
||||||
|
|
||||||
|
--num-mel-bins=40 # 8kHz so we use 15 bins
|
||||||
|
--htk-compat=true # try to make it compatible with HTK
|
|
@ -0,0 +1 @@
|
||||||
|
--use-energy=false # only non-default option.
|
|
@ -0,0 +1,9 @@
|
||||||
|
momentum=0
|
||||||
|
l1_penalty=0
|
||||||
|
l2_penalty=0
|
||||||
|
max_iters=25
|
||||||
|
min_iters=
|
||||||
|
start_halving_inc=0.5
|
||||||
|
end_halving_inc=0.1
|
||||||
|
halving_factor=0.7
|
||||||
|
cache_size=32768
|
|
@ -0,0 +1,9 @@
|
||||||
|
momentum=0
|
||||||
|
l1_penalty=0
|
||||||
|
l2_penalty=0
|
||||||
|
max_iters=30
|
||||||
|
min_iters=
|
||||||
|
start_halving_inc=0.5
|
||||||
|
end_halving_inc=0.1
|
||||||
|
halving_factor=0.5
|
||||||
|
cache_size=65536
|
|
@ -0,0 +1,9 @@
|
||||||
|
momentum=0.9
|
||||||
|
l1_penalty=0
|
||||||
|
l2_penalty=0
|
||||||
|
max_iters=10
|
||||||
|
min_iters=10
|
||||||
|
start_halving_inc=0.5
|
||||||
|
end_halving_inc=0.1
|
||||||
|
halving_factor=0.7
|
||||||
|
cache_size=32768
|
|
@ -0,0 +1,9 @@
|
||||||
|
momentum=0
|
||||||
|
l1_penalty=0
|
||||||
|
l2_penalty=0
|
||||||
|
max_iters=25
|
||||||
|
min_iters=
|
||||||
|
start_halving_inc=0.5
|
||||||
|
end_halving_inc=0.1
|
||||||
|
halving_factor=0.7
|
||||||
|
cache_size=32768
|
|
@ -0,0 +1,55 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
# Copyright 2010-2011 Microsoft Corporation
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# This is modified from the script in standard Kaldi recipe to account
|
||||||
|
# for the way the WSJ data is structured on the Edinburgh systems.
|
||||||
|
# - Arnab Ghoshal, 12/1/12
|
||||||
|
|
||||||
|
# This program takes as its standard input an .ndx file from the WSJ corpus that looks
|
||||||
|
# like this:
|
||||||
|
#;; File: tr_s_wv1.ndx, updated 04/26/94
|
||||||
|
#;;
|
||||||
|
#;; Index for WSJ0 SI-short Sennheiser training data
|
||||||
|
#;; Data is read WSJ sentences, Sennheiser mic.
|
||||||
|
#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts
|
||||||
|
#;; per speaker TI) = 7236 utts
|
||||||
|
#;;
|
||||||
|
#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
|
||||||
|
#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
|
||||||
|
#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1
|
||||||
|
|
||||||
|
# and as command-line argument it takes the names of the WSJ disk locations, e.g.:
|
||||||
|
# /group/corpora/public/wsjcam0/data on DICE machines.
|
||||||
|
# It outputs a list of absolute pathnames.
|
||||||
|
|
||||||
|
$wsj_dir = $ARGV[0];
|
||||||
|
|
||||||
|
while(<STDIN>){
|
||||||
|
if(m/^;/){ next; } # Comment. Ignore it.
|
||||||
|
else {
|
||||||
|
#m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_";
|
||||||
|
@A=split(" ", $_);
|
||||||
|
$filename = $A[0]; # as a subdirectory of the distributed disk.
|
||||||
|
#if ($filename !~ m/\.wv1$/) { $filename .= ".wv1"; }
|
||||||
|
$filename = "$wsj_dir/$filename";
|
||||||
|
if (-e $filename) {
|
||||||
|
print "$filename\n";
|
||||||
|
} else {
|
||||||
|
print STDERR "File $filename found in the index but not on disk\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,401 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
|
||||||
|
# Apache 2.0.
|
||||||
|
|
||||||
|
# This is modified from the script in standard Kaldi recipe to account
|
||||||
|
# for the way the WSJ data is structured on the Edinburgh systems.
|
||||||
|
# - Arnab Ghoshal, 29/05/12
|
||||||
|
|
||||||
|
if [ $# -ne 2 ]; then
|
||||||
|
printf "\nUSAGE: %s <corpus-directory>\n\n" `basename $0`
|
||||||
|
echo "The argument should be a the top-level WSJ corpus directory."
|
||||||
|
echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory"
|
||||||
|
echo "within the top-level corpus directory."
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
AURORA=$1
|
||||||
|
CORPUS=$2
|
||||||
|
|
||||||
|
dir=`pwd`/data/local/data
|
||||||
|
lmdir=`pwd`/data/local/nist_lm
|
||||||
|
mkdir -p $dir $lmdir
|
||||||
|
local=`pwd`/local
|
||||||
|
utils=`pwd`/utils
|
||||||
|
|
||||||
|
. ./path.sh # Needed for KALDI_ROOT
|
||||||
|
export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
|
||||||
|
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
|
||||||
|
if [ ! -x $sph2pipe ]; then
|
||||||
|
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
cd $dir
|
||||||
|
|
||||||
|
# SI-84 clean training data
|
||||||
|
cat $AURORA/lists/training_clean_sennh_16k.list \
|
||||||
|
| $local/aurora2flist.pl $AURORA | sort -u > train_si84_clean.flist
|
||||||
|
|
||||||
|
# SI-84 multi-condition training data
|
||||||
|
cat $AURORA/lists/training_multicondition_16k.list \
|
||||||
|
| $local/aurora2flist.pl $AURORA | sort -u > train_si84_multi.flist
|
||||||
|
|
||||||
|
#Dev Set
|
||||||
|
for x in $(seq -f "%02g" 01 14); do
|
||||||
|
# Dev-set 1 (330x14 utterances)
|
||||||
|
cat $AURORA/lists/devtest${x}_0330_16k.list | perl -e '
|
||||||
|
while(<STDIN>) {
|
||||||
|
@A=split("/", $_);
|
||||||
|
@B=split("_", $A[0]);
|
||||||
|
print $B[0].$B[1]."_".$B[2]."/".$_;
|
||||||
|
}
|
||||||
|
' | $local/aurora2flist.pl $AURORA | sort -u > dev_0330_${x}.flist
|
||||||
|
# Dev-set 2 (1206x14 utterances)
|
||||||
|
cat $AURORA/lists/devtest${x}_1206_16k.list | perl -e '
|
||||||
|
while(<STDIN>) {
|
||||||
|
@A=split("/", $_);
|
||||||
|
@B=split("_", $A[0]);
|
||||||
|
print $B[0].$B[1]."_".$B[2]."/".$_;
|
||||||
|
}
|
||||||
|
' | $local/aurora2flist.pl $AURORA | sort -u > dev_1206_${x}.flist
|
||||||
|
done
|
||||||
|
|
||||||
|
#Test Set
|
||||||
|
for x in $(seq -f "%02g" 01 14); do
|
||||||
|
# test set 1 (166x14 utterances)
|
||||||
|
cat $AURORA/lists/test${x}_0166_16k.list \
|
||||||
|
| $local/aurora2flist.pl $AURORA | sort -u > test_0166_${x}.flist
|
||||||
|
cat $AURORA/lists/test${x}_0330_16k.list \
|
||||||
|
| $local/aurora2flist.pl $AURORA | sort -u > test_eval92_${x}.flist
|
||||||
|
done
|
||||||
|
|
||||||
|
# Finding the transcript files:
|
||||||
|
find -L $CORPUS -iname '*.dot' > dot_files.flist
|
||||||
|
|
||||||
|
# Convert the transcripts into our format (no normalization yet)
|
||||||
|
# adding suffix to utt_id
|
||||||
|
# 0 for clean condition
|
||||||
|
|
||||||
|
# Trans and sph for Train Set
|
||||||
|
x=train_si84_clean
|
||||||
|
$local/flist2scp_12.pl $x.flist | sort > ${x}_sph_tmp.scp
|
||||||
|
cat ${x}_sph_tmp.scp | awk '{print $1}' \
|
||||||
|
| $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1
|
||||||
|
cat ${x}_sph_tmp.scp | awk '{printf("%s0 %s\n", $1, $2);}' > ${x}_sph.scp
|
||||||
|
cat ${x}_tmp.trans1 | awk '{printf("%s0 ", $1); for(i=2;i<=NF;i++) printf("%s ", $i); printf("\n");}' > ${x}.trans1
|
||||||
|
|
||||||
|
x=train_si84_multi
|
||||||
|
$local/flist2scp_12.pl $x.flist | sort > ${x}_sph_tmp.scp
|
||||||
|
cat ${x}_sph_tmp.scp | awk '{print $1}' \
|
||||||
|
| $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1
|
||||||
|
cat ${x}_sph_tmp.scp | awk '{printf("%s1 %s\n", $1, $2);}' > ${x}_sph.scp
|
||||||
|
cat ${x}_tmp.trans1 | awk '{printf("%s1 ", $1); for(i=2;i<=NF;i++) printf("%s ", $i); printf("\n");}' > ${x}.trans1
|
||||||
|
|
||||||
|
# Trans and sph for Dev Set
|
||||||
|
for x in $(seq -f "%02g" 01 14); do
|
||||||
|
$local/flist2scp_12.pl dev_0330_${x}.flist | sort > dev_0330_${x}_sph_tmp.scp
|
||||||
|
$local/flist2scp_12.pl dev_1206_${x}.flist | sort > dev_1206_${x}_sph_tmp.scp
|
||||||
|
cat dev_0330_${x}_sph_tmp.scp | awk '{print $1}' \
|
||||||
|
| $local/find_transcripts.pl dot_files.flist > dev_0330_${x}_tmp.trans1
|
||||||
|
cat dev_1206_${x}_sph_tmp.scp | awk '{print $1}' \
|
||||||
|
| $local/find_transcripts.pl dot_files.flist > dev_1206_${x}_tmp.trans1
|
||||||
|
cat dev_0330_${x}_sph_tmp.scp | perl -e \
|
||||||
|
' $condition="$ARGV[0]";
|
||||||
|
if ($condition eq "01") {$suffix=0;}
|
||||||
|
elsif ($condition eq "02") {$suffix=1;}
|
||||||
|
elsif ($condition eq "03") {$suffix=2;}
|
||||||
|
elsif ($condition eq "04") {$suffix=3;}
|
||||||
|
elsif ($condition eq "05") {$suffix=4;}
|
||||||
|
elsif ($condition eq "06") {$suffix=5;}
|
||||||
|
elsif ($condition eq "07") {$suffix=6;}
|
||||||
|
elsif ($condition eq "08") {$suffix=7;}
|
||||||
|
elsif ($condition eq "09") {$suffix=8;}
|
||||||
|
elsif ($condition eq "10") {$suffix=9;}
|
||||||
|
elsif ($condition eq "11") {$suffix=a;}
|
||||||
|
elsif ($condition eq "12") {$suffix=b;}
|
||||||
|
elsif ($condition eq "13") {$suffix=c;}
|
||||||
|
elsif ($condition eq "14") {$suffix=d;}
|
||||||
|
else {print STDERR "error condition $condition\n";}
|
||||||
|
while(<STDIN>) {
|
||||||
|
@A=split(" ", $_);
|
||||||
|
print $A[0].$suffix." ".$A[1]."\n";
|
||||||
|
}
|
||||||
|
' $x > dev_0330_${x}_sph.scp
|
||||||
|
|
||||||
|
cat dev_1206_${x}_sph_tmp.scp | perl -e \
|
||||||
|
' $condition="$ARGV[0]";
|
||||||
|
if ($condition eq "01") {$suffix=0;}
|
||||||
|
elsif ($condition eq "02") {$suffix=1;}
|
||||||
|
elsif ($condition eq "03") {$suffix=2;}
|
||||||
|
elsif ($condition eq "04") {$suffix=3;}
|
||||||
|
elsif ($condition eq "05") {$suffix=4;}
|
||||||
|
elsif ($condition eq "06") {$suffix=5;}
|
||||||
|
elsif ($condition eq "07") {$suffix=6;}
|
||||||
|
elsif ($condition eq "08") {$suffix=7;}
|
||||||
|
elsif ($condition eq "09") {$suffix=8;}
|
||||||
|
elsif ($condition eq "10") {$suffix=9;}
|
||||||
|
elsif ($condition eq "11") {$suffix=a;}
|
||||||
|
elsif ($condition eq "12") {$suffix=b;}
|
||||||
|
elsif ($condition eq "13") {$suffix=c;}
|
||||||
|
elsif ($condition eq "14") {$suffix=d;}
|
||||||
|
else {print STDERR "error condition $condition";}
|
||||||
|
while(<STDIN>) {
|
||||||
|
@A=split(" ", $_);
|
||||||
|
print $A[0].$suffix." ".$A[1]."\n";
|
||||||
|
}
|
||||||
|
' $x > dev_1206_${x}_sph.scp
|
||||||
|
|
||||||
|
cat dev_0330_${x}_tmp.trans1 | perl -e \
|
||||||
|
' $condition="$ARGV[0]";
|
||||||
|
if ($condition eq "01") {$suffix=0;}
|
||||||
|
elsif ($condition eq "02") {$suffix=1;}
|
||||||
|
elsif ($condition eq "03") {$suffix=2;}
|
||||||
|
elsif ($condition eq "04") {$suffix=3;}
|
||||||
|
elsif ($condition eq "05") {$suffix=4;}
|
||||||
|
elsif ($condition eq "06") {$suffix=5;}
|
||||||
|
elsif ($condition eq "07") {$suffix=6;}
|
||||||
|
elsif ($condition eq "08") {$suffix=7;}
|
||||||
|
elsif ($condition eq "09") {$suffix=8;}
|
||||||
|
elsif ($condition eq "10") {$suffix=9;}
|
||||||
|
elsif ($condition eq "11") {$suffix=a;}
|
||||||
|
elsif ($condition eq "12") {$suffix=b;}
|
||||||
|
elsif ($condition eq "13") {$suffix=c;}
|
||||||
|
elsif ($condition eq "14") {$suffix=d;}
|
||||||
|
else {print STDERR "error condition $condition";}
|
||||||
|
while(<STDIN>) {
|
||||||
|
@A=split(" ", $_);
|
||||||
|
print $A[0].$suffix;
|
||||||
|
for ($i=1; $i < @A; $i++) {print " ".$A[$i];}
|
||||||
|
print "\n";
|
||||||
|
}
|
||||||
|
' $x > dev_0330_${x}.trans1
|
||||||
|
|
||||||
|
cat dev_1206_${x}_tmp.trans1 | perl -e \
|
||||||
|
' $condition="$ARGV[0]";
|
||||||
|
if ($condition eq "01") {$suffix=0;}
|
||||||
|
elsif ($condition eq "02") {$suffix=1;}
|
||||||
|
elsif ($condition eq "03") {$suffix=2;}
|
||||||
|
elsif ($condition eq "04") {$suffix=3;}
|
||||||
|
elsif ($condition eq "05") {$suffix=4;}
|
||||||
|
elsif ($condition eq "06") {$suffix=5;}
|
||||||
|
elsif ($condition eq "07") {$suffix=6;}
|
||||||
|
elsif ($condition eq "08") {$suffix=7;}
|
||||||
|
elsif ($condition eq "09") {$suffix=8;}
|
||||||
|
elsif ($condition eq "10") {$suffix=9;}
|
||||||
|
elsif ($condition eq "11") {$suffix=a;}
|
||||||
|
elsif ($condition eq "12") {$suffix=b;}
|
||||||
|
elsif ($condition eq "13") {$suffix=c;}
|
||||||
|
elsif ($condition eq "14") {$suffix=d;}
|
||||||
|
else {print STDERR "error condition $condition";}
|
||||||
|
while(<STDIN>) {
|
||||||
|
@A=split(" ", $_);
|
||||||
|
print $A[0].$suffix;
|
||||||
|
for ($i=1; $i < @A; $i++) {print " ".$A[$i];}
|
||||||
|
print "\n";
|
||||||
|
}
|
||||||
|
' $x > dev_1206_${x}.trans1
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
cat dev_0330_*_sph.scp | sort -k1 > dev_0330_sph.scp
|
||||||
|
cat dev_1206_*_sph.scp | sort -k1 > dev_1206_sph.scp
|
||||||
|
cat dev_0330_??.trans1 | sort -k1 > dev_0330.trans1
|
||||||
|
cat dev_1206_??.trans1 | sort -k1 > dev_1206.trans1
|
||||||
|
|
||||||
|
|
||||||
|
# Trans and sph for Test Set
|
||||||
|
for x in $(seq -f "%02g" 01 14); do
|
||||||
|
$local/flist2scp_12.pl test_0166_${x}.flist | sort > test_0166_${x}_sph_tmp.scp
|
||||||
|
$local/flist2scp_12.pl test_eval92_${x}.flist | sort > test_eval92_${x}_sph_tmp.scp
|
||||||
|
cat test_0166_${x}_sph_tmp.scp | awk '{print $1}' \
|
||||||
|
| $local/find_transcripts.pl dot_files.flist > test_0166_${x}_tmp.trans1
|
||||||
|
cat test_eval92_${x}_sph_tmp.scp | awk '{print $1}' \
|
||||||
|
| $local/find_transcripts.pl dot_files.flist > test_eval92_${x}_tmp.trans1
|
||||||
|
cat test_0166_${x}_sph_tmp.scp | perl -e \
|
||||||
|
' $condition="$ARGV[0]";
|
||||||
|
if ($condition eq "01") {$suffix=0;}
|
||||||
|
elsif ($condition eq "02") {$suffix=1;}
|
||||||
|
elsif ($condition eq "03") {$suffix=2;}
|
||||||
|
elsif ($condition eq "04") {$suffix=3;}
|
||||||
|
elsif ($condition eq "05") {$suffix=4;}
|
||||||
|
elsif ($condition eq "06") {$suffix=5;}
|
||||||
|
elsif ($condition eq "07") {$suffix=6;}
|
||||||
|
elsif ($condition eq "08") {$suffix=7;}
|
||||||
|
elsif ($condition eq "09") {$suffix=8;}
|
||||||
|
elsif ($condition eq "10") {$suffix=9;}
|
||||||
|
elsif ($condition eq "11") {$suffix=a;}
|
||||||
|
elsif ($condition eq "12") {$suffix=b;}
|
||||||
|
elsif ($condition eq "13") {$suffix=c;}
|
||||||
|
elsif ($condition eq "14") {$suffix=d;}
|
||||||
|
else {print STDERR "error condition $condition";}
|
||||||
|
while(<STDIN>) {
|
||||||
|
@A=split(" ", $_);
|
||||||
|
print $A[0].$suffix." ".$A[1]."\n";
|
||||||
|
}
|
||||||
|
' $x > test_0166_${x}_sph.scp
|
||||||
|
|
||||||
|
cat test_eval92_${x}_sph_tmp.scp | perl -e \
|
||||||
|
' $condition="$ARGV[0]";
|
||||||
|
if ($condition eq "01") {$suffix=0;}
|
||||||
|
elsif ($condition eq "02") {$suffix=1;}
|
||||||
|
elsif ($condition eq "03") {$suffix=2;}
|
||||||
|
elsif ($condition eq "04") {$suffix=3;}
|
||||||
|
elsif ($condition eq "05") {$suffix=4;}
|
||||||
|
elsif ($condition eq "06") {$suffix=5;}
|
||||||
|
elsif ($condition eq "07") {$suffix=6;}
|
||||||
|
elsif ($condition eq "08") {$suffix=7;}
|
||||||
|
elsif ($condition eq "09") {$suffix=8;}
|
||||||
|
elsif ($condition eq "10") {$suffix=9;}
|
||||||
|
elsif ($condition eq "11") {$suffix=a;}
|
||||||
|
elsif ($condition eq "12") {$suffix=b;}
|
||||||
|
elsif ($condition eq "13") {$suffix=c;}
|
||||||
|
elsif ($condition eq "14") {$suffix=d;}
|
||||||
|
else {print STDERR "error condition $condition";}
|
||||||
|
while(<STDIN>) {
|
||||||
|
@A=split(" ", $_);
|
||||||
|
print $A[0].$suffix." ".$A[1]."\n";
|
||||||
|
}
|
||||||
|
' $x > test_eval92_${x}_sph.scp
|
||||||
|
|
||||||
|
cat test_0166_${x}_tmp.trans1 | perl -e \
|
||||||
|
' $condition="$ARGV[0]";
|
||||||
|
if ($condition eq "01") {$suffix=0;}
|
||||||
|
elsif ($condition eq "02") {$suffix=1;}
|
||||||
|
elsif ($condition eq "03") {$suffix=2;}
|
||||||
|
elsif ($condition eq "04") {$suffix=3;}
|
||||||
|
elsif ($condition eq "05") {$suffix=4;}
|
||||||
|
elsif ($condition eq "06") {$suffix=5;}
|
||||||
|
elsif ($condition eq "07") {$suffix=6;}
|
||||||
|
elsif ($condition eq "08") {$suffix=7;}
|
||||||
|
elsif ($condition eq "09") {$suffix=8;}
|
||||||
|
elsif ($condition eq "10") {$suffix=9;}
|
||||||
|
elsif ($condition eq "11") {$suffix=a;}
|
||||||
|
elsif ($condition eq "12") {$suffix=b;}
|
||||||
|
elsif ($condition eq "13") {$suffix=c;}
|
||||||
|
elsif ($condition eq "14") {$suffix=d;}
|
||||||
|
else {print STDERR "error condition $condition";}
|
||||||
|
while(<STDIN>) {
|
||||||
|
@A=split(" ", $_);
|
||||||
|
print $A[0].$suffix;
|
||||||
|
for ($i=1; $i < @A; $i++) {print " ".$A[$i];}
|
||||||
|
print "\n";
|
||||||
|
}
|
||||||
|
' $x > test_0166_${x}.trans1
|
||||||
|
|
||||||
|
cat test_eval92_${x}_tmp.trans1 | perl -e \
|
||||||
|
' $condition="$ARGV[0]";
|
||||||
|
if ($condition eq "01") {$suffix=0;}
|
||||||
|
elsif ($condition eq "02") {$suffix=1;}
|
||||||
|
elsif ($condition eq "03") {$suffix=2;}
|
||||||
|
elsif ($condition eq "04") {$suffix=3;}
|
||||||
|
elsif ($condition eq "05") {$suffix=4;}
|
||||||
|
elsif ($condition eq "06") {$suffix=5;}
|
||||||
|
elsif ($condition eq "07") {$suffix=6;}
|
||||||
|
elsif ($condition eq "08") {$suffix=7;}
|
||||||
|
elsif ($condition eq "09") {$suffix=8;}
|
||||||
|
elsif ($condition eq "10") {$suffix=9;}
|
||||||
|
elsif ($condition eq "11") {$suffix=a;}
|
||||||
|
elsif ($condition eq "12") {$suffix=b;}
|
||||||
|
elsif ($condition eq "13") {$suffix=c;}
|
||||||
|
elsif ($condition eq "14") {$suffix=d;}
|
||||||
|
else {print STDERR "error condition $condition";}
|
||||||
|
while(<STDIN>) {
|
||||||
|
@A=split(" ", $_);
|
||||||
|
print $A[0].$suffix;
|
||||||
|
for ($i=1; $i < @A; $i++) {print " ".$A[$i];}
|
||||||
|
print "\n";
|
||||||
|
}
|
||||||
|
' $x > test_eval92_${x}.trans1
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
cat test_0166_*_sph.scp | sort -k1 > test_0166_sph.scp
|
||||||
|
cat test_eval92_*_sph.scp | sort -k1 > test_eval92_sph.scp
|
||||||
|
cat test_0166_??.trans1 | sort -k1 > test_0166.trans1
|
||||||
|
cat test_eval92_??.trans1 | sort -k1 > test_eval92.trans1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Do some basic normalization steps. At this point we don't remove OOVs--
|
||||||
|
# that will be done inside the training scripts, as we'd like to make the
|
||||||
|
# data-preparation stage independent of the specific lexicon used.
|
||||||
|
noiseword="<NOISE>";
|
||||||
|
for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do
|
||||||
|
cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
|
||||||
|
| sort > $x.txt || exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
|
||||||
|
for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do
|
||||||
|
awk '{printf("%s sox -B -r 16k -e signed -b 16 -c 1 -t raw %s -t wav - |\n", $1, $2);}' < ${x}_sph.scp \
|
||||||
|
> ${x}_wav.scp
|
||||||
|
done
|
||||||
|
|
||||||
|
# Make the utt2spk and spk2utt files.
|
||||||
|
for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do
|
||||||
|
cat ${x}_sph.scp | awk '{print $1}' \
|
||||||
|
| perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
|
||||||
|
cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
#in case we want to limit lm's on most frequent words, copy lm training word frequency list
|
||||||
|
cp $CORPUS/wsj0/doc/lng_modl/vocab/wfl_64.lst $lmdir
|
||||||
|
chmod u+w $lmdir/*.lst # had weird permissions on source.
|
||||||
|
|
||||||
|
# The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without
|
||||||
|
# verbalized pronunciations. This is the most common test setup, I understand.
|
||||||
|
|
||||||
|
cp $CORPUS/wsj0/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1;
|
||||||
|
chmod u+w $lmdir/lm_bg.arpa.gz
|
||||||
|
|
||||||
|
# trigram would be:
|
||||||
|
cat $CORPUS/wsj0/doc/lng_modl/base_lm/tcb20onp.z | \
|
||||||
|
perl -e 'while(<>){ if(m/^\\data\\/){ print; last; } } while(<>){ print; }' \
|
||||||
|
| gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1;
|
||||||
|
|
||||||
|
prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1;
|
||||||
|
gzip -f $lmdir/lm_tgpr.arpa || exit 1;
|
||||||
|
|
||||||
|
# repeat for 5k language models
|
||||||
|
cp $CORPUS/wsj0/doc/lng_modl/base_lm/bcb05onp.z $lmdir/lm_bg_5k.arpa.gz || exit 1;
|
||||||
|
chmod u+w $lmdir/lm_bg_5k.arpa.gz
|
||||||
|
|
||||||
|
# trigram would be: !only closed vocabulary here!
|
||||||
|
cp $CORPUS/wsj0/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1;
|
||||||
|
chmod u+w $lmdir/lm_tg_5k.arpa.gz
|
||||||
|
gunzip $lmdir/lm_tg_5k.arpa.gz
|
||||||
|
tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz
|
||||||
|
rm $lmdir/lm_tg_5k.arpa
|
||||||
|
|
||||||
|
prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1;
|
||||||
|
gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
|
||||||
|
rm -f wsj0-train-spkrinfo.txt
|
||||||
|
wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt \
|
||||||
|
|| ( echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
|
||||||
|
wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt );
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f wsj0-train-spkrinfo.txt ]; then
|
||||||
|
echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
|
||||||
|
echo "This is possibly omitted from the training disks; couldn't find it."
|
||||||
|
echo "Everything else may have worked; we just may be missing gender info"
|
||||||
|
echo "which is only needed for VTLN-related diagnostics anyway."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
|
||||||
|
# LDC put it on the web. Perhaps it was accidentally omitted from the
|
||||||
|
# disks.
|
||||||
|
|
||||||
|
cat $CORPUS/wsj0/doc/spkrinfo.txt \
|
||||||
|
./wsj0-train-spkrinfo.txt | \
|
||||||
|
perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
|
||||||
|
awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
|
||||||
|
|
||||||
|
|
||||||
|
echo "Data preparation succeeded"
|
|
@ -0,0 +1,86 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
|
||||||
|
# Apache 2.0
|
||||||
|
|
||||||
|
# This script takes data prepared in a corpus-dependent way
|
||||||
|
# in data/local/, and converts it into the "canonical" form,
|
||||||
|
# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
|
||||||
|
# data/train_si284, data/train_si84, etc.
|
||||||
|
|
||||||
|
# Don't bother doing train_si84 separately (although we have the file lists
|
||||||
|
# in data/local/) because it's just the first 7138 utterances in train_si284.
|
||||||
|
# We'll create train_si84 after doing the feature extraction.
|
||||||
|
|
||||||
|
. ./path.sh || exit 1;
|
||||||
|
|
||||||
|
echo "Preparing train and test data"
|
||||||
|
srcdir=data/local/data
|
||||||
|
lmdir=data/local/nist_lm
|
||||||
|
tmpdir=data/local/lm_tmp
|
||||||
|
lexicon=data/local/lang_tmp/lexiconp.txt
|
||||||
|
mkdir -p $tmpdir
|
||||||
|
|
||||||
|
for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do
|
||||||
|
mkdir -p data/$x
|
||||||
|
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
|
||||||
|
cp $srcdir/$x.txt data/$x/text || exit 1;
|
||||||
|
cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
|
||||||
|
cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
|
||||||
|
utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
# Next, for each type of language model, create the corresponding FST
|
||||||
|
# and the corresponding lang_test_* directory.
|
||||||
|
|
||||||
|
echo Preparing language models for test
|
||||||
|
|
||||||
|
for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
|
||||||
|
test=data/lang_test_${lm_suffix}
|
||||||
|
mkdir -p $test
|
||||||
|
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
|
||||||
|
phones/; do
|
||||||
|
cp -r data/lang/$f $test
|
||||||
|
done
|
||||||
|
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
||||||
|
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
|
||||||
|
|
||||||
|
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||||
|
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
||||||
|
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
||||||
|
# which are supposed to occur only at being/end of utt. These can cause
|
||||||
|
# determinization failures of CLG [ends up being epsilon cycles].
|
||||||
|
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
||||||
|
grep -v '<s> <s>' | \
|
||||||
|
grep -v '</s> <s>' | \
|
||||||
|
grep -v '</s> </s>' | \
|
||||||
|
arpa2fst - | fstprint | \
|
||||||
|
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
|
||||||
|
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||||
|
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||||
|
fstrmepsilon > $test/G.fst
|
||||||
|
fstisstochastic $test/G.fst
|
||||||
|
# The output is like:
|
||||||
|
# 9.14233e-05 -0.259833
|
||||||
|
# we do expect the first of these 2 numbers to be close to zero (the second is
|
||||||
|
# nonzero because the backoff weights make the states sum to >1).
|
||||||
|
# Because of the <s> fiasco for these particular LMs, the first number is not
|
||||||
|
# as close to zero as it could be.
|
||||||
|
|
||||||
|
# Everything below is only for diagnostic.
|
||||||
|
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
|
||||||
|
# this might cause determinization failure of CLG.
|
||||||
|
# #0 is treated as an empty word.
|
||||||
|
mkdir -p $tmpdir/g
|
||||||
|
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
|
||||||
|
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
|
||||||
|
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
|
||||||
|
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
|
||||||
|
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||||
|
echo "Language model has cycles with empty words" && exit 1
|
||||||
|
rm -r $tmpdir/g
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Succeeded in formatting data."
|
||||||
|
rm -r $tmpdir
|
|
@ -0,0 +1,54 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
# Copyright 2010-2011 Microsoft Corporation
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# This is modified from the script in standard Kaldi recipe to account
|
||||||
|
# for the way the WSJ data is structured on the Edinburgh systems.
|
||||||
|
# - Arnab Ghoshal, 12/1/12
|
||||||
|
|
||||||
|
# This program takes as its standard input an .ndx file from the WSJ corpus that looks
|
||||||
|
# like this:
|
||||||
|
#;; File: tr_s_wv1.ndx, updated 04/26/94
|
||||||
|
#;;
|
||||||
|
#;; Index for WSJ0 SI-short Sennheiser training data
|
||||||
|
#;; Data is read WSJ sentences, Sennheiser mic.
|
||||||
|
#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts
|
||||||
|
#;; per speaker TI) = 7236 utts
|
||||||
|
#;;
|
||||||
|
#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
|
||||||
|
#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
|
||||||
|
#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1
|
||||||
|
|
||||||
|
# and as command-line argument it takes the names of the WSJ disk locations, e.g.:
|
||||||
|
# /group/corpora/public/wsjcam0/data on DICE machines.
|
||||||
|
# It outputs a list of absolute pathnames.
|
||||||
|
|
||||||
|
$wsj_dir = $ARGV[0];
|
||||||
|
|
||||||
|
while(<STDIN>){
|
||||||
|
if(m/^;/){ next; } # Comment. Ignore it.
|
||||||
|
else {
|
||||||
|
m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_";
|
||||||
|
$filename = $2; # as a subdirectory of the distributed disk.
|
||||||
|
if ($filename !~ m/\.wv1$/) { $filename .= ".wv1"; }
|
||||||
|
$filename = "$wsj_dir/$filename";
|
||||||
|
if (-e $filename) {
|
||||||
|
print "$filename\n";
|
||||||
|
} else {
|
||||||
|
print STDERR "File $filename found in the index but not on disk\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,187 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
|
||||||
|
# Apache 2.0.
|
||||||
|
|
||||||
|
# This is modified from the script in standard Kaldi recipe to account
|
||||||
|
# for the way the WSJ data is structured on the Edinburgh systems.
|
||||||
|
# - Arnab Ghoshal, 29/05/12
|
||||||
|
|
||||||
|
if [ $# -ne 1 ]; then
|
||||||
|
printf "\nUSAGE: %s <corpus-directory>\n\n" `basename $0`
|
||||||
|
echo "The argument should be a the top-level WSJ corpus directory."
|
||||||
|
echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory"
|
||||||
|
echo "within the top-level corpus directory."
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
CORPUS=$1
|
||||||
|
|
||||||
|
dir=`pwd`/data/local/data
|
||||||
|
lmdir=`pwd`/data/local/nist_lm
|
||||||
|
mkdir -p $dir $lmdir
|
||||||
|
local=`pwd`/local
|
||||||
|
utils=`pwd`/utils
|
||||||
|
|
||||||
|
. ./path.sh # Needed for KALDI_ROOT
|
||||||
|
export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
|
||||||
|
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
|
||||||
|
if [ ! -x $sph2pipe ]; then
|
||||||
|
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
cd $dir
|
||||||
|
|
||||||
|
# This version for SI-84
|
||||||
|
cat $CORPUS/wsj0/doc/indices/train/tr_s_wv1.ndx \
|
||||||
|
| $local/cstr_ndx2flist.pl $CORPUS | sort \
|
||||||
|
| grep -v wsj0/si_tr_s/401 > train_si84.flist
|
||||||
|
|
||||||
|
# This version for SI-284
|
||||||
|
cat $CORPUS/wsj1/doc/indices/si_tr_s.ndx \
|
||||||
|
$CORPUS/wsj0/doc/indices/train/tr_s_wv1.ndx \
|
||||||
|
| $local/cstr_ndx2flist.pl $CORPUS | sort \
|
||||||
|
| grep -v wsj0/si_tr_s/401 > train_si284.flist
|
||||||
|
|
||||||
|
# Now for the test sets.
|
||||||
|
# $CORPUS/wsj1/doc/indices/readme.doc
|
||||||
|
# describes all the different test sets.
|
||||||
|
# Note: each test-set seems to come in multiple versions depending
|
||||||
|
# on different vocabulary sizes, verbalized vs. non-verbalized
|
||||||
|
# pronunciations, etc. We use the largest vocab and non-verbalized
|
||||||
|
# pronunciations.
|
||||||
|
# The most normal one seems to be the "baseline 60k test set", which
|
||||||
|
# is h1_p0.
|
||||||
|
|
||||||
|
# Nov'92 (333 utts)
|
||||||
|
# These index files have a slightly different format;
|
||||||
|
# have to add .wv1, which is done in cstr_ndx2flist.pl
|
||||||
|
cat $CORPUS/wsj0/doc/indices/test/nvp/si_et_20.ndx | \
|
||||||
|
$local/cstr_ndx2flist.pl $CORPUS | sort > test_eval92.flist
|
||||||
|
|
||||||
|
# Nov'92 (330 utts, 5k vocab)
|
||||||
|
cat $CORPUS/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
|
||||||
|
$local/cstr_ndx2flist.pl $CORPUS | sort > test_eval92_5k.flist
|
||||||
|
|
||||||
|
# Nov'93: (213 utts)
|
||||||
|
# Have to replace a wrong disk-id.
|
||||||
|
cat $CORPUS/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \
|
||||||
|
$local/cstr_ndx2flist.pl $CORPUS | sort > test_eval93.flist
|
||||||
|
|
||||||
|
# Nov'93: (215 utts, 5k)
|
||||||
|
cat $CORPUS/wsj1/doc/indices/wsj1/eval/h2_p0.ndx | \
|
||||||
|
$local/cstr_ndx2flist.pl $CORPUS | sort > test_eval93_5k.flist
|
||||||
|
|
||||||
|
# Dev-set for Nov'93 (503 utts)
|
||||||
|
cat $CORPUS/wsj1/doc/indices/h1_p0.ndx | \
|
||||||
|
$local/cstr_ndx2flist.pl $CORPUS | sort > test_dev93.flist
|
||||||
|
|
||||||
|
# Dev-set for Nov'93 (513 utts, 5k vocab)
|
||||||
|
cat $CORPUS/wsj1/doc/indices/h2_p0.ndx | \
|
||||||
|
$local/cstr_ndx2flist.pl $CORPUS | sort > test_dev93_5k.flist
|
||||||
|
|
||||||
|
|
||||||
|
# Dev-set Hub 1,2 (503, 913 utterances)
|
||||||
|
|
||||||
|
# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.
|
||||||
|
# Sometimes this gets copied from the CD's with upcasing, don't know
|
||||||
|
# why (could be older versions of the disks).
|
||||||
|
find $CORPUS/???1/??_??_20 -print | grep -i ".wv1" | sort > dev_dt_20.flist
|
||||||
|
find $CORPUS/???1/??_??_05 -print | grep -i ".wv1" | sort > dev_dt_05.flist
|
||||||
|
|
||||||
|
|
||||||
|
# Finding the transcript files:
|
||||||
|
find -L $CORPUS -iname '*.dot' > dot_files.flist
|
||||||
|
|
||||||
|
# Convert the transcripts into our format (no normalization yet)
|
||||||
|
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
||||||
|
$local/flist2scp.pl $x.flist | sort > ${x}_sph.scp
|
||||||
|
cat ${x}_sph.scp | awk '{print $1}' \
|
||||||
|
| $local/find_transcripts.pl dot_files.flist > $x.trans1
|
||||||
|
done
|
||||||
|
|
||||||
|
# Do some basic normalization steps. At this point we don't remove OOVs--
|
||||||
|
# that will be done inside the training scripts, as we'd like to make the
|
||||||
|
# data-preparation stage independent of the specific lexicon used.
|
||||||
|
noiseword="<NOISE>";
|
||||||
|
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
||||||
|
cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
|
||||||
|
| sort > $x.txt || exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
|
||||||
|
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
||||||
|
awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp \
|
||||||
|
> ${x}_wav.scp
|
||||||
|
done
|
||||||
|
|
||||||
|
# Make the utt2spk and spk2utt files.
|
||||||
|
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
||||||
|
cat ${x}_sph.scp | awk '{print $1}' \
|
||||||
|
| perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
|
||||||
|
cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
#in case we want to limit lm's on most frequent words, copy lm training word frequency list
|
||||||
|
cp $CORPUS/wsj1/doc/lng_modl/vocab/wfl_64.lst $lmdir
|
||||||
|
chmod u+w $lmdir/*.lst # had weird permissions on source.
|
||||||
|
|
||||||
|
# The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without
|
||||||
|
# verbalized pronunciations. This is the most common test setup, I understand.
|
||||||
|
|
||||||
|
cp $CORPUS/wsj1/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1;
|
||||||
|
chmod u+w $lmdir/lm_bg.arpa.gz
|
||||||
|
|
||||||
|
# trigram would be:
|
||||||
|
cat $CORPUS/wsj1/doc/lng_modl/base_lm/tcb20onp.z | \
|
||||||
|
perl -e 'while(<>){ if(m/^\\data\\/){ print; last; } } while(<>){ print; }' \
|
||||||
|
| gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1;
|
||||||
|
|
||||||
|
prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1;
|
||||||
|
gzip -f $lmdir/lm_tgpr.arpa || exit 1;
|
||||||
|
|
||||||
|
# repeat for 5k language models
|
||||||
|
cp $CORPUS/wsj1/doc/lng_modl/base_lm/bcb05onp.z $lmdir/lm_bg_5k.arpa.gz || exit 1;
|
||||||
|
chmod u+w $lmdir/lm_bg_5k.arpa.gz
|
||||||
|
|
||||||
|
# trigram would be: !only closed vocabulary here!
|
||||||
|
cp $CORPUS/wsj1/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1;
|
||||||
|
chmod u+w $lmdir/lm_tg_5k.arpa.gz
|
||||||
|
gunzip $lmdir/lm_tg_5k.arpa.gz
|
||||||
|
tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz
|
||||||
|
rm $lmdir/lm_tg_5k.arpa
|
||||||
|
|
||||||
|
prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1;
|
||||||
|
gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
|
||||||
|
rm -f wsj0-train-spkrinfo.txt
|
||||||
|
wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt \
|
||||||
|
|| ( echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
|
||||||
|
wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt );
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f wsj0-train-spkrinfo.txt ]; then
|
||||||
|
echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
|
||||||
|
echo "This is possibly omitted from the training disks; couldn't find it."
|
||||||
|
echo "Everything else may have worked; we just may be missing gender info"
|
||||||
|
echo "which is only needed for VTLN-related diagnostics anyway."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
|
||||||
|
# LDC put it on the web. Perhaps it was accidentally omitted from the
|
||||||
|
# disks.
|
||||||
|
|
||||||
|
cat $CORPUS/wsj0/doc/spkrinfo.txt \
|
||||||
|
$CORPUS/wsj1/doc/evl_spok/spkrinfo.txt \
|
||||||
|
$CORPUS/wsj1/doc/dev_spok/spkrinfo.txt \
|
||||||
|
$CORPUS/wsj1/doc/train/spkrinfo.txt \
|
||||||
|
./wsj0-train-spkrinfo.txt | \
|
||||||
|
perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
|
||||||
|
awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
|
||||||
|
|
||||||
|
|
||||||
|
echo "Data preparation succeeded"
|
|
@ -0,0 +1,172 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script builds a larger word-list and dictionary
|
||||||
|
# than used for the LMs supplied with the WSJ corpus.
|
||||||
|
# It uses a couple of strategies to fill-in words in
|
||||||
|
# the LM training data but not in CMUdict. One is
|
||||||
|
# to generate special prons for possible acronyms, that
|
||||||
|
# just consist of the constituent letters. The other
|
||||||
|
# is designed to handle derivatives of known words
|
||||||
|
# (e.g. deriving the pron of a plural from the pron of
|
||||||
|
# the base-word), but in a more general, learned-from-data
|
||||||
|
# way.
|
||||||
|
# It makes use of scripts in local/dict/
|
||||||
|
|
||||||
|
if [ $# -ne 1 ]; then
|
||||||
|
echo "Usage: local/cstr_wsj_train_lms.sh WSJ1_doc_dir"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
export PATH=$PATH:`pwd`/local/dict/
|
||||||
|
srcdir=$1
|
||||||
|
|
||||||
|
if [ ! -d $srcdir/lng_modl ]; then
|
||||||
|
echo "Expecting 'lng_modl' under WSJ doc directory '$srcdir'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p data/local/dict_larger
|
||||||
|
dir=data/local/dict_larger
|
||||||
|
cp data/local/dict/* data/local/dict_larger # Various files describing phones etc.
|
||||||
|
# are there; we just want to copy them as the phoneset is the same.
|
||||||
|
rm data/local/dict_larger/lexicon.txt # we don't want this.
|
||||||
|
mincount=2 # Minimum count of an OOV we will try to generate a pron for.
|
||||||
|
|
||||||
|
[ ! -f data/local/dict/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1;
|
||||||
|
|
||||||
|
# Remove comments from cmudict; print first field; remove
|
||||||
|
# words like FOO(1) which are alternate prons: our dict format won't
|
||||||
|
# include these markers.
|
||||||
|
grep -v ';;;' data/local/dict/cmudict/cmudict.0.7a |
|
||||||
|
perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu
|
||||||
|
|
||||||
|
cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu
|
||||||
|
|
||||||
|
echo "Getting training data [this should take at least a few seconds; if not, there's a problem]"
|
||||||
|
|
||||||
|
# Convert to uppercase, remove XML-like markings.
|
||||||
|
# For words ending in "." that are not in CMUdict, we assume that these
|
||||||
|
# are periods that somehow remained in the data during data preparation,
|
||||||
|
# and we we replace the "." with "\n". Note: we found this by looking at
|
||||||
|
# oov.counts below (before adding this rule).
|
||||||
|
|
||||||
|
touch $dir/cleaned.gz
|
||||||
|
if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then
|
||||||
|
echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]";
|
||||||
|
else
|
||||||
|
gunzip -c $srcdir/lng_modl/lm_train/np_data/{87,88,89}/*.z \
|
||||||
|
| awk '/^</{next}{print toupper($0)}' | perl -e '
|
||||||
|
open(F, "<$ARGV[0]")||die;
|
||||||
|
while(<F>){ chop; $isword{$_} = 1; }
|
||||||
|
while(<STDIN>) {
|
||||||
|
@A = split(" ", $_);
|
||||||
|
for ($n = 0; $n < @A; $n++) {
|
||||||
|
$a = $A[$n];
|
||||||
|
if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "."
|
||||||
|
# and have no other "." in them: treat as period.
|
||||||
|
print "$a";
|
||||||
|
if ($n+1 < @A) { print "\n"; }
|
||||||
|
} else { print "$a "; }
|
||||||
|
}
|
||||||
|
print "\n";
|
||||||
|
}
|
||||||
|
' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz
|
||||||
|
fi
|
||||||
|
|
||||||
|
# get unigram counts
|
||||||
|
echo "Getting unigram counts"
|
||||||
|
gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \
|
||||||
|
awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams
|
||||||
|
|
||||||
|
cat $dir/unigrams | awk -v dict=$dir/dict.cmu \
|
||||||
|
'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \
|
||||||
|
> $dir/oov.counts
|
||||||
|
|
||||||
|
echo "Most frequent unseen unigrams are: "
|
||||||
|
head $dir/oov.counts
|
||||||
|
|
||||||
|
# Prune away singleton counts, and remove things with numbers in
|
||||||
|
# (which should have been normalized) and with no letters at all.
|
||||||
|
|
||||||
|
|
||||||
|
cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \
|
||||||
|
| awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist
|
||||||
|
|
||||||
|
# Automatic rule-finding...
|
||||||
|
|
||||||
|
# First make some prons for possible acronyms.
|
||||||
|
# Note: we don't do this for things like U.K or U.N,
|
||||||
|
# or A.B. (which doesn't exist anyway),
|
||||||
|
# as we consider this normalization/spelling errors.
|
||||||
|
|
||||||
|
cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu > $dir/dict.acronyms
|
||||||
|
|
||||||
|
mkdir $dir/f $dir/b # forward, backward directions of rules...
|
||||||
|
# forward is normal suffix
|
||||||
|
# rules, backward is reversed (prefix rules). These
|
||||||
|
# dirs contain stuff we create while making the rule-based
|
||||||
|
# extensions to the dictionary.
|
||||||
|
|
||||||
|
# Remove ; and , from words, if they are present; these
|
||||||
|
# might crash our scripts, as they are used as separators there.
|
||||||
|
filter_dict.pl $dir/dict.cmu > $dir/f/dict
|
||||||
|
cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
|
||||||
|
reverse_dict.pl $dir/f/dict > $dir/b/dict
|
||||||
|
reverse_dict.pl $dir/f/oovs > $dir/b/oovs
|
||||||
|
|
||||||
|
# The next stage takes a few minutes.
|
||||||
|
# Note: the forward stage takes longer, as English is
|
||||||
|
# mostly a suffix-based language, and there are more rules
|
||||||
|
# that it finds.
|
||||||
|
for d in $dir/f $dir/b; do
|
||||||
|
(
|
||||||
|
cd $d
|
||||||
|
cat dict | get_rules.pl 2>get_rules.log >rules
|
||||||
|
get_rule_hierarchy.pl rules >hierarchy
|
||||||
|
awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
|
||||||
|
limit_candidate_prons.pl hierarchy | \
|
||||||
|
score_prons.pl dict | \
|
||||||
|
count_rules.pl >rule.counts
|
||||||
|
# the sort command below is just for convenience of reading.
|
||||||
|
score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
|
||||||
|
get_candidate_prons.pl rules.with_scores dict oovs | \
|
||||||
|
limit_candidate_prons.pl hierarchy > oovs.candidates
|
||||||
|
) &
|
||||||
|
done
|
||||||
|
wait
|
||||||
|
|
||||||
|
# Merge the candidates.
|
||||||
|
reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
|
||||||
|
select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s %s\n", $1, $2);}' \
|
||||||
|
> $dir/dict.oovs
|
||||||
|
|
||||||
|
cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
|
||||||
|
|
||||||
|
awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
|
||||||
|
sort $dir/oovlist | diff - $dir/oovlist.handled | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
|
||||||
|
|
||||||
|
|
||||||
|
# add_counts.pl attaches to original counts to the list of handled/not-handled OOVs
|
||||||
|
add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts
|
||||||
|
add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts
|
||||||
|
|
||||||
|
echo "**Top OOVs we handled are:**";
|
||||||
|
head $dir/oovlist.handled.counts
|
||||||
|
echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**";
|
||||||
|
head $dir/oovlist.not_handled.counts
|
||||||
|
|
||||||
|
|
||||||
|
echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`"
|
||||||
|
echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`"
|
||||||
|
echo "Count of OOVs we didn't handle due to low count is" \
|
||||||
|
`awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts`
|
||||||
|
# The two files created above are for humans to look at, as diagnostics.
|
||||||
|
|
||||||
|
cat <<EOF | cat - $dir/dict.cmu $dir/dict.oovs_merged | sort | uniq > $dir/lexicon.txt
|
||||||
|
!SIL SIL
|
||||||
|
<SPOKEN_NOISE> SPN
|
||||||
|
<UNK> SPN
|
||||||
|
<NOISE> NSN
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo "Created $dir/lexicon.txt"
|
|
@ -0,0 +1,31 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
|
||||||
|
# Add counts to an oovlist.
|
||||||
|
# Reads in counts as output by uniq -c, and
|
||||||
|
# an oovlist, and prints out the counts of the oovlist.
|
||||||
|
|
||||||
|
(@ARGV == 1 || @ARGV == 2) || die "Usage: add_counts.pl count_file [oovlist]\n";
|
||||||
|
|
||||||
|
$counts = shift @ARGV;
|
||||||
|
|
||||||
|
open(C, "<$counts") || die "Opening counts file $counts";
|
||||||
|
|
||||||
|
while(<C>) {
|
||||||
|
@A = split(" ", $_);
|
||||||
|
@A == 2 || die "Bad line in counts file: $_";
|
||||||
|
($count, $word) = @A;
|
||||||
|
$count =~ m:^\d+$: || die "Bad count $A[0]\n";
|
||||||
|
$counts{$word} = $count;
|
||||||
|
}
|
||||||
|
|
||||||
|
while(<>) {
|
||||||
|
chop;
|
||||||
|
$w = $_;
|
||||||
|
$w =~ m:\S+: || die "Bad word $w";
|
||||||
|
defined $counts{$w} || die "Word $w not present in counts file";
|
||||||
|
print "\t$counts{$w}\t$w\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
# This program takes the output of score_prons.pl and collates
|
||||||
|
# it for each (rule, destress) pair so that we get the
|
||||||
|
# counts of right/partial/wrong for each pair.
|
||||||
|
|
||||||
|
# The input is a 7-tuple on each line, like:
|
||||||
|
# word;pron;base-word;base-pron;rule-name;de-stress;right|partial|wrong
|
||||||
|
#
|
||||||
|
# The output format is a 5-tuple like:
|
||||||
|
#
|
||||||
|
# rule;destress;right-count;partial-count;wrong-count
|
||||||
|
#
|
||||||
|
|
||||||
|
if (@ARGV != 0 && @ARGV != 1) {
|
||||||
|
die "Usage: count_rules.pl < scored_candidate_prons > rule_counts";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
while(<>) {
|
||||||
|
chop;
|
||||||
|
$line = $_;
|
||||||
|
my ($word, $pron, $baseword, $basepron, $rulename, $destress, $score) = split(";", $line);
|
||||||
|
|
||||||
|
my $key = $rulename . ";" . $destress;
|
||||||
|
|
||||||
|
if (!defined $counts{$key}) {
|
||||||
|
$counts{$key} = [ 0, 0, 0 ]; # new anonymous array.
|
||||||
|
}
|
||||||
|
$ref = $counts{$key};
|
||||||
|
if ($score eq "right") {
|
||||||
|
$$ref[0]++;
|
||||||
|
} elsif ($score eq "partial") {
|
||||||
|
$$ref[1]++;
|
||||||
|
} elsif ($score eq "wrong") {
|
||||||
|
$$ref[2]++;
|
||||||
|
} else {
|
||||||
|
die "Bad score $score\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while ( my ($key, $value) = each(%counts)) {
|
||||||
|
print $key . ";" . join(";", @$value) . "\n";
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
|
||||||
|
# This program reads and writes either a dictionary or just a list
|
||||||
|
# of words, and it removes any words containing ";" or "," as these
|
||||||
|
# are used in these programs. It will warn about these.
|
||||||
|
# It will die if the pronunciations have these symbols in.
|
||||||
|
while(<>) {
|
||||||
|
chop;
|
||||||
|
@A = split(" ", $_);
|
||||||
|
$word = shift @A;
|
||||||
|
|
||||||
|
if ($word =~ m:[;,]:) {
|
||||||
|
print STDERR "Omitting line $_ since it has one of the banned characters ; or ,\n" ;
|
||||||
|
} else {
|
||||||
|
$_ =~ m:[;,]: && die "Phones cannot have ; or , in them.";
|
||||||
|
print $_ . "\n";
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,95 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
# Reads a dictionary, and prints out a list of words that seem to be pronounced
|
||||||
|
# as acronyms (not including plurals of acronyms, just acronyms). Uses
|
||||||
|
# the prons of the individual letters (A., B. and so on) to judge this.
|
||||||
|
# Note: this is somewhat dependent on the convention used in CMUduct, that
|
||||||
|
# the individual letters are spelled this way (e.g. "A.").
|
||||||
|
|
||||||
|
$max_length = 6; # Max length of words that might be
|
||||||
|
# acronyms.
|
||||||
|
|
||||||
|
while(<>) { # Read the dict.
|
||||||
|
chop;
|
||||||
|
@A = split(" ", $_);
|
||||||
|
$word = shift @A;
|
||||||
|
$pron = join(" ", @A);
|
||||||
|
if ($word =~ m/^([A-Z])\.$/ ) {
|
||||||
|
chop $word; # Remove trailing "." to get just the letter
|
||||||
|
$letter = $1;
|
||||||
|
if (!defined $letter_prons{$letter} ) {
|
||||||
|
$letter_prons{$letter} = [ ]; # new anonymous array
|
||||||
|
}
|
||||||
|
$arrayref = $letter_prons{$letter};
|
||||||
|
push @$arrayref, $pron;
|
||||||
|
} elsif( length($word) <= $max_length ) {
|
||||||
|
$pronof{$word . "," . $pron} = 1;
|
||||||
|
$isword{$word} = 1;
|
||||||
|
#if (!defined $prons{$word} ) {
|
||||||
|
# $prons{$word} = [ ];
|
||||||
|
#}
|
||||||
|
# push @{$prons{$word}}, $pron;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sub get_letter_prons;
|
||||||
|
|
||||||
|
foreach $word (keys %isword) {
|
||||||
|
my @letter_prons = get_letter_prons($word);
|
||||||
|
foreach $pron (@letter_prons) {
|
||||||
|
if (defined $pronof{$word.",".$pron}) {
|
||||||
|
print "$word $pron\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub get_letter_prons {
|
||||||
|
@acronym = split("", shift); # The letters in the word.
|
||||||
|
my @prons = ( "" );
|
||||||
|
|
||||||
|
while (@acronym > 0) {
|
||||||
|
$l = shift @acronym;
|
||||||
|
$n = 1; # num-repeats of letter $l.
|
||||||
|
while (@acronym > 0 && $acronym[0] eq $l) {
|
||||||
|
$n++;
|
||||||
|
shift @acronym;
|
||||||
|
}
|
||||||
|
my $arrayref = $letter_prons{$l};
|
||||||
|
my @prons_of_block = ();
|
||||||
|
if ($n == 1) { # Just one repeat.
|
||||||
|
foreach $lpron ( @$arrayref ) {
|
||||||
|
push @prons_of_block, $lpron; # typically (always?) just one pron of a letter.
|
||||||
|
}
|
||||||
|
} elsif ($n == 2) { # Two repeats. Can be "double a" or "a a"
|
||||||
|
foreach $lpron ( @$arrayref ) {
|
||||||
|
push @prons_of_block, "D AH1 B AH0 L " . $lpron;
|
||||||
|
push @prons_of_block, $lpron . $lpron;
|
||||||
|
}
|
||||||
|
} elsif ($n == 3) { # can be "triple a" or "a a a"
|
||||||
|
foreach $lpron ( @$arrayref ) {
|
||||||
|
push @prons_of_block, "T R IH1 P AH0 L " . $lpron;
|
||||||
|
push @prons_of_block, $lpron . $lpron . $lpron;
|
||||||
|
}
|
||||||
|
} elsif ($n >= 4) { # let's say it can only be that letter repeated $n times..
|
||||||
|
# not sure really.
|
||||||
|
foreach $lpron ( @$arrayref ) {
|
||||||
|
$nlpron = "";
|
||||||
|
for ($m = 0; $m < $n; $m++) { $nlpron = $nlpron . $lpron; }
|
||||||
|
push @prons_of_block, $nlpron;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
my @new_prons = ();
|
||||||
|
foreach $pron (@prons) {
|
||||||
|
foreach $pron_of_block(@prons_of_block) {
|
||||||
|
if ($pron eq "") {
|
||||||
|
push @new_prons, $pron_of_block;
|
||||||
|
} else {
|
||||||
|
push @new_prons, $pron . " " . $pron_of_block;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@prons = @new_prons;
|
||||||
|
}
|
||||||
|
return @prons;
|
||||||
|
}
|
|
@ -0,0 +1,123 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
# Reads a dictionary (for prons of letters), and an OOV list,
|
||||||
|
# and puts out candidate pronunciations of words in that list
|
||||||
|
# that could plausibly be acronyms.
|
||||||
|
# We judge that a word can plausibly be an acronym if it is
|
||||||
|
# a sequence of just letters (no non-letter characters such
|
||||||
|
# as "'"), or something like U.K.,
|
||||||
|
# and the number of letters is four or less.
|
||||||
|
#
|
||||||
|
# If the text were not already pre-normalized, there would
|
||||||
|
# be other hints such as capitalization.
|
||||||
|
|
||||||
|
# This program appends
|
||||||
|
# the prons of the individual letters (A., B. and so on) to work out
|
||||||
|
# the pron of the acronym.
|
||||||
|
# Note: this is somewhat dependent on the convention used in CMUduct, that
|
||||||
|
# the individual letters are spelled this way (e.g. "A."). [it seems
|
||||||
|
# to also have the separated versions.
|
||||||
|
|
||||||
|
if (!(@ARGV == 1 || @ARGV == 2)) {
|
||||||
|
print "Usage: get_acronym_prons.pl dict [oovlist]";
|
||||||
|
}
|
||||||
|
|
||||||
|
$max_length = 4; # Max #letters in an acronym. (Longer
|
||||||
|
# acronyms tend to have "pseudo-pronunciations", e.g. think about UNICEF.
|
||||||
|
|
||||||
|
$dict = shift @ARGV;
|
||||||
|
open(D, "<$dict") || die "Opening dictionary $dict";
|
||||||
|
|
||||||
|
while(<D>) { # Read the dict, to get the prons of the letters.
|
||||||
|
chop;
|
||||||
|
@A = split(" ", $_);
|
||||||
|
$word = shift @A;
|
||||||
|
$pron = join(" ", @A);
|
||||||
|
if ($word =~ m/^([A-Z])\.$/ ) {
|
||||||
|
chop $word; # Remove trailing "." to get just the letter
|
||||||
|
$letter = $1;
|
||||||
|
if (!defined $letter_prons{$letter} ) {
|
||||||
|
$letter_prons{$letter} = [ ]; # new anonymous array
|
||||||
|
}
|
||||||
|
$arrayref = $letter_prons{$letter};
|
||||||
|
push @$arrayref, $pron;
|
||||||
|
} elsif( length($word) <= $max_length ) {
|
||||||
|
$pronof{$word . "," . $pron} = 1;
|
||||||
|
$isword{$word} = 1;
|
||||||
|
#if (!defined $prons{$word} ) {
|
||||||
|
# $prons{$word} = [ ];
|
||||||
|
#}
|
||||||
|
# push @{$prons{$word}}, $pron;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sub get_letter_prons;
|
||||||
|
|
||||||
|
while(<>) { # Read OOVs.
|
||||||
|
# For now, just do the simple cases without "." in
|
||||||
|
# between... things with "." in the OOV list seem to
|
||||||
|
# be mostly errors.
|
||||||
|
chop;
|
||||||
|
$word = $_;
|
||||||
|
if ($word =~ m/^[A-Z]{1,5}$/) {
|
||||||
|
foreach $pron ( get_letter_prons($word) ) { # E.g. UNPO
|
||||||
|
print "$word $pron\n";
|
||||||
|
}
|
||||||
|
} elsif ($word =~ m:^(\w\.){1,4}\w\.?$:) { # E.g. U.K. Make the final "." optional.
|
||||||
|
$letters = $word;
|
||||||
|
$letters =~ s:\.::g;
|
||||||
|
foreach $pron ( get_letter_prons($letters) ) {
|
||||||
|
print "$word $pron\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sub get_letter_prons {
|
||||||
|
@acronym = split("", shift); # The letters in the word.
|
||||||
|
my @prons = ( "" );
|
||||||
|
|
||||||
|
while (@acronym > 0) {
|
||||||
|
$l = shift @acronym;
|
||||||
|
$n = 1; # num-repeats of letter $l.
|
||||||
|
while (@acronym > 0 && $acronym[0] eq $l) {
|
||||||
|
$n++;
|
||||||
|
shift @acronym;
|
||||||
|
}
|
||||||
|
my $arrayref = $letter_prons{$l};
|
||||||
|
my @prons_of_block = ();
|
||||||
|
if ($n == 1) { # Just one repeat.
|
||||||
|
foreach $lpron ( @$arrayref ) {
|
||||||
|
push @prons_of_block, $lpron; # typically (always?) just one pron of a letter.
|
||||||
|
}
|
||||||
|
} elsif ($n == 2) { # Two repeats. Can be "double a" or "a a"
|
||||||
|
foreach $lpron ( @$arrayref ) {
|
||||||
|
push @prons_of_block, "D AH1 B AH0 L " . $lpron;
|
||||||
|
push @prons_of_block, $lpron . " " . $lpron;
|
||||||
|
}
|
||||||
|
} elsif ($n == 3) { # can be "triple a" or "a a a"
|
||||||
|
foreach $lpron ( @$arrayref ) {
|
||||||
|
push @prons_of_block, "T R IH1 P AH0 L " . $lpron;
|
||||||
|
push @prons_of_block, "$lpron $lpron $lpron";
|
||||||
|
}
|
||||||
|
} elsif ($n >= 4) { # let's say it can only be that letter repeated $n times..
|
||||||
|
# not sure really.
|
||||||
|
foreach $lpron ( @$arrayref ) {
|
||||||
|
$nlpron = $lpron;
|
||||||
|
for ($m = 1; $m < $n; $m++) { $nlpron = $nlpron . " " . $lpron; }
|
||||||
|
push @prons_of_block, $nlpron;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
my @new_prons = ();
|
||||||
|
foreach $pron (@prons) {
|
||||||
|
foreach $pron_of_block(@prons_of_block) {
|
||||||
|
if ($pron eq "") {
|
||||||
|
push @new_prons, $pron_of_block;
|
||||||
|
} else {
|
||||||
|
push @new_prons, $pron . " " . $pron_of_block;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@prons = @new_prons;
|
||||||
|
}
|
||||||
|
return @prons;
|
||||||
|
}
|
|
@ -0,0 +1,187 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
# This script takes three command-line arguments (typically files, or "-"):
|
||||||
|
# the suffix rules (as output by get_rules.pl), the rule-hierarchy
|
||||||
|
# (from get_rule_hierarchy.pl), and the words that we want prons to be
|
||||||
|
# generated for (one per line).
|
||||||
|
|
||||||
|
# The output consists of candidate generated pronunciations for those words,
|
||||||
|
# together with information about how we generated those pronunciations.
|
||||||
|
# This does not do pruning of the candidates using the restriction
|
||||||
|
# "you can't use a more general rule when a more specific one is applicable".
|
||||||
|
# That is done by limit_candidate_prons.pl.
|
||||||
|
|
||||||
|
# Each line of the output consists of a 4-tuple, separated by ";", of the
|
||||||
|
# form:
|
||||||
|
# word;pron;base-word;base-pron;rule-name;destress[;rule-score]
|
||||||
|
# [the last field is only present if you supplied rules with score information].
|
||||||
|
# where:
|
||||||
|
# - "word" is the input word that we queried for, e.g. WASTED
|
||||||
|
# - "pron" is the generated pronunciation, e.g. "W EY1 S T AH0 D"
|
||||||
|
# - rule-name is a 4-tuple separated by commas that describes the rule, e.g.
|
||||||
|
# "STED,STING,D,NG",
|
||||||
|
# - "base-word" is the base-word we're getting the pron from,
|
||||||
|
# e.g. WASTING
|
||||||
|
# - "base-pron" is the pron of the base-word, e.g. "W EY1 S T IH0 NG"
|
||||||
|
# - "destress" is either "yes" or "no" and corresponds to whether we destressed the
|
||||||
|
# base-word or not [de-stressing just corresponds to just taking any 2's down to 1's,
|
||||||
|
# although we may extend this in future]...
|
||||||
|
# - "rule-score" is a numeric score of the rule (this field is only present
|
||||||
|
# if there was score information in your rules.
|
||||||
|
|
||||||
|
|
||||||
|
(@ARGV == 2 || @ARGV == 3) || die "Usage: get_candidate_prons.pl rules base-dict [ words ]";
|
||||||
|
|
||||||
|
$min_prefix_len = 3; # this should probably match with get_rules.pl
|
||||||
|
|
||||||
|
$rules = shift @ARGV; # Note: rules may be with destress "yes/no" indicators or without...
|
||||||
|
# if without, it's treated as if both "yes" and "no" are present.
|
||||||
|
$dict = shift @ARGV;
|
||||||
|
|
||||||
|
open(R, "<$rules") || die "Opening rules file: $rules";
|
||||||
|
|
||||||
|
sub process_word;
|
||||||
|
|
||||||
|
while(<R>) {
|
||||||
|
chop $_;
|
||||||
|
my ($rule, $destress, $rule_score) = split(";", $_); # We may have "destress" markings (yes|no),
|
||||||
|
# and scores, or we may have just rule, in which case
|
||||||
|
# $destress and $rule_score will be undefined.
|
||||||
|
|
||||||
|
my @R = split(",", $rule, 4); # "my" means new instance of @R each
|
||||||
|
# time we do this loop -> important because we'll be creating
|
||||||
|
# a reference to @R below.
|
||||||
|
# Note: the last arg to SPLIT tells it how many fields max to get.
|
||||||
|
# This stops it from omitting empty trailing fields.
|
||||||
|
@R == 4 || die "Bad rule $_";
|
||||||
|
$suffix = $R[0]; # Suffix of word we want pron for.
|
||||||
|
if (!defined $isrule{$rule}) {
|
||||||
|
$isrule{$rule} = 1; # make sure we do this only once for each rule
|
||||||
|
# (don't repeate for different stresses).
|
||||||
|
if (!defined $suffix2rule{$suffix}) {
|
||||||
|
# The syntax [ $x, $y, ... ] means a reference to a newly created array
|
||||||
|
# containing $x, $y, etc. \@R creates an array reference to R.
|
||||||
|
# so suffix2rule is a hash from suffix to ref to array of refs to
|
||||||
|
# 4-dimensional arrays.
|
||||||
|
$suffix2rule{$suffix} = [ \@R ];
|
||||||
|
} else {
|
||||||
|
# Below, the syntax @{$suffix2rule{$suffix}} dereferences the array
|
||||||
|
# reference inside the hash; \@R pushes onto that array a new array
|
||||||
|
# reference pointing to @R.
|
||||||
|
push @{$suffix2rule{$suffix}}, \@R;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!defined $rule_score) { $rule_score = -1; } # -1 means we don't have the score info.
|
||||||
|
|
||||||
|
# Now store information on which destress markings (yes|no) this rule
|
||||||
|
# is valid for, and the associated scores (if supplied)
|
||||||
|
# If just the rule is given (i.e. no destress marking specified),
|
||||||
|
# assume valid for both.
|
||||||
|
if (!defined $destress) { # treat as if both "yes" and "no" are valid.
|
||||||
|
$rule_and_destress_to_rule_score{$rule.";yes"} = $rule_score;
|
||||||
|
$rule_and_destress_to_rule_score{$rule.";no"} = $rule_score;
|
||||||
|
} else {
|
||||||
|
$rule_and_destress_to_rule_score{$rule.";".$destress} = $rule_score;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
open(D, "<$dict") || die "Opening base dictionary: $dict";
|
||||||
|
while(<D>) {
|
||||||
|
@A = split(" ", $_);
|
||||||
|
$word = shift @A;
|
||||||
|
$pron = join(" ", @A);
|
||||||
|
if (!defined $word2prons{$word}) {
|
||||||
|
$word2prons{$word} = [ $pron ]; # Ref to new anonymous array containing just "pron".
|
||||||
|
} else {
|
||||||
|
push @{$word2prons{$word}}, $pron; # Push $pron onto array referred to (@$ref derefs array).
|
||||||
|
}
|
||||||
|
}
|
||||||
|
foreach $word (%word2prons) {
|
||||||
|
# Set up the hash "prefixcount", which says how many times a char-sequence
|
||||||
|
# is a prefix (not necessarily a strict prefix) of a word in the dict.
|
||||||
|
$len = length($word);
|
||||||
|
for ($l = 0; $l <= $len; $l++) {
|
||||||
|
$prefixcount{substr($word, 0, $l)}++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
open(R, "<$rules") || die "Opening rules file: $rules";
|
||||||
|
|
||||||
|
|
||||||
|
while(<>) {
|
||||||
|
chop;
|
||||||
|
m/^\S+$/ || die;
|
||||||
|
process_word($_);
|
||||||
|
}
|
||||||
|
|
||||||
|
sub process_word {
|
||||||
|
my $word = shift @_;
|
||||||
|
$len = length($word);
|
||||||
|
# $owncount is used in evaluating whether a particular prefix is a prefix
|
||||||
|
# of some other word in the dict... if a word itself may be in the dict
|
||||||
|
# (usually because we're running this on the dict itself), we need to
|
||||||
|
# correct for this.
|
||||||
|
if (defined $word2prons{$word}) { $owncount = 1; } else { $owncount = 0; }
|
||||||
|
|
||||||
|
for ($prefix_len = $min_prefix_len; $prefix_len <= $len; $prefix_len++) {
|
||||||
|
my $prefix = substr($word, 0, $prefix_len);
|
||||||
|
my $suffix = substr($word, $prefix_len);
|
||||||
|
if ($prefixcount{$prefix} - $owncount == 0) {
|
||||||
|
# This prefix is not a prefix of any word in the dict, so no point
|
||||||
|
# checking the rules below-- none of them can match.
|
||||||
|
next;
|
||||||
|
}
|
||||||
|
$rules_array_ref = $suffix2rule{$suffix};
|
||||||
|
if (defined $rules_array_ref) {
|
||||||
|
foreach $R (@$rules_array_ref) { # @$rules_array_ref dereferences the array.
|
||||||
|
# $R is a refernce to a 4-dimensional array, whose elements we access with
|
||||||
|
# $$R[0], etc.
|
||||||
|
my $base_suffix = $$R[1];
|
||||||
|
my $base_word = $prefix . $base_suffix;
|
||||||
|
my $base_prons_ref = $word2prons{$base_word};
|
||||||
|
if (defined $base_prons_ref) {
|
||||||
|
my $psuffix = $$R[2];
|
||||||
|
my $base_psuffix = $$R[3];
|
||||||
|
if ($base_psuffix ne "") {
|
||||||
|
$base_psuffix = " " . $base_psuffix;
|
||||||
|
# Include " ", the space between phones, to prevent
|
||||||
|
# matching partial phones below.
|
||||||
|
}
|
||||||
|
my $base_psuffix_len = length($base_psuffix);
|
||||||
|
foreach $base_pron (@$base_prons_ref) { # @$base_prons_ref derefs
|
||||||
|
# that reference to an array.
|
||||||
|
my $base_pron_prefix_len = length($base_pron) - $base_psuffix_len;
|
||||||
|
# Note: these lengths are in characters, not phones.
|
||||||
|
if ($base_pron_prefix_len >= 0 &&
|
||||||
|
substr($base_pron, $base_pron_prefix_len) eq $base_psuffix) {
|
||||||
|
# The suffix of the base_pron is what it should be.
|
||||||
|
my $pron_prefix = substr($base_pron, 0, $base_pron_prefix_len);
|
||||||
|
my $rule = join(",", @$R); # we'll output this..
|
||||||
|
my $len = @R;
|
||||||
|
for ($destress = 0; $destress <= 1; $destress++) { # Two versions
|
||||||
|
# of each rule: with destressing and without.
|
||||||
|
# pron is the generated pron.
|
||||||
|
if ($destress) { $pron_prefix =~ s/2/1/g; }
|
||||||
|
my $pron;
|
||||||
|
if ($psuffix ne "") { $pron = $pron_prefix . " " . $psuffix; }
|
||||||
|
else { $pron = $pron_prefix; }
|
||||||
|
# Now print out the info about the generated pron.
|
||||||
|
my $destress_mark = ($destress ? "yes" : "no");
|
||||||
|
my $rule_score = $rule_and_destress_to_rule_score{$rule.";".$destress_mark};
|
||||||
|
if (defined $rule_score) { # Means that the (rule,destress) combination was
|
||||||
|
# seen [note: this if-statement may be pointless, as currently we don't
|
||||||
|
# do any pruning of rules].
|
||||||
|
my @output = ($word, $pron, $base_word, $base_pron, $rule, $destress_mark);
|
||||||
|
if ($rule_score != -1) { push @output, $rule_score; } # If scores were supplied,
|
||||||
|
# we also output the score info.
|
||||||
|
print join(";", @output) . "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,73 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
#This reads in rules, of the form put out by get_rules.pl, e.g.:
|
||||||
|
# ERT,,ER0 T,
|
||||||
|
# MENT,ING,M AH0 N T,IH0 NG
|
||||||
|
# S,TON,Z,T AH0 N
|
||||||
|
# ,ER,IH0 NG,IH0 NG ER0
|
||||||
|
# ,'S,M AH0 N,M AH0 N Z
|
||||||
|
#TIONS,TIVE,SH AH0 N Z,T IH0 V
|
||||||
|
|
||||||
|
# and it works out a hierarchy that says which rules are sub-cases
|
||||||
|
# of which rules: it outputs on each line a pair separated by ";", where
|
||||||
|
# each member of the pair is a rule, first one is the specialization, the
|
||||||
|
# second one being more general.
|
||||||
|
# E.g.:
|
||||||
|
# RED,RE,D,/ED,E,D,
|
||||||
|
# RED,RE,D,/D,,D,
|
||||||
|
# GING,GE,IH0 NG,/ING,I,IH0 NG,
|
||||||
|
# TOR,TING,T ER0,T IH0 NG/OR,OR,T ER0,T ER0
|
||||||
|
# ERED,ER,D,/RED,R,D,
|
||||||
|
# ERED,ER,D,/ED,,D,
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
while(<>) {
|
||||||
|
chop;
|
||||||
|
$rule = $_;
|
||||||
|
$isrule{$rule} = 1;
|
||||||
|
push @rules, $rule;
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach my $rule (@rules) {
|
||||||
|
# Truncate the letters and phones in the rule, while we
|
||||||
|
# can, to get more general rules; if the more general rule
|
||||||
|
# exists, put out the pair.
|
||||||
|
@A = split(",", $rule);
|
||||||
|
@suffixa = split("", $A[0]);
|
||||||
|
@suffixb = split("", $A[1]);
|
||||||
|
@psuffixa = split(" ", $A[2]);
|
||||||
|
@psuffixb = split(" ", $A[3]);
|
||||||
|
for ($common_suffix_len = 0; $common_suffix_len < @suffixa && $common_suffix_len < @suffixb;) {
|
||||||
|
if ($suffixa[$common_suffix_len] eq $suffixb[$common_suffix_len]) {
|
||||||
|
$common_suffix_len++;
|
||||||
|
} else {
|
||||||
|
last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for ($common_psuffix_len = 0; $common_psuffix_len < @psuffixa && $common_psuffix_len < @psuffixb;) {
|
||||||
|
if ($psuffixa[$common_psuffix_len] eq $psuffixb[$common_psuffix_len]) {
|
||||||
|
$common_psuffix_len++;
|
||||||
|
} else {
|
||||||
|
last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
# Get all combinations of pairs of integers <= (common_suffix_len, common_psuffix_len),
|
||||||
|
# except (0,0), and print out this rule together with the corresponding rule (if it exists).
|
||||||
|
for ($m = 0; $m <= $common_suffix_len; $m++) {
|
||||||
|
$sa = join("", @suffixa[$m...$#suffixa]); # @x[a..b] is array slice notation.
|
||||||
|
$sb = join("", @suffixb[$m...$#suffixb]);
|
||||||
|
for ($n = 0; $n <= $common_psuffix_len; $n++) {
|
||||||
|
if (!($m == 0 && $n == 0)) {
|
||||||
|
$psa = join(" ", @psuffixa[$n...$#psuffixa]);
|
||||||
|
$psb = join(" ", @psuffixb[$n...$#psuffixb]);
|
||||||
|
$more_general_rule = join(",", ($sa, $sb, $psa, $psb));
|
||||||
|
if (defined $isrule{$more_general_rule}) {
|
||||||
|
print $rule . ";" . $more_general_rule . "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,204 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
# This program creates suggested suffix rules from a dictionary.
|
||||||
|
# It outputs quadruples of the form:
|
||||||
|
# suffix,base-suffix,psuffix,base-psuffix
|
||||||
|
# where "suffix" is the suffix of the letters of a word, "base-suffix" is
|
||||||
|
# the suffix of the letters of the base-word, "psuffix" is the suffix of the
|
||||||
|
# pronunciation of the word (a space-separated list of phonemes), and
|
||||||
|
# "base-psuffix" is the suffix of the pronunciation of the baseword.
|
||||||
|
# As far as this program is concerned, there is no distinction between
|
||||||
|
# "word" and "base-word". To simplify things slightly, what it does
|
||||||
|
# is return all tuples (a,b,c,d) [with a != b] such that there are
|
||||||
|
# at least $min_suffix_count instances in the dictionary of
|
||||||
|
# a (word-prefix, pron-prefix) pair where there exists (word,pron)
|
||||||
|
# pairs of the form
|
||||||
|
# ( word-prefix . a, pron-prefix . c)
|
||||||
|
# and
|
||||||
|
# ( word-prefix . b, pron-prefix . d)
|
||||||
|
# For example if (a,b,c,d) equals (USLY,US,S L IY0,S)
|
||||||
|
# then this quadruple will be output as long as there at least
|
||||||
|
# e.g. 30 instances of prefixes like (FAM, F EY1 M AH0)
|
||||||
|
# where there exist (word, pron) pairs like:
|
||||||
|
# FAMOUS, F EY1 M AH0 S
|
||||||
|
# FAMOUSLY F EY1 M AH0 S L IY0
|
||||||
|
#
|
||||||
|
# There are some modifications to the picture above, for efficiency.
|
||||||
|
# If $disallow_empty_suffix != 0, this program will not output 4-tuples where
|
||||||
|
# the first element (the own-word suffix) is empty, as this would cause
|
||||||
|
# efficiency problems in get_candidate_prons.pl. If
|
||||||
|
# $ignore_prefix_stress != 0, this program will ignore stress markings
|
||||||
|
# while evaluating whether prefixes are the same.
|
||||||
|
# The minimum count for a quadruple to be output is $min_suffix_count
|
||||||
|
# (e.g. 30).
|
||||||
|
#
|
||||||
|
# The function of this program is not to evaluate the accuracy of these rules;
|
||||||
|
# it is mostly a pruning step, where we suggest rules that have large enough
|
||||||
|
# counts to be suitable for our later procedure where we evaluate their
|
||||||
|
# accuracy in predicting prons.
|
||||||
|
|
||||||
|
$disallow_empty_suffix = 1; # Disallow rules where the suffix of the "own-word" is
|
||||||
|
# empty. This is for efficiency in later stages (e.g. get_candidate_prons.pl).
|
||||||
|
$min_prefix_len = 3; # this must match with get_candidate_prons.pl
|
||||||
|
$ignore_prefix_stress = 1; # or 0 to take account of stress in prefix.
|
||||||
|
$min_suffix_count = 20;
|
||||||
|
|
||||||
|
# Takes in dictionary.
|
||||||
|
|
||||||
|
print STDERR "Reading dict\n";
|
||||||
|
while(<>) {
|
||||||
|
@A = split(" ", $_);
|
||||||
|
my $word = shift @A;
|
||||||
|
my $pron = join(" ", @A);
|
||||||
|
if (!defined $prons{$word}) {
|
||||||
|
$prons{$word} = $pron;
|
||||||
|
push @words, $word;
|
||||||
|
} else {
|
||||||
|
$prons{$word} = $prons{$word} . ";" . $pron;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Get common suffixes (e.g., count >100). Include empty suffix.
|
||||||
|
|
||||||
|
print STDERR "Getting common suffix counts.\n";
|
||||||
|
{
|
||||||
|
foreach $word (@words) {
|
||||||
|
$len = length($word);
|
||||||
|
for ($x = $min_prefix_len; $x <= $len; $x++) {
|
||||||
|
$suffix_count{substr($word, $x)}++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach $suffix (keys %suffix_count) {
|
||||||
|
if ($suffix_count{$suffix} >= $min_suffix_count) {
|
||||||
|
$newsuffix_count{$suffix} = $suffix_count{$suffix};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
%suffix_count = %newsuffix_count;
|
||||||
|
undef %newsuffix_count;
|
||||||
|
|
||||||
|
foreach $suffix ( sort { $suffix_count{$b} <=> $suffix_count{$a} } keys %suffix_count ) {
|
||||||
|
print STDERR "$suffix_count{$suffix} $suffix\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print STDERR "Getting common suffix pairs.\n";
|
||||||
|
|
||||||
|
{
|
||||||
|
print STDERR " Getting map from prefix -> suffix-set.\n";
|
||||||
|
|
||||||
|
# Create map from prefix -> suffix-set.
|
||||||
|
foreach $word (@words) {
|
||||||
|
$len = length($word);
|
||||||
|
for ($x = $min_prefix_len; $x <= $len; $x++) {
|
||||||
|
$prefix = substr($word, 0, $x);
|
||||||
|
$suffix = substr($word, $x);
|
||||||
|
if (defined $suffix_count{$suffix}) { # Suffix is common...
|
||||||
|
if (!defined $suffixes_of{$prefix}) {
|
||||||
|
$suffixes_of{$prefix} = [ $suffix ]; # Create a reference to a new array with
|
||||||
|
# one element.
|
||||||
|
} else {
|
||||||
|
push @{$suffixes_of{$prefix}}, $suffix; # Push $suffix onto array that the
|
||||||
|
# hash member is a reference .
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
my %suffix_set_count;
|
||||||
|
print STDERR " Getting map from suffix-set -> count.\n";
|
||||||
|
while ( my ($key, $value) = each(%suffixes_of) ) {
|
||||||
|
my @suffixes = sort ( @$value );
|
||||||
|
$suffix_set_count{join(";", @suffixes)}++;
|
||||||
|
}
|
||||||
|
print STDERR " Getting counts for suffix pairs.\n";
|
||||||
|
while ( my ($suffix_set, $count) = each (%suffix_set_count) ) {
|
||||||
|
my @suffixes = split(";", $suffix_set);
|
||||||
|
# Consider pairs to be ordered. This is more convenient
|
||||||
|
# later on.
|
||||||
|
foreach $suffix_a (@suffixes) {
|
||||||
|
foreach $suffix_b (@suffixes) {
|
||||||
|
if ($suffix_a ne $suffix_b) {
|
||||||
|
$suffix_pair = $suffix_a . "," . $suffix_b;
|
||||||
|
$suffix_pair_count{$suffix_pair} += $count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# To save memory, only keep pairs above threshold in the hash.
|
||||||
|
while ( my ($suffix_pair, $count) = each (%suffix_pair_count) ) {
|
||||||
|
if ($count >= $min_suffix_count) {
|
||||||
|
$new_hash{$suffix_pair} = $count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
%suffix_pair_count = %new_hash;
|
||||||
|
undef %new_hash;
|
||||||
|
|
||||||
|
# Print out the suffix pairs so the user can see.
|
||||||
|
foreach $suffix_pair (
|
||||||
|
sort { $suffix_pair_count{$b} <=> $suffix_pair_count{$a} } keys %suffix_pair_count ) {
|
||||||
|
print STDERR "$suffix_pair_count{$suffix_pair} $suffix_pair\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print STDERR "Getting common suffix/suffix/psuffix/psuffix quadruples\n";
|
||||||
|
|
||||||
|
{
|
||||||
|
while ( my ($prefix, $suffixes_ref) = each(%suffixes_of) ) {
|
||||||
|
# Note: suffixes_ref is a reference to an array. We dereference with
|
||||||
|
# @$suffixes_ref.
|
||||||
|
# Consider each pair of suffixes (in each order).
|
||||||
|
foreach my $suffix_a ( @$suffixes_ref ) {
|
||||||
|
foreach my $suffix_b ( @$suffixes_ref ) {
|
||||||
|
# could just used "defined" in next line, but this is for clarity.
|
||||||
|
$suffix_pair = $suffix_a.",".$suffix_b;
|
||||||
|
if ( $suffix_pair_count{$suffix_pair} >= $min_suffix_count ) {
|
||||||
|
foreach $pron_a_str (split(";", $prons{$prefix.$suffix_a})) {
|
||||||
|
@pron_a = split(" ", $pron_a_str);
|
||||||
|
foreach $pron_b_str (split(";", $prons{$prefix.$suffix_b})) {
|
||||||
|
@pron_b = split(" ", $pron_b_str);
|
||||||
|
$len_a = @pron_a; # evaluating array as scalar automatically gives length.
|
||||||
|
$len_b = @pron_b;
|
||||||
|
for (my $pos = 0; $pos <= $len_a && $pos <= $len_b; $pos++) {
|
||||||
|
# $pos is starting-pos of psuffix-pair.
|
||||||
|
$psuffix_a = join(" ", @pron_a[$pos...$#pron_a]);
|
||||||
|
$psuffix_b = join(" ", @pron_b[$pos...$#pron_b]);
|
||||||
|
$quadruple = $suffix_pair . "," . $psuffix_a . "," . $psuffix_b;
|
||||||
|
$quadruple_count{$quadruple}++;
|
||||||
|
|
||||||
|
my $pron_a_pos = $pron_a[$pos], $pron_b_pos = $pron_b[$pos];
|
||||||
|
if ($ignore_prefix_stress) {
|
||||||
|
$pron_a_pos =~ s/\d//; # e.g convert IH0 to IH. Only affects
|
||||||
|
$pron_b_pos =~ s/\d//; # whether we exit the loop below.
|
||||||
|
}
|
||||||
|
if ($pron_a_pos ne $pron_b_pos) {
|
||||||
|
# This is important: we don't consider a pron suffix-pair to be
|
||||||
|
# valid unless the pron prefix is the same.
|
||||||
|
last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
# To save memory, only keep pairs above threshold in the hash.
|
||||||
|
while ( my ($quadruple, $count) = each (%quadruple_count) ) {
|
||||||
|
if ($count >= $min_suffix_count) {
|
||||||
|
$new_hash{$quadruple} = $count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
%quadruple_count = %new_hash;
|
||||||
|
undef %new_hash;
|
||||||
|
|
||||||
|
# Print out the quadruples for diagnostics.
|
||||||
|
foreach $quadruple (
|
||||||
|
sort { $quadruple_count{$b} <=> $quadruple_count{$a} } keys %quadruple_count ) {
|
||||||
|
print STDERR "$quadruple_count{$quadruple} $quadruple\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
# Now print out the quadruples; these are the output of this program.
|
||||||
|
foreach $quadruple (keys %quadruple_count) {
|
||||||
|
print $quadruple."\n";
|
||||||
|
}
|
|
@ -0,0 +1,103 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
# This program enforces the rule that
|
||||||
|
# if a "more specific" rule applies, we cannot use the more general rule.
|
||||||
|
# It takes in tuples generated by get_candidate_prons (one per line, separated
|
||||||
|
# by ";"), of the form:
|
||||||
|
# word;pron;base-word;base-pron;rule-name;de-stress[;rule-score]
|
||||||
|
# [note: we mean that the last element, the numeric score of the rule, is optional]
|
||||||
|
# and it outputs a (generally shorter) list
|
||||||
|
# of the same form.
|
||||||
|
|
||||||
|
|
||||||
|
# For each word:
|
||||||
|
# For each (base-word,base-pron):
|
||||||
|
# Eliminate "more-general" rules as follows:
|
||||||
|
# For each pair of rules applying to this (base-word, base-pron):
|
||||||
|
# If pair is in more-general hash, disallow more general one.
|
||||||
|
# Let the output be: for each (base-word, base-pron, rule):
|
||||||
|
# for (destress-prefix) in [yes, no], do:
|
||||||
|
# print out the word input, the rule-name, [destressed:yes|no], and the new pron.
|
||||||
|
|
||||||
|
|
||||||
|
if (@ARGV != 1 && @ARGV != 2) {
|
||||||
|
die "Usage: limit_candidate_prons.pl rule_hierarchy [candidate_prons] > limited_candidate_prons";
|
||||||
|
}
|
||||||
|
|
||||||
|
$hierarchy = shift @ARGV;
|
||||||
|
open(H, "<$hierarchy") || die "Opening rule hierarchy $hierarchy";
|
||||||
|
|
||||||
|
while(<H>) {
|
||||||
|
chop;
|
||||||
|
m:.+;.+: || die "Bad rule-hierarchy line $_";
|
||||||
|
$hierarchy{$_} = 1; # Format is: if $rule1 is the string form of the more specific rule
|
||||||
|
# and $rule21 is that string form of the more general rule, then $hierarchy{$rule1.";".$rule2}
|
||||||
|
# is defined, else undefined.
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub process_word;
|
||||||
|
|
||||||
|
undef $cur_word;
|
||||||
|
@cur_lines = ();
|
||||||
|
|
||||||
|
while(<>) {
|
||||||
|
# input, output is:
|
||||||
|
# word;pron;base-word;base-pron;rule-name;destress;score
|
||||||
|
chop;
|
||||||
|
m:^([^;]+);: || die "Unexpected input: $_";
|
||||||
|
$word = $1;
|
||||||
|
if (!defined $cur_word || $word eq $cur_word) {
|
||||||
|
if (!defined $cur_word) { $cur_word = $word; }
|
||||||
|
push @cur_lines, $_;
|
||||||
|
} else {
|
||||||
|
process_word(@cur_lines); # Process a series of suggested prons
|
||||||
|
# for a particular word.
|
||||||
|
$cur_word = $word;
|
||||||
|
@cur_lines = ( $_ );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
process_word(@cur_lines);
|
||||||
|
|
||||||
|
sub process_word {
|
||||||
|
my %pair2rule_list; # hash from $baseword.";".$baseword to ref
|
||||||
|
# to array of [ line1, line2, ... ].
|
||||||
|
my @cur_lines = @_;
|
||||||
|
foreach my $line (@cur_lines) {
|
||||||
|
my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line);
|
||||||
|
my $key = $baseword.";".$basepron;
|
||||||
|
if (defined $pair2rule_list{$key}) {
|
||||||
|
push @{$pair2rule_list{$key}}, $line; # @{...} derefs the array pointed to
|
||||||
|
# by the array ref inside {}.
|
||||||
|
} else {
|
||||||
|
$pair2rule_list{$key} = [ $line ]; # [ $x ] is new anonymous array with 1 elem ($x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while ( my ($key, $value) = each(%pair2rule_list) ) {
|
||||||
|
my @lines = @$value; # array of lines that are for this (baseword,basepron).
|
||||||
|
my @stress, @rules; # Arrays of stress markers and rule names, indexed by
|
||||||
|
# same index that indexes @lines.
|
||||||
|
for (my $n = 0; $n < @lines; $n++) {
|
||||||
|
my $line = $lines[$n];
|
||||||
|
my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line);
|
||||||
|
$stress[$n] = $destress;
|
||||||
|
$rules[$n] = $rulename;
|
||||||
|
}
|
||||||
|
for (my $m = 0; $m < @lines; $m++) {
|
||||||
|
my $ok = 1; # if stays 1, this line is OK.
|
||||||
|
for (my $n = 0; $n < @lines; $n++) {
|
||||||
|
if ($m != $n && $stress[$m] eq $stress[$n]) {
|
||||||
|
if (defined $hierarchy{$rules[$n].";".$rules[$m]}) {
|
||||||
|
# Note: this "hierarchy" variable is defined if $rules[$n] is a more
|
||||||
|
# specific instances of $rules[$m], thus invalidating $rules[$m].
|
||||||
|
$ok = 0;
|
||||||
|
last; # no point iterating further.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ($ok != 0) {
|
||||||
|
print $lines[$m] . "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,50 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
# This takes the output of e.g. get_candidate_prons.pl or limit_candidate_prons.pl,
|
||||||
|
# which is 7-tuples, one per line, of the form:
|
||||||
|
|
||||||
|
# word;pron;base-word;base-pron;rule-name;de-stress;rule-score
|
||||||
|
# (where rule-score is somtimes listed as optional, but this
|
||||||
|
# program does expect it, since we don't anticipate it being used
|
||||||
|
# without it).
|
||||||
|
# This program assumes that all the words and prons and rules have
|
||||||
|
# come from a reversed dictionary (reverse_dict.pl) where the order
|
||||||
|
# of the characters in the words, and the phones in the prons, have
|
||||||
|
# been reversed, and it un-reverses them. That it, the characters
|
||||||
|
# in "word" and "base-word", and the phones in "pron" and "base-pron"
|
||||||
|
# are reversed; and the rule ("rule-name") is parsed as a 4-tuple,
|
||||||
|
# like:
|
||||||
|
# suffix,base-suffix,psuffix,base-psuffix
|
||||||
|
# so this program reverses the characters in "suffix" and "base-suffix"
|
||||||
|
# and the phones (separated by spaces) in "psuffix" and "base-psuffix".
|
||||||
|
|
||||||
|
sub reverse_str {
|
||||||
|
$str = shift;
|
||||||
|
return join("", reverse(split("", $str)));
|
||||||
|
}
|
||||||
|
sub reverse_pron {
|
||||||
|
$str = shift;
|
||||||
|
return join(" ", reverse(split(" ", $str)));
|
||||||
|
}
|
||||||
|
|
||||||
|
while(<>){
|
||||||
|
chop;
|
||||||
|
@A = split(";", $_);
|
||||||
|
@A == 7 || die "Bad input line $_: found $len fields, expected 7.";
|
||||||
|
|
||||||
|
($word,$pron,$baseword,$basepron,$rule,$destress,$score) = @A;
|
||||||
|
$word = reverse_str($word);
|
||||||
|
$pron = reverse_pron($pron);
|
||||||
|
$baseword = reverse_str($baseword);
|
||||||
|
$basepron = reverse_pron($basepron);
|
||||||
|
@R = split(",", $rule, 4);
|
||||||
|
@R == 4 || die "Bad rule $rule";
|
||||||
|
|
||||||
|
$R[0] = reverse_str($R[0]); # suffix.
|
||||||
|
$R[1] = reverse_str($R[1]); # base-suffix.
|
||||||
|
$R[2] = reverse_pron($R[2]); # pron.
|
||||||
|
$R[3] = reverse_pron($R[3]); # base-pron.
|
||||||
|
$rule = join(",", @R);
|
||||||
|
@A = ($word,$pron,$baseword,$basepron,$rule,$destress,$score);
|
||||||
|
print join(";", @A) . "\n";
|
||||||
|
}
|
|
@ -0,0 +1,14 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
# Used in conjunction with get_rules.pl
|
||||||
|
# example input line: XANTHE Z AE1 N DH
|
||||||
|
# example output line: EHTNAX DH N AE1 Z
|
||||||
|
|
||||||
|
while(<>){
|
||||||
|
@A = split(" ", $_);
|
||||||
|
$word = shift @A;
|
||||||
|
$word = join("", reverse(split("", $word))); # Reverse letters of word.
|
||||||
|
@A = reverse(@A); # Reverse phones in pron.
|
||||||
|
unshift @A, $word;
|
||||||
|
print join(" ", @A) . "\n";
|
||||||
|
}
|
|
@ -0,0 +1,50 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
# This program takes candidate prons from "get_candidate_prons.pl" or
|
||||||
|
# "limit_candidate_prons.pl", and a reference dictionary covering those words,
|
||||||
|
# and outputs the same format but with scoring information added (so we go from
|
||||||
|
# 6 to 7 fields). The scoring information says, for each generated pron,
|
||||||
|
# whether we have a match, a partial match, or no match, to some word in the
|
||||||
|
# dictionary. A partial match means it's correct except for stress.
|
||||||
|
|
||||||
|
# The input is a 6-tuple on each line, like:
|
||||||
|
# word;pron;base-word;base-pron;rule-name;de-stress
|
||||||
|
#
|
||||||
|
# The output is the same except with one more field, the score,
|
||||||
|
# which may be "right", "wrong", "partial".
|
||||||
|
|
||||||
|
if (@ARGV != 1 && @ARGV != 2) {
|
||||||
|
die "Usage: score_prons.pl reference_dict [candidate_prons] > scored_candidate_prons";
|
||||||
|
}
|
||||||
|
|
||||||
|
$dict = shift @ARGV;
|
||||||
|
open(D, "<$dict") || die "Opening dictionary $dict";
|
||||||
|
|
||||||
|
while(<D>) { # Set up some hashes that tell us when
|
||||||
|
# a (word,pron) pair is correct (and the same for
|
||||||
|
# prons with stress information removed).
|
||||||
|
chop;
|
||||||
|
@A = split(" ", $_);
|
||||||
|
$word = shift @A;
|
||||||
|
$pron = join(" ", @A);
|
||||||
|
$pron_nostress = $pron;
|
||||||
|
$pron_nostress =~ s:\d::g;
|
||||||
|
$word_and_pron{$word.";".$pron} = 1;
|
||||||
|
$word_and_pron_nostress{$word.";".$pron_nostress} = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
while(<>) {
|
||||||
|
chop;
|
||||||
|
$line = $_;
|
||||||
|
my ($word, $pron, $baseword, $basepron, $rulename, $destress) = split(";", $line);
|
||||||
|
$pron_nostress = $pron;
|
||||||
|
$pron_nostress =~ s:\d::g;
|
||||||
|
if (defined $word_and_pron{$word.";".$pron}) {
|
||||||
|
$score = "right";
|
||||||
|
} elsif (defined $word_and_pron_nostress{$word.";".$pron_nostress}) {
|
||||||
|
$score = "partial";
|
||||||
|
} else {
|
||||||
|
$score = "wrong";
|
||||||
|
}
|
||||||
|
print $line.";".$score."\n";
|
||||||
|
}
|
|
@ -0,0 +1,52 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
# This program takes the output of count_rules.pl, which is tuples
|
||||||
|
# of the form
|
||||||
|
#
|
||||||
|
# rule;destress;right-count;partial-count;wrong-count
|
||||||
|
#
|
||||||
|
# and outputs lines of the form
|
||||||
|
#
|
||||||
|
# rule;de-stress;score
|
||||||
|
#
|
||||||
|
# where the score, between 0 and 1 (1 better), is
|
||||||
|
# equal to:
|
||||||
|
#
|
||||||
|
# It forms a score between 0 and 1, of the form:
|
||||||
|
# ((#correct) + $partial_score * (#partial)) / (#correct + #partial + #wrong + $ballast)
|
||||||
|
#
|
||||||
|
# where $partial_score (e.g. 0.8) is the score we assign to a "partial" match,
|
||||||
|
# and $ballast is a small number, e.g. 1, that is treated like "extra" wrong scores, to penalize
|
||||||
|
# rules with few observations.
|
||||||
|
#
|
||||||
|
# It outputs all rules that at are at least the
|
||||||
|
|
||||||
|
$ballast = 1;
|
||||||
|
$partial_score = 0.8;
|
||||||
|
$destress_penalty = 1.0e-05; # Give destressed rules a small
|
||||||
|
# penalty vs. their no-destress counterparts, so if we
|
||||||
|
# have to choose arbitrarily we won't destress (seems safer)>
|
||||||
|
|
||||||
|
for ($n = 1; $n <= 4; $n++) {
|
||||||
|
if ($ARGV[0] eq "--ballast") {
|
||||||
|
shift @ARGV;
|
||||||
|
$ballast = shift @ARGV;
|
||||||
|
}
|
||||||
|
if ($ARGV[0] eq "--partial-score") {
|
||||||
|
shift @ARGV;
|
||||||
|
$partial_score = shift @ARGV;
|
||||||
|
($partial_score >= 0.0 && $partial_score <= 1.0) || die "Invalid partial_score: $partial_score";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
(@ARGV == 0 || @ARGV == 1) || die "Usage: score_rules.pl [--ballast ballast-count] [--partial-score partial-score] [input from count_rules.pl]";
|
||||||
|
|
||||||
|
while(<>) {
|
||||||
|
@A = split(";", $_);
|
||||||
|
@A == 5 || die "Bad input line; $_";
|
||||||
|
($rule,$destress,$right_count,$partial_count,$wrong_count) = @A;
|
||||||
|
$rule_score = ($right_count + $partial_score*$partial_count) /
|
||||||
|
($right_count+$partial_count+$wrong_count+$ballast);
|
||||||
|
if ($destress eq "yes") { $rule_score -= $destress_penalty; }
|
||||||
|
print join(";", $rule, $destress, sprintf("%.5f", $rule_score)) . "\n";
|
||||||
|
}
|
|
@ -0,0 +1,84 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
# This takes the output of e.g. get_candidate_prons.pl or limit_candidate_prons.pl
|
||||||
|
# or reverse_candidates.pl, which is 7-tuples, one per line, of the form:
|
||||||
|
#
|
||||||
|
# word;pron;base-word;base-pron;rule-name;de-stress;rule-score
|
||||||
|
#
|
||||||
|
# and selects the most likely prons for the words based on rule
|
||||||
|
# score. It outputs in the same format as the input (thus, it is
|
||||||
|
# similar to limit_candidates.pl in its input and output format,
|
||||||
|
# except it has a different way of selecting the prons to put out).
|
||||||
|
#
|
||||||
|
# This script will select the $max_prons best pronunciations for
|
||||||
|
# each candidate word, subject to the constraint that no pron should
|
||||||
|
# have a rule score worse than $min_rule_score.
|
||||||
|
# It first merges the candidates by, if there are multiple candidates
|
||||||
|
# generating the same pron, selecting the candidate that had the
|
||||||
|
# best associated score. It then sorts the prons on score and
|
||||||
|
# selects the n best prons (but doesn't print out candidates with
|
||||||
|
# score beneath the threshold).
|
||||||
|
|
||||||
|
|
||||||
|
$max_prons = 4;
|
||||||
|
$min_rule_score = 0.35;
|
||||||
|
|
||||||
|
|
||||||
|
for ($n = 1; $n <= 3; $n++) {
|
||||||
|
if ($ARGV[0] eq "--max-prons") {
|
||||||
|
shift @ARGV;
|
||||||
|
$max_prons = shift @ARGV;
|
||||||
|
}
|
||||||
|
if ($ARGV[0] eq "--min-rule-score") {
|
||||||
|
shift @ARGV;
|
||||||
|
$min_rule_score = shift @ARGV;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (@ARGV != 0 && @ARGV != 1) {
|
||||||
|
die "Usage: select_candidates_prons.pl [candidate_prons] > selected_candidate_prons";
|
||||||
|
}
|
||||||
|
|
||||||
|
sub process_word;
|
||||||
|
|
||||||
|
undef $cur_word;
|
||||||
|
@cur_lines = ();
|
||||||
|
|
||||||
|
while(<>) {
|
||||||
|
# input, output is:
|
||||||
|
# word;pron;base-word;base-pron;rule-name;destress;score
|
||||||
|
chop;
|
||||||
|
m:^([^;]+);: || die "Unexpected input: $_";
|
||||||
|
$word = $1;
|
||||||
|
if (!defined $cur_word || $word eq $cur_word) {
|
||||||
|
if (!defined $cur_word) { $cur_word = $word; }
|
||||||
|
push @cur_lines, $_;
|
||||||
|
} else {
|
||||||
|
process_word(@cur_lines); # Process a series of suggested prons
|
||||||
|
# for a particular word.
|
||||||
|
$cur_word = $word;
|
||||||
|
@cur_lines = ( $_ );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
process_word(@cur_lines);
|
||||||
|
|
||||||
|
|
||||||
|
sub process_word {
|
||||||
|
my %pron2rule_score; # hash from generated pron to rule score for that pron.
|
||||||
|
my %pron2line; # hash from generated pron to best line for that pron.
|
||||||
|
my @cur_lines = @_;
|
||||||
|
foreach my $line (@cur_lines) {
|
||||||
|
my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line);
|
||||||
|
if (!defined $pron2rule_score{$pron} ||
|
||||||
|
$rule_score > $pron2rule_score{$pron}) {
|
||||||
|
$pron2rule_score{$pron} = $rule_score;
|
||||||
|
$pron2line{$pron} = $line;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
my @prons = sort { $pron2rule_score{$b} <=> $pron2rule_score{$a} } keys %pron2rule_score;
|
||||||
|
for (my $n = 0; $n < @prons && $n < $max_prons &&
|
||||||
|
$pron2rule_score{$prons[$n]} >= $min_rule_score; $n++) {
|
||||||
|
print $pron2line{$prons[$n]} . "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,64 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
# Copyright 2010-2011 Microsoft Corporation
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# This program takes on its standard input a list of utterance
|
||||||
|
# id's, one for each line. (e.g. 4k0c030a is a an utterance id).
|
||||||
|
# It takes as
|
||||||
|
# Extracts from the dot files the transcripts for a given
|
||||||
|
# dataset (represented by a file list).
|
||||||
|
#
|
||||||
|
|
||||||
|
@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts";
|
||||||
|
$dot_flist = shift @ARGV;
|
||||||
|
|
||||||
|
open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n";
|
||||||
|
while(<L>){
|
||||||
|
chop;
|
||||||
|
m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_";
|
||||||
|
$spk = $1;
|
||||||
|
$spk2dot{$spk} = $_;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
while(<STDIN>){
|
||||||
|
chop;
|
||||||
|
$uttid = $_;
|
||||||
|
$uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_";
|
||||||
|
$spk = $1;
|
||||||
|
if($spk ne $curspk) {
|
||||||
|
%utt2trans = { }; # Don't keep all the transcripts in memory...
|
||||||
|
$curspk = $spk;
|
||||||
|
$dotfile = $spk2dot{$spk};
|
||||||
|
defined $dotfile || die "No dot file for speaker $spk\n";
|
||||||
|
open(F, "<$dotfile") || die "Error opening dot file $dotfile\n";
|
||||||
|
while(<F>) {
|
||||||
|
$_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n";
|
||||||
|
$trans = $1;
|
||||||
|
$utt = $2;
|
||||||
|
$utt2trans{$utt} = $trans;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(!defined $utt2trans{$uttid}) {
|
||||||
|
print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n";
|
||||||
|
} else {
|
||||||
|
print "$uttid $utt2trans{$uttid}\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
# Copyright 2010-2011 Microsoft Corporation
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
# takes in a file list with lines like
|
||||||
|
# /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
|
||||||
|
# and outputs an scp in kaldi format with lines like
|
||||||
|
# 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
|
||||||
|
# (the first thing is the utterance-id, which is the same as the basename of the file.
|
||||||
|
|
||||||
|
|
||||||
|
while(<>){
|
||||||
|
m:^\S+/(\w+)\.[wW][vV]1$: || die "Bad line $_";
|
||||||
|
$id = $1;
|
||||||
|
$id =~ tr/A-Z/a-z/; # Necessary because of weirdness on disk 13-16.1 (uppercase filenames)
|
||||||
|
print "$id $_";
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
# Copyright 2010-2011 Microsoft Corporation
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
# takes in a file list with lines like
|
||||||
|
# /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
|
||||||
|
# and outputs an scp in kaldi format with lines like
|
||||||
|
# 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
|
||||||
|
# (the first thing is the utterance-id, which is the same as the basename of the file.
|
||||||
|
|
||||||
|
|
||||||
|
while(<>){
|
||||||
|
m:^\S+/(\w+)\.[wW][vV][12]$: || die "Bad line $_";
|
||||||
|
$id = $1;
|
||||||
|
$id =~ tr/A-Z/a-z/; # Necessary because of weirdness on disk 13-16.1 (uppercase filenames)
|
||||||
|
print "$id $_";
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,110 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen)
|
||||||
|
# Apache 2.0.
|
||||||
|
|
||||||
|
|
||||||
|
if [ $# -ne 2 ]; then
|
||||||
|
echo "Usage: local/generate_example_kws.sh <data-dir> <kws-data-dir>"
|
||||||
|
echo " e.g.: local/generate_example_kws.sh data/test_eval92/ <data/kws>"
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
datadir=$1;
|
||||||
|
kwsdatadir=$2;
|
||||||
|
text=$datadir/text;
|
||||||
|
|
||||||
|
mkdir -p $kwsdatadir;
|
||||||
|
|
||||||
|
# Generate keywords; we generate 20 unigram keywords with at least 20 counts,
|
||||||
|
# 20 bigram keywords with at least 10 counts and 10 trigram keywords with at
|
||||||
|
# least 5 counts.
|
||||||
|
cat $text | perl -e '
|
||||||
|
%unigram = ();
|
||||||
|
%bigram = ();
|
||||||
|
%trigram = ();
|
||||||
|
while(<>) {
|
||||||
|
chomp;
|
||||||
|
@col=split(" ", $_);
|
||||||
|
shift @col;
|
||||||
|
for($i = 0; $i < @col; $i++) {
|
||||||
|
# unigram case
|
||||||
|
if (!defined($unigram{$col[$i]})) {
|
||||||
|
$unigram{$col[$i]} = 0;
|
||||||
|
}
|
||||||
|
$unigram{$col[$i]}++;
|
||||||
|
|
||||||
|
# bigram case
|
||||||
|
if ($i < @col-1) {
|
||||||
|
$word = $col[$i] . " " . $col[$i+1];
|
||||||
|
if (!defined($bigram{$word})) {
|
||||||
|
$bigram{$word} = 0;
|
||||||
|
}
|
||||||
|
$bigram{$word}++;
|
||||||
|
}
|
||||||
|
|
||||||
|
# trigram case
|
||||||
|
if ($i < @col-2) {
|
||||||
|
$word = $col[$i] . " " . $col[$i+1] . " " . $col[$i+2];
|
||||||
|
if (!defined($trigram{$word})) {
|
||||||
|
$trigram{$word} = 0;
|
||||||
|
}
|
||||||
|
$trigram{$word}++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$max_count = 100;
|
||||||
|
$total = 20;
|
||||||
|
$current = 0;
|
||||||
|
$min_count = 20;
|
||||||
|
while ($current < $total && $min_count <= $max_count) {
|
||||||
|
foreach $x (keys %unigram) {
|
||||||
|
if ($unigram{$x} == $min_count) {
|
||||||
|
print "$x\n";
|
||||||
|
$unigram{$x} = 0;
|
||||||
|
$current++;
|
||||||
|
}
|
||||||
|
if ($current == $total) {
|
||||||
|
last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$min_count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
$total = 20;
|
||||||
|
$current = 0;
|
||||||
|
$min_count = 4;
|
||||||
|
while ($current < $total && $min_count <= $max_count) {
|
||||||
|
foreach $x (keys %bigram) {
|
||||||
|
if ($bigram{$x} == $min_count) {
|
||||||
|
print "$x\n";
|
||||||
|
$bigram{$x} = 0;
|
||||||
|
$current++;
|
||||||
|
}
|
||||||
|
if ($current == $total) {
|
||||||
|
last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$min_count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
$total = 10;
|
||||||
|
$current = 0;
|
||||||
|
$min_count = 3;
|
||||||
|
while ($current < $total && $min_count <= $max_count) {
|
||||||
|
foreach $x (keys %trigram) {
|
||||||
|
if ($trigram{$x} == $min_count) {
|
||||||
|
print "$x\n";
|
||||||
|
$trigram{$x} = 0;
|
||||||
|
$current++;
|
||||||
|
}
|
||||||
|
if ($current == $total) {
|
||||||
|
last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$min_count++;
|
||||||
|
}
|
||||||
|
' > $kwsdatadir/raw_keywords.txt
|
||||||
|
|
||||||
|
echo "Keywords generation succeeded"
|
|
@ -0,0 +1,60 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen)
|
||||||
|
# Apache 2.0.
|
||||||
|
|
||||||
|
|
||||||
|
if [ $# -ne 3 ]; then
|
||||||
|
echo "Usage: local/kws_data_prep.sh <lang-dir> <data-dir> <kws-data-dir>"
|
||||||
|
echo " e.g.: local/kws_data_prep.sh data/lang_test_bd_tgpr/ data/test_eval92/ data/kws/"
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
langdir=$1;
|
||||||
|
datadir=$2;
|
||||||
|
kwsdatadir=$3;
|
||||||
|
|
||||||
|
mkdir -p $kwsdatadir;
|
||||||
|
|
||||||
|
# Create keyword id for each keyword
|
||||||
|
cat $kwsdatadir/raw_keywords.txt | perl -e '
|
||||||
|
$idx=1;
|
||||||
|
while(<>) {
|
||||||
|
chomp;
|
||||||
|
printf "WSJ-%04d $_\n", $idx;
|
||||||
|
$idx++;
|
||||||
|
}' > $kwsdatadir/keywords.txt
|
||||||
|
|
||||||
|
# Map the keywords to integers; note that we remove the keywords that
|
||||||
|
# are not in our $langdir/words.txt, as we won't find them anyway...
|
||||||
|
cat $kwsdatadir/keywords.txt | \
|
||||||
|
sym2int.pl --map-oov 0 -f 2- $langdir/words.txt | \
|
||||||
|
grep -v " 0 " | grep -v " 0$" > $kwsdatadir/keywords.int
|
||||||
|
|
||||||
|
# Compile keywords into FSTs
|
||||||
|
transcripts-to-fsts ark:$kwsdatadir/keywords.int ark:$kwsdatadir/keywords.fsts
|
||||||
|
|
||||||
|
# Create utterance id for each utterance; Note that by "utterance" here I mean
|
||||||
|
# the keys that will appear in the lattice archive. You may have to modify here
|
||||||
|
cat $datadir/wav.scp | \
|
||||||
|
awk '{print $1}' | \
|
||||||
|
sort | uniq | perl -e '
|
||||||
|
$idx=1;
|
||||||
|
while(<>) {
|
||||||
|
chomp;
|
||||||
|
print "$_ $idx\n";
|
||||||
|
$idx++;
|
||||||
|
}' > $kwsdatadir/utter_id
|
||||||
|
|
||||||
|
# Map utterance to the names that will appear in the rttm file. You have
|
||||||
|
# to modify the commands below accoring to your rttm file. In the WSJ case
|
||||||
|
# since each file is an utterance, we assume that the actual file names will
|
||||||
|
# be the "names" in the rttm, so the utterance names map to themselves.
|
||||||
|
cat $datadir/wav.scp | \
|
||||||
|
awk '{print $1}' | \
|
||||||
|
sort | uniq | perl -e '
|
||||||
|
while(<>) {
|
||||||
|
chomp;
|
||||||
|
print "$_ $_\n";
|
||||||
|
}' > $kwsdatadir/utter_map;
|
||||||
|
echo "Kws data preparation succeeded"
|
|
@ -0,0 +1,62 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
# Copyright 2010-2011 Microsoft Corporation
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
# This program takes as its standard input an .ndx file from the WSJ corpus that looks
|
||||||
|
# like this:
|
||||||
|
#;; File: tr_s_wv1.ndx, updated 04/26/94
|
||||||
|
#;;
|
||||||
|
#;; Index for WSJ0 SI-short Sennheiser training data
|
||||||
|
#;; Data is read WSJ sentences, Sennheiser mic.
|
||||||
|
#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts
|
||||||
|
#;; per speaker TI) = 7236 utts
|
||||||
|
#;;
|
||||||
|
#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
|
||||||
|
#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
|
||||||
|
#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1
|
||||||
|
|
||||||
|
#and as command-line arguments it takes the names of the WSJ disk locations, e.g.:
|
||||||
|
#/mnt/matylda2/data/WSJ0/11-1.1 /mnt/matylda2/data/WSJ0/11-10.1 ... etc.
|
||||||
|
# It outputs a list of absolute pathnames (it does this by replacing e.g. 11_1_1 with
|
||||||
|
# /mnt/matylda2/data/WSJ0/11-1.1.
|
||||||
|
# It also does a slight fix because one of the WSJ disks (WSJ1/13-16.1) was distributed with
|
||||||
|
# uppercase rather than lower case filenames.
|
||||||
|
|
||||||
|
foreach $fn (@ARGV) {
|
||||||
|
$fn =~ m:.+/([0-9\.\-]+)/?$: || die "Bad command-line argument $fn\n";
|
||||||
|
$disk_id=$1;
|
||||||
|
$disk_id =~ tr/-\./__/; # replace - and . with - so 11-10.1 becomes 11_10_1
|
||||||
|
$fn =~ s:/$::; # Remove final slash, just in case it is present.
|
||||||
|
$disk2fn{$disk_id} = $fn;
|
||||||
|
}
|
||||||
|
|
||||||
|
while(<STDIN>){
|
||||||
|
if(m/^;/){ next; } # Comment. Ignore it.
|
||||||
|
else {
|
||||||
|
m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_";
|
||||||
|
$disk=$1;
|
||||||
|
if(!defined $disk2fn{$disk}) {
|
||||||
|
die "Disk id $disk not found";
|
||||||
|
}
|
||||||
|
$filename = $2; # as a subdirectory of the distributed disk.
|
||||||
|
if($disk eq "13_16_1" && `hostname` =~ m/fit.vutbr.cz/) {
|
||||||
|
# The disk 13-16.1 has been uppercased for some reason, on the
|
||||||
|
# BUT system. This is a fix specifically for that case.
|
||||||
|
$filename =~ tr/a-z/A-Z/; # This disk contains all uppercase filenames. Why?
|
||||||
|
}
|
||||||
|
print "$disk2fn{$disk}/$filename\n";
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,69 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
train_stage=-100
|
||||||
|
# This trains only unadapted (just cepstral mean normalized) features,
|
||||||
|
# and uses various combinations of VTLN warping factor and time-warping
|
||||||
|
# factor to artificially expand the amount of data.
|
||||||
|
|
||||||
|
. cmd.sh
|
||||||
|
|
||||||
|
. utils/parse_options.sh # to parse the --stage option, if given
|
||||||
|
|
||||||
|
[ $# != 0 ] && echo "Usage: local/run_4b.sh [--stage <stage> --train-stage <train-stage>]" && exit 1;
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [ $stage -le 0 ]; then
|
||||||
|
# Create the training data.
|
||||||
|
featdir=`pwd`/mfcc/nnet5b; mkdir -p $featdir
|
||||||
|
fbank_conf=conf/fbank_40.conf
|
||||||
|
echo "--num-mel-bins=40" > $fbank_conf
|
||||||
|
steps/nnet2/get_perturbed_feats.sh --cmd "$train_cmd" \
|
||||||
|
$fbank_conf $featdir exp/perturbed_fbanks_si284 data/train_si284 data/train_si284_perturbed_fbank &
|
||||||
|
steps/nnet2/get_perturbed_feats.sh --cmd "$train_cmd" --feature-type mfcc \
|
||||||
|
conf/mfcc.conf $featdir exp/perturbed_mfcc_si284 data/train_si284 data/train_si284_perturbed_mfcc &
|
||||||
|
wait
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 1 ]; then
|
||||||
|
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
|
||||||
|
data/train_si284_perturbed_mfcc data/lang exp/tri4b exp/tri4b_ali_si284_perturbed_mfcc
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 2 ]; then
|
||||||
|
steps/nnet2/train_block.sh --stage "$train_stage" \
|
||||||
|
--cleanup false \
|
||||||
|
--initial-learning-rate 0.01 --final-learning-rate 0.001 \
|
||||||
|
--num-epochs 10 --num-epochs-extra 5 \
|
||||||
|
--cmd "$decode_cmd" \
|
||||||
|
--hidden-layer-dim 1536 \
|
||||||
|
--num-block-layers 3 --num-normal-layers 3 \
|
||||||
|
data/train_si284_perturbed_fbank data/lang exp/tri4b_ali_si284_perturbed_mfcc exp/nnet5b || exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 3 ]; then # create testing fbank data.
|
||||||
|
featdir=`pwd`/mfcc
|
||||||
|
fbank_conf=conf/fbank_40.conf
|
||||||
|
for x in test_eval92 test_eval93 test_dev93; do
|
||||||
|
cp -rT data/$x data/${x}_fbank
|
||||||
|
rm -r ${x}_fbank/split* || true
|
||||||
|
steps/make_fbank.sh --fbank-config "$fbank_conf" --nj 8 \
|
||||||
|
--cmd "$train_cmd" data/${x}_fbank exp/make_fbank/$x $featdir || exit 1;
|
||||||
|
steps/compute_cmvn_stats.sh data/${x}_fbank exp/make_fbank/$x $featdir || exit 1;
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 4 ]; then
|
||||||
|
steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
|
||||||
|
exp/tri4b/graph_bd_tgpr data/test_dev93_fbank exp/nnet5b/decode_bd_tgpr_dev93
|
||||||
|
|
||||||
|
steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 8 \
|
||||||
|
exp/tri4b/graph_bd_tgpr data/test_eval92_fbank exp/nnet5b/decode_bd_tgpr_eval92
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
exit 0;
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This is neural net training on top of adapted 40-dimensional features.
|
||||||
|
#
|
||||||
|
|
||||||
|
. ./cmd.sh
|
||||||
|
|
||||||
|
(
|
||||||
|
steps/nnet2/train_tanh.sh \
|
||||||
|
--mix-up 8000 \
|
||||||
|
--initial-learning-rate 0.01 --final-learning-rate 0.001 \
|
||||||
|
--num-hidden-layers 4 --hidden-layer-dim 1024 \
|
||||||
|
--cmd "$decode_cmd" \
|
||||||
|
data/train_si284 data/lang exp/tri4b_ali_si284 exp/nnet5c || exit 1
|
||||||
|
|
||||||
|
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 10 \
|
||||||
|
--transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
|
||||||
|
exp/tri4b/graph_bd_tgpr data/test_dev93 exp/nnet5c/decode_bd_tgpr_dev93
|
||||||
|
|
||||||
|
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 8 \
|
||||||
|
--transform-dir exp/tri4b/decode_bd_tgpr_eval92 \
|
||||||
|
exp/tri4b/graph_bd_tgpr data/test_eval92 exp/nnet5c/decode_bd_tgpr_eval92
|
||||||
|
)
|
||||||
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
# Copyright 2010-2011 Microsoft Corporation
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
# This takes data from the standard input that's unnormalized transcripts in the format
|
||||||
|
# 4k2c0308 Of course there isn\'t any guarantee the company will keep its hot hand [misc_noise]
|
||||||
|
# 4k2c030a [loud_breath] And new hardware such as the set of personal computers I\. B\. M\. introduced last week can lead to unexpected changes in the software business [door_slam]
|
||||||
|
# and outputs normalized transcripts.
|
||||||
|
# c.f. /mnt/matylda2/data/WSJ0/11-10.1/wsj0/transcrp/doc/dot_spec.doc
|
||||||
|
|
||||||
|
@ARGV == 1 || die "usage: normalize_transcript.pl noise_word < transcript > transcript2";
|
||||||
|
$noise_word = shift @ARGV;
|
||||||
|
|
||||||
|
while(<STDIN>) {
|
||||||
|
$_ =~ m:^(\S+) (.+): || die "bad line $_";
|
||||||
|
$utt = $1;
|
||||||
|
$trans = $2;
|
||||||
|
print "$utt";
|
||||||
|
foreach $w (split (" ",$trans)) {
|
||||||
|
$w =~ tr:a-z:A-Z:; # Upcase everything to match the CMU dictionary. .
|
||||||
|
$w =~ s:\\::g; # Remove backslashes. We don't need the quoting.
|
||||||
|
$w =~ s:^\%PERCENT$:PERCENT:; # Normalization for Nov'93 test transcripts.
|
||||||
|
$w =~ s:^\.POINT$:POINT:; # Normalization for Nov'93 test transcripts.
|
||||||
|
if($w =~ m:^\[\<\w+\]$: || # E.g. [<door_slam], this means a door slammed in the preceding word. Delete.
|
||||||
|
$w =~ m:^\[\w+\>\]$: || # E.g. [door_slam>], this means a door slammed in the next word. Delete.
|
||||||
|
$w =~ m:\[\w+/\]$: || # E.g. [phone_ring/], which indicates the start of this phenomenon.
|
||||||
|
$w =~ m:\[\/\w+]$: || # E.g. [/phone_ring], which indicates the end of this phenomenon.
|
||||||
|
$w eq "~" || # This is used to indicate truncation of an utterance. Not a word.
|
||||||
|
$w eq ".") { # "." is used to indicate a pause. Silence is optional anyway so not much
|
||||||
|
# point including this in the transcript.
|
||||||
|
next; # we won't print this word.
|
||||||
|
} elsif($w =~ m:\[\w+\]:) { # Other noises, e.g. [loud_breath].
|
||||||
|
print " $noise_word";
|
||||||
|
} elsif($w =~ m:^\<([\w\']+)\>$:) {
|
||||||
|
# e.g. replace <and> with and. (the <> means verbal deletion of a word).. but it's pronounced.
|
||||||
|
print " $1";
|
||||||
|
} elsif($w eq "--DASH") {
|
||||||
|
print " -DASH"; # This is a common issue; the CMU dictionary has it as -DASH.
|
||||||
|
# } elsif($w =~ m:(.+)\-DASH$:) { # E.g. INCORPORATED-DASH... seems the DASH gets combined with previous word
|
||||||
|
# print " $1 -DASH";
|
||||||
|
} else {
|
||||||
|
print " $w";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print "\n";
|
||||||
|
}
|
|
@ -0,0 +1,42 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
. cmd.sh
|
||||||
|
|
||||||
|
mfccdir=mfcc
|
||||||
|
|
||||||
|
# Make "per-utterance" versions of the test sets where the speaker
|
||||||
|
# information corresponds to utterances-- to demonstrate adaptation on
|
||||||
|
# short utterances, particularly for basis fMLLR
|
||||||
|
for x in test_eval92 test_eval93 test_dev93 ; do
|
||||||
|
y=${x}_utt
|
||||||
|
rm -r data/$y
|
||||||
|
cp -r data/$x data/$y
|
||||||
|
cat data/$x/utt2spk | awk '{print $1, $1;}' > data/$y/utt2spk;
|
||||||
|
cp data/$y/utt2spk data/$y/spk2utt;
|
||||||
|
steps/compute_cmvn_stats.sh data/$y exp/make_mfcc/$y $mfccdir || exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
# basis fMLLR experiments.
|
||||||
|
# First a baseline: decode per-utterance with normal fMLLR.
|
||||||
|
steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
|
||||||
|
exp/tri3b/graph_tgpr data/test_dev93_utt exp/tri3b/decode_tgpr_dev93_utt || exit 1;
|
||||||
|
steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
|
||||||
|
exp/tri3b/graph_tgpr data/test_eval92_utt exp/tri3b/decode_tgpr_eval92_utt || exit 1;
|
||||||
|
|
||||||
|
# get the fMLLR basis.
|
||||||
|
steps/get_fmllr_basis.sh --cmd "$train_cmd" data/train_si84 data/lang exp/tri3b
|
||||||
|
|
||||||
|
# decoding tri3b with basis fMLLR
|
||||||
|
steps/decode_basis_fmllr.sh --nj 10 --cmd "$decode_cmd" \
|
||||||
|
exp/tri3b/graph_tgpr data/test_dev93 exp/tri3b/decode_tgpr_dev93_basis || exit 1;
|
||||||
|
steps/decode_basis_fmllr.sh --nj 8 --cmd "$decode_cmd" \
|
||||||
|
exp/tri3b/graph_tgpr data/test_eval92 exp/tri3b/decode_tgpr_eval92_basis || exit 1;
|
||||||
|
|
||||||
|
# The same, per-utterance.
|
||||||
|
steps/decode_basis_fmllr.sh --nj 10 --cmd "$decode_cmd" \
|
||||||
|
exp/tri3b/graph_tgpr data/test_dev93_utt exp/tri3b/decode_tgpr_dev93_basis_utt || exit 1;
|
||||||
|
steps/decode_basis_fmllr.sh --nj 8 --cmd "$decode_cmd" \
|
||||||
|
exp/tri3b/graph_tgpr data/test_eval92_utt exp/tri3b/decode_tgpr_eval92_basis_utt || exit 1;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,181 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2012-2013 Brno University of Technology (Author: Karel Vesely)
|
||||||
|
# Apache 2.0
|
||||||
|
|
||||||
|
# In this recipe we build DNN in four stages:
|
||||||
|
# 1) Data preparations : the fMLLR features are stored to disk
|
||||||
|
# 2) RBM pre-training : in this unsupervised stage we train stack of RBMs, a good starting point for Cross-entropy trainig
|
||||||
|
# 3) Frame-level cross-entropy training : in this stage the objective is to classify frames correctly.
|
||||||
|
# 4) Sequence-criterion training : in this stage the objective is to classify the whole sequence correctly,
|
||||||
|
# the idea is similar to the 'Discriminative training' in context of GMM-HMMs.
|
||||||
|
|
||||||
|
|
||||||
|
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
|
||||||
|
## This relates to the queue.
|
||||||
|
|
||||||
|
. ./path.sh ## Source the tools/utils (import the queue.pl)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#false && \
|
||||||
|
{
|
||||||
|
gmmdir=exp/tri4b
|
||||||
|
|
||||||
|
###
|
||||||
|
### Generate the alignments of dev93
|
||||||
|
### (held-out set for Cross-entropy training)
|
||||||
|
###
|
||||||
|
steps/align_fmllr.sh --nj 10 --cmd "$train_cmd" \
|
||||||
|
data/test_dev93 data/lang $gmmdir exp/tri4b_ali_dev93 || exit 1
|
||||||
|
|
||||||
|
###
|
||||||
|
### Store the fMLLR features, so we can train on them easily
|
||||||
|
###
|
||||||
|
|
||||||
|
# train si284
|
||||||
|
# generate the features
|
||||||
|
dir=data-fmllr-tri4b/train_si284
|
||||||
|
steps/make_fmllr_feats.sh --nj 20 --cmd "$train_cmd" \
|
||||||
|
--transform-dir exp/tri4b_ali_si284 \
|
||||||
|
$dir data/train_si284 $gmmdir $dir/_log $dir/_data || exit 1
|
||||||
|
|
||||||
|
# eval92
|
||||||
|
dir=data-fmllr-tri4b/test_eval92
|
||||||
|
steps/make_fmllr_feats.sh --nj 8 --cmd "$train_cmd" \
|
||||||
|
--transform-dir exp/tri4b/decode_tgpr_eval92 \
|
||||||
|
$dir data/test_eval92 $gmmdir $dir/_log $dir/_data || exit 1
|
||||||
|
|
||||||
|
# dev93 (unsupervised fMLLR)
|
||||||
|
# held-out set of Cross-entropy training
|
||||||
|
dir=data-fmllr-tri4b/test_dev93
|
||||||
|
steps/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
|
||||||
|
--transform-dir exp/tri4b/decode_tgpr_dev93 \
|
||||||
|
$dir data/test_dev93 $gmmdir $dir/_log $dir/_data || exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
###
|
||||||
|
### Now we can pre-train stack of RBMs
|
||||||
|
###
|
||||||
|
#false && \
|
||||||
|
{ # Pre-train the DBN
|
||||||
|
dir=exp/tri4b_pretrain-dbn
|
||||||
|
(tail --pid=$$ -F $dir/_pretrain_dbn.log 2>/dev/null)&
|
||||||
|
$cuda_cmd $dir/_pretrain_dbn.log \
|
||||||
|
steps/pretrain_dbn.sh --rbm-iter 3 data-fmllr-tri4b/train_si284 $dir
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
###
|
||||||
|
### Now we train the DNN optimizing cross-entropy.
|
||||||
|
### This will take quite some time.
|
||||||
|
###
|
||||||
|
|
||||||
|
#false && \
|
||||||
|
{ # Train the MLP
|
||||||
|
dir=exp/tri4b_pretrain-dbn_dnn
|
||||||
|
ali=exp/tri4b_ali
|
||||||
|
feature_transform=exp/tri4b_pretrain-dbn/final.feature_transform
|
||||||
|
dbn=exp/tri4b_pretrain-dbn/6.dbn
|
||||||
|
(tail --pid=$$ -F $dir/_train_nnet.log 2>/dev/null)&
|
||||||
|
$cuda_cmd $dir/_train_nnet.log \
|
||||||
|
steps/train_nnet.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
|
||||||
|
data-fmllr-tri4b/train_si284 data-fmllr-tri4b/test_dev93 data/lang ${ali}_si284 ${ali}_dev93 $dir || exit 1;
|
||||||
|
# decode with 'big-dictionary' (reuse HCLG graph)
|
||||||
|
steps/decode_nnet.sh --nj 10 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \
|
||||||
|
exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_dev93 $dir/decode_bd_tgpr_dev93 || exit 1;
|
||||||
|
steps/decode_nnet.sh --nj 8 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \
|
||||||
|
exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_eval92 $dir/decode_bd_tgpr_eval92 || exit 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
###
|
||||||
|
### Finally we train using sMBR criterion.
|
||||||
|
### We do Stochastic-GD with per-utterance updates.
|
||||||
|
###
|
||||||
|
### To get faster convergence, we will re-generate
|
||||||
|
### the lattices after 1st epoch of sMBR.
|
||||||
|
###
|
||||||
|
|
||||||
|
dir=exp/tri4b_pretrain-dbn_dnn_smbr
|
||||||
|
srcdir=exp/tri4b_pretrain-dbn_dnn
|
||||||
|
acwt=0.10
|
||||||
|
|
||||||
|
# First we need to generate lattices and alignments:
|
||||||
|
#false && \
|
||||||
|
{
|
||||||
|
steps/align_nnet.sh --nj 100 --cmd "$train_cmd" \
|
||||||
|
data-fmllr-tri4b/train_si284 data/lang $srcdir ${srcdir}_ali_si284 || exit 1;
|
||||||
|
steps/make_denlats_nnet.sh --nj 100 --cmd "$decode_cmd" \
|
||||||
|
--config conf/decode_dnn.config --acwt $acwt \
|
||||||
|
data-fmllr-tri4b/train_si284 data/lang $srcdir ${srcdir}_denlats_si284 || exit 1;
|
||||||
|
}
|
||||||
|
# Now we re-train the hybrid by single iteration of sMBR
|
||||||
|
#false && \
|
||||||
|
{
|
||||||
|
steps/train_nnet_mpe.sh --cmd "$cuda_cmd" --num-iters 1 --acwt $acwt --do-smbr true \
|
||||||
|
data-fmllr-tri4b/train_si284 data/lang $srcdir \
|
||||||
|
${srcdir}_ali_si284 ${srcdir}_denlats_si284 $dir || exit 1
|
||||||
|
}
|
||||||
|
# Decode
|
||||||
|
#false && \
|
||||||
|
{
|
||||||
|
for ITER in 1; do
|
||||||
|
# decode dev93 with big dict graph_bd_tgpr
|
||||||
|
steps/decode_nnet.sh --nj 10 --cmd "$decode_cmd" --config conf/decode_dnn.config \
|
||||||
|
--nnet $dir/${ITER}.nnet --acwt $acwt \
|
||||||
|
exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_dev93 $dir/decode_dev93_bd_tgpr_it${ITER} || exit 1
|
||||||
|
# decode eval92 with big dict graph_bd_tgpr
|
||||||
|
steps/decode_nnet.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config \
|
||||||
|
--nnet $dir/${ITER}.nnet --acwt $acwt \
|
||||||
|
exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_eval92 $dir/decode_eval92_bd_tgpr_it${ITER} || exit 1
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
###
|
||||||
|
### Re-generate lattices and run several more iterations of sMBR
|
||||||
|
###
|
||||||
|
|
||||||
|
dir=exp/tri4b_pretrain-dbn_dnn_smbr_iter1-lats
|
||||||
|
srcdir=exp/tri4b_pretrain-dbn_dnn_smbr
|
||||||
|
acwt=0.10
|
||||||
|
|
||||||
|
# First we need to generate lattices and alignments:
|
||||||
|
#false && \
|
||||||
|
{
|
||||||
|
steps/align_nnet.sh --nj 100 --cmd "$train_cmd" \
|
||||||
|
data-fmllr-tri4b/train_si284 data/lang $srcdir ${srcdir}_ali_si284 || exit 1;
|
||||||
|
steps/make_denlats_nnet.sh --nj 100 --cmd "$decode_cmd" \
|
||||||
|
--config conf/decode_dnn.config --acwt $acwt \
|
||||||
|
data-fmllr-tri4b/train_si284 data/lang $srcdir ${srcdir}_denlats_si284 || exit 1;
|
||||||
|
}
|
||||||
|
# Now we re-train the hybrid by several iterations of sMBR
|
||||||
|
#false && \
|
||||||
|
{
|
||||||
|
steps/train_nnet_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \
|
||||||
|
data-fmllr-tri4b/train_si284 data/lang $srcdir \
|
||||||
|
${srcdir}_ali_si284 ${srcdir}_denlats_si284 $dir
|
||||||
|
}
|
||||||
|
# Decode
|
||||||
|
#false && \
|
||||||
|
{
|
||||||
|
for ITER in 1 2 3 4; do
|
||||||
|
# decode dev93 with big dict graph_bd_tgpr
|
||||||
|
steps/decode_nnet.sh --nj 10 --cmd "$decode_cmd" --config conf/decode_dnn.config \
|
||||||
|
--nnet $dir/${ITER}.nnet --acwt $acwt \
|
||||||
|
exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_dev93 $dir/decode_dev93_bd_tgpr_it${ITER} || exit 1
|
||||||
|
# decode eval92 with big dict graph_bd_tgpr
|
||||||
|
steps/decode_nnet.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config \
|
||||||
|
--nnet $dir/${ITER}.nnet --acwt $acwt \
|
||||||
|
exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_eval92 $dir/decode_eval92_bd_tgpr_it${ITER} || exit 1
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Getting results [see RESULTS file]
|
||||||
|
# for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
|
|
@ -0,0 +1,41 @@
|
||||||
|
#prepare reverse lexicon and language model for backwards decoding
|
||||||
|
utils/prepare_lang.sh --reverse true data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp.reverse data/lang.reverse || exit 1;
|
||||||
|
utils/reverse_lm.sh data/local/nist_lm/lm_bg_5k.arpa.gz data/lang.reverse data/lang_test_bg_5k.reverse || exit 1;
|
||||||
|
utils/reverse_lm_test.sh data/lang_test_bg_5k data/lang_test_bg_5k.reverse || exit 1;
|
||||||
|
|
||||||
|
# normal forward decoding
|
||||||
|
utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg5k
|
||||||
|
steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --nj 8 --cmd "$decode_cmd" \
|
||||||
|
exp/tri2a/graph_bg5k data/test_eval92 exp/tri2a/decode_eval92_bg5k_10 || exit 1;
|
||||||
|
|
||||||
|
# backward decoding
|
||||||
|
utils/mkgraph.sh --reverse data/lang_test_bg_5k.reverse exp/tri2a exp/tri2a/graph_bg5k_r
|
||||||
|
steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --reverse true --nj 8 --cmd "$decode_cmd" \
|
||||||
|
exp/tri2a/graph_bg5k_r data/test_eval92 exp/tri2a/decode_eval92_bg5k_reverse10 || exit 1;
|
||||||
|
|
||||||
|
# pingpong decoding
|
||||||
|
steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --reverse true --nj 8 --cmd "$decode_cmd" \
|
||||||
|
--first_pass exp/tri2a/decode_eval92_bg5k_10 exp/tri2a/graph_bg5k_r data/test_eval92 exp/tri2a/decode_eval92_bg5k_pingpong10 || exit 1;
|
||||||
|
steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --nj 8 --cmd "$decode_cmd" \
|
||||||
|
--first_pass exp/tri2a/decode_eval92_bg5k_reverse10 exp/tri2a/graph_bg5k data/test_eval92 exp/tri2a/decode_eval92_bg5k_pongping10 || exit 1;
|
||||||
|
|
||||||
|
# same for bigger language models (on machine with 8GB RAM, you can run the whole decoding in 3-4 min without SGE)
|
||||||
|
utils/prepare_lang.sh --reverse true data/local/dict_larger "<SPOKEN_NOISE>" data/local/lang_larger.reverse data/lang_bd.reverse || exit;
|
||||||
|
utils/reverse_lm.sh --lexicon data/local/dict_larger/lexicon.txt data/local/local_lm/3gram-mincount/lm_pr6.0.gz data/lang_bd.reverse data/lang_test_bd_tgpr.reverse || exit 1;
|
||||||
|
utils/reverse_lm_test.sh data/lang_test_bd_tgpr data/lang_test_bd_tgpr.reverse || exit 1;
|
||||||
|
|
||||||
|
utils/mkgraph.sh data/lang_test_bd_tgpr exp/tri2a exp/tri2a/graph_bd_tgpr
|
||||||
|
steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --nj 4 --cmd run.pl \
|
||||||
|
exp/tri2a/graph_bd_tgpr data/test_eval92 exp/tri2a/decode_eval92_bdtgpr4_10 || exit 1;
|
||||||
|
|
||||||
|
utils/mkgraph.sh --reverse data/lang_test_bd_tgpr.reverse exp/tri2a exp/tri2a/graph_bd_tgpr_r
|
||||||
|
steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --reverse true --nj 4 --cmd run.pl \
|
||||||
|
exp/tri2a/graph_bd_tgpr_r data/test_eval92 exp/tri2a/decode_eval92_bdtgpr4_reverse10 || exit 1;
|
||||||
|
|
||||||
|
steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --reverse true --nj 4 --cmd run.pl \
|
||||||
|
--first_pass exp/tri2a/decode_eval92_bdtgpr4_10 exp/tri2a/graph_bd_tgpr_r data/test_eval92 \
|
||||||
|
exp/tri2a/decode_eval92_bdtgpr4_pingpong10 || exit 1;
|
||||||
|
|
||||||
|
steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --nj 4 --cmd run.pl \
|
||||||
|
--first_pass exp/tri2a/decode_eval92_bdtgpr4_reverse10 exp/tri2a/graph_bd_tgpr data/test_eval92 \
|
||||||
|
exp/tri2a/decode_eval92_bdtgpr4_pongping10 || exit 1;
|
|
@ -0,0 +1,60 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
. ./cmd.sh
|
||||||
|
|
||||||
|
# Train and test MMI (and boosted MMI) on tri2b system.
|
||||||
|
steps/make_denlats.sh --sub-split 20 --nj 10 --cmd "$train_cmd" \
|
||||||
|
data/train_si84 data/lang exp/tri2b exp/tri2b_denlats_si84 || exit 1;
|
||||||
|
|
||||||
|
# train the basic MMI system.
|
||||||
|
steps/train_mmi.sh --cmd "$train_cmd" \
|
||||||
|
data/train_si84 data/lang exp/tri2b_ali_si84 \
|
||||||
|
exp/tri2b_denlats_si84 exp/tri2b_mmi || exit 1;
|
||||||
|
for iter in 3 4; do
|
||||||
|
steps/decode_si.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
|
||||||
|
exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_mmi/decode_tgpr_dev93_it$iter &
|
||||||
|
steps/decode_si.sh --nj 8 --cmd "$decode_cmd" --iter $iter \
|
||||||
|
exp/tri2b/graph_tgpr data/test_eval92 exp/tri2b_mmi/decode_tgpr_eval92_it$iter &
|
||||||
|
done
|
||||||
|
|
||||||
|
# MMI with 0.1 boosting factor.
|
||||||
|
steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \
|
||||||
|
data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 \
|
||||||
|
exp/tri2b_mmi_b0.1 || exit 1;
|
||||||
|
|
||||||
|
for iter in 3 4; do
|
||||||
|
steps/decode_si.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
|
||||||
|
exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_mmi_b0.1/decode_tgpr_dev93_it$iter &
|
||||||
|
steps/decode_si.sh --nj 8 --cmd "$decode_cmd" --iter $iter \
|
||||||
|
exp/tri2b/graph_tgpr data/test_eval92 exp/tri2b_mmi_b0.1/decode_tgpr_eval92_it$iter &
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
# Train a UBM with 400 components, for fMMI.
|
||||||
|
steps/train_diag_ubm.sh --silence-weight 0.5 --nj 10 --cmd "$train_cmd" \
|
||||||
|
400 data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b
|
||||||
|
|
||||||
|
steps/train_mmi_fmmi.sh --boost 0.1 --cmd "$train_cmd" \
|
||||||
|
data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \
|
||||||
|
exp/tri2b_fmmi_b0.1
|
||||||
|
|
||||||
|
for iter in `seq 3 8`; do
|
||||||
|
steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
|
||||||
|
exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it$iter &
|
||||||
|
done
|
||||||
|
|
||||||
|
steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \
|
||||||
|
data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \
|
||||||
|
exp/tri2b_fmmi_b0.1_lr0.005 || exit 1;
|
||||||
|
for iter in `seq 3 8`; do
|
||||||
|
steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
|
||||||
|
exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it$iter &
|
||||||
|
done
|
||||||
|
|
||||||
|
steps/train_mmi_fmmi_indirect.sh --boost 0.1 --cmd "$train_cmd" \
|
||||||
|
data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \
|
||||||
|
exp/tri2b_fmmi_indirect_b0.1
|
||||||
|
for iter in `seq 3 8`; do
|
||||||
|
steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
|
||||||
|
exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it$iter &
|
||||||
|
done
|
|
@ -0,0 +1,50 @@
|
||||||
|
#!/bin/bash
|
||||||
|
. ./cmd.sh
|
||||||
|
|
||||||
|
steps/make_denlats.sh --nj 30 --sub-split 30 --cmd "$train_cmd" \
|
||||||
|
--transform-dir exp/tri4b_ali_si284 \
|
||||||
|
data/train_si284 data/lang exp/tri4b exp/tri4b_denlats_si284 || exit 1;
|
||||||
|
|
||||||
|
steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \
|
||||||
|
data/train_si284 data/lang exp/tri4b_ali_si284 exp/tri4b_denlats_si284 \
|
||||||
|
exp/tri4b_mmi_b0.1 || exit 1;
|
||||||
|
|
||||||
|
steps/decode.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri3b/decode_tgpr_dev93 \
|
||||||
|
exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b_mmi_b0.1/decode_tgpr_dev93
|
||||||
|
|
||||||
|
#first, train UBM for fMMI experiments.
|
||||||
|
steps/train_diag_ubm.sh --silence-weight 0.5 --nj 30 --cmd "$train_cmd" \
|
||||||
|
600 data/train_si284 data/lang exp/tri4b_ali_si284 exp/dubm4b
|
||||||
|
|
||||||
|
# Next, fMMI+MMI.
|
||||||
|
steps/train_mmi_fmmi.sh \
|
||||||
|
--boost 0.1 --cmd "$train_cmd" data/train_si284 data/lang exp/tri4b_ali_si284 exp/dubm4b exp/tri4b_denlats_si284 \
|
||||||
|
exp/tri4b_fmmi_a || exit 1;
|
||||||
|
|
||||||
|
for iter in 3 4 5 6 7 8; do
|
||||||
|
steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
|
||||||
|
--transform-dir exp/tri3b/decode_tgpr_dev93 exp/tri4b/graph_tgpr data/test_dev93 \
|
||||||
|
exp/tri4b_fmmi_a/decode_tgpr_dev93_it$iter &
|
||||||
|
done
|
||||||
|
# decode the last iter with the bd model.
|
||||||
|
for iter in 8; do
|
||||||
|
steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
|
||||||
|
--transform-dir exp/tri3b/decode_bd_tgpr_dev93 exp/tri4b/graph_bd_tgpr data/test_dev93 \
|
||||||
|
exp/tri4b_fmmi_a/decode_bd_tgpr_dev93_it$iter &
|
||||||
|
steps/decode_fmmi.sh --nj 8 --cmd "$decode_cmd" --iter $iter \
|
||||||
|
--transform-dir exp/tri3b/decode_bd_tgpr_eval92 exp/tri4b/graph_bd_tgpr data/test_eval92 \
|
||||||
|
exp/tri4b_fmmi_a/decode_tgpr_eval92_it$iter &
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
# fMMI + mmi with indirect differential.
|
||||||
|
steps/train_mmi_fmmi_indirect.sh \
|
||||||
|
--boost 0.1 --cmd "$train_cmd" data/train_si284 data/lang exp/tri4b_ali_si284 exp/dubm4b exp/tri4b_denlats_si284 \
|
||||||
|
exp/tri4b_fmmi_indirect || exit 1;
|
||||||
|
|
||||||
|
for iter in 3 4 5 6 7 8; do
|
||||||
|
steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
|
||||||
|
--transform-dir exp/tri3b/decode_tgpr_dev93 exp/tri4b/graph_tgpr data/test_dev93 \
|
||||||
|
exp/tri4b_fmmi_indirect/decode_tgpr_dev93_it$iter &
|
||||||
|
done
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
. ./cmd.sh
|
||||||
|
|
||||||
|
|
||||||
|
# ...
|
||||||
|
|
||||||
|
local/nnet2/run_5c.sh
|
||||||
|
|
|
@ -0,0 +1,66 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
|
||||||
|
steps/align_raw_fmllr.sh --nj 10 --cmd "$train_cmd" --use-graphs true \
|
||||||
|
data/train_si84 data/lang exp/tri2b exp/tri2b_ali_si84_raw
|
||||||
|
|
||||||
|
steps/train_raw_sat.sh --cmd "$train_cmd" \
|
||||||
|
2500 15000 data/train_si84 data/lang exp/tri2b_ali_si84_raw exp/tri3c || exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
mfccdir=mfcc
|
||||||
|
for x in test_eval92 test_eval93 test_dev93 ; do
|
||||||
|
y=${x}_utt
|
||||||
|
cp -rT data/$x data/$y
|
||||||
|
cat data/$x/utt2spk | awk '{print $1, $1;}' > data/$y/utt2spk;
|
||||||
|
cp data/$y/utt2spk data/$y/spk2utt;
|
||||||
|
steps/compute_cmvn_stats.sh data/$y exp/make_mfcc/$y $mfccdir || exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
(
|
||||||
|
utils/mkgraph.sh data/lang_test_tgpr exp/tri3c exp/tri3c/graph_tgpr || exit 1;
|
||||||
|
steps/decode_raw_fmllr.sh --nj 10 --cmd "$decode_cmd" \
|
||||||
|
exp/tri3c/graph_tgpr data/test_dev93 exp/tri3c/decode_tgpr_dev93 || exit 1;
|
||||||
|
steps/decode_raw_fmllr.sh --nj 8 --cmd "$decode_cmd" \
|
||||||
|
exp/tri3c/graph_tgpr data/test_eval92 exp/tri3c/decode_tgpr_eval92 || exit 1;
|
||||||
|
|
||||||
|
steps/decode_raw_fmllr.sh --nj 30 --cmd "$decode_cmd" \
|
||||||
|
exp/tri3c/graph_tgpr data/test_dev93_utt exp/tri3c/decode_tgpr_dev93_utt || exit 1;
|
||||||
|
steps/decode_raw_fmllr.sh --nj 30 --cmd "$decode_cmd" \
|
||||||
|
exp/tri3c/graph_tgpr data/test_eval92_utt exp/tri3c/decode_tgpr_eval92_utt || exit 1;
|
||||||
|
|
||||||
|
steps/decode_raw_fmllr.sh --use-normal-fmllr true --nj 10 --cmd "$decode_cmd" \
|
||||||
|
exp/tri3c/graph_tgpr data/test_dev93 exp/tri3c/decode_tgpr_dev93_2fmllr || exit 1;
|
||||||
|
steps/decode_raw_fmllr.sh --use-normal-fmllr true --nj 8 --cmd "$decode_cmd" \
|
||||||
|
exp/tri3c/graph_tgpr data/test_eval92 exp/tri3c/decode_tgpr_eval92_2fmllr || exit 1;
|
||||||
|
)&
|
||||||
|
|
||||||
|
(
|
||||||
|
utils/mkgraph.sh data/lang_test_bd_tgpr exp/tri3c exp/tri3c/graph_bd_tgpr || exit 1;
|
||||||
|
|
||||||
|
steps/decode_raw_fmllr.sh --cmd "$decode_cmd" --nj 8 exp/tri3c/graph_bd_tgpr \
|
||||||
|
data/test_eval92 exp/tri3c/decode_bd_tgpr_eval92
|
||||||
|
steps/decode_raw_fmllr.sh --cmd "$decode_cmd" --nj 10 exp/tri3c/graph_bd_tgpr \
|
||||||
|
data/test_dev93 exp/tri3c/decode_bd_tgpr_dev93
|
||||||
|
)&
|
||||||
|
|
||||||
|
steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
|
||||||
|
data/train_si284 data/lang exp/tri3c exp/tri3c_ali_si284 || exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
steps/train_raw_sat.sh --cmd "$train_cmd" \
|
||||||
|
4200 40000 data/train_si284 data/lang exp/tri3c_ali_si284 exp/tri4d || exit 1;
|
||||||
|
(
|
||||||
|
utils/mkgraph.sh data/lang_test_tgpr exp/tri4d exp/tri4d/graph_tgpr || exit 1;
|
||||||
|
steps/decode_raw_fmllr.sh --nj 10 --cmd "$decode_cmd" \
|
||||||
|
exp/tri4d/graph_tgpr data/test_dev93 exp/tri4d/decode_tgpr_dev93 || exit 1;
|
||||||
|
steps/decode_raw_fmllr.sh --nj 8 --cmd "$decode_cmd" \
|
||||||
|
exp/tri4d/graph_tgpr data/test_eval92 exp/tri4d/decode_tgpr_eval92 || exit 1;
|
||||||
|
) &
|
||||||
|
|
||||||
|
|
||||||
|
wait
|
||||||
|
|
||||||
|
|
||||||
|
#for x in exp/tri3{b,c}/decode_tgpr*; do grep WER $x/wer_* | utils/best_wer.sh ; done
|
||||||
|
|
|
@ -0,0 +1,42 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
for test in dev93 eval92; do
|
||||||
|
|
||||||
|
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_bd_tgpr data/lang_test_bd_fg \
|
||||||
|
data/test_${test} exp/sgmm5b_mmi_b0.1/decode_bd_tgpr_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 || exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
# Note: for N-best-list generation, choosing the acoustic scale (12) that gave
|
||||||
|
# the best WER on this test set. Ideally we should do this on a dev set.
|
||||||
|
|
||||||
|
# This step interpolates a small RNNLM (with weight 0.25) with the 4-gram LM.
|
||||||
|
steps/rnnlmrescore.sh \
|
||||||
|
--N 100 --cmd "$decode_cmd" --inv-acwt 12 \
|
||||||
|
0.25 data/lang_test_bd_fg data/local/rnnlm.h30.voc10k data/test_${test} \
|
||||||
|
exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm30_0.25 \
|
||||||
|
|| exit 1;
|
||||||
|
|
||||||
|
steps/rnnlmrescore.sh \
|
||||||
|
--N 100 --cmd "$decode_cmd" --inv-acwt 12 \
|
||||||
|
0.5 data/lang_test_bd_fg data/local/rnnlm.h100.voc20k data/test_${test} \
|
||||||
|
exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm100_0.5 \
|
||||||
|
|| exit 1;
|
||||||
|
|
||||||
|
steps/rnnlmrescore.sh \
|
||||||
|
--N 100 --cmd "$decode_cmd" --inv-acwt 12 \
|
||||||
|
0.5 data/lang_test_bd_fg data/local/rnnlm.h200.voc30k data/test_${test} \
|
||||||
|
exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm200_0.5 \
|
||||||
|
|| exit 1;
|
||||||
|
|
||||||
|
steps/rnnlmrescore.sh \
|
||||||
|
--N 100 --cmd "$decode_cmd" --inv-acwt 12 \
|
||||||
|
0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_${test} \
|
||||||
|
exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm300_0.5 \
|
||||||
|
|| exit 1;
|
||||||
|
|
||||||
|
steps/rnnlmrescore.sh \
|
||||||
|
--N 100 --cmd "$decode_cmd" --inv-acwt 12 \
|
||||||
|
0.75 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_${test} \
|
||||||
|
exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm300_0.75 \
|
||||||
|
|| exit 1;
|
||||||
|
done
|
|
@ -0,0 +1,64 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
. cmd.sh
|
||||||
|
|
||||||
|
# This step interpolates a small RNNLM (with weight 0.25) with the 4-gram LM.
|
||||||
|
steps/rnnlmrescore.sh \
|
||||||
|
--N 100 --cmd "$decode_cmd" --inv-acwt 17 \
|
||||||
|
0.25 data/lang_test_bd_fg data/local/rnnlm.h30.voc10k data/test_eval92 \
|
||||||
|
exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm30_0.25 \
|
||||||
|
|| exit 1;
|
||||||
|
|
||||||
|
steps/rnnlmrescore.sh \
|
||||||
|
--N 100 --cmd "$decode_cmd" --inv-acwt 17 \
|
||||||
|
0.5 data/lang_test_bd_fg data/local/rnnlm.h100.voc20k data/test_eval92 \
|
||||||
|
exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm100_0.5 \
|
||||||
|
|| exit 1;
|
||||||
|
|
||||||
|
steps/rnnlmrescore.sh \
|
||||||
|
--N 100 --cmd "$decode_cmd" --inv-acwt 17 \
|
||||||
|
0.5 data/lang_test_bd_fg data/local/rnnlm.h200.voc30k data/test_eval92 \
|
||||||
|
exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm200_0.5 \
|
||||||
|
|| exit 1;
|
||||||
|
|
||||||
|
steps/rnnlmrescore.sh \
|
||||||
|
--N 100 --cmd "$decode_cmd" --inv-acwt 17 \
|
||||||
|
0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
|
||||||
|
exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5 \
|
||||||
|
|| exit 1;
|
||||||
|
|
||||||
|
steps/rnnlmrescore.sh \
|
||||||
|
--N 1000 --cmd "$decode_cmd" --inv-acwt 17 \
|
||||||
|
0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
|
||||||
|
exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5_N1000
|
||||||
|
|
||||||
|
dir=exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.75_N1000
|
||||||
|
rm -rf $dir
|
||||||
|
cp -r exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5_N1000 $dir
|
||||||
|
steps/rnnlmrescore.sh \
|
||||||
|
--stage 7 --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \
|
||||||
|
0.75 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
|
||||||
|
exp/tri3b/decode_bd_tgpr_eval92_fg $dir
|
||||||
|
|
||||||
|
dir=exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.75
|
||||||
|
rm -rf $dir
|
||||||
|
cp -r exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5 $dir
|
||||||
|
steps/rnnlmrescore.sh \
|
||||||
|
--stage 7 --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
|
||||||
|
0.75 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
|
||||||
|
exp/tri3b/decode_bd_tgpr_eval92_fg $dir
|
||||||
|
|
||||||
|
dir=exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.25
|
||||||
|
rm -rf $dir
|
||||||
|
cp -r exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5 $dir
|
||||||
|
steps/rnnlmrescore.sh \
|
||||||
|
--stage 7 --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
|
||||||
|
0.25 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
|
||||||
|
exp/tri3b/decode_bd_tgpr_eval92_fg $dir
|
||||||
|
|
||||||
|
steps/rnnlmrescore.sh \
|
||||||
|
--N 10 --cmd "$decode_cmd" --inv-acwt 17 \
|
||||||
|
0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
|
||||||
|
exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5_N10 \
|
||||||
|
|| exit 1;
|
||||||
|
|
|
@ -0,0 +1,113 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script is invoked from ../run.sh
|
||||||
|
# It contains some SGMM-related scripts that I am breaking out of the main run.sh for clarity.
|
||||||
|
|
||||||
|
. cmd.sh
|
||||||
|
|
||||||
|
# SGMM system on si84 data [sgmm5a]. Note: the system we aligned from used the si284 data for
|
||||||
|
# training, but this shouldn't have much effect.
|
||||||
|
|
||||||
|
(
|
||||||
|
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
|
||||||
|
data/train_si84 data/lang exp/tri4b exp/tri4b_ali_si84 || exit 1;
|
||||||
|
|
||||||
|
steps/train_ubm.sh --cmd "$train_cmd" \
|
||||||
|
400 data/train_si84 data/lang exp/tri4b_ali_si84 exp/ubm5a || exit 1;
|
||||||
|
|
||||||
|
steps/train_sgmm.sh --cmd "$train_cmd" \
|
||||||
|
3500 10000 data/train_si84 data/lang exp/tri4b_ali_si84 \
|
||||||
|
exp/ubm5b/final.ubm exp/sgmm5a || exit 1;
|
||||||
|
|
||||||
|
(
|
||||||
|
utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5a exp/sgmm5a/graph_tgpr
|
||||||
|
steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
|
||||||
|
exp/sgmm5a/graph_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93
|
||||||
|
) &
|
||||||
|
|
||||||
|
steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si84 \
|
||||||
|
--use-graphs true --use-gselect true data/train_si84 data/lang exp/sgmm5a exp/sgmm5a_ali_si84 || exit 1;
|
||||||
|
steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 \
|
||||||
|
data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84
|
||||||
|
|
||||||
|
steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
|
||||||
|
data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1
|
||||||
|
|
||||||
|
for iter in 1 2 3 4; do
|
||||||
|
steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
|
||||||
|
--transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \
|
||||||
|
exp/sgmm5a_mmi_b0.1/decode_tgpr_dev93_it$iter &
|
||||||
|
done
|
||||||
|
|
||||||
|
steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
|
||||||
|
--update-opts "--cov-min-value=0.9" data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1_m0.9
|
||||||
|
|
||||||
|
for iter in 1 2 3 4; do
|
||||||
|
steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
|
||||||
|
--transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \
|
||||||
|
exp/sgmm5a_mmi_b0.1_m0.9/decode_tgpr_dev93_it$iter &
|
||||||
|
done
|
||||||
|
|
||||||
|
) &
|
||||||
|
|
||||||
|
|
||||||
|
(
|
||||||
|
# The next commands are the same thing on all the si284 data.
|
||||||
|
|
||||||
|
# SGMM system on the si284 data [sgmm5b]
|
||||||
|
steps/train_ubm.sh --cmd "$train_cmd" \
|
||||||
|
600 data/train_si284 data/lang exp/tri4b_ali_si284 exp/ubm5b || exit 1;
|
||||||
|
|
||||||
|
steps/train_sgmm.sh --cmd "$train_cmd" \
|
||||||
|
5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \
|
||||||
|
exp/ubm5b/final.ubm exp/sgmm5b || exit 1;
|
||||||
|
|
||||||
|
(
|
||||||
|
utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5b exp/sgmm5b/graph_tgpr
|
||||||
|
steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
|
||||||
|
exp/sgmm5b/graph_tgpr data/test_dev93 exp/sgmm5b/decode_tgpr_dev93
|
||||||
|
steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_eval92 \
|
||||||
|
exp/sgmm5b/graph_tgpr data/test_eval92 exp/sgmm5b/decode_tgpr_eval92
|
||||||
|
|
||||||
|
utils/mkgraph.sh data/lang_test_bd_tgpr exp/sgmm5b exp/sgmm5b/graph_bd_tgpr || exit 1;
|
||||||
|
steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
|
||||||
|
exp/sgmm5b/graph_bd_tgpr data/test_dev93 exp/sgmm5b/decode_bd_tgpr_dev93
|
||||||
|
steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \
|
||||||
|
exp/sgmm5b/graph_bd_tgpr data/test_eval92 exp/sgmm5b/decode_bd_tgpr_eval92
|
||||||
|
) &
|
||||||
|
|
||||||
|
steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si284 \
|
||||||
|
--use-graphs true --use-gselect true data/train_si284 data/lang exp/sgmm5b exp/sgmm5b_ali_si284
|
||||||
|
|
||||||
|
steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 \
|
||||||
|
data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284
|
||||||
|
|
||||||
|
steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \
|
||||||
|
data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284 exp/sgmm5b_mmi_b0.1
|
||||||
|
|
||||||
|
for iter in 1 2 3 4; do
|
||||||
|
for test in dev93 eval92; do
|
||||||
|
steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
|
||||||
|
--transform-dir exp/tri4b/decode_tgpr_${test} data/lang_test_tgpr data/test_${test} exp/sgmm5b/decode_tgpr_${test} \
|
||||||
|
exp/sgmm5b_mmi_b0.1/decode_tgpr_${test}_it$iter &
|
||||||
|
|
||||||
|
steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
|
||||||
|
--transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_tgpr data/test_${test} exp/sgmm5b/decode_bd_tgpr_${test} \
|
||||||
|
exp/sgmm5b_mmi_b0.1/decode_bd_tgpr_${test}_it$iter &
|
||||||
|
done
|
||||||
|
done
|
||||||
|
) &
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Train quinphone SGMM system.
|
||||||
|
|
||||||
|
steps/train_sgmm.sh --cmd "$train_cmd" \
|
||||||
|
--context-opts "--context-width=5 --central-position=2" \
|
||||||
|
5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \
|
||||||
|
exp/ubm5b/final.ubm exp/sgmm5c || exit 1;
|
||||||
|
|
||||||
|
# Decode from lattices in exp/sgmm5a/decode_tgpr_dev93.
|
||||||
|
steps/decode_sgmm_fromlats.sh --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
|
||||||
|
data/test_dev93 data/lang_test_tgpr exp/sgmm5a/decode_tgpr_dev93 exp/sgmm5c/decode_tgpr_dev93
|
||||||
|
|
|
@ -0,0 +1,148 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script is invoked from ../run.sh
|
||||||
|
# It contains some SGMM-related scripts that I am breaking out of the main run.sh for clarity.
|
||||||
|
|
||||||
|
. cmd.sh
|
||||||
|
|
||||||
|
# Note: you might want to try to give the option --spk-dep-weights=false to train_sgmm2.sh;
|
||||||
|
# this takes out the "symmetric SGMM" part which is not always helpful.
|
||||||
|
|
||||||
|
# SGMM system on si84 data [sgmm5a]. Note: the system we aligned from used the si284 data for
|
||||||
|
# training, but this shouldn't have much effect.
|
||||||
|
|
||||||
|
(
|
||||||
|
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
|
||||||
|
data/train_si84 data/lang exp/tri4b exp/tri4b_ali_si84 || exit 1;
|
||||||
|
|
||||||
|
steps/train_ubm.sh --cmd "$train_cmd" \
|
||||||
|
400 data/train_si84 data/lang exp/tri4b_ali_si84 exp/ubm5a || exit 1;
|
||||||
|
|
||||||
|
steps/train_sgmm2.sh --cmd "$train_cmd" \
|
||||||
|
7000 9000 data/train_si84 data/lang exp/tri4b_ali_si84 \
|
||||||
|
exp/ubm5a/final.ubm exp/sgmm2_5a || exit 1;
|
||||||
|
|
||||||
|
(
|
||||||
|
utils/mkgraph.sh data/lang_test_tgpr exp/sgmm2_5a exp/sgmm2_5a/graph_tgpr
|
||||||
|
steps/decode_sgmm2.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
|
||||||
|
exp/sgmm2_5a/graph_tgpr data/test_dev93 exp/sgmm2_5a/decode_tgpr_dev93
|
||||||
|
) &
|
||||||
|
|
||||||
|
steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si84 \
|
||||||
|
--use-graphs true --use-gselect true data/train_si84 data/lang exp/sgmm2_5a exp/sgmm2_5a_ali_si84 || exit 1;
|
||||||
|
steps/make_denlats_sgmm2.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 \
|
||||||
|
data/train_si84 data/lang exp/sgmm2_5a_ali_si84 exp/sgmm2_5a_denlats_si84
|
||||||
|
|
||||||
|
steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
|
||||||
|
data/train_si84 data/lang exp/sgmm2_5a_ali_si84 exp/sgmm2_5a_denlats_si84 exp/sgmm2_5a_mmi_b0.1
|
||||||
|
|
||||||
|
for iter in 1 2 3 4; do
|
||||||
|
steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
|
||||||
|
--transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm2_5a/decode_tgpr_dev93 \
|
||||||
|
exp/sgmm2_5a_mmi_b0.1/decode_tgpr_dev93_it$iter &
|
||||||
|
done
|
||||||
|
|
||||||
|
steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
|
||||||
|
--update-opts "--cov-min-value=0.9" data/train_si84 data/lang exp/sgmm2_5a_ali_si84 exp/sgmm2_5a_denlats_si84 exp/sgmm2_5a_mmi_b0.1_m0.9
|
||||||
|
|
||||||
|
for iter in 1 2 3 4; do
|
||||||
|
steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
|
||||||
|
--transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm2_5a/decode_tgpr_dev93 \
|
||||||
|
exp/sgmm2_5a_mmi_b0.1_m0.9/decode_tgpr_dev93_it$iter &
|
||||||
|
done
|
||||||
|
|
||||||
|
) &
|
||||||
|
|
||||||
|
|
||||||
|
(
|
||||||
|
# The next commands are the same thing on all the si284 data.
|
||||||
|
|
||||||
|
# SGMM system on the si284 data [sgmm5b]
|
||||||
|
steps/train_ubm.sh --cmd "$train_cmd" \
|
||||||
|
600 data/train_si284 data/lang exp/tri4b_ali_si284 exp/ubm5b || exit 1;
|
||||||
|
|
||||||
|
steps/train_sgmm2.sh --cmd "$train_cmd" \
|
||||||
|
11000 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \
|
||||||
|
exp/ubm5b/final.ubm exp/sgmm2_5b || exit 1;
|
||||||
|
|
||||||
|
(
|
||||||
|
utils/mkgraph.sh data/lang_test_tgpr exp/sgmm2_5b exp/sgmm2_5b/graph_tgpr
|
||||||
|
steps/decode_sgmm2.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
|
||||||
|
exp/sgmm2_5b/graph_tgpr data/test_dev93 exp/sgmm2_5b/decode_tgpr_dev93
|
||||||
|
steps/decode_sgmm2.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_eval92 \
|
||||||
|
exp/sgmm2_5b/graph_tgpr data/test_eval92 exp/sgmm2_5b/decode_tgpr_eval92
|
||||||
|
|
||||||
|
utils/mkgraph.sh data/lang_test_bd_tgpr exp/sgmm2_5b exp/sgmm2_5b/graph_bd_tgpr || exit 1;
|
||||||
|
steps/decode_sgmm2.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
|
||||||
|
exp/sgmm2_5b/graph_bd_tgpr data/test_dev93 exp/sgmm2_5b/decode_bd_tgpr_dev93
|
||||||
|
steps/decode_sgmm2.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \
|
||||||
|
exp/sgmm2_5b/graph_bd_tgpr data/test_eval92 exp/sgmm2_5b/decode_bd_tgpr_eval92
|
||||||
|
) &
|
||||||
|
|
||||||
|
|
||||||
|
# This shows how you would build and test a quinphone SGMM2 system, but
|
||||||
|
(
|
||||||
|
steps/train_sgmm2.sh --cmd "$train_cmd" \
|
||||||
|
--context-opts "--context-width=5 --central-position=2" \
|
||||||
|
11000 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \
|
||||||
|
exp/ubm5b/final.ubm exp/sgmm2_5c || exit 1;
|
||||||
|
# Decode from lattices in exp/sgmm2_5b
|
||||||
|
steps/decode_sgmm2_fromlats.sh --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
|
||||||
|
data/test_dev93 data/lang_test_tgpr exp/sgmm2_5b/decode_tgpr_dev93 exp/sgmm2_5c/decode_tgpr_dev93
|
||||||
|
steps/decode_sgmm2_fromlats.sh --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_eval92 \
|
||||||
|
data/test_eval92 data/lang_test_tgpr exp/sgmm2_5b/decode_tgpr_eval92 exp/sgmm2_5c/decode_tgpr_eval92
|
||||||
|
) &
|
||||||
|
|
||||||
|
|
||||||
|
steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si284 \
|
||||||
|
--use-graphs true --use-gselect true data/train_si284 data/lang exp/sgmm2_5b exp/sgmm2_5b_ali_si284
|
||||||
|
|
||||||
|
steps/make_denlats_sgmm2.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 \
|
||||||
|
data/train_si284 data/lang exp/sgmm2_5b_ali_si284 exp/sgmm2_5b_denlats_si284
|
||||||
|
|
||||||
|
steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \
|
||||||
|
data/train_si284 data/lang exp/sgmm2_5b_ali_si284 exp/sgmm2_5b_denlats_si284 exp/sgmm2_5b_mmi_b0.1
|
||||||
|
|
||||||
|
for iter in 1 2 3 4; do
|
||||||
|
for test in eval92; do # dev93
|
||||||
|
steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
|
||||||
|
--transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_fg data/test_${test} exp/sgmm2_5b/decode_bd_tgpr_${test} \
|
||||||
|
exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_${test}_it$iter &
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \
|
||||||
|
--zero-if-disjoint true data/train_si284 data/lang exp/sgmm2_5b_ali_si284 exp/sgmm2_5b_denlats_si284 exp/sgmm2_5b_mmi_b0.1_z
|
||||||
|
|
||||||
|
for iter in 1 2 3 4; do
|
||||||
|
for test in eval92 dev93; do
|
||||||
|
steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
|
||||||
|
--transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_fg data/test_${test} exp/sgmm2_5b/decode_bd_tgpr_${test} \
|
||||||
|
exp/sgmm2_5b_mmi_b0.1_z/decode_bd_tgpr_${test}_it$iter &
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
) &
|
||||||
|
|
||||||
|
wait
|
||||||
|
|
||||||
|
# Examples of combining some of the best decodings: SGMM+MMI with
|
||||||
|
# MMI+fMMI on a conventional system.
|
||||||
|
|
||||||
|
local/score_combine.sh data/test_eval92 \
|
||||||
|
data/lang_test_bd_tgpr \
|
||||||
|
exp/tri4b_fmmi_a/decode_tgpr_eval92_it8 \
|
||||||
|
exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3 \
|
||||||
|
exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it8_3
|
||||||
|
|
||||||
|
|
||||||
|
# %WER 4.43 [ 250 / 5643, 41 ins, 12 del, 197 sub ] exp/tri4b_fmmi_a/decode_tgpr_eval92_it8/wer_11
|
||||||
|
# %WER 3.85 [ 217 / 5643, 35 ins, 11 del, 171 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3/wer_10
|
||||||
|
# combined to:
|
||||||
|
# %WER 3.76 [ 212 / 5643, 32 ins, 12 del, 168 sub ] exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it8_3/wer_12
|
||||||
|
|
||||||
|
# Checking MBR decode of baseline:
|
||||||
|
cp -r -T exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3{,.mbr}
|
||||||
|
local/score_mbr.sh data/test_eval92 data/lang_test_bd_tgpr exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr
|
||||||
|
# MBR decoding did not seem to help (baseline was 3.85). I think this is normal at such low WERs.
|
||||||
|
%WER 3.86 [ 218 / 5643, 35 ins, 11 del, 172 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr/wer_10
|
|
@ -0,0 +1,67 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
|
||||||
|
# Apache 2.0
|
||||||
|
|
||||||
|
[ -f ./path.sh ] && . ./path.sh
|
||||||
|
|
||||||
|
# begin configuration section.
|
||||||
|
cmd=run.pl
|
||||||
|
stage=0
|
||||||
|
decode_mbr=true
|
||||||
|
reverse=false
|
||||||
|
word_ins_penalty=0.0
|
||||||
|
min_lmwt=9
|
||||||
|
max_lmwt=20
|
||||||
|
#end configuration section.
|
||||||
|
|
||||||
|
[ -f ./path.sh ] && . ./path.sh
|
||||||
|
. parse_options.sh || exit 1;
|
||||||
|
|
||||||
|
if [ $# -ne 3 ]; then
|
||||||
|
echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
|
||||||
|
echo " Options:"
|
||||||
|
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
|
||||||
|
echo " --stage (0|1|2) # start scoring script from part-way through."
|
||||||
|
echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)."
|
||||||
|
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
|
||||||
|
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
|
||||||
|
echo " --reverse (true/false) # score with time reversed features "
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
data=$1
|
||||||
|
lang_or_graph=$2
|
||||||
|
dir=$3
|
||||||
|
|
||||||
|
symtab=$lang_or_graph/words.txt
|
||||||
|
|
||||||
|
for f in $symtab $dir/lat.1.gz $data/text; do
|
||||||
|
[ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
mkdir -p $dir/scoring/log
|
||||||
|
|
||||||
|
cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
|
||||||
|
|
||||||
|
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
|
||||||
|
lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
|
||||||
|
lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \
|
||||||
|
lattice-best-path --word-symbol-table=$symtab \
|
||||||
|
ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
|
||||||
|
|
||||||
|
if $reverse; then
|
||||||
|
for lmwt in `seq $min_lmwt $max_lmwt`; do
|
||||||
|
mv $dir/scoring/$lmwt.tra $dir/scoring/$lmwt.tra.orig
|
||||||
|
awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
|
||||||
|
<$dir/scoring/$lmwt.tra.orig >$dir/scoring/$lmwt.tra
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Note: the double level of quoting for the sed command
|
||||||
|
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
|
||||||
|
cat $dir/scoring/LMWT.tra \| \
|
||||||
|
utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
|
||||||
|
compute-wer --text --mode=present \
|
||||||
|
ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT || exit 1;
|
||||||
|
|
||||||
|
exit 0;
|
|
@ -0,0 +1,95 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2013 Arnab Ghoshal
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
# Script for system combination using minimum Bayes risk decoding.
|
||||||
|
# This calls lattice-combine to create a union of lattices that have been
|
||||||
|
# normalized by removing the total forward cost from them. The resulting lattice
|
||||||
|
# is used as input to lattice-mbr-decode. This should not be put in steps/ or
|
||||||
|
# utils/ since the scores on the combined lattice must not be scaled.
|
||||||
|
|
||||||
|
# begin configuration section.
|
||||||
|
cmd=run.pl
|
||||||
|
min_lmwt=9
|
||||||
|
max_lmwt=20
|
||||||
|
lat_weights=
|
||||||
|
#end configuration section.
|
||||||
|
|
||||||
|
help_message="Usage: "$(basename $0)" [options] <data-dir> <graph-dir|lang-dir> <decode-dir1> <decode-dir2> [decode-dir3 ... ] <out-dir>
|
||||||
|
Options:
|
||||||
|
--cmd (run.pl|queue.pl...) # specify how to run the sub-processes.
|
||||||
|
--min-lmwt INT # minumum LM-weight for lattice rescoring
|
||||||
|
--max-lmwt INT # maximum LM-weight for lattice rescoring
|
||||||
|
--lat-weights STR # colon-separated string of lattice weights
|
||||||
|
";
|
||||||
|
|
||||||
|
[ -f ./path.sh ] && . ./path.sh
|
||||||
|
. parse_options.sh || exit 1;
|
||||||
|
|
||||||
|
if [ $# -lt 5 ]; then
|
||||||
|
printf "$help_message\n";
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
data=$1
|
||||||
|
graphdir=$2
|
||||||
|
odir=${@: -1} # last argument to the script
|
||||||
|
shift 2;
|
||||||
|
decode_dirs=( $@ ) # read the remaining arguments into an array
|
||||||
|
unset decode_dirs[${#decode_dirs[@]}-1] # 'pop' the last argument which is odir
|
||||||
|
num_sys=${#decode_dirs[@]} # number of systems to combine
|
||||||
|
|
||||||
|
symtab=$graphdir/words.txt
|
||||||
|
[ ! -f $symtab ] && echo "$0: missing word symbol table '$symtab'" && exit 1;
|
||||||
|
[ ! -f $data/text ] && echo "$0: missing reference '$data/text'" && exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
mkdir -p $odir/log
|
||||||
|
|
||||||
|
for i in `seq 0 $[num_sys-1]`; do
|
||||||
|
model=${decode_dirs[$i]}/../final.mdl # model one level up from decode dir
|
||||||
|
for f in $model ${decode_dirs[$i]}/lat.1.gz ; do
|
||||||
|
[ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
|
||||||
|
done
|
||||||
|
lats[$i]="\"ark:gunzip -c ${decode_dirs[$i]}/lat.*.gz |\""
|
||||||
|
done
|
||||||
|
|
||||||
|
mkdir -p $odir/scoring/log
|
||||||
|
|
||||||
|
cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' \
|
||||||
|
> $odir/scoring/test_filt.txt
|
||||||
|
|
||||||
|
if [ -z "$lat_weights" ]; then
|
||||||
|
$cmd LMWT=$min_lmwt:$max_lmwt $odir/log/combine_lats.LMWT.log \
|
||||||
|
lattice-combine --inv-acoustic-scale=LMWT ${lats[@]} ark:- \| \
|
||||||
|
lattice-mbr-decode --word-symbol-table=$symtab ark:- \
|
||||||
|
ark,t:$odir/scoring/LMWT.tra || exit 1;
|
||||||
|
else
|
||||||
|
$cmd LMWT=$min_lmwt:$max_lmwt $odir/log/combine_lats.LMWT.log \
|
||||||
|
lattice-combine --inv-acoustic-scale=LMWT --lat-weights=$lat_weights \
|
||||||
|
${lats[@]} ark:- \| \
|
||||||
|
lattice-mbr-decode --word-symbol-table=$symtab ark:- \
|
||||||
|
ark,t:$odir/scoring/LMWT.tra || exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
$cmd LMWT=$min_lmwt:$max_lmwt $odir/scoring/log/score.LMWT.log \
|
||||||
|
cat $odir/scoring/LMWT.tra \| \
|
||||||
|
utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
|
||||||
|
compute-wer --text --mode=present \
|
||||||
|
ark:$odir/scoring/test_filt.txt ark,p:- ">&" $odir/wer_LMWT || exit 1;
|
||||||
|
|
||||||
|
exit 0
|
|
@ -0,0 +1,58 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Script for minimum bayes risk decoding.
|
||||||
|
|
||||||
|
[ -f ./path.sh ] && . ./path.sh;
|
||||||
|
|
||||||
|
# begin configuration section.
|
||||||
|
cmd=run.pl
|
||||||
|
min_lmwt=9
|
||||||
|
max_lmwt=20
|
||||||
|
#end configuration section.
|
||||||
|
|
||||||
|
[ -f ./path.sh ] && . ./path.sh
|
||||||
|
. parse_options.sh || exit 1;
|
||||||
|
|
||||||
|
if [ $# -ne 3 ]; then
|
||||||
|
echo "Usage: local/score_mbr.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
|
||||||
|
echo " Options:"
|
||||||
|
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
|
||||||
|
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
|
||||||
|
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
data=$1
|
||||||
|
lang_or_graph=$2
|
||||||
|
dir=$3
|
||||||
|
|
||||||
|
symtab=$lang_or_graph/words.txt
|
||||||
|
|
||||||
|
for f in $symtab $dir/lat.1.gz $data/text; do
|
||||||
|
[ ! -f $f ] && echo "score_mbr.sh: no such file $f" && exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
mkdir -p $dir/scoring/log
|
||||||
|
|
||||||
|
cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
|
||||||
|
|
||||||
|
# We submit the jobs separately, not as an array, because it's hard
|
||||||
|
# to get the inverse of the LM scales.
|
||||||
|
rm $dir/.error 2>/dev/null
|
||||||
|
for inv_acwt in `seq $min_lmwt $max_lmwt`; do
|
||||||
|
acwt=`perl -e "print (1.0/$inv_acwt);"`
|
||||||
|
$cmd $dir/scoring/rescore_mbr.${inv_acwt}.log \
|
||||||
|
lattice-mbr-decode --acoustic-scale=$acwt --word-symbol-table=$symtab \
|
||||||
|
"ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/${inv_acwt}.tra \
|
||||||
|
|| touch $dir/.error &
|
||||||
|
done
|
||||||
|
wait;
|
||||||
|
[ -f $dir/.error ] && echo "score_mbr.sh: errror getting MBR outout.";
|
||||||
|
|
||||||
|
|
||||||
|
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
|
||||||
|
cat $dir/scoring/LMWT.tra \| \
|
||||||
|
utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
|
||||||
|
compute-wer --text --mode=present \
|
||||||
|
ark:$dir/scoring/test_filt.txt ark,p:- ">" $dir/wer_LMWT || exit 1;
|
||||||
|
|
|
@ -0,0 +1,201 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
|
||||||
|
# Apache 2.0.
|
||||||
|
|
||||||
|
|
||||||
|
if [ $# -le 3 ]; then
|
||||||
|
echo "Arguments should be a list of WSJ directories, see ../run.sh for example."
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
dir=`pwd`/data/local/data
|
||||||
|
lmdir=`pwd`/data/local/nist_lm
|
||||||
|
mkdir -p $dir $lmdir
|
||||||
|
local=`pwd`/local
|
||||||
|
utils=`pwd`/utils
|
||||||
|
|
||||||
|
. ./path.sh # Needed for KALDI_ROOT
|
||||||
|
export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
|
||||||
|
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
|
||||||
|
if [ ! -x $sph2pipe ]; then
|
||||||
|
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
cd $dir
|
||||||
|
|
||||||
|
# Make directory of links to the WSJ disks such as 11-13.1. This relies on the command
|
||||||
|
# line arguments being absolute pathnames.
|
||||||
|
rm -r links/ 2>/dev/null
|
||||||
|
mkdir links/
|
||||||
|
ln -s $* links
|
||||||
|
|
||||||
|
# Do some basic checks that we have what we expected.
|
||||||
|
if [ ! -d links/11-13.1 -o ! -d links/13-34.1 -o ! -d links/11-2.1 ]; then
|
||||||
|
echo "wsj_data_prep.sh: Spot check of command line arguments failed"
|
||||||
|
echo "Command line arguments must be absolute pathnames to WSJ directories"
|
||||||
|
echo "with names like 11-13.1."
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
# This version for SI-84
|
||||||
|
|
||||||
|
cat links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
|
||||||
|
$local/ndx2flist.pl $* | sort | \
|
||||||
|
grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si84.flist
|
||||||
|
|
||||||
|
nl=`cat train_si84.flist | wc -l`
|
||||||
|
[ "$nl" -eq 7138 ] || echo "Warning: expected 7138 lines in train_si84.flist, got $nl"
|
||||||
|
|
||||||
|
# This version for SI-284
|
||||||
|
cat links/13-34.1/wsj1/doc/indices/si_tr_s.ndx \
|
||||||
|
links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
|
||||||
|
$local/ndx2flist.pl $* | sort | \
|
||||||
|
grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si284.flist
|
||||||
|
|
||||||
|
nl=`cat train_si284.flist | wc -l`
|
||||||
|
[ "$nl" -eq 37416 ] || echo "Warning: expected 37416 lines in train_si284.flist, got $nl"
|
||||||
|
|
||||||
|
# Now for the test sets.
|
||||||
|
# links/13-34.1/wsj1/doc/indices/readme.doc
|
||||||
|
# describes all the different test sets.
|
||||||
|
# Note: each test-set seems to come in multiple versions depending
|
||||||
|
# on different vocabulary sizes, verbalized vs. non-verbalized
|
||||||
|
# pronunciations, etc. We use the largest vocab and non-verbalized
|
||||||
|
# pronunciations.
|
||||||
|
# The most normal one seems to be the "baseline 60k test set", which
|
||||||
|
# is h1_p0.
|
||||||
|
|
||||||
|
# Nov'92 (333 utts)
|
||||||
|
# These index files have a slightly different format;
|
||||||
|
# have to add .wv1
|
||||||
|
cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_20.ndx | \
|
||||||
|
$local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \
|
||||||
|
sort > test_eval92.flist
|
||||||
|
|
||||||
|
# Nov'92 (330 utts, 5k vocab)
|
||||||
|
cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
|
||||||
|
$local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \
|
||||||
|
sort > test_eval92_5k.flist
|
||||||
|
|
||||||
|
# Nov'93: (213 utts)
|
||||||
|
# Have to replace a wrong disk-id.
|
||||||
|
cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \
|
||||||
|
sed s/13_32_1/13_33_1/ | \
|
||||||
|
$local/ndx2flist.pl $* | sort > test_eval93.flist
|
||||||
|
|
||||||
|
# Nov'93: (213 utts, 5k)
|
||||||
|
cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h2_p0.ndx | \
|
||||||
|
sed s/13_32_1/13_33_1/ | \
|
||||||
|
$local/ndx2flist.pl $* | sort > test_eval93_5k.flist
|
||||||
|
|
||||||
|
# Dev-set for Nov'93 (503 utts)
|
||||||
|
cat links/13-34.1/wsj1/doc/indices/h1_p0.ndx | \
|
||||||
|
$local/ndx2flist.pl $* | sort > test_dev93.flist
|
||||||
|
|
||||||
|
# Dev-set for Nov'93 (513 utts, 5k vocab)
|
||||||
|
cat links/13-34.1/wsj1/doc/indices/h2_p0.ndx | \
|
||||||
|
$local/ndx2flist.pl $* | sort > test_dev93_5k.flist
|
||||||
|
|
||||||
|
|
||||||
|
# Dev-set Hub 1,2 (503, 913 utterances)
|
||||||
|
|
||||||
|
# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.
|
||||||
|
# Sometimes this gets copied from the CD's with upcasing, don't know
|
||||||
|
# why (could be older versions of the disks).
|
||||||
|
find `readlink links/13-16.1`/???1/??_??_20 -print | grep -i ".wv1" | sort > dev_dt_20.flist
|
||||||
|
find `readlink links/13-16.1`/???1/??_??_05 -print | grep -i ".wv1" | sort > dev_dt_05.flist
|
||||||
|
|
||||||
|
|
||||||
|
# Finding the transcript files:
|
||||||
|
for x in $*; do find -L $x -iname '*.dot'; done > dot_files.flist
|
||||||
|
|
||||||
|
# Convert the transcripts into our format (no normalization yet)
|
||||||
|
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
||||||
|
$local/flist2scp.pl $x.flist | sort > ${x}_sph.scp
|
||||||
|
cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl dot_files.flist > $x.trans1
|
||||||
|
done
|
||||||
|
|
||||||
|
# Do some basic normalization steps. At this point we don't remove OOVs--
|
||||||
|
# that will be done inside the training scripts, as we'd like to make the
|
||||||
|
# data-preparation stage independent of the specific lexicon used.
|
||||||
|
noiseword="<NOISE>";
|
||||||
|
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
||||||
|
cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
|
||||||
|
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
||||||
|
awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
|
||||||
|
done
|
||||||
|
|
||||||
|
# Make the utt2spk and spk2utt files.
|
||||||
|
for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
||||||
|
cat ${x}_sph.scp | awk '{print $1}' | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
|
||||||
|
cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
#in case we want to limit lm's on most frequent words, copy lm training word frequency list
|
||||||
|
cp links/13-32.1/wsj1/doc/lng_modl/vocab/wfl_64.lst $lmdir
|
||||||
|
chmod u+w $lmdir/*.lst # had weird permissions on source.
|
||||||
|
|
||||||
|
# The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without
|
||||||
|
# verbalized pronunciations. This is the most common test setup, I understand.
|
||||||
|
|
||||||
|
cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1;
|
||||||
|
chmod u+w $lmdir/lm_bg.arpa.gz
|
||||||
|
|
||||||
|
# trigram would be:
|
||||||
|
cat links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb20onp.z | \
|
||||||
|
perl -e 'while(<>){ if(m/^\\data\\/){ print; last; } } while(<>){ print; }' | \
|
||||||
|
gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1;
|
||||||
|
|
||||||
|
prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1;
|
||||||
|
gzip -f $lmdir/lm_tgpr.arpa || exit 1;
|
||||||
|
|
||||||
|
# repeat for 5k language models
|
||||||
|
cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb05onp.z $lmdir/lm_bg_5k.arpa.gz || exit 1;
|
||||||
|
chmod u+w $lmdir/lm_bg_5k.arpa.gz
|
||||||
|
|
||||||
|
# trigram would be: !only closed vocabulary here!
|
||||||
|
cp links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1;
|
||||||
|
chmod u+w $lmdir/lm_tg_5k.arpa.gz
|
||||||
|
gunzip $lmdir/lm_tg_5k.arpa.gz
|
||||||
|
tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz
|
||||||
|
rm $lmdir/lm_tg_5k.arpa
|
||||||
|
|
||||||
|
prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1;
|
||||||
|
gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
|
||||||
|
rm wsj0-train-spkrinfo.txt
|
||||||
|
! wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt && \
|
||||||
|
echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
|
||||||
|
wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f wsj0-train-spkrinfo.txt ]; then
|
||||||
|
echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
|
||||||
|
echo "This is possibly omitted from the training disks; couldn't find it."
|
||||||
|
echo "Everything else may have worked; we just may be missing gender info"
|
||||||
|
echo "which is only needed for VTLN-related diagnostics anyway."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
|
||||||
|
# LDC put it on the web. Perhaps it was accidentally omitted from the
|
||||||
|
# disks.
|
||||||
|
|
||||||
|
cat links/11-13.1/wsj0/doc/spkrinfo.txt \
|
||||||
|
links/13-32.1/wsj1/doc/evl_spok/spkrinfo.txt \
|
||||||
|
links/13-34.1/wsj1/doc/dev_spok/spkrinfo.txt \
|
||||||
|
links/13-34.1/wsj1/doc/train/spkrinfo.txt \
|
||||||
|
./wsj0-train-spkrinfo.txt | \
|
||||||
|
perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
|
||||||
|
awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
|
||||||
|
|
||||||
|
|
||||||
|
echo "Data preparation succeeded"
|
|
@ -0,0 +1,173 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script builds a larger word-list and dictionary
|
||||||
|
# than used for the LMs supplied with the WSJ corpus.
|
||||||
|
# It uses a couple of strategies to fill-in words in
|
||||||
|
# the LM training data but not in CMUdict. One is
|
||||||
|
# to generate special prons for possible acronyms, that
|
||||||
|
# just consist of the constituent letters. The other
|
||||||
|
# is designed to handle derivatives of known words
|
||||||
|
# (e.g. deriving the pron of a plural from the pron of
|
||||||
|
# the base-word), but in a more general, learned-from-data
|
||||||
|
# way.
|
||||||
|
# It makes use of scripts in local/dict/
|
||||||
|
|
||||||
|
if [ $# -ne 1 ]; then
|
||||||
|
echo "Usage: local/wsj_train_lms.sh /foo/bar/WSJ/13-32.1/"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ "`basename $1`" != 13-32.1 ]; then
|
||||||
|
echo "Expecting the argument to this script to end in 13-32.1"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# e.g.
|
||||||
|
#srcdir=/mnt/matylda2/data/WSJ1/13-32.1
|
||||||
|
export PATH=$PATH:`pwd`/local/dict/
|
||||||
|
srcdir=$1
|
||||||
|
mkdir -p data/local/dict_larger
|
||||||
|
dir=data/local/dict_larger
|
||||||
|
cp data/local/dict/* data/local/dict_larger # Various files describing phones etc.
|
||||||
|
# are there; we just want to copy them as the phoneset is the same.
|
||||||
|
rm data/local/dict_larger/lexicon.txt # we don't want this.
|
||||||
|
rm data/local/dict_larger/lexiconp.txt # we don't want this either.
|
||||||
|
mincount=2 # Minimum count of an OOV we will try to generate a pron for.
|
||||||
|
|
||||||
|
[ ! -f data/local/dict/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1;
|
||||||
|
|
||||||
|
# Remove comments from cmudict; print first field; remove
|
||||||
|
# words like FOO(1) which are alternate prons: our dict format won't
|
||||||
|
# include these markers.
|
||||||
|
grep -v ';;;' data/local/dict/cmudict/cmudict.0.7a |
|
||||||
|
perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu
|
||||||
|
|
||||||
|
cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu
|
||||||
|
|
||||||
|
echo "Getting training data [this should take at least a few seconds; if not, there's a problem]"
|
||||||
|
|
||||||
|
# Convert to uppercase, remove XML-like markings.
|
||||||
|
# For words ending in "." that are not in CMUdict, we assume that these
|
||||||
|
# are periods that somehow remained in the data during data preparation,
|
||||||
|
# and we we replace the "." with "\n". Note: we found this by looking at
|
||||||
|
# oov.counts below (before adding this rule).
|
||||||
|
|
||||||
|
touch $dir/cleaned.gz
|
||||||
|
if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then
|
||||||
|
echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]";
|
||||||
|
else
|
||||||
|
gunzip -c $srcdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \
|
||||||
|
| awk '/^</{next}{print toupper($0)}' | perl -e '
|
||||||
|
open(F, "<$ARGV[0]")||die;
|
||||||
|
while(<F>){ chop; $isword{$_} = 1; }
|
||||||
|
while(<STDIN>) {
|
||||||
|
@A = split(" ", $_);
|
||||||
|
for ($n = 0; $n < @A; $n++) {
|
||||||
|
$a = $A[$n];
|
||||||
|
if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "."
|
||||||
|
# and have no other "." in them: treat as period.
|
||||||
|
print "$a";
|
||||||
|
if ($n+1 < @A) { print "\n"; }
|
||||||
|
} else { print "$a "; }
|
||||||
|
}
|
||||||
|
print "\n";
|
||||||
|
}
|
||||||
|
' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz
|
||||||
|
fi
|
||||||
|
|
||||||
|
# get unigram counts
|
||||||
|
echo "Getting unigram counts"
|
||||||
|
gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \
|
||||||
|
awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams
|
||||||
|
|
||||||
|
cat $dir/unigrams | awk -v dict=$dir/dict.cmu \
|
||||||
|
'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \
|
||||||
|
> $dir/oov.counts
|
||||||
|
|
||||||
|
echo "Most frequent unseen unigrams are: "
|
||||||
|
head $dir/oov.counts
|
||||||
|
|
||||||
|
# Prune away singleton counts, and remove things with numbers in
|
||||||
|
# (which should have been normalized) and with no letters at all.
|
||||||
|
|
||||||
|
|
||||||
|
cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \
|
||||||
|
| awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist
|
||||||
|
|
||||||
|
# Automatic rule-finding...
|
||||||
|
|
||||||
|
# First make some prons for possible acronyms.
|
||||||
|
# Note: we don't do this for things like U.K or U.N,
|
||||||
|
# or A.B. (which doesn't exist anyway),
|
||||||
|
# as we consider this normalization/spelling errors.
|
||||||
|
|
||||||
|
cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu > $dir/dict.acronyms
|
||||||
|
|
||||||
|
mkdir $dir/f $dir/b # forward, backward directions of rules...
|
||||||
|
# forward is normal suffix
|
||||||
|
# rules, backward is reversed (prefix rules). These
|
||||||
|
# dirs contain stuff we create while making the rule-based
|
||||||
|
# extensions to the dictionary.
|
||||||
|
|
||||||
|
# Remove ; and , from words, if they are present; these
|
||||||
|
# might crash our scripts, as they are used as separators there.
|
||||||
|
filter_dict.pl $dir/dict.cmu > $dir/f/dict
|
||||||
|
cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
|
||||||
|
reverse_dict.pl $dir/f/dict > $dir/b/dict
|
||||||
|
reverse_dict.pl $dir/f/oovs > $dir/b/oovs
|
||||||
|
|
||||||
|
# The next stage takes a few minutes.
|
||||||
|
# Note: the forward stage takes longer, as English is
|
||||||
|
# mostly a suffix-based language, and there are more rules
|
||||||
|
# that it finds.
|
||||||
|
for d in $dir/f $dir/b; do
|
||||||
|
(
|
||||||
|
cd $d
|
||||||
|
cat dict | get_rules.pl 2>get_rules.log >rules
|
||||||
|
get_rule_hierarchy.pl rules >hierarchy
|
||||||
|
awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
|
||||||
|
limit_candidate_prons.pl hierarchy | \
|
||||||
|
score_prons.pl dict | \
|
||||||
|
count_rules.pl >rule.counts
|
||||||
|
# the sort command below is just for convenience of reading.
|
||||||
|
score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
|
||||||
|
get_candidate_prons.pl rules.with_scores dict oovs | \
|
||||||
|
limit_candidate_prons.pl hierarchy > oovs.candidates
|
||||||
|
) &
|
||||||
|
done
|
||||||
|
wait
|
||||||
|
|
||||||
|
# Merge the candidates.
|
||||||
|
reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
|
||||||
|
select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s %s\n", $1, $2);}' \
|
||||||
|
> $dir/dict.oovs
|
||||||
|
|
||||||
|
cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
|
||||||
|
|
||||||
|
awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
|
||||||
|
sort $dir/oovlist | diff - $dir/oovlist.handled | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
|
||||||
|
|
||||||
|
|
||||||
|
# add_counts.pl attaches to original counts to the list of handled/not-handled OOVs
|
||||||
|
add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts
|
||||||
|
add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts
|
||||||
|
|
||||||
|
echo "**Top OOVs we handled are:**";
|
||||||
|
head $dir/oovlist.handled.counts
|
||||||
|
echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**";
|
||||||
|
head $dir/oovlist.not_handled.counts
|
||||||
|
|
||||||
|
|
||||||
|
echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`"
|
||||||
|
echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`"
|
||||||
|
echo "Count of OOVs we didn't handle due to low count is" \
|
||||||
|
`awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts`
|
||||||
|
# The two files created above are for humans to look at, as diagnostics.
|
||||||
|
|
||||||
|
cat <<EOF | cat - $dir/dict.cmu $dir/dict.oovs_merged | sort | uniq > $dir/lexicon.txt
|
||||||
|
!SIL SIL
|
||||||
|
<SPOKEN_NOISE> SPN
|
||||||
|
<UNK> SPN
|
||||||
|
<NOISE> NSN
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo "Created $dir/lexicon.txt"
|
|
@ -0,0 +1,86 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
|
||||||
|
# Apache 2.0
|
||||||
|
|
||||||
|
# This script takes data prepared in a corpus-dependent way
|
||||||
|
# in data/local/, and converts it into the "canonical" form,
|
||||||
|
# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
|
||||||
|
# data/train_si284, data/train_si84, etc.
|
||||||
|
|
||||||
|
# Don't bother doing train_si84 separately (although we have the file lists
|
||||||
|
# in data/local/) because it's just the first 7138 utterances in train_si284.
|
||||||
|
# We'll create train_si84 after doing the feature extraction.
|
||||||
|
|
||||||
|
. ./path.sh || exit 1;
|
||||||
|
|
||||||
|
echo "Preparing train and test data"
|
||||||
|
srcdir=data/local/data
|
||||||
|
lmdir=data/local/nist_lm
|
||||||
|
tmpdir=data/local/lm_tmp
|
||||||
|
lexicon=data/local/lang_tmp/lexiconp.txt
|
||||||
|
mkdir -p $tmpdir
|
||||||
|
|
||||||
|
for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
||||||
|
mkdir -p data/$x
|
||||||
|
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
|
||||||
|
cp $srcdir/$x.txt data/$x/text || exit 1;
|
||||||
|
cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
|
||||||
|
cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
|
||||||
|
utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
# Next, for each type of language model, create the corresponding FST
|
||||||
|
# and the corresponding lang_test_* directory.
|
||||||
|
|
||||||
|
echo Preparing language models for test
|
||||||
|
|
||||||
|
for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
|
||||||
|
test=data/lang_test_${lm_suffix}
|
||||||
|
mkdir -p $test
|
||||||
|
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
|
||||||
|
phones/; do
|
||||||
|
cp -r data/lang/$f $test
|
||||||
|
done
|
||||||
|
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
||||||
|
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
|
||||||
|
|
||||||
|
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||||
|
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
||||||
|
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
||||||
|
# which are supposed to occur only at being/end of utt. These can cause
|
||||||
|
# determinization failures of CLG [ends up being epsilon cycles].
|
||||||
|
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
||||||
|
grep -v '<s> <s>' | \
|
||||||
|
grep -v '</s> <s>' | \
|
||||||
|
grep -v '</s> </s>' | \
|
||||||
|
arpa2fst - | fstprint | \
|
||||||
|
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
|
||||||
|
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||||
|
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||||
|
fstrmepsilon > $test/G.fst
|
||||||
|
fstisstochastic $test/G.fst
|
||||||
|
# The output is like:
|
||||||
|
# 9.14233e-05 -0.259833
|
||||||
|
# we do expect the first of these 2 numbers to be close to zero (the second is
|
||||||
|
# nonzero because the backoff weights make the states sum to >1).
|
||||||
|
# Because of the <s> fiasco for these particular LMs, the first number is not
|
||||||
|
# as close to zero as it could be.
|
||||||
|
|
||||||
|
# Everything below is only for diagnostic.
|
||||||
|
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
|
||||||
|
# this might cause determinization failure of CLG.
|
||||||
|
# #0 is treated as an empty word.
|
||||||
|
mkdir -p $tmpdir/g
|
||||||
|
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
|
||||||
|
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
|
||||||
|
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
|
||||||
|
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
|
||||||
|
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||||
|
echo "Language model has cycles with empty words" && exit 1
|
||||||
|
rm -r $tmpdir/g
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Succeeded in formatting data."
|
||||||
|
rm -r $tmpdir
|
|
@ -0,0 +1,52 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright Johns Hopkins University (Author: Daniel Povey) 2012
|
||||||
|
|
||||||
|
. ./path.sh
|
||||||
|
|
||||||
|
[ ! -d data/lang_bd ] && echo "Expect data/local/lang_bd to exist" && exit 1;
|
||||||
|
|
||||||
|
lm_srcdir_3g=data/local/local_lm/3gram-mincount
|
||||||
|
lm_srcdir_4g=data/local/local_lm/4gram-mincount
|
||||||
|
|
||||||
|
[ ! -d "$lm_srcdir_3g" ] && echo "No such dir $lm_srcdir_3g" && exit 1;
|
||||||
|
[ ! -d "$lm_srcdir_4g" ] && echo "No such dir $lm_srcdir_4g" && exit 1;
|
||||||
|
|
||||||
|
for d in data/lang_test_bd_{tg,tgpr,fg,fgpr}; do
|
||||||
|
rm -r $d 2>/dev/null
|
||||||
|
cp -r data/lang_bd $d
|
||||||
|
done
|
||||||
|
|
||||||
|
lang=data/lang_bd
|
||||||
|
|
||||||
|
# Be careful: this time we dispense with the grep -v '<s> <s>' so this might
|
||||||
|
# not work for LMs generated from all toolkits.
|
||||||
|
gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \
|
||||||
|
arpa2fst - | fstprint | \
|
||||||
|
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
||||||
|
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||||
|
fstrmepsilon > data/lang_test_bd_tgpr/G.fst || exit 1;
|
||||||
|
fstisstochastic data/lang_test_bd_tgpr/G.fst
|
||||||
|
|
||||||
|
gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
|
||||||
|
arpa2fst - | fstprint | \
|
||||||
|
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
||||||
|
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||||
|
fstrmepsilon > data/lang_test_bd_tg/G.fst || exit 1;
|
||||||
|
fstisstochastic data/lang_test_bd_tg/G.fst
|
||||||
|
|
||||||
|
gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
|
||||||
|
arpa2fst - | fstprint | \
|
||||||
|
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
||||||
|
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||||
|
fstrmepsilon > data/lang_test_bd_fg/G.fst || exit 1;
|
||||||
|
fstisstochastic data/lang_test_bd_fg/G.fst
|
||||||
|
|
||||||
|
gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \
|
||||||
|
arpa2fst - | fstprint | \
|
||||||
|
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
||||||
|
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||||
|
fstrmepsilon > data/lang_test_bd_fgpr/G.fst || exit 1;
|
||||||
|
fstisstochastic data/lang_test_bd_fgpr/G.fst
|
||||||
|
|
||||||
|
exit 0;
|
|
@ -0,0 +1,83 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# Call this script from one level above, e.g. from the s3/ directory. It puts
|
||||||
|
# its output in data/local/.
|
||||||
|
|
||||||
|
# The parts of the output of this that will be needed are
|
||||||
|
# [in data/local/dict/ ]
|
||||||
|
# lexicon.txt
|
||||||
|
# extra_questions.txt
|
||||||
|
# nonsilence_phones.txt
|
||||||
|
# optional_silence.txt
|
||||||
|
# silence_phones.txt
|
||||||
|
|
||||||
|
# run this from ../
|
||||||
|
dir=data/local/dict
|
||||||
|
mkdir -p $dir
|
||||||
|
|
||||||
|
|
||||||
|
# (1) Get the CMU dictionary
|
||||||
|
svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
|
||||||
|
$dir/cmudict || exit 1;
|
||||||
|
|
||||||
|
# can add -r 10966 for strict compatibility.
|
||||||
|
|
||||||
|
|
||||||
|
#(2) Dictionary preparation:
|
||||||
|
|
||||||
|
|
||||||
|
# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
|
||||||
|
# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.
|
||||||
|
|
||||||
|
# silence phones, one per line.
|
||||||
|
(echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt
|
||||||
|
echo SIL > $dir/optional_silence.txt
|
||||||
|
|
||||||
|
# nonsilence phones; on each line is a list of phones that correspond
|
||||||
|
# really to the same base phone.
|
||||||
|
cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \
|
||||||
|
perl -e 'while(<>){
|
||||||
|
chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
|
||||||
|
$phones_of{$1} .= "$_ "; }
|
||||||
|
foreach $list (values %phones_of) {print $list . "\n"; } ' \
|
||||||
|
> $dir/nonsilence_phones.txt || exit 1;
|
||||||
|
|
||||||
|
# A few extra questions that will be added to those obtained by automatically clustering
|
||||||
|
# the "real" phones. These ask about stress; there's also one for silence.
|
||||||
|
cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
|
||||||
|
cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
|
||||||
|
$p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
|
||||||
|
>> $dir/extra_questions.txt || exit 1;
|
||||||
|
|
||||||
|
grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
|
||||||
|
perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
|
||||||
|
> $dir/lexicon1_raw_nosil.txt || exit 1;
|
||||||
|
|
||||||
|
# Add to cmudict the silences, noises etc.
|
||||||
|
|
||||||
|
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; echo '<NOISE> NSN'; ) | \
|
||||||
|
cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
# lexicon.txt is without the _B, _E, _S, _I markers.
|
||||||
|
# This is the input to wsj_format_data.sh
|
||||||
|
cp $dir/lexicon2_raw.txt $dir/lexicon.txt
|
||||||
|
|
||||||
|
|
||||||
|
echo "Dictionary preparation succeeded"
|
||||||
|
|
|
@ -0,0 +1,202 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script trains LMs on the WSJ LM-training data.
|
||||||
|
# It requires that you have already run wsj_extend_dict.sh,
|
||||||
|
# to get the larger-size dictionary including all of CMUdict
|
||||||
|
# plus any OOVs and possible acronyms that we could easily
|
||||||
|
# derive pronunciations for.
|
||||||
|
|
||||||
|
# This script takes no command-line arguments
|
||||||
|
|
||||||
|
dir=data/local/local_lm
|
||||||
|
srcdir=data/local/dict_larger
|
||||||
|
mkdir -p $dir
|
||||||
|
. ./path.sh || exit 1; # for KALDI_ROOT
|
||||||
|
export PATH=$KALDI_ROOT/tools/kaldi_lm:$PATH
|
||||||
|
( # First make sure the kaldi_lm toolkit is installed.
|
||||||
|
cd $KALDI_ROOT/tools || exit 1;
|
||||||
|
if [ -d kaldi_lm ]; then
|
||||||
|
echo Not installing the kaldi_lm toolkit since it is already there.
|
||||||
|
else
|
||||||
|
echo Downloading and installing the kaldi_lm tools
|
||||||
|
if [ ! -f kaldi_lm.tar.gz ]; then
|
||||||
|
wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1;
|
||||||
|
fi
|
||||||
|
tar -xvzf kaldi_lm.tar.gz || exit 1;
|
||||||
|
cd kaldi_lm
|
||||||
|
make || exit 1;
|
||||||
|
echo Done making the kaldi_lm tools
|
||||||
|
fi
|
||||||
|
) || exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
|
||||||
|
echo "Expecting files $srcdir/cleaned.gz and $srcdir/lexicon.txt to exist";
|
||||||
|
echo "You need to run local/wsj_extend_dict.sh before running this script."
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Get a wordlist-- keep everything but silence, which should not appear in
|
||||||
|
# the LM.
|
||||||
|
awk '{print $1}' $srcdir/lexicon.txt | grep -v -w '!SIL' > $dir/wordlist.txt
|
||||||
|
|
||||||
|
# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
|
||||||
|
echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)"
|
||||||
|
gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.txt \
|
||||||
|
'BEGIN{while((getline<w)>0) v[$1]=1;}
|
||||||
|
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
|
||||||
|
| gzip -c > $dir/train_nounk.gz
|
||||||
|
|
||||||
|
# Get unigram counts (without bos/eos, but this doens't matter here, it's
|
||||||
|
# only to get the word-map, which treats them specially & doesn't need their
|
||||||
|
# counts).
|
||||||
|
# Add a 1-count for each word in word-list by including that in the data,
|
||||||
|
# so all words appear.
|
||||||
|
gunzip -c $dir/train_nounk.gz | cat - $dir/wordlist.txt | \
|
||||||
|
awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
|
||||||
|
sort -nr > $dir/unigram.counts
|
||||||
|
|
||||||
|
# Get "mapped" words-- a character encoding of the words that makes the common words very short.
|
||||||
|
cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map
|
||||||
|
|
||||||
|
gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
|
||||||
|
{ for(n=1;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz
|
||||||
|
|
||||||
|
# To save disk space, remove the un-mapped training data. We could
|
||||||
|
# easily generate it again if needed.
|
||||||
|
rm $dir/train_nounk.gz
|
||||||
|
|
||||||
|
train_lm.sh --arpa --lmtype 3gram-mincount $dir
|
||||||
|
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
|
||||||
|
# 7.8 million N-grams.
|
||||||
|
|
||||||
|
prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
|
||||||
|
# 1.45 million N-grams.
|
||||||
|
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
|
||||||
|
|
||||||
|
train_lm.sh --arpa --lmtype 4gram-mincount $dir
|
||||||
|
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
|
||||||
|
# 10.3 million N-grams.
|
||||||
|
|
||||||
|
prune_lm.sh --arpa 7.0 $dir/4gram-mincount
|
||||||
|
# 1.50 million N-grams
|
||||||
|
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
|
||||||
|
|
||||||
|
|
||||||
|
exit 0
|
||||||
|
|
||||||
|
### Below here, this script is showing various commands that
|
||||||
|
## were run during LM tuning.
|
||||||
|
|
||||||
|
train_lm.sh --arpa --lmtype 3gram-mincount $dir
|
||||||
|
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
|
||||||
|
# 7.8 million N-grams.
|
||||||
|
|
||||||
|
prune_lm.sh --arpa 3.0 $dir/3gram-mincount/
|
||||||
|
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740
|
||||||
|
# 2.5 million N-grams.
|
||||||
|
|
||||||
|
prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
|
||||||
|
# 1.45 million N-grams.
|
||||||
|
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
|
||||||
|
|
||||||
|
train_lm.sh --arpa --lmtype 4gram-mincount $dir
|
||||||
|
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
|
||||||
|
# 10.3 million N-grams.
|
||||||
|
|
||||||
|
prune_lm.sh --arpa 3.0 $dir/4gram-mincount
|
||||||
|
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294
|
||||||
|
# 2.6 million N-grams.
|
||||||
|
|
||||||
|
prune_lm.sh --arpa 4.0 $dir/4gram-mincount
|
||||||
|
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717
|
||||||
|
# 2.15 million N-grams.
|
||||||
|
|
||||||
|
prune_lm.sh --arpa 5.0 $dir/4gram-mincount
|
||||||
|
# 1.86 million N-grams
|
||||||
|
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023
|
||||||
|
|
||||||
|
prune_lm.sh --arpa 7.0 $dir/4gram-mincount
|
||||||
|
# 1.50 million N-grams
|
||||||
|
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
|
||||||
|
|
||||||
|
train_lm.sh --arpa --lmtype 3gram $dir
|
||||||
|
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866
|
||||||
|
# 20.0 million N-grams
|
||||||
|
|
||||||
|
! which ngram-count \
|
||||||
|
&& echo "SRILM tools not installed so not doing the comparison" && exit 1;
|
||||||
|
|
||||||
|
#################
|
||||||
|
# You could finish the script here if you wanted.
|
||||||
|
# Below is to show how to do baselines with SRILM.
|
||||||
|
# You'd have to install the SRILM toolkit first.
|
||||||
|
|
||||||
|
heldout_sent=10000 # Don't change this if you want result to be comparable with
|
||||||
|
# kaldi_lm results
|
||||||
|
sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
|
||||||
|
mkdir -p $sdir
|
||||||
|
gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout
|
||||||
|
gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train
|
||||||
|
(echo "<s>"; echo "</s>" ) | cat - $dir/wordlist.txt > $sdir/wordlist.final.s
|
||||||
|
|
||||||
|
# 3-gram:
|
||||||
|
ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
|
||||||
|
-map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
|
||||||
|
ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2
|
||||||
|
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
|
||||||
|
#0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437
|
||||||
|
|
||||||
|
# Trying 4-gram:
|
||||||
|
ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
|
||||||
|
-map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz
|
||||||
|
ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout
|
||||||
|
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
|
||||||
|
#0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822
|
||||||
|
|
||||||
|
#3-gram with pruning:
|
||||||
|
ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
|
||||||
|
-prune 0.0000001 -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz
|
||||||
|
ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout
|
||||||
|
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
|
||||||
|
#0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616
|
||||||
|
# Around 2.25M N-grams.
|
||||||
|
# Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/"
|
||||||
|
# above, which gave 2.5 million N-grams and a perplexity of 156.
|
||||||
|
|
||||||
|
# Note: all SRILM experiments above fully discount all singleton 3 and 4-grams.
|
||||||
|
# You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to
|
||||||
|
# the kaldi_lm experiments above without "-mincount".
|
||||||
|
|
||||||
|
## From here is how to train with
|
||||||
|
# IRSTLM. This is not really working at the moment.
|
||||||
|
export IRSTLM=$KALDI_ROOT/tools/irstlm/
|
||||||
|
|
||||||
|
idir=$dir/irstlm
|
||||||
|
mkdir $idir
|
||||||
|
gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | $IRSTLM/scripts/add-start-end.sh | \
|
||||||
|
gzip -c > $idir/train.gz
|
||||||
|
|
||||||
|
$IRSTLM/bin/dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no
|
||||||
|
cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\
|
||||||
|
{print $0;}}' > vocab.irstlm.20k
|
||||||
|
|
||||||
|
|
||||||
|
$IRSTLM/bin/build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz -p yes \
|
||||||
|
-n 3 -s improved-kneser-ney -b yes
|
||||||
|
# Testing perplexity with SRILM tools:
|
||||||
|
ngram -lm $idir/lm_3gram.gz -ppl $sdir/cleaned.heldout
|
||||||
|
#data/local/local_lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for <unk> in closed-vocabulary LM
|
||||||
|
#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs
|
||||||
|
#0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599
|
||||||
|
|
||||||
|
# Perplexity is very bad (should be ~141, since we used -p option,
|
||||||
|
# not 175),
|
||||||
|
# but adding -debug 3 to the command line shows that
|
||||||
|
# the IRSTLM LM does not seem to sum to one properly, so it seems that
|
||||||
|
# it produces an LM that isn't interpretable in the normal way as an ARPA
|
||||||
|
# LM.
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,153 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2012 Johns Hopkins University (author: Daniel Povey) Tony Robinson
|
||||||
|
|
||||||
|
# This script trains LMs on the WSJ LM-training data.
|
||||||
|
# It requires that you have already run wsj_extend_dict.sh,
|
||||||
|
# to get the larger-size dictionary including all of CMUdict
|
||||||
|
# plus any OOVs and possible acronyms that we could easily
|
||||||
|
# derive pronunciations for.
|
||||||
|
|
||||||
|
# This script takes no command-line arguments but takes the --cmd option.
|
||||||
|
|
||||||
|
# Begin configuration section.
|
||||||
|
rand_seed=0
|
||||||
|
cmd=run.pl
|
||||||
|
nwords=10000 # This is how many words we're putting in the vocab of the RNNLM.
|
||||||
|
hidden=30
|
||||||
|
class=200 # Num-classes... should be somewhat larger than sqrt of nwords.
|
||||||
|
direct=1000 # Probably number of megabytes to allocate for hash-table for "direct" connections.
|
||||||
|
rnnlm_ver=rnnlm-0.3e # version of RNNLM to use
|
||||||
|
# End configuration section.
|
||||||
|
|
||||||
|
[ -f ./path.sh ] && . ./path.sh
|
||||||
|
. utils/parse_options.sh
|
||||||
|
|
||||||
|
if [ $# != 1 ]; then
|
||||||
|
echo "Usage: local/wsj_train_rnnlms.sh [options] <dest-dir>"
|
||||||
|
echo "For options, see top of script file"
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
dir=$1
|
||||||
|
srcdir=data/local/dict_larger
|
||||||
|
mkdir -p $dir
|
||||||
|
|
||||||
|
export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH
|
||||||
|
|
||||||
|
|
||||||
|
( # First make sure the kaldi_lm toolkit is installed.
|
||||||
|
# Note: this didn't work out of the box for me, I had to
|
||||||
|
# change the g++ version to just "g++" (no cross-compilation
|
||||||
|
# needed for me as I ran on a machine that had been setup
|
||||||
|
# as 64 bit by default.
|
||||||
|
cd $KALDI_ROOT/tools || exit 1;
|
||||||
|
if [ -d $rnnlm_ver ]; then
|
||||||
|
echo Not installing the rnnlm toolkit since it is already there.
|
||||||
|
else
|
||||||
|
echo Downloading and installing the rnnlm tools
|
||||||
|
# http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz
|
||||||
|
if [ ! -f $rnnlm_ver.tgz ]; then
|
||||||
|
wget http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz || exit 1;
|
||||||
|
fi
|
||||||
|
mkdir $rnnlm_ver
|
||||||
|
cd $rnnlm_ver
|
||||||
|
tar -xvzf ../$rnnlm_ver.tgz || exit 1;
|
||||||
|
make CC=g++ || exit 1;
|
||||||
|
echo Done making the rnnlm tools
|
||||||
|
fi
|
||||||
|
) || exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
|
||||||
|
echo "Expecting files $srcdir/cleaned.gz and $srcdir/wordlist.final to exist";
|
||||||
|
echo "You need to run local/wsj_extend_dict.sh before running this script."
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
cat $srcdir/lexicon.txt | awk '{print $1}' | grep -v -w '!SIL' > $dir/wordlist.all
|
||||||
|
|
||||||
|
# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
|
||||||
|
echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)"
|
||||||
|
gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.all \
|
||||||
|
'BEGIN{while((getline<w)>0) v[$1]=1;}
|
||||||
|
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
|
||||||
|
| gzip -c > $dir/all.gz
|
||||||
|
|
||||||
|
echo "Splitting data into train and validation sets."
|
||||||
|
heldout_sent=10000
|
||||||
|
gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data
|
||||||
|
gunzip -c $dir/all.gz | tail -n +$heldout_sent | \
|
||||||
|
perl -e ' use List::Util qw(shuffle); @A=<>; print join("", shuffle(@A)); ' \
|
||||||
|
> $dir/train.in # training data
|
||||||
|
|
||||||
|
|
||||||
|
# The rest will consist of a word-class represented by <RNN_UNK>, that
|
||||||
|
# maps (with probabilities) to a whole class of words.
|
||||||
|
|
||||||
|
# Get unigram counts from our training data, and use this to select word-list
|
||||||
|
# for RNNLM training; e.g. 10k most frequent words. Rest will go in a class
|
||||||
|
# that we (manually, at the shell level) assign probabilities for words that
|
||||||
|
# are in that class. Note: this word-list doesn't need to include </s>; this
|
||||||
|
# automatically gets added inside the rnnlm program.
|
||||||
|
# Note: by concatenating with $dir/wordlist.all, we are doing add-one
|
||||||
|
# smoothing of the counts.
|
||||||
|
|
||||||
|
cat $dir/train.in $dir/wordlist.all | grep -v '</s>' | grep -v '<s>' | \
|
||||||
|
awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
|
||||||
|
sort -nr > $dir/unigram.counts
|
||||||
|
|
||||||
|
head -$nwords $dir/unigram.counts | awk '{print $2}' > $dir/wordlist.rnn
|
||||||
|
|
||||||
|
tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts
|
||||||
|
|
||||||
|
tot=`awk '{x=x+$1} END{print x}' $dir/unk_class.counts`
|
||||||
|
awk -v tot=$tot '{print $2, ($1*1.0/tot);}' <$dir/unk_class.counts >$dir/unk.probs
|
||||||
|
|
||||||
|
|
||||||
|
for type in train valid; do
|
||||||
|
cat $dir/$type.in | awk -v w=$dir/wordlist.rnn \
|
||||||
|
'BEGIN{while((getline<w)>0) v[$1]=1;}
|
||||||
|
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<RNN_UNK> ";print ""}'|sed 's/ $//g' \
|
||||||
|
> $dir/$type
|
||||||
|
done
|
||||||
|
rm $dir/train.in # no longer needed-- and big.
|
||||||
|
|
||||||
|
# Now randomize the order of the training data.
|
||||||
|
cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \
|
||||||
|
sort | cut -f 2 > $dir/foo
|
||||||
|
mv $dir/foo $dir/train
|
||||||
|
|
||||||
|
# OK we'll train the RNNLM on this data.
|
||||||
|
|
||||||
|
# todo: change 100 to 320.
|
||||||
|
# using 100 classes as square root of 10k.
|
||||||
|
echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
|
||||||
|
#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/100.rnnlm \
|
||||||
|
# -hidden 100 -rand-seed 1 -debug 2 -class 100 -bptt 2 -bptt-block 20 \
|
||||||
|
# -direct-order 4 -direct 1000 -binary >& $dir/rnnlm1.log &
|
||||||
|
|
||||||
|
$cmd $dir/rnnlm.log \
|
||||||
|
$KALDI_ROOT/tools/$rnnlm_ver/rnnlm -independent -train $dir/train -valid $dir/valid \
|
||||||
|
-rnnlm $dir/rnnlm -hidden $hidden -rand-seed 1 -debug 2 -class $class -bptt 2 -bptt-block 20 \
|
||||||
|
-direct-order 4 -direct $direct -binary || exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
# make it like a Kaldi table format, with fake utterance-ids.
|
||||||
|
cat $dir/valid.in | awk '{ printf("uttid-%d ", NR); print; }' > $dir/valid.with_ids
|
||||||
|
|
||||||
|
utils/rnnlm_compute_scores.sh $dir $dir/tmp.valid $dir/valid.with_ids \
|
||||||
|
$dir/valid.scores
|
||||||
|
nw=`wc -w < $dir/valid.with_ids` # Note: valid.with_ids includes utterance-ids which
|
||||||
|
# is one per word, to account for the </s> at the end of each sentence; this is the
|
||||||
|
# correct number to normalize buy.
|
||||||
|
p=`awk -v nw=$nw '{x=x+$2} END{print exp(x/nw);}' <$dir/valid.scores`
|
||||||
|
echo Perplexity is $p | tee $dir/perplexity.log
|
||||||
|
|
||||||
|
rm $dir/train $dir/all.gz
|
||||||
|
|
||||||
|
# This is a better setup, but takes a long time to train:
|
||||||
|
#echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
|
||||||
|
#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/320.rnnlm \
|
||||||
|
# -hidden 320 -rand-seed 1 -debug 2 -class 300 -bptt 2 -bptt-block 20 \
|
||||||
|
# -direct-order 4 -direct 2000 -binary
|
|
@ -0,0 +1,3 @@
|
||||||
|
export KALDI_ROOT=`pwd`/../../..
|
||||||
|
export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet-cpubin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH
|
||||||
|
export LC_ALL=C
|
|
@ -0,0 +1,152 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
|
||||||
|
## This relates to the queue.
|
||||||
|
|
||||||
|
# This is a shell script, but it's recommended that you run the commands one by
|
||||||
|
# one by copying and pasting into the shell.
|
||||||
|
|
||||||
|
case 0 in #goto here
|
||||||
|
1)
|
||||||
|
|
||||||
|
|
||||||
|
aurora4=/mnt/spdb/aurora4
|
||||||
|
#we need lm, trans, from WSJ0 CORPUS
|
||||||
|
wsj0=/mnt/spdb/wall_street_journal
|
||||||
|
|
||||||
|
local/aurora4_data_prep.sh $aurora4 $wsj0
|
||||||
|
|
||||||
|
local/wsj_prepare_dict.sh || exit 1;
|
||||||
|
|
||||||
|
utils/prepare_lang.sh data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp data/lang || exit 1;
|
||||||
|
|
||||||
|
local/aurora4_format_data.sh || exit 1;
|
||||||
|
|
||||||
|
# Now make MFCC features.
|
||||||
|
# mfccdir should be some place with a largish disk where you
|
||||||
|
# want to store MFCC features.
|
||||||
|
mfccdir=mfcc
|
||||||
|
for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do
|
||||||
|
steps/make_mfcc.sh --nj 10 \
|
||||||
|
data/$x exp/make_mfcc/$x $mfccdir || exit 1;
|
||||||
|
steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
# make fbank features
|
||||||
|
fbankdir=fbank
|
||||||
|
mkdir -p data-fbank
|
||||||
|
for x in train_si84_clean train_si84_multi dev_0330 dev_1206 test_eval92 test_0166; do
|
||||||
|
cp -r data/$x data-fbank/$x
|
||||||
|
steps/make_fbank.sh --nj 10 \
|
||||||
|
data-fbank/$x exp/make_fbank/$x $fbankdir || exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
# Note: the --boost-silence option should probably be omitted by default
|
||||||
|
# for normal setups. It doesn't always help. [it's to discourage non-silence
|
||||||
|
# models from modeling silence.]
|
||||||
|
#steps/train_mono.sh --boost-silence 1.25 --nj 10 \
|
||||||
|
# data/train_si84_clean data/lang exp/mono0a || exit 1;
|
||||||
|
|
||||||
|
steps/train_mono.sh --boost-silence 1.25 --nj 10 \
|
||||||
|
data/train_si84_multi data/lang exp/mono0a_multi || exit 1;
|
||||||
|
#(
|
||||||
|
# utils/mkgraph.sh --mono data/lang_test_tgpr exp/mono0a exp/mono0a/graph_tgpr && \
|
||||||
|
# steps/decode.sh --nj 8 \
|
||||||
|
# exp/mono0a/graph_tgpr data/test_eval92 exp/mono0a/decode_tgpr_eval92
|
||||||
|
#) &
|
||||||
|
|
||||||
|
#steps/align_si.sh --boost-silence 1.25 --nj 10 \
|
||||||
|
# data/train_si84_clean data/lang exp/mono0a exp/mono0a_ali || exit 1;
|
||||||
|
|
||||||
|
steps/align_si.sh --boost-silence 1.25 --nj 10 \
|
||||||
|
data/train_si84_multi data/lang exp/mono0a_multi exp/mono0a_multi_ali || exit 1;
|
||||||
|
|
||||||
|
#steps/train_deltas.sh --boost-silence 1.25 \
|
||||||
|
# 2000 10000 data/train_si84_clean data/lang exp/mono0a_ali exp/tri1 || exit 1;
|
||||||
|
|
||||||
|
steps/train_deltas.sh --boost-silence 1.25 \
|
||||||
|
2000 10000 data/train_si84_multi data/lang exp/mono0a_multi_ali exp/tri1_multi || exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
while [ ! -f data/lang_test_tgpr/tmp/LG.fst ] || \
|
||||||
|
[ -z data/lang_test_tgpr/tmp/LG.fst ]; do
|
||||||
|
sleep 20;
|
||||||
|
done
|
||||||
|
sleep 30;
|
||||||
|
# or the mono mkgraph.sh might be writing
|
||||||
|
# data/lang_test_tgpr/tmp/LG.fst which will cause this to fail.
|
||||||
|
|
||||||
|
steps/align_si.sh --nj 10 \
|
||||||
|
data/train_si84_multi data/lang exp/tri1_multi exp/tri1_multi_ali_si84 || exit 1;
|
||||||
|
|
||||||
|
steps/train_deltas.sh \
|
||||||
|
2500 15000 data/train_si84_multi data/lang exp/tri1_multi_ali_si84 exp/tri2a_multi || exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
steps/train_lda_mllt.sh \
|
||||||
|
--splice-opts "--left-context=3 --right-context=3" \
|
||||||
|
2500 15000 data/train_si84_multi data/lang exp/tri1_multi_ali_si84 exp/tri2b_multi || exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri2b_multi exp/tri2b_multi/graph_tgpr_5k || exit 1;
|
||||||
|
steps/decode.sh --nj 8 \
|
||||||
|
exp/tri2b_multi/graph_tgpr_5k data/test_eval92 exp/tri2b_multi/decode_tgpr_5k_eval92 || exit 1;
|
||||||
|
|
||||||
|
# Align tri2b system with si84 multi-condition data.
|
||||||
|
steps/align_si.sh --nj 10 \
|
||||||
|
--use-graphs true data/train_si84_multi data/lang exp/tri2b_multi exp/tri2b_multi_ali_si84 || exit 1;
|
||||||
|
|
||||||
|
steps/align_si.sh --nj 10 \
|
||||||
|
data/dev_0330 data/lang exp/tri2b_multi exp/tri2b_multi_ali_dev_0330 || exit 1;
|
||||||
|
|
||||||
|
steps/align_si.sh --nj 10 \
|
||||||
|
data/dev_1206 data/lang exp/tri2b_multi exp/tri2b_multi_ali_dev_1206 || exit 1;
|
||||||
|
|
||||||
|
#Now begin train DNN systems on multi data
|
||||||
|
. ./path.sh
|
||||||
|
#RBM pretrain
|
||||||
|
dir=exp/tri3a_dnn_pretrain
|
||||||
|
$cuda_cmd $dir/_pretrain_dbn.log \
|
||||||
|
steps/pretrain_dbn.sh --use-gpu-id 0 --nn-depth 7 --rbm-iter 3 data-fbank/train_si84_multi $dir
|
||||||
|
|
||||||
|
dir=exp/tri3a_dnn
|
||||||
|
ali=exp/tri2b_multi_ali_si84
|
||||||
|
ali_dev=exp/tri2b_multi_ali_dev_0330
|
||||||
|
feature_transform=exp/tri3a_dnn_pretrain/final.feature_transform
|
||||||
|
dbn=exp/tri3a_dnn_pretrain/7.dbn
|
||||||
|
$cuda_cmd $dir/_train_nnet.log \
|
||||||
|
steps/train_nnet.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 --use-gpu-id 0 \
|
||||||
|
data-fbank/train_si84_multi data-fbank/dev_0330 data/lang $ali $ali_dev $dir || exit 1;
|
||||||
|
|
||||||
|
utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri3a_dnn exp/tri3a_dnn/graph_tgpr_5k || exit 1;
|
||||||
|
dir=exp/tri3a_dnn
|
||||||
|
steps/decode_nnet.sh --nj 8 --acwt 0.10 --config conf/decode_dnn.config \
|
||||||
|
exp/tri3a_dnn/graph_tgpr_5k data-fbank/test_eval92 $dir/decode_tgpr_5k_eval92 || exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
#realignments
|
||||||
|
srcdir=exp/tri3a_dnn
|
||||||
|
steps/align_nnet.sh --nj 10 \
|
||||||
|
data-fbank/train_si84_multi data/lang $srcdir ${srcdir}_ali_si84_multi || exit 1;
|
||||||
|
steps/align_nnet.sh --nj 10 \
|
||||||
|
data-fbank/dev_0330 data/lang $srcdir ${srcdir}_ali_dev_0330 || exit 1;
|
||||||
|
|
||||||
|
#train system again
|
||||||
|
|
||||||
|
dir=exp/tri4a_dnn
|
||||||
|
ali=exp/tri3a_dnn_ali_si84_multi
|
||||||
|
ali_dev=exp/tri3a_dnn_ali_dev_0330
|
||||||
|
feature_transform=exp/tri3a_dnn_pretrain/final.feature_transform
|
||||||
|
dbn=exp/tri3a_dnn_pretrain/7.dbn
|
||||||
|
$cuda_cmd $dir/_train_nnet.log \
|
||||||
|
steps/train_nnet.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 --use-gpu-id 0 \
|
||||||
|
data-fbank/train_si84_multi data-fbank/dev_0330 data/lang $ali $ali_dev $dir || exit 1;
|
||||||
|
|
||||||
|
utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri4a_dnn exp/tri4a_dnn/graph_tgpr_5k || exit 1;
|
||||||
|
dir=exp/tri4a_dnn
|
||||||
|
steps/decode_nnet.sh --nj 8 --acwt 0.10 --config conf/decode_dnn.config \
|
||||||
|
exp/tri4a_dnn/graph_tgpr_5k data-fbank/test_eval92 $dir/decode_tgpr_5k_eval92 || exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
# DNN Sequential DT training
|
||||||
|
#......
|
|
@ -0,0 +1 @@
|
||||||
|
../../wsj/s5/steps
|
|
@ -0,0 +1 @@
|
||||||
|
../../wsj/s5/utils
|
Загрузка…
Ссылка в новой задаче