зеркало из https://github.com/mozilla/kaldi.git
sandbox/akirkedal: Refactored data preparation scripts and the lexicon is now downloaded from openslr.org
git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/akirkedal@4269 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
cdfaba14c6
Коммит
7e6af54b2d
|
@ -22,48 +22,16 @@ exproot=$(pwd)
|
||||||
dir=data/local/dict
|
dir=data/local/dict
|
||||||
mkdir -p $dir
|
mkdir -p $dir
|
||||||
|
|
||||||
|
|
||||||
# Dictionary preparation:
|
# Dictionary preparation:
|
||||||
|
# This lexicon was created using eSpeak.
|
||||||
# Normalise transcripts and create a transcript file
|
# To extend the setup, see local/dict_prep.sh
|
||||||
# Removes '.,:;?' and removes '\' before '\Komma' (dictated ',')
|
|
||||||
# outputs a normalised transcript without utterance ids and a list of utterance ids
|
|
||||||
echo "Normalising"
|
|
||||||
trainsrc=data/local/trainsrc
|
|
||||||
rm -rf $trainsrc
|
|
||||||
mkdir $trainsrc
|
|
||||||
mv data/train/text1 $trainsrc/text1
|
|
||||||
python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $trainsrc/text1 $trainsrc/onlyids $dir/transcripts.tmp
|
|
||||||
|
|
||||||
# Additional normalisation, uppercasing, writing numbers etc.
|
|
||||||
# and recombine with
|
|
||||||
local/norm_dk/format_text.sh am $dir/transcripts.tmp > $dir/transcripts.am
|
|
||||||
cp $dir/transcripts.am $trainsrc/onlytext
|
|
||||||
paste -d ' ' $trainsrc/onlyids $trainsrc/onlytext > data/train/text
|
|
||||||
utils/validate_data_dir.sh --no-feat data/train || exit 1;
|
|
||||||
|
|
||||||
# lmsents is output by sprak_data_prep.sh and contains
|
|
||||||
# sentences that are disjoint from the test and dev set
|
|
||||||
python3 local/normalize_transcript.py local/norm_dk/numbersUp.tbl data/local/data/lmsents $dir/lmsents.norm
|
|
||||||
wait
|
|
||||||
|
|
||||||
# Create wordlist from the AM transcripts
|
|
||||||
cat $dir/transcripts.am | tr [:blank:] '\n' | sort -u > $dir/wlist.txt &
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Because training data is read aloud, there are many occurences of the same
|
|
||||||
# sentence and bias towards the domain. Make a version where
|
|
||||||
# the sentences are unique to reduce bias.
|
|
||||||
local/norm_dk/format_text.sh lm $dir/lmsents.norm > $dir/transcripts.txt
|
|
||||||
sort -u $dir/transcripts.txt > $dir/transcripts.uniq
|
|
||||||
|
|
||||||
# Copy pre-made phone table
|
# Copy pre-made phone table
|
||||||
cp local/dictsrc/complexphones.txt $dir/nonsilence_phones.txt
|
cp local/dictsrc/complexphones.txt $dir/nonsilence_phones.txt
|
||||||
|
|
||||||
# Copy pre-made lexicon
|
# Copy pre-made lexicon
|
||||||
cp local/dictsrc/lexicon.txt $dir/lexicon.txt
|
wget http://www.openslr.org/resources/8/lexicon-da.tar.gz --directory-prefix=data/local/data/download
|
||||||
|
tar -xzf data/local/data/download/lexicon-da.tar.gz -C $dir
|
||||||
|
|
||||||
|
|
||||||
# silence phones, one per line.
|
# silence phones, one per line.
|
||||||
|
@ -72,30 +40,7 @@ echo SIL > $dir/optional_silence.txt
|
||||||
|
|
||||||
touch $dir/extra_questions.txt
|
touch $dir/extra_questions.txt
|
||||||
|
|
||||||
# Repeat text preparation on test set, but do not add to dictionary
|
wait
|
||||||
testsrc=data/local/testsrc
|
|
||||||
rm -rf $testsrc
|
|
||||||
mkdir $testsrc
|
|
||||||
mv data/test/text1 $testsrc/text1
|
|
||||||
python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $testsrc/text1 $testsrc/onlyids $testsrc/transcripts.am
|
|
||||||
local/norm_dk/format_text.sh am $testsrc/transcripts.am > $testsrc/onlytext
|
|
||||||
paste -d ' ' $testsrc/onlyids $testsrc/onlytext > data/test/text
|
|
||||||
utils/validate_data_dir.sh --no-feat data/test || exit 1;
|
|
||||||
|
|
||||||
# Repeat text preparation on dev set, but do not add to dictionary
|
|
||||||
devsrc=data/local/devsrc
|
|
||||||
rm -rf $devsrc
|
|
||||||
mkdir $devsrc
|
|
||||||
mv data/dev/text1 $devsrc/text1
|
|
||||||
python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $devsrc/text1 $devsrc/onlyids $devsrc/transcripts.tmp
|
|
||||||
local/norm_dk/format_text.sh am $devsrc/transcripts.tmp > $devsrc/onlytext
|
|
||||||
paste -d ' ' $devsrc/onlyids $devsrc/onlytext > data/dev/text
|
|
||||||
|
|
||||||
# Also create a file that can be used for reranking using LMs
|
|
||||||
local/norm_dk/format_text.sh lm $devsrc/transcripts.tmp > data/dev/transcripts.txt
|
|
||||||
sort -u data/dev/transcripts.txt > data/dev/transcripts.uniq
|
|
||||||
|
|
||||||
utils/validate_data_dir.sh --no-feat data/dev || exit 1;
|
|
||||||
|
|
||||||
|
|
||||||
## TODO: add cleanup commands
|
## TODO: add cleanup commands
|
||||||
|
|
|
@ -0,0 +1,34 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2014 Mirsk Digital ApS (Author: Andreas Kirkedal)
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
if [ $# != 2 ]; then
|
||||||
|
echo "Usage: create_dataset.sh <src-data-dir> <dest-dir> "
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
src=$1
|
||||||
|
dest=$2
|
||||||
|
mkdir $dest
|
||||||
|
python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $src/text.unnormalised $src/onlyids $src/transcripts.am
|
||||||
|
local/norm_dk/format_text.sh am $src/transcripts.am > $src/onlytext
|
||||||
|
paste -d ' ' $src/onlyids $src/onlytext > $dest/text
|
||||||
|
for f in wav.scp utt2spk; do
|
||||||
|
cp $src/$f $dest/$f
|
||||||
|
done
|
||||||
|
utils/utt2spk_to_spk2utt.pl $dest/utt2spk > $dest/spk2utt
|
||||||
|
utils/validate_data_dir.sh --no-feats $dest || exit 1;
|
|
@ -131,7 +131,7 @@ if __name__ == '__main__':
|
||||||
else:
|
else:
|
||||||
traindata = create_parallel_kaldi(flist, "")
|
traindata = create_parallel_kaldi(flist, "")
|
||||||
|
|
||||||
textout = codecs.open(os.path.join(outpath, "text1"), "w", "utf8")
|
textout = codecs.open(os.path.join(outpath, "text.unnormalised"), "w", "utf8")
|
||||||
wavout = codecs.open(os.path.join(outpath, "wav.scp"), "w")
|
wavout = codecs.open(os.path.join(outpath, "wav.scp"), "w")
|
||||||
utt2spkout = codecs.open(os.path.join(outpath, "utt2spk"), "w")
|
utt2spkout = codecs.open(os.path.join(outpath, "utt2spk"), "w")
|
||||||
textout.writelines(traindata[0])
|
textout.writelines(traindata[0])
|
||||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -21,14 +21,16 @@
|
||||||
|
|
||||||
mode=$1
|
mode=$1
|
||||||
|
|
||||||
|
tmp="$(mktemp -d)"
|
||||||
|
|
||||||
dir=$(pwd)/local/norm_dk
|
dir=$(pwd)/local/norm_dk
|
||||||
|
|
||||||
src=$dir/src.tmp
|
src=$tmp/src.tmp
|
||||||
abbr=$dir/anot.tmp
|
abbr=$tmp/anot.tmp
|
||||||
rem=$dir/rem.tmp
|
rem=$tmp/rem.tmp
|
||||||
line=$dir/line.tmp
|
line=$tmp/line.tmp
|
||||||
num=$dir/num.tmp
|
num=$tmp/num.tmp
|
||||||
nonum=$dir/nonum.tmp
|
nonum=$tmp/nonum.tmp
|
||||||
|
|
||||||
cat $2 | tr -d '\r' > $src
|
cat $2 | tr -d '\r' > $src
|
||||||
|
|
||||||
|
@ -50,4 +52,4 @@ PERLIO=:utf8 perl -pe '$_=uc'
|
||||||
|
|
||||||
# Comment this line for debugging
|
# Comment this line for debugging
|
||||||
wait
|
wait
|
||||||
rm -f $abbr $rem $line
|
rm -rf $tmp
|
||||||
|
|
|
@ -6,10 +6,11 @@
|
||||||
|
|
||||||
|
|
||||||
dir=`pwd`/data/local/data
|
dir=`pwd`/data/local/data
|
||||||
lmdir=`pwd`/data/local/arpa_lm
|
lmdir=`pwd`/data/local/transcript_lm
|
||||||
traindir=`pwd`/data/train
|
traindir=`pwd`/data/local/trainsrc
|
||||||
testdir=`pwd`/data/test
|
testdir=`pwd`/data/local/testsrc
|
||||||
devdir=`pwd`/data/dev
|
devdir=`pwd`/data/local/devsrc
|
||||||
|
rm -rf $lmdir $traindir $testdir $devdir
|
||||||
mkdir -p $dir $lmdir $traindir $testdir $devdir
|
mkdir -p $dir $lmdir $traindir $testdir $devdir
|
||||||
local=`pwd`/local
|
local=`pwd`/local
|
||||||
utils=`pwd`/utils
|
utils=`pwd`/utils
|
||||||
|
@ -18,7 +19,7 @@ utils=`pwd`/utils
|
||||||
|
|
||||||
# Checks if python3 is available on the system and install python3 in userspace if not
|
# Checks if python3 is available on the system and install python3 in userspace if not
|
||||||
# This recipe currently relies on version 3 because python3 uses utf8 as internal
|
# This recipe currently relies on version 3 because python3 uses utf8 as internal
|
||||||
# representation string representation
|
# string representation
|
||||||
|
|
||||||
if ! which python3 >&/dev/null; then
|
if ! which python3 >&/dev/null; then
|
||||||
echo "Installing python3 since not on your path."
|
echo "Installing python3 since not on your path."
|
||||||
|
@ -60,7 +61,7 @@ if [ ! -d $dir/download/0611 ]; then
|
||||||
echo "Corpus unpacked succesfully."
|
echo "Corpus unpacked succesfully."
|
||||||
fi
|
fi
|
||||||
|
|
||||||
. ./path.sh # Needed for KALDI_ROOT
|
|
||||||
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
|
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
|
||||||
if [ ! -x $sph2pipe ]; then
|
if [ ! -x $sph2pipe ]; then
|
||||||
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
|
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
|
||||||
|
@ -76,62 +77,73 @@ mkdir -p $dir/corpus_processed/training/0565-1 $dir/corpus_processed/training/05
|
||||||
|
|
||||||
# Create parallel file lists and text files, but keep sound files in the same location to save disk space
|
# Create parallel file lists and text files, but keep sound files in the same location to save disk space
|
||||||
# Writes the lists to data/local/data (~ 310h)
|
# Writes the lists to data/local/data (~ 310h)
|
||||||
|
echo "Creating parallel data for training data."
|
||||||
python3 $local/sprak2kaldi.py $dir/download/0565-1 $dir/corpus_processed/training/0565-1 & # ~130h
|
python3 $local/sprak2kaldi.py $dir/download/0565-1 $dir/corpus_processed/training/0565-1 & # ~130h
|
||||||
python3 $local/sprak2kaldi.py $dir/download/0565-2 $dir/corpus_processed/training/0565-2 & # ~115h
|
python3 $local/sprak2kaldi.py $dir/download/0565-2 $dir/corpus_processed/training/0565-2 & # ~115h
|
||||||
python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon05 $dir/corpus_processed/training/0611_Stasjon05 & # ~51h
|
python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon05 $dir/corpus_processed/training/0611_Stasjon05 & # ~51h
|
||||||
|
|
||||||
|
(
|
||||||
# Ditto dev set (~ 16h)
|
# Ditto dev set (~ 16h)
|
||||||
rm -rf $dir/corpus_processed/dev03
|
echo "Creating parallel data for test data."
|
||||||
mkdir -p $dir/corpus_processed/dev03
|
rm -rf $dir/corpus_processed/dev03
|
||||||
python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon03 $dir/corpus_processed/dev03 &
|
mkdir -p $dir/corpus_processed/dev03
|
||||||
|
python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon03 $dir/corpus_processed/dev03 &
|
||||||
|
) &
|
||||||
|
|
||||||
|
(
|
||||||
# Ditto test set (about 9 hours)
|
# Ditto test set (about 9 hours)
|
||||||
rm -rf $dir/corpus_processed/test06
|
echo "Creating parallel data for development data."
|
||||||
mkdir -p $dir/corpus_processed/test06
|
rm -rf $dir/corpus_processed/test06
|
||||||
python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon06 $dir/corpus_processed/test06 || exit 1;
|
mkdir -p $dir/corpus_processed/test06
|
||||||
|
python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon06 $dir/corpus_processed/test06 || exit 1;
|
||||||
|
) &
|
||||||
|
|
||||||
wait
|
wait
|
||||||
|
|
||||||
|
# Create the LM training data
|
||||||
|
# Test and dev data is disjoint from training data, so we use those transcripts)
|
||||||
|
|
||||||
|
# Because training data is read aloud, there are many occurences of the same
|
||||||
|
# sentence and bias towards the domain. Make a version where
|
||||||
|
# the sentences are unique to reduce bias.
|
||||||
|
|
||||||
|
(
|
||||||
|
echo "Writing the LM text to file and normalising."
|
||||||
|
cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist | while read l; do cat $l; done > $lmdir/lmsents
|
||||||
|
python3 local/normalize_transcript.py local/norm_dk/numbersUp.tbl $lmdir/lmsents $lmdir/lmsents.norm
|
||||||
|
local/norm_dk/format_text.sh lm $lmdir/lmsents.norm > $lmdir/transcripts.txt
|
||||||
|
sort -u $lmdir/transcripts.txt > $lmdir/transcripts.uniq
|
||||||
|
) &
|
||||||
|
|
||||||
# Combine training file lists
|
# Combine training file lists
|
||||||
echo "Combine file lists."
|
echo "Combine file lists."
|
||||||
cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist $dir/corpus_processed/training/0611_Stasjon05/txtlist > $dir/traintxtfiles
|
cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist $dir/corpus_processed/training/0611_Stasjon05/txtlist > $dir/traintxtfiles
|
||||||
cat $dir/corpus_processed/training/0565-1/sndlist $dir/corpus_processed/training/0565-2/sndlist $dir/corpus_processed/training/0611_Stasjon05/sndlist > $dir/trainsndfiles
|
cat $dir/corpus_processed/training/0565-1/sndlist $dir/corpus_processed/training/0565-2/sndlist $dir/corpus_processed/training/0611_Stasjon05/sndlist > $dir/trainsndfiles
|
||||||
|
|
||||||
# LM training files (test data is disjoint from training data)
|
# Move test file lists to the right location
|
||||||
echo "Write file list with LM text files. (This will take a while)"
|
cp $dir/corpus_processed/dev03/txtlist $dir/devtxtfiles
|
||||||
cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist > $dir/lmtxtfiles
|
cp $dir/corpus_processed/dev03/sndlist $dir/devsndfiles
|
||||||
cat $dir/lmtxtfiles | while read l; do cat $l; done > $dir/lmsents &
|
|
||||||
|
|
||||||
# Move test file lists to the right location
|
# Move test file lists to the right location
|
||||||
mv $dir/corpus_processed/dev03/txtlist $dir/devtxtfiles
|
cp $dir/corpus_processed/test06/txtlist $dir/testtxtfiles
|
||||||
mv $dir/corpus_processed/dev03/sndlist $dir/devsndfiles
|
cp $dir/corpus_processed/test06/sndlist $dir/testsndfiles
|
||||||
|
|
||||||
|
# Write wav.scp, utt2spk and text.unnormalised for train, test and dev sets with
|
||||||
# Move test file lists to the right location
|
|
||||||
mv $dir/corpus_processed/test06/txtlist $dir/testtxtfiles
|
|
||||||
mv $dir/corpus_processed/test06/sndlist $dir/testsndfiles
|
|
||||||
|
|
||||||
# Write wav.scp, utt2spk and text1 for train, test and dev sets with
|
|
||||||
# Use sph2pipe because the wav files are actually sph files
|
# Use sph2pipe because the wav files are actually sph files
|
||||||
echo "Creating wav.scp, utt2spk and text1 for train, test and dev dirs."
|
echo "Creating wav.scp, utt2spk and text.unnormalised for train, test and dev"
|
||||||
python3 $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe &
|
python3 $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe &
|
||||||
python3 $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe &
|
python3 $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe &
|
||||||
python3 $local/data_prep.py $dir/devtxtfiles $devdir $dir/devsndfiles $sph2pipe &
|
python3 $local/data_prep.py $dir/devtxtfiles $devdir $dir/devsndfiles $sph2pipe &
|
||||||
|
|
||||||
wait
|
wait
|
||||||
|
|
||||||
# Create spk2utt file
|
# Create the main data sets
|
||||||
utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt &
|
local/create_datasets.sh $testdir data/test &
|
||||||
utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt &
|
local/create_datasets.sh $devdir data/dev &
|
||||||
utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
|
local/create_datasets.sh $traindir data/train &
|
||||||
|
|
||||||
wait
|
wait
|
||||||
|
|
||||||
for d in train test dev; do
|
|
||||||
utils/validate_data_dir.sh --no-feats --no-text data/$d || exit 1;
|
|
||||||
done
|
|
||||||
|
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
|
|
||||||
# Extract gender from spl files
|
# Extract gender from spl files
|
||||||
|
|
|
@ -8,15 +8,13 @@
|
||||||
# Note: you might want to try to give the option --spk-dep-weights=false to train_sgmm2.sh;
|
# Note: you might want to try to give the option --spk-dep-weights=false to train_sgmm2.sh;
|
||||||
# this takes out the "symmetric SGMM" part which is not always helpful.
|
# this takes out the "symmetric SGMM" part which is not always helpful.
|
||||||
|
|
||||||
# SGMM system on si84 data [sgmm5a]. Note: the system we aligned from used the si284 data for
|
|
||||||
# training, but this shouldn't have much effect.
|
|
||||||
|
|
||||||
test=$1
|
test=$1
|
||||||
|
|
||||||
|
if [ ! -d xxp/tri4b_ali ]; then
|
||||||
steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \
|
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
|
||||||
data/train data/lang exp/tri4b exp/tri4b_ali || exit 1;
|
data/train data/lang exp/tri4b exp/tri4b_ali || exit 1;
|
||||||
|
fi
|
||||||
steps/train_ubm.sh --cmd "$train_cmd" \
|
steps/train_ubm.sh --cmd "$train_cmd" \
|
||||||
400 data/train data/lang exp/tri4b_ali exp/ubm5a || exit 1;
|
400 data/train data/lang exp/tri4b_ali exp/ubm5a || exit 1;
|
||||||
|
|
||||||
|
@ -30,9 +28,9 @@ test=$1
|
||||||
exp/sgmm2_5a/graph_3g data/${test} exp/sgmm2_5a/decode_3g_${test}
|
exp/sgmm2_5a/graph_3g data/${test} exp/sgmm2_5a/decode_3g_${test}
|
||||||
) &
|
) &
|
||||||
|
|
||||||
steps/align_sgmm2.sh --nj 50 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
|
steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
|
||||||
--use-graphs true --use-gselect true data/train data/lang exp/sgmm2_5a exp/sgmm2_5a_ali || exit 1;
|
--use-graphs true --use-gselect true data/train data/lang exp/sgmm2_5a exp/sgmm2_5a_ali || exit 1;
|
||||||
steps/make_denlats_sgmm2.sh --nj 50 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
|
steps/make_denlats_sgmm2.sh --nj 30 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
|
||||||
data/train data/lang exp/sgmm2_5a_ali exp/sgmm2_5a_denlats
|
data/train data/lang exp/sgmm2_5a_ali exp/sgmm2_5a_denlats
|
||||||
|
|
||||||
wait
|
wait
|
||||||
|
@ -104,10 +102,10 @@ test=$1
|
||||||
|
|
||||||
wait
|
wait
|
||||||
|
|
||||||
steps/align_sgmm2.sh --nj 50 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
|
steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali \
|
||||||
--use-graphs true --use-gselect true data/train data/lang exp/sgmm2_5b exp/sgmm2_5b_ali
|
--use-graphs true --use-gselect true data/train data/lang exp/sgmm2_5b exp/sgmm2_5b_ali
|
||||||
|
|
||||||
steps/make_denlats_sgmm2.sh --nj 50 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
|
steps/make_denlats_sgmm2.sh --nj 30 --sub-split 2 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali \
|
||||||
data/train data/lang exp/sgmm2_5b_ali exp/sgmm2_5b_denlats
|
data/train data/lang exp/sgmm2_5b_ali exp/sgmm2_5b_denlats
|
||||||
|
|
||||||
wait
|
wait
|
||||||
|
@ -136,8 +134,6 @@ wait
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
wait
|
wait
|
||||||
|
|
||||||
# Examples of combining some of the best decodings: SGMM+MMI with
|
# Examples of combining some of the best decodings: SGMM+MMI with
|
||||||
|
@ -149,14 +145,3 @@ local/score_combine.sh data/${test} \
|
||||||
exp/sgmm2_5b_mmi_b0.1/decode_4g_${test}_it3 \
|
exp/sgmm2_5b_mmi_b0.1/decode_4g_${test}_it3 \
|
||||||
exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_4g_${test}_it8_3
|
exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_4g_${test}_it8_3
|
||||||
|
|
||||||
|
|
||||||
# %WER 4.43 [ 250 / 5643, 41 ins, 12 del, 197 sub ] exp/tri4b_fmmi_a/decode_tgpr_eval92_it8/wer_11
|
|
||||||
# %WER 3.85 [ 217 / 5643, 35 ins, 11 del, 171 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3/wer_10
|
|
||||||
# combined to:
|
|
||||||
# %WER 3.76 [ 212 / 5643, 32 ins, 12 del, 168 sub ] exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it8_3/wer_12
|
|
||||||
|
|
||||||
# Checking MBR decode of baseline:
|
|
||||||
cp -r -T exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3{,.mbr}
|
|
||||||
local/score_mbr.sh data/test_eval92 data/lang_test_bd_tgpr exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr
|
|
||||||
# MBR decoding did not seem to help (baseline was 3.85). I think this is normal at such low WERs.
|
|
||||||
%WER 3.86 [ 218 / 5643, 35 ins, 11 del, 172 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr/wer_10
|
|
||||||
|
|
|
@ -13,6 +13,7 @@
|
||||||
local/sprak_data_prep.sh || exit 1;
|
local/sprak_data_prep.sh || exit 1;
|
||||||
|
|
||||||
# Perform text normalisation, prepare dict folder and LM data transcriptions
|
# Perform text normalisation, prepare dict folder and LM data transcriptions
|
||||||
|
# This setup uses previsously prepared data. eSpeak must be installed and in PATH to use dict_prep.sh
|
||||||
#local/dict_prep.sh || exit 1;
|
#local/dict_prep.sh || exit 1;
|
||||||
local/copy_dict.sh || exit 1;
|
local/copy_dict.sh || exit 1;
|
||||||
|
|
||||||
|
@ -29,33 +30,40 @@ mfccdir=mfcc
|
||||||
# p was added to the rspecifier (scp,p:$logdir/wav.JOB.scp) in make_mfcc.sh because some
|
# p was added to the rspecifier (scp,p:$logdir/wav.JOB.scp) in make_mfcc.sh because some
|
||||||
# wave files are corrupt
|
# wave files are corrupt
|
||||||
# Will return a warning message because of the corrupt audio files, but compute them anyway
|
# Will return a warning message because of the corrupt audio files, but compute them anyway
|
||||||
steps/make_mfcc.sh --nj 30 --cmd $train_cmd data/train exp/make_mfcc/train mfcc
|
# If this step fails and prints a partial diff, rerun from sprak_data_prep.sh
|
||||||
steps/make_mfcc.sh --nj 30 --cmd $train_cmd data/test exp/make_mfcc/test mfcc
|
|
||||||
|
|
||||||
|
steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/test exp/make_mfcc/test mfcc &
|
||||||
|
steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/dev exp/make_mfcc/dev mfcc &
|
||||||
|
steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/train exp/make_mfcc/train mfcc || exit 1;
|
||||||
|
wait
|
||||||
|
|
||||||
# Compute cepstral mean and variance normalisation
|
# Compute cepstral mean and variance normalisation
|
||||||
steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train mfcc && \
|
steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test mfcc &
|
||||||
steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test mfcc
|
steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev mfcc &
|
||||||
|
steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train mfcc
|
||||||
|
|
||||||
|
wait
|
||||||
|
|
||||||
# Repair data set (remove corrupt data points with corrupt audio)
|
# Repair data set (remove corrupt data points with corrupt audio)
|
||||||
utils/fix_data_dir.sh data/train && utils/fix_data_dir.sh data/test
|
|
||||||
utils/fix_data_dir.sh data/dev
|
utils/fix_data_dir.sh data/test &
|
||||||
|
utils/fix_data_dir.sh data/dev &
|
||||||
|
utils/fix_data_dir.sh data/train
|
||||||
|
wait
|
||||||
|
|
||||||
# Train LM with CMUCLMTK
|
# Train LM with CMUCLMTK
|
||||||
|
# This setup uses IRSTLM
|
||||||
#local/sprak_train_lm.sh &> data/local/cmuclmtk/lm.log
|
#local/sprak_train_lm.sh &> data/local/cmuclmtk/lm.log
|
||||||
|
|
||||||
# Train LM with irstlm
|
# Train LM with irstlm
|
||||||
local/train_irstlm.sh data/local/dict/transcripts.txt 3 "b3g" data/lang data/local/trainb3_lm &> data/local/b3g.log &
|
local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 3 "3g" data/lang data/local/train3_lm &> data/local/3g.log &
|
||||||
local/train_irstlm.sh data/local/dict/transcripts.uniq 3 "3g" data/lang data/local/train3_lm &> data/local/3g.log &
|
local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 4 "4g" data/lang data/local/train4_lm &> data/local/4g.log
|
||||||
#local/train_irstlm.sh data/local/dict/transcripts.txt b4 "b4g" data/lang data/local/trainb4_lm &> data/local/b4g.log &
|
|
||||||
#local/train_irstlm.sh data/local/dict/transcripts.uniq 4 "4g" data/lang data/local/train4_lm &> data/local/4g.log &
|
|
||||||
|
|
||||||
# Make subset with 1k utterances for rapid testing
|
# Make subset with 1k utterances for rapid testing
|
||||||
# Randomly selects 980 utterances from 7 speakers
|
# Randomly selects 980 utterances from 7 speakers
|
||||||
utils/subset_data_dir.sh --per-spk data/test 140 data/test1k &
|
utils/subset_data_dir.sh --per-spk data/test 140 data/test1k &
|
||||||
|
|
||||||
# Now make subset with the shortest 120k utterances.
|
# Now make subset of the training data with the shortest 120k utterances.
|
||||||
utils/subset_data_dir.sh --shortest data/train 120000 data/train_120kshort || exit 1;
|
utils/subset_data_dir.sh --shortest data/train 120000 data/train_120kshort || exit 1;
|
||||||
|
|
||||||
# Train monophone model on short utterances
|
# Train monophone model on short utterances
|
||||||
|
@ -66,24 +74,14 @@ steps/train_mono.sh --nj 30 --cmd "$train_cmd" \
|
||||||
wait
|
wait
|
||||||
|
|
||||||
utils/mkgraph.sh --mono data/lang_test_3g exp/mono0a exp/mono0a/graph_3g &
|
utils/mkgraph.sh --mono data/lang_test_3g exp/mono0a exp/mono0a/graph_3g &
|
||||||
#utils/mkgraph.sh --mono data/lang_test_b3g exp/mono0a exp/mono0a/graph_b3g &
|
utils/mkgraph.sh --mono data/lang_test_4g exp/mono0a exp/mono0a/graph_4g &
|
||||||
#utils/mkgraph.sh --mono data/lang_test_4g exp/mono0a exp/mono0a/graph_4g &
|
|
||||||
#utils/mkgraph.sh --mono data/lang_test_b4g exp/mono0a exp/mono0a/graph_b4g
|
|
||||||
|
|
||||||
# Ensure that all graphs are constructed
|
# Ensure that all graphs are constructed
|
||||||
wait
|
wait
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#(
|
|
||||||
#steps/decode.sh --nj 7 --cmd "$decode_cmd" \
|
|
||||||
# exp/mono0a/graph_b3g data/test1k exp/mono0a/decode_b3g_test1k
|
|
||||||
#) &
|
|
||||||
steps/decode.sh --nj 7 --cmd "$decode_cmd" \
|
steps/decode.sh --nj 7 --cmd "$decode_cmd" \
|
||||||
exp/mono0a/graph_3g data/test1k exp/mono0a/decode_3g_test1k
|
exp/mono0a/graph_3g data/test1k exp/mono0a/decode_3g_test1k
|
||||||
|
|
||||||
exit 0;
|
|
||||||
|
|
||||||
# steps/align_si.sh --boost-silence 1.25 --nj 42 --cmd "$train_cmd" \
|
# steps/align_si.sh --boost-silence 1.25 --nj 42 --cmd "$train_cmd" \
|
||||||
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
|
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
|
||||||
data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;
|
data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;
|
||||||
|
@ -96,19 +94,19 @@ wait
|
||||||
|
|
||||||
|
|
||||||
utils/mkgraph.sh data/lang_test_3g exp/tri1 exp/tri1/graph_3g &
|
utils/mkgraph.sh data/lang_test_3g exp/tri1 exp/tri1/graph_3g &
|
||||||
utils/mkgraph.sh data/lang_test_b3g exp/tri1 exp/tri1/graph_b3g || exit 1;#
|
utils/mkgraph.sh data/lang_test_4g exp/tri1 exp/tri1/graph_4g || exit 1;
|
||||||
|
|
||||||
#(
|
(
|
||||||
#steps/decode.sh --nj 7 --cmd "$decode_cmd" \
|
steps/decode.sh --nj 7 --cmd "$decode_cmd" \
|
||||||
# exp/tri1/graph_4g data/test1k exp/tri1/decode_4g_test1k || exit 1;
|
exp/tri1/graph_4g data/test1k exp/tri1/decode_4g_test1k || exit 1;
|
||||||
#) &
|
) &
|
||||||
|
|
||||||
(
|
(
|
||||||
steps/decode.sh --nj 7 --cmd "$decode_cmd" \
|
steps/decode.sh --nj 7 --cmd "$decode_cmd" \
|
||||||
exp/tri1/graph_3g data/test1k exp/tri1/decode_3g_test1k || exit 1;
|
exp/tri1/graph_3g data/test1k exp/tri1/decode_3g_test1k || exit 1;
|
||||||
) &
|
) &
|
||||||
steps/decode.sh --nj 7 --cmd "$decode_cmd" \
|
|
||||||
exp/tri1/graph_b3g data/test1k exp/tri1/decode_b3g_test1k || exit 1;
|
wait
|
||||||
|
|
||||||
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
|
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
|
||||||
data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
|
data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
|
||||||
|
@ -120,14 +118,12 @@ steps/train_deltas.sh --cmd "$train_cmd" \
|
||||||
|
|
||||||
utils/mkgraph.sh data/lang_test_3g exp/tri2a exp/tri2a/graph_3g || exit 1;
|
utils/mkgraph.sh data/lang_test_3g exp/tri2a exp/tri2a/graph_3g || exit 1;
|
||||||
|
|
||||||
#steps/decode.sh --nj 7 --cmd "$decode_cmd" \
|
|
||||||
# exp/tri2a/graph_b3g data/test1k exp/tri2a/decode_b3g_test1k || exit 1;
|
|
||||||
steps/decode.sh --nj 7 --cmd "$decode_cmd" \
|
steps/decode.sh --nj 7 --cmd "$decode_cmd" \
|
||||||
exp/tri2a/graph_3g data/test1k exp/tri2a/decode_3g_test1k || exit 1;
|
exp/tri2a/graph_3g data/test1k exp/tri2a/decode_3g_test1k || exit 1;
|
||||||
|
|
||||||
|
|
||||||
steps/train_lda_mllt.sh --cmd "$train_cmd" \
|
steps/train_lda_mllt.sh --cmd "$train_cmd" \
|
||||||
--splice-opts "--left-context=3 --right-context=3" \
|
--splice-opts "--left-context=5 --right-context=5" \
|
||||||
2500 15000 data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
|
2500 15000 data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
|
||||||
|
|
||||||
utils/mkgraph.sh data/lang_test_3g exp/tri2b exp/tri2b/graph_3g || exit 1;
|
utils/mkgraph.sh data/lang_test_3g exp/tri2b exp/tri2b/graph_3g || exit 1;
|
||||||
|
@ -135,7 +131,6 @@ steps/decode.sh --nj 7 --cmd "$decode_cmd" \
|
||||||
exp/tri2b/graph_3g data/test1k exp/tri2b/decode_3g_test1k || exit 1;
|
exp/tri2b/graph_3g data/test1k exp/tri2b/decode_3g_test1k || exit 1;
|
||||||
|
|
||||||
|
|
||||||
# Align tri2b system with si84 data.
|
|
||||||
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
|
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
|
||||||
--use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali || exit 1;
|
--use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali || exit 1;
|
||||||
|
|
||||||
|
@ -151,18 +146,17 @@ steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
|
||||||
|
|
||||||
|
|
||||||
# Trying 4-gram language model
|
# Trying 4-gram language model
|
||||||
local/train_irstlm.sh data/local/dict/transcripts.uniq 4 "4g" data/lang data/local/train4_lm &> data/local/4g.log
|
|
||||||
utils/mkgraph.sh data/lang_test_4g exp/tri3b exp/tri3b/graph_4g || exit 1;
|
utils/mkgraph.sh data/lang_test_4g exp/tri3b exp/tri3b/graph_4g || exit 1;
|
||||||
|
|
||||||
steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 7 \
|
steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 7 \
|
||||||
exp/tri3b/graph_4g data/test1k exp/tri3b/decode_4g_test1k || exit 1;
|
exp/tri3b/graph_4g data/test1k exp/tri3b/decode_4g_test1k || exit 1;
|
||||||
|
|
||||||
|
|
||||||
# Train RNN for reranking
|
# Train RNN for reranking
|
||||||
local/sprak_train_rnnlms.sh data/local/dict data/dev/transcripts.uniq data/local/rnnlms/g_c380_d1k_h100_v130k
|
local/sprak_train_rnnlms.sh data/local/dict data/dev/transcripts.uniq data/local/rnnlms/g_c380_d1k_h100_v130k
|
||||||
# Consumes a lot of memory! Do not run in parallel
|
# Consumes a lot of memory! Do not run in parallel
|
||||||
local/sprak_run_rnnlms_tri3b.sh data/lang_test_3g data/local/rnnlms/g_c380_d1k_h100_v130k data/test1k exp/tri3b/decode_3g_test1k
|
local/sprak_run_rnnlms_tri3b.sh data/lang_test_3g data/local/rnnlms/g_c380_d1k_h100_v130k data/test1k exp/tri3b/decode_3g_test1k
|
||||||
|
|
||||||
|
|
||||||
# From 3b system
|
# From 3b system
|
||||||
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
|
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
|
||||||
data/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
|
data/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
|
||||||
|
@ -175,9 +169,6 @@ steps/train_sat.sh --cmd "$train_cmd" \
|
||||||
utils/mkgraph.sh data/lang_test_3g exp/tri4a exp/tri4a/graph_3g || exit 1;
|
utils/mkgraph.sh data/lang_test_3g exp/tri4a exp/tri4a/graph_3g || exit 1;
|
||||||
steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
|
steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
|
||||||
exp/tri4a/graph_3g data/test1k exp/tri4a/decode_3g_test1k || exit 1;
|
exp/tri4a/graph_3g data/test1k exp/tri4a/decode_3g_test1k || exit 1;
|
||||||
# steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
|
|
||||||
# exp/tri4a/graph_tgpr data/test_eval92 exp/tri4a/decode_tgpr_eval92 || exit 1;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
steps/train_quick.sh --cmd "$train_cmd" \
|
steps/train_quick.sh --cmd "$train_cmd" \
|
||||||
|
@ -195,9 +186,7 @@ steps/train_quick.sh --cmd "$train_cmd" \
|
||||||
|
|
||||||
wait
|
wait
|
||||||
|
|
||||||
|
# alignment used to train nnets and sgmms
|
||||||
# Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on
|
|
||||||
# all the data). Use 30 jobs.
|
|
||||||
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
|
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
|
||||||
data/train data/lang exp/tri4b exp/tri4b_ali || exit 1;
|
data/train data/lang exp/tri4b exp/tri4b_ali || exit 1;
|
||||||
|
|
||||||
|
@ -207,9 +196,6 @@ local/sprak_run_nnet_cpu.sh 3g test1k
|
||||||
## Works
|
## Works
|
||||||
local/sprak_run_sgmm2.sh test1k
|
local/sprak_run_sgmm2.sh test1k
|
||||||
|
|
||||||
# You probably want to run the hybrid recipe as it is complementary:
|
|
||||||
#local/run_hybrid.sh
|
|
||||||
|
|
||||||
|
|
||||||
# Getting results [see RESULTS file]
|
# Getting results [see RESULTS file]
|
||||||
for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
|
for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
|
||||||
|
|
Загрузка…
Ссылка в новой задаче