sandbox/online: merging changes from trunk

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/online@4243 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2014-08-03 01:07:56 +00:00
Родитель c45978c068 3ef595c743
Коммит 3b2a6582b6
61 изменённых файлов: 2488 добавлений и 324 удалений

Просмотреть файл

@ -25,6 +25,8 @@ do
utils/fix_data_dir.sh $data_dir/$split
utils/validate_data_dir.sh $data_dir/$split
rm $data_dir/$split/*.tmp
if ls $data_dir/$split/*.tmp &> /dev/null; then
rm $data_dir/$split/*.tmp
fi
done

Просмотреть файл

@ -0,0 +1,31 @@
#!/usr/bin/env bash
# Gets lattice oracles
if [ $# -lt 3 ]; then
echo "Specify lattice dir, symbol table and text file for partition"
exit 1;
fi
latticeDir=$1
textFile=$3
symTable=$2
oracleDir=$latticeDir/oracle
echo $latticeDir
echo $oracleDir
. path.sh
if [ ! -f $textFile -o ! -f $symTable -o ! -d $latticeDir ]; then
echo "Required files not found"
exit 1;
fi
mkdir -p $oracleDir
cat $textFile | sed 's:\[laughter\]::g' | sed 's:\[noise\]::g' | \
utils/sym2int.pl -f 2- $symTable | \
$KALDI_ROOT/src/latbin/lattice-oracle --word-symbol-table=$symTable "ark:gunzip -c $latticeDir/lat.*.gz|" ark:- ark,t:$oracleDir/oracle.tra 2>$oracleDir/oracle.log
sort -k1,1 -u $oracleDir/oracle.tra -o $oracleDir/oracle.tra

Просмотреть файл

@ -10,7 +10,8 @@ if [ $# -lt 3 ]; then
exit 1
fi
prunebeam=2
prunebeam=50
maxProcesses=10
latdir=$1
decode_dir=$2
@ -33,6 +34,7 @@ then
mkdir -p $latdir/$compiledLatDir
mkdir -p $latdir/$preplfLatDir
runningProcesses=0
for l in $decode_dir/lat.*.gz
do
(
@ -69,11 +71,19 @@ then
continue
fi
# Replace laugh, unk, oov, noise with eps
echo "$line" | awk '{if ($3 == 2038 || $3 == 2039 || $3 == 2040) {$3 = 0; $4 = 0} print}' >> "$latdir/$rawLatDir/$fileName.lat"
echo "$line" | awk '{if ($3 == 1157 || $3 == 5327 || $3 == 5328 || $3 == 5329 || $3 ==5326) {$3 = 0; $4 = 0} print}' >> "$latdir/$rawLatDir/$fileName.lat"
done < $bname.ark.fst
echo "Done isolating lattices"
fi
) &
runningProcesses=$((runningProcesses+1))
echo "#### Processes running = " $runningProcesses " ####"
if [ $runningProcesses -eq $maxProcesses ]; then
echo "#### Waiting for slot ####"
wait
runningProcesses=0
echo "#### Done waiting ####"
fi
done
wait
rm $latdir/*.bin
@ -82,6 +92,7 @@ then
if [ $stage -le 2 ]; then
#Compile lattices
runningProcesses=0
for l in $latdir/$rawLatDir/*.lat
do
(
@ -89,6 +100,14 @@ then
bname=${l##*/}
fstcompile --arc_type=log $latdir/$rawLatDir/$bname $latdir/$compiledLatDir/$bname
) &
runningProcesses=$((runningProcesses+1))
echo "#### Processes running = " $runningProcesses " ####"
if [ $runningProcesses -eq $maxProcesses ]; then
echo "#### Waiting for slot ####"
wait
runningProcesses=0
echo "#### Done waiting ####"
fi
done
wait
echo "Done compiling lattices."
@ -99,6 +118,7 @@ then
# Create a dummy FST with one state and no arcs first
echo 0 | fstcompile --arc_type=log - $latdir/$preplfLatDir/dummy.fst
# Push Lattice weights towards initial state
runningProcesses=0
for l in $latdir/$compiledLatDir/*.lat
do
(
@ -112,6 +132,14 @@ then
fstrmepsilon - | \
fstreverse - $latdir/$preplfLatDir/$bname
) &
runningProcesses=$((runningProcesses+1))
echo "#### Processes running = " $runningProcesses " ####"
if [ $runningProcesses -eq $maxProcesses ]; then
echo "#### Waiting for slot ####"
wait
runningProcesses=0
echo "#### Done waiting ####"
fi
done
wait
# Let's take a moment to thank the dummy FST for playing its

Просмотреть файл

@ -0,0 +1,32 @@
#!/usr/bin/env bash
outDir=exp/lat
mkdir -p $outDir
stage=1
if [ $stage -lt 1 ]; then
# First convert all lattices into the pruned, minimized version
decodeDir=exp/tri5a/decode_dev
acousticScale=0.8333
local/latconvert.sh $outDir $decodeDir $acousticScale
decodeDir=exp/tri5a/decode_test
acousticScale=0.8333
local/latconvert.sh $outDir $decodeDir $acousticScale
fi
if [ $stage -lt 2 ]; then
# Get oracles
latticeDir=exp/tri5a/decode_dev
textFile=data/dev/text
symTable=exp/tri5a/graph/words.txt
local/get_oracle.sh $latticeDir $symTable $textFile
latticeDir=exp/tri5a/decode_test
textFile=data/test/text
symTable=exp/tri5a/graph/words.txt
local/get_oracle.sh $latticeDir $symTable $textFile
fi

Просмотреть файл

@ -56,14 +56,14 @@ steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
# ones (mostly uh-huh). So take the 100k shortest ones, and then take 10k random
# utterances from those.
steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
data/train data/lang exp/mono0a
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;
steps/train_deltas.sh --cmd "$train_cmd" \
1000 10000 data/train data/lang exp/mono0a_ali exp/tri1 || exit 1;
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;
steps/train_deltas.sh --cmd "$train_cmd" \
1000 10000 data/train data/lang exp/mono0a_ali exp/tri1 || exit 1;
(utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph

Просмотреть файл

@ -153,9 +153,14 @@ steps/train_sat.sh --cmd "$train_cmd" \
(
utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri5a/graph data/dev exp/tri5a/decode_dev
exp/tri5a/graph data/dev exp/tri5a/decode_dev
)&
#
# steps/cleanup/find_bad_utts.sh --nj 200 --cmd "$train_cmd" data/train data/lang \
# exp/tri5a exp/tri5a_cleanup
# local/run_for_spkid.sh
# we don't have to results for the step below yet.

Просмотреть файл

@ -118,14 +118,17 @@ exit 0
%WER 1.80 [ 226 / 12533, 29 ins, 44 del, 153 sub ] exp/nnet4c/decode/wer_4
%WER 8.49 [ 1064 / 12533, 80 ins, 175 del, 809 sub ] exp/nnet4c/decode_ug/wer_11
%WER 1.61 [ 202 / 12533, 25 ins, 47 del, 130 sub ] exp/nnet4d/decode/wer_5
%WER 8.17 [ 1024 / 12533, 83 ins, 179 del, 762 sub ] exp/nnet4d/decode_ug/wer_11
%WER 1.68 [ 211 / 12533, 29 ins, 39 del, 143 sub ] exp/nnet4d/decode/wer_4
%WER 8.40 [ 1053 / 12533, 101 ins, 153 del, 799 sub ] exp/nnet4d/decode_ug/wer_10
%WER 1.63 [ 204 / 12533, 29 ins, 42 del, 133 sub ] exp/nnet4d_gpu/decode/wer_4
%WER 8.11 [ 1016 / 12533, 80 ins, 168 del, 768 sub ] exp/nnet4d_gpu/decode_ug/wer_10
%WER 1.74 [ 218 / 12533, 25 ins, 48 del, 145 sub ] exp/nnet4d_gpu/decode/wer_6
%WER 8.39 [ 1051 / 12533, 106 ins, 149 del, 796 sub ] exp/nnet4d_gpu/decode_ug/wer_10
%WER 1.63 [ 204 / 12533, 29 ins, 42 del, 133 sub ] exp/nnet4d_gpu/decode/wer_4
%WER 8.11 [ 1016 / 12533, 80 ins, 168 del, 768 sub ] exp/nnet4d_gpu/decode_ug/wer_10
%WER 1.53 [ 192 / 12533, 22 ins, 42 del, 128 sub ] exp/nnet4d2/decode/wer_3
%WER 8.06 [ 1010 / 12533, 79 ins, 152 del, 779 sub ] exp/nnet4d2/decode_ug/wer_8
%WER 1.51 [ 189 / 12533, 25 ins, 34 del, 130 sub ] exp/nnet4d2_gpu/decode/wer_3
%WER 7.97 [ 999 / 12533, 78 ins, 152 del, 769 sub ] exp/nnet4d2_gpu/decode_ug/wer_8
%WER 1.37 [ 172 / 12533, 14 ins, 36 del, 122 sub ] exp/nnet4e_gpu/decode/wer_3
%WER 8.03 [ 1006 / 12533, 61 ins, 179 del, 766 sub ] exp/nnet4e_gpu/decode_ug/wer_8
@ -153,8 +156,8 @@ exit 0
# Discriminatively trained system (using p-norm rather than tanh nonlinearities, using SMBR, on GPU)
%WER 1.56 [ 195 / 12533, 28 ins, 31 del, 136 sub ] exp/nnet5d_mpe_gpu/decode_epoch2/wer_2
%WER 8.35 [ 1047 / 12533, 77 ins, 171 del, 799 sub ] exp/nnet5d_mpe_gpu/decode_ug_epoch4/wer_10
%WER 1.74 [ 218 / 12533, 25 ins, 48 del, 145 sub ] exp/nnet5d_mpe_gpu/decode_epoch1/wer_6
%WER 8.40 [ 1053 / 12533, 108 ins, 148 del, 797 sub ] exp/nnet5d_mpe_gpu/decode_ug_epoch1/wer_10
# Discriminatively trained system on top of ensemble trained p-norm network (using SMBR, on GPU)
%WER 1.36 [ 170 / 12533, 15 ins, 34 del, 121 sub ] exp/nnet5e_mpe_gpu/decode_epoch2/wer_3

Просмотреть файл

@ -0,0 +1,62 @@
#!/bin/bash
# 4d2 is as 4d but adding perturbed training with multiplier=1.0
train_stage=-10
use_gpu=true
. cmd.sh
. ./path.sh
. utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
fi
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
dir=exp/nnet4d2_gpu
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="-pe smp $num_threads"
dir=exp/nnet4d2
fi
if [ ! -f $dir/final.mdl ]; then
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
--target-multiplier 1.0 \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--num-jobs-nnet 4 \
--num-epochs-extra 10 --add-layers-period 1 \
--num-hidden-layers 2 \
--mix-up 4000 \
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
--cmd "$decode_cmd" \
--pnorm-input-dim 1000 \
--pnorm-output-dim 200 \
data/train data/lang exp/tri3b_ali $dir || exit 1;
fi
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test $dir/decode &
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode_ug \
exp/tri3b/graph_ug data/test $dir/decode_ug
wait

126
egs/rm/s5/local/nnet2/run_5d.sh Executable file
Просмотреть файл

@ -0,0 +1,126 @@
#!/bin/bash
# This script demonstrates discriminative training of p-norm neural nets.
# It's on top of run_4d_gpu.sh which uses adapted 40-dimensional features.
# This version of the script uses GPUs. We distinguish it by putting "_gpu"
# at the end of the directory name.
use_gpu=true
stage=0
transform_dir=exp/tri3b_ali
. cmd.sh
. ./path.sh
. utils/parse_options.sh
[ ! -f $transform_dir/num_jobs ] && \
echo "Expected $transform_dir/num_jobs to exist" && exit 1;
nj_orig=$(cat $transform_dir/num_jobs)
# The queue options in this script are for the CLSP network, and might not work
# for you.
if $use_gpu; then
. ./cmd.sh
. ./path.sh
! cuda-compiled && cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
align_gpu_opts="-l gpu=1"
use_gpu_flag="--use-gpu yes"
train_parallel_opts="-l gpu=1"
train_num_threads=1
srcdir=exp/nnet4d_gpu
dir=exp/nnet5d_mpe_gpu
nj=$nj_orig
else
align_gpu_opts=
use_gpu_flag="--use-gpu no"
train_parallel_opts="-pe smp 6"
train_num_threads=6
srcdir=exp/nnet4d
dir=exp/nnet5d_mpe
if [ "$decode_cmd" != "run.pl" ]; then
nj=$[$nj_orig*5]; # use more jobs, or it will be slow in the alignment
# phase. But if we are just running everything on
# one machine this won't help us
else
nj=$nj_orig
fi
fi
if [ ! -f $srcdir/final.mdl ]; then
echo "$0: expected $srcdir/final.mdl to exist."
exit 1;
fi
# The denominator lattice creation currently doesn't use GPUs; that would be
# wasteful since the lattice determinization and graph search use up a fair
# amount of CPU, and we'd be idling the GPU much of the time.
# We specify 1G each for the mem_free and ram_free which, is per thread... it
# will likely be less than the default. Increase the beam relative to the
# defaults; this is just for this RM setup, where the default beams will likely
# generate very thin lattices.
# Note: the transform-dir is important to
# specify, since this system is on top of fMLLR features.
if [ $stage -le 0 ]; then
steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
--nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
--beam 20.0 --lattice-beam 10.0 \
--transform-dir $transform_dir \
data/train data/lang $srcdir ${srcdir}_denlats
fi
if [ $stage -le 1 ]; then
steps/nnet2/align.sh --cmd "$decode_cmd $align_gpu_opts" $use_gpu_flag \
--transform-dir $transform_dir \
--nj $nj data/train data/lang $srcdir ${srcdir}_ali
fi
if [ $stage -le 2 ]; then
steps/nnet2/train_discriminative.sh --cmd "$decode_cmd" \
--num-jobs-nnet 2 --transform-dir $transform_dir \
--num-threads "$train_num_threads" --parallel-opts "$train_parallel_opts" data/train data/lang \
${srcdir}_ali ${srcdir}_denlats $srcdir/final.mdl $dir
fi
if [ $stage -le 3 ]; then
for epoch in 1 2 3 4; do
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 --iter epoch$epoch \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test $dir/decode_epoch$epoch &
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 --iter epoch$epoch \
--transform-dir exp/tri3b/decode_ug \
exp/tri3b/graph_ug data/test $dir/decode_ug_epoch$epoch &
done
wait
fi
exit 0;
# The following is some test commands that I ran in order to verify that
# the neural-net splitting and excising code was working as intended.
# (
# acoustic_scale=0.1
# for criterion in smbr mmi mpfe; do
# for drop_frames in true false; do
# nnet-get-egs-discriminative --drop-frames=$drop_frames --criterion=$criterion --excise=true exp/tri5c_mpe/0.mdl 'ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:data/train/split8/1/utt2spk scp:data/train/split8/1/cmvn.scp "scp:head -n 40 data/train/split8/1/feats.scp|" ark:- | splice-feats --left-context=3 --right-context=3 ark:- ark:- | transform-feats exp/tri5c_mpe/final.mat ark:- ark:- | transform-feats --utt2spk=ark:data/train/split8/1/utt2spk ark:$transform_dir/trans.1 ark:- ark:- |' 'ark,s,cs:gunzip -c exp/${dir}_ali/ali.1.gz |' 'ark,s,cs:gunzip -c exp/${dir}_denlats/lat.1.gz|' "ark:|nnet-combine-egs-discriminative ark:- ark:1.egs"
# nnet-get-egs-discriminative --drop-frames=$drop_frames --criterion=$criterion --split=false --excise=false exp/tri5c_mpe/0.mdl 'ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:data/train/split8/1/utt2spk scp:data/train/split8/1/cmvn.scp "scp:head -n 40 data/train/split8/1/feats.scp|" ark:- | splice-feats --left-context=3 --right-context=3 ark:- ark:- | transform-feats exp/tri5c_mpe/final.mat ark:- ark:- | transform-feats --utt2spk=ark:data/train/split8/1/utt2spk ark:$transform_dir/trans.1 ark:- ark:- |' 'ark,s,cs:gunzip -c exp/${dir}_ali/ali.1.gz |' 'ark,s,cs:gunzip -c exp/${dir}_denlats/lat.1.gz|' ark:2.egs
# nnet-compare-hash-discriminative --acoustic-scale=$acoustic_scale --drop-frames=$drop_frames --criterion=$criterion $dir/final.mdl ark:1.egs ark:2.egs || exit 1;
# done
# done
# )

Просмотреть файл

@ -21,12 +21,15 @@ if $use_gpu; then
# This one is for training pnorm nnets on top of 40-dim + fMLLR features
# **THIS IS THE PRIMARY RECIPE**
local/nnet2/run_4d.sh --use-gpu true
# as above with 'perturbed training'. A bit better results, a bit slower.
local/nnet2/run_4d2.sh --use-gpu true
# This is discriminative training on top of 4c.
# This is discriminative training on top of 4c. (hardly helps)
local/nnet2/run_5c_gpu.sh
# This is discriminative training on top of 4d.
local/nnet2/run_5d_gpu.sh
local/nnet2/run_5d.sh --use-gpu true
else
# This example runs on top of "raw-fMLLR" features;
# you have to run local/run_raw_fmllr.sh first.
@ -42,9 +45,15 @@ else
# **THIS IS THE PRIMARY RECIPE (40-dim + fMLLR + p-norm neural net)**
local/nnet2/run_4d.sh --use-gpu false
# as above with 'perturbed training'. A bit better results, a bit slower.
local/nnet2/run_4d2.sh --use-gpu false
# This is discriminative training on top of 4c.
local/nnet2/run_5c.sh
# This is discriminative training on top of 4d.
local/nnet2/run_5d.sh --use-gpu false
# This is p-norm on top of raw-fMLLR.
#local/nnet2/run_4e.sh

Просмотреть файл

@ -146,6 +146,15 @@ steps/decode_fmllr.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
steps/align_fmllr.sh --nj 8 --cmd "$train_cmd" --use-graphs true \
data/train data/lang exp/tri3b exp/tri3b_ali
# # We have now added a script that will help you find portions of your data that
# # has bad transcripts, so you can filter it out. Below we demonstrate how to
# # run this script.
# steps/cleanup/find_bad_utts.sh --nj 20 --cmd "$train_cmd" data/train data/lang \
# exp/tri3b_ali exp/tri3b_cleanup
# # The following command will show you some of the hardest-to-align utterances in the data.
# head exp/tri3b_cleanup/all_info.sorted.txt
## MMI on top of tri3b (i.e. LDA+MLLT+SAT+MMI)
steps/make_denlats.sh --config conf/decode.config \
--nj 8 --cmd "$train_cmd" --transform-dir exp/tri3b_ali \

Просмотреть файл

@ -20,6 +20,9 @@
. ./path.sh ## Source the tools/utils (import the queue.pl)
nj=80
decode_nj=8
# Config:
gmmdir=exp/tri3
data_fmllr=data-fmllr-tri3
@ -69,10 +72,10 @@ if [ $stage -le 2 ]; then
steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
$data_fmllr/train_tr90 $data_fmllr/train_cv10 data/lang $ali $ali $dir || exit 1;
# Decode (reuse HCLG graph)
steps/nnet/decode.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
--num-threads 3 --parallel-opts "-pe smp 4" \
$gmmdir/graph $data_fmllr/dev $dir/decode_dev || exit 1;
steps/nnet/decode.sh --nj 11 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
--num-threads 3 --parallel-opts "-pe smp 4" \
$gmmdir/graph $data_fmllr/test $dir/decode_test || exit 1;
fi
@ -87,9 +90,9 @@ acwt=0.1
if [ $stage -le 3 ]; then
# First we generate lattices and alignments:
steps/nnet/align.sh --nj 80 --cmd "$train_cmd" \
steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \
$data_fmllr/train data/lang $srcdir ${srcdir}_ali || exit 1;
steps/nnet/make_denlats.sh --nj 3 --sub-split 100 --cmd "$decode_cmd" --config conf/decode_dnn.config \
steps/nnet/make_denlats.sh --nj 6 --sub-split $nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
--acwt $acwt $data_fmllr/train data/lang $srcdir ${srcdir}_denlats || exit 1;
fi
@ -99,11 +102,11 @@ if [ $stage -le 4 ]; then
$data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
# Decode (reuse HCLG graph)
for ITER in 1; do
steps/nnet/decode.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config \
steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
--num-threads 3 --parallel-opts "-pe smp 4" \
--nnet $dir/${ITER}.nnet --acwt $acwt \
$gmmdir/graph $data_fmllr/dev $dir/decode_dev || exit 1;
steps/nnet/decode.sh --nj 11 --cmd "$decode_cmd" --config conf/decode_dnn.config \
steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
--num-threads 3 --parallel-opts "-pe smp 4" \
--nnet $dir/${ITER}.nnet --acwt $acwt \
$gmmdir/graph $data_fmllr/test $dir/decode_test || exit 1;
@ -117,9 +120,9 @@ acwt=0.1
if [ $stage -le 5 ]; then
# First we generate lattices and alignments:
steps/nnet/align.sh --nj 80 --cmd "$train_cmd" \
steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \
$data_fmllr/train data/lang $srcdir ${srcdir}_ali || exit 1;
steps/nnet/make_denlats.sh --nj 3 --sub-split 100 --cmd "$decode_cmd" --config conf/decode_dnn.config \
steps/nnet/make_denlats.sh --nj 6 --sub-split $nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
--acwt $acwt $data_fmllr/train data/lang $srcdir ${srcdir}_denlats || exit 1;
fi
@ -129,11 +132,11 @@ if [ $stage -le 6 ]; then
$data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
# Decode (reuse HCLG graph)
for ITER in 1 2 3 4; do
steps/nnet/decode.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config \
steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
--num-threads 3 --parallel-opts "-pe smp 4" \
--nnet $dir/${ITER}.nnet --acwt $acwt \
$gmmdir/graph $data_fmllr/dev $dir/decode_dev || exit 1;
steps/nnet/decode.sh --nj 11 --cmd "$decode_cmd" --config conf/decode_dnn.config \
steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
--num-threads 3 --parallel-opts "-pe smp 4" \
--nnet $dir/${ITER}.nnet --acwt $acwt \
$gmmdir/graph $data_fmllr/test $dir/decode_test || exit 1;

Просмотреть файл

@ -27,8 +27,9 @@ numGaussUBM=400
numLeavesSGMM=7000
numGaussSGMM=9000
decode_nj=5
feats_nj=10
train_nj=30
decode_nj=5
echo ============================================================================
echo " Data & Lexicon & Language Preparation "
@ -60,7 +61,7 @@ mfccdir=mfcc
for x in train dev test; do
steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 data/$x exp/make_mfcc/$x $mfccdir
steps/make_mfcc.sh --cmd "$train_cmd" --nj $feats_nj data/$x exp/make_mfcc/$x $mfccdir
steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
done

Просмотреть файл

@ -272,6 +272,8 @@ steps/train_sat.sh --cmd "$train_cmd" \
) &
# This step is just to demonstrate the train_quick.sh script, in which we
# initialize the GMMs from the old system's GMMs.
steps/train_quick.sh --cmd "$train_cmd" \
4200 40000 data/train_si284 data/lang exp/tri3b_ali_si284 exp/tri4b || exit 1;

Просмотреть файл

@ -56,6 +56,7 @@ echo $nj > $dir/num_jobs
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
cp $srcdir/{tree,final.mdl} $dir || exit 1;
cp $srcdir/final.alimdl $dir 2>/dev/null
cp $srcdir/final.occs $dir;
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.

Просмотреть файл

@ -42,6 +42,11 @@ lang=$2
srcdir=$3
dir=$4
for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl; do
[ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done
oov=`cat $lang/oov.int` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs
@ -57,6 +62,7 @@ cp $srcdir/{tree,final.mdl} $dir || exit 1;
cp $srcdir/final.occs $dir;
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

Просмотреть файл

@ -0,0 +1,165 @@
#!/bin/bash
# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# Computes training alignments using a model with delta or
# LDA+MLLT features. This version, rather than just using the
# text to align, computes mini-language models (unigram) from the text
# and a few common words in the LM, and allows
# Begin configuration section.
nj=4
cmd=run.pl
use_graphs=false
# Begin configuration.
scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
acoustic_scale=0.1
beam=20.0
lattice_beam=10.0
transform_dir= # directory to find fMLLR transforms in.
top_n_words=100 # Number of common words that we compile into each graph (most frequent
# in $lang/text.
stage=0
cleanup=true
# End configuration options.
echo "$0 $@" # Print the command line for logging
[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;
if [ $# != 4 ]; then
echo "usage: $0 <data-dir> <lang-dir> <src-dir> <align-dir>"
echo "e.g.: $0 data/train data/lang exp/tri1 exp/tri1_ali"
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --nj <nj> # number of parallel jobs"
echo " --use-graphs true # use graphs in src-dir"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
exit 1;
fi
data=$1
lang=$2
srcdir=$3
dir=$4
for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl \
$lang/L_disambig.fst $lang/phones/disambig.int; do
[ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done
oov=`cat $lang/oov.int` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
cp $srcdir/{tree,final.mdl} $dir || exit 1;
cp $srcdir/final.occs $dir;
utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt <$data/text | \
awk '{for(x=2;x<=NF;x++) print $x;}' | sort | uniq -c | \
sort -rn > $dir/word_counts.int || exit 1;
num_words=$(awk '{x+=$1} END{print x}' < $dir/word_counts.int) || exit 1;
# print top-n words with their unigram probabilities.
head -n $top_n_words $dir/word_counts.int | awk -v tot=$num_words '{print $1/tot, $2;}' >$dir/top_words.int
utils/int2sym.pl -f 2 $lang/words.txt <$dir/top_words.int >$dir/top_words.txt
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"
case $feat_type in
delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
cp $srcdir/final.mat $srcdir/full.mat $dir
;;
*) echo "$0: invalid feature type $feat_type" && exit 1;
esac
if [ -z "$transform_dir" ] && [ -f $srcdir/trans.1 ]; then
transform_dir=$srcdir
fi
if [ ! -z "$transform_dir" ]; then
echo "$0: using transforms from $transform_dir"
[ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
nj_orig=$(cat $transform_dir/num_jobs)
if [ $nj -ne $nj_orig ]; then
# Copy the transforms into an archive with an index.
for n in $(seq $nj_orig); do cat $transform_dir/trans.$n; done | \
copy-feats ark:- ark,scp:$dir/trans.ark,$dir/trans.scp || exit 1;
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |"
else
# number of jobs matches with alignment dir.
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
fi
elif [ -f $srcdir/final.alimdl ]; then
echo "$0: **WARNING**: you seem to be using an fMLLR system as input,"
echo " but you are not providing the --transform-dir option during alignment."
fi
echo "$0: decoding $data using utterance-specific decoding graphs using model from $srcdir, output in $dir"
if [ $stage -le 0 ]; then
rm $dir/edits.*.txt $dir/aligned_ref.*.txt 2>/dev/null
$cmd JOB=1:$nj $dir/log/decode.JOB.log \
utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text \| \
steps/cleanup/make_utterance_fsts.pl $dir/top_words.int \| \
compile-train-graphs-fsts $scale_opts --read-disambig-syms=$lang/phones/disambig.int \
$dir/tree $dir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \
gmm-latgen-faster --acoustic-scale=$acoustic_scale --beam=$beam \
--lattice-beam=$lattice_beam --word-symbol-table=$lang/words.txt \
$dir/final.mdl ark:- "$feats" ark:- \| \
lattice-oracle ark:- "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|" \
ark,t:- ark,t:$dir/edits.JOB.txt \| \
utils/int2sym.pl -f 2- $lang/words.txt '>' $dir/aligned_ref.JOB.txt || exit 1;
fi
if [ $stage -le 1 ]; then
if [ -f $dir/edits.1.txt ]; then
for x in $(seq $nj); do cat $dir/edits.$x.txt; done > $dir/edits.txt
for x in $(seq $nj); do cat $dir/aligned_ref.$x.txt; done > $dir/aligned_ref.txt
else
echo "$0: warning: no file $dir/edits.1.txt, using previously concatenated file if present."
fi
# in case any utterances failed to align, get filtered copy of $data/text that's filtered.
utils/filter_scp.pl $dir/edits.txt < $data/text > $dir/text
cat $dir/text | awk '{print $1, (NF-1);}' > $dir/length.txt
n1=$(wc -l < $dir/edits.txt)
n2=$(wc -l < $dir/aligned_ref.txt)
n3=$(wc -l < $dir/text)
n4=$(wc -l < $dir/length.txt)
if [ $n1 -ne $n2 ] || [ $n2 -ne $n3 ] || [ $n3 -ne $n4 ]; then
echo "$0: mismatch in lengths of files:"
wc $dir/edits.txt $dir/aligned_ref.txt $dir/text $dir/length.txt
exit 1;
fi
# note: the format of all_info.txt is:
# <utterance-id> <number of errors> <reference-length> <decoded-output> <reference>
# with the fields separated by tabs, e.g.
# adg04_sr009_trn 1 12 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED
paste $dir/edits.txt \
<(awk '{print $2}' $dir/length.txt) \
<(awk '{$1="";print;}' <$dir/aligned_ref.txt) \
<(awk '{$1="";print;}' <$dir/text) > $dir/all_info.txt
sort -nr -k2 $dir/all_info.txt > $dir/all_info.sorted.txt
if $cleanup; then
rm $dir/edits.*.txt $dir/aligned_ref.*.txt
fi
fi

Просмотреть файл

@ -0,0 +1,45 @@
#!/usr/bin/perl -w
# makes unigram decoding-graph FSTs specific to each utterances, where the
# supplied top-n-words list together with the supervision text of the utterance are
# combined.
if (@ARGV != 1) {
print STDERR "Usage: make_utterance_fsts.pl top-words-file.txt < text-archive > fsts-archive\n" .
"e.g.: utils/sym2int.pl -f 2- data/lang/words.txt data/train/text | \\\n" .
" make_utterance_fsts.pl exp/foo/top_words.int | compile-train-graphs-fsts ... \n";
}
($top_words_file) = @ARGV;
open(F, "<$top_words_file") || die "opening $top_words_file";
%top_word_probs = ( );
while(<F>) {
@A = split;
(@A == 2 && $A[0] > 0.0) || die "Bad line $_ in $top_words_file";
$A[1] =~ m/^[0-9]+$/ || die "Expecting numeric word-ids in $top_words_file: $_\n";
$top_word_probs{$A[1]} += $A[0];
}
while (<STDIN>) {
@A = split;
$utterance_id = shift @A;
print "$utterance_id\n";
$num_words = @A + 0; # length of array @A
%word_probs = %top_word_probs;
foreach $w (@A) {
$w =~ m/^[0-9]+$/ || die "Expecting numeric word-ids as stdin: $_";
$word_probs{$w} += 1.0 / $num_words;
}
foreach $w (keys %word_probs) {
$prob = $word_probs{$w};
$prob > 0.0 || die "Word $w with bad probability $prob, utterance-id = $utterance_id\n";
$cost = -log($prob);
print "0 0 $w $w $cost\n";
}
$final_cost = -log(1.0 / $num_words);
print "0 $final_cost\n";
print "\n"; # Empty line terminates the FST in the text-archive format.
}

Просмотреть файл

@ -4,7 +4,8 @@
# Apache 2.0
# Begin configuration section.
transform_dir=
transform_dir= # this option won't normally be used, but it can be used if you want to
# supply existing fMLLR transforms when decoding.
iter=
model= # You can specify the model to use (e.g. if you want to use the .alimdl)
stage=0

Просмотреть файл

@ -77,20 +77,31 @@ case $feat_type in
;;
*) echo "$0: invalid feature type $feat_type" && exit 1;
esac
if [ ! -z "$transform_dir" ]; then
if ! [ $nj -eq `cat $transform_dir/num_jobs` ]; then
echo "$0: Number of jobs mismatch with transform-dir: $nj versus `cat $transform_dir/num_jobs`";
echo "$0: using transforms from $transform_dir"
[ ! -s $transform_dir/num_jobs ] && \
echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
nj_orig=$(cat $transform_dir/num_jobs)
if [ $feat_type == "raw" ]; then trans=raw_trans;
else trans=trans; fi
if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then
echo "$0: LDA transforms differ between $srcdir and $transform_dir"
exit 1;
fi
if [ $feat_type == "lda" ]; then
[ ! -f $transform_dir/trans.1 ] && echo "No such file $transform_dir/raw_trans.1" && exit 1;
echo "$0: using transforms from $transform_dir"
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
if [ ! -f $transform_dir/$trans.1 ]; then
echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
exit 1;
fi
if [ $feat_type == "raw" ]; then
[ ! -f $transform_dir/raw_trans.1 ] && echo "No such file $transform_dir/raw_trans.1" && exit 1;
echo "$0: using raw-fMLLR transforms from $transform_dir"
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
if [ $nj -ne $nj_orig ]; then
# Copy the transforms into an archive with an index.
for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
else
# number of jobs matches with alignment dir.
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
fi
fi

Просмотреть файл

@ -145,6 +145,7 @@ fi
if [ $stage -le 0 ]; then
echo "$0: Accumulating LDA statistics."
rm $dir/lda.*.acc 2>/dev/null # in case any left over from before.
$cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
@ -157,11 +158,19 @@ echo $lda_dim > $dir/lda_dim
echo $ivector_dim > $dir/ivector_dim
if [ $stage -le 1 ]; then
nnet-get-feature-transform --write-cholesky=$dir/cholesky.tpmat \
--within-class-factor=$within_class_factor --dim=$lda_dim \
$dir/lda.mat $dir/lda.*.acc \
2>$dir/log/lda_est.log || exit 1;
sum-lda-accs $dir/lda.acc $dir/lda.*.acc 2>$dir/log/lda_sum.log || exit 1;
rm $dir/lda.*.acc
fi
if [ $stage -le 2 ]; then
# There are various things that we sometimes (but not always) need
# the within-class covariance and its Cholesky factor for, and we
# write these to disk just in case.
nnet-get-feature-transform --write-cholesky=$dir/cholesky.tpmat \
--write-within-covar=$dir/within_covar.spmat \
--within-class-factor=$within_class_factor --dim=$lda_dim \
$dir/lda.mat $dir/lda.acc \
2>$dir/log/lda_est.log || exit 1;
fi
echo "$0: Finished estimating LDA"

Просмотреть файл

@ -95,25 +95,39 @@ echo "align_si.sh: feature type is $feat_type"
case $feat_type in
delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
;;
lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
cp $srcdir/final.mat $dir
;;
*) echo "Invalid feature type $feat_type" && exit 1;
esac
if [ ! -z "$transform_dir" ]; then # add transforms to features...
echo "$0: using fMLLR transforms from $transform_dir"
[ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
[ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \
&& echo "$0: mismatch in number of jobs with $transform_dir" && exit 1;
[ -f $srcdir/final.mat ] && ! cmp $transform_dir/final.mat $srcdir/final.mat && \
echo "$0: LDA transforms differ between $srcdir and $transform_dir"
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
else
if [ -f $srcdir/final.alimdl ]; then
echo "$0: you seem to have a SAT system but you did not supply the --transform-dir option.";
if [ ! -z "$transform_dir" ]; then
echo "$0: using transforms from $transform_dir"
[ ! -s $transform_dir/num_jobs ] && \
echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
nj_orig=$(cat $transform_dir/num_jobs)
if [ $feat_type == "raw" ]; then trans=raw_trans;
else trans=trans; fi
if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then
echo "$0: LDA transforms differ between $srcdir and $transform_dir"
exit 1;
fi
if [ ! -f $transform_dir/$trans.1 ]; then
echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
exit 1;
fi
if [ $nj -ne $nj_orig ]; then
# Copy the transforms into an archive with an index.
for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
else
# number of jobs matches with alignment dir.
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
fi
fi
if [ $sub_split -eq 1 ]; then

Просмотреть файл

@ -22,7 +22,7 @@ num_jobs_nnet=4 # Number of neural net jobs to run in parallel. Note: this
samples_per_iter=400000 # measured in frames, not in "examples"
spk_vecs_dir=
modify_learning_rates=false
modify_learning_rates=true
last_layer_factor=1.0 # relates to modify-learning-rates
first_layer_factor=1.0 # relates to modify-learning-rates
shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
@ -140,17 +140,38 @@ case $feat_type in
*) echo "$0: invalid feature type $feat_type" && exit 1;
esac
[ -z "$transform_dir" ] && transform_dir=$alidir
if [ -z "$transform_dir" ]; then
if [ -f $transform_dir/trans.1 ] || [ -f $transform_dir/raw_trans.1 ]; then
transform_dir=$alidir
fi
fi
if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
if [ ! -z "$transform_dir" ]; then
echo "$0: using transforms from $transform_dir"
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
[ ! -s $transform_dir/num_jobs ] && \
echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
nj_orig=$(cat $transform_dir/num_jobs)
if [ $feat_type == "raw" ]; then trans=raw_trans;
else trans=trans; fi
if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $alidir/final.mat; then
echo "$0: LDA transforms differ between $alidir and $transform_dir"
exit 1;
fi
if [ ! -f $transform_dir/$trans.1 ]; then
echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
exit 1;
fi
if [ $nj -ne $nj_orig ]; then
# Copy the transforms into an archive with an index.
for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
else
# number of jobs matches with alignment dir.
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
fi
fi
if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
echo "$0: using raw-fMLLR transforms from $transform_dir"
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
fi
if [ -z "$degs_dir" ]; then
if [ $stage -le -8 ]; then

Просмотреть файл

@ -64,6 +64,10 @@ max_change_per_sample=0.075
precondition_rank_in=20 # relates to online preconditioning
precondition_rank_out=80 # relates to online preconditioning
# this relates to perturbed training.
min_target_objf_change=0.1
target_multiplier=0 # Set this to e.g. 1.0 to enable perturbed training.
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
# specified.)
num_threads=16
@ -262,24 +266,49 @@ echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
echo "$0: (while reducing learning rate) + (with constant learning rate)."
function set_target_objf_change {
# nothing to do if $target_multiplier not set.
[ "$target_multiplier" == "0" -o "$target_multiplier" == "0.0" ] && return;
[ $x -le $finish_add_layers_iter ] && return;
wait=2 # the compute_prob_{train,valid} from 2 iterations ago should
# most likey be done even though we backgrounded them.
[ $[$x-$wait] -le 0 ] && return;
while true; do
# Note: awk 'some-expression' is the same as: awk '{if(some-expression) print;}'
train_prob=$(awk '(NF == 1)' < $dir/log/compute_prob_train.$[$x-$wait].log)
valid_prob=$(awk '(NF == 1)' < $dir/log/compute_prob_valid.$[$x-$wait].log)
if [ -z "$train_prob" ] || [ -z "$valid_prob" ]; then
echo "$0: waiting until $dir/log/compute_prob_{train,valid}.$[$x-$wait].log are done"
sleep 60
else
target_objf_change=$(perl -e '($train,$valid,$min_change,$multiplier)=@ARGV; if (!($train < 0.0) || !($valid < 0.0)) { print "0\n"; print STDERR "Error: invalid train or valid prob: $train_prob, $valid_prob\n"; exit(0); } else { print STDERR "train,valid=$train,$valid\n"; $proposed_target = $multiplier * ($train-$valid); if ($proposed_target < $min_change) { print "0"; } else { print $proposed_target; }}' -- "$train_prob" "$valid_prob" "$min_target_objf_change" "$target_multiplier")
echo "On iter $x, (train,valid) probs from iter $[$x-$wait] were ($train_prob,$valid_prob), and setting target-objf-change to $target_objf_change."
return;
fi
done
}
finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
# This is when we decide to mix up from: halfway between when we've finished
# adding the hidden layers and the end of training.
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]
if [ $num_threads -eq 1 ]; then
train_suffix="-simple" # this enables us to use GPU code if
parallel_suffix="-simple" # this enables us to use GPU code if
# we have just one thread.
parallel_train_opts=
if ! cuda-compiled; then
echo "$0: WARNING: you are running with one thread but you have not compiled"
echo " for CUDA. You may be running a setup optimized for GPUs. If you have"
echo " GPUs and have nvcc installed, go to src/ and do ./configure; make"
fi
else
train_suffix="-parallel --num-threads=$num_threads"
parallel_suffix="-parallel"
parallel_train_opts="--num-threads=$num_threads"
fi
x=0
target_objf_change=0 # relates to perturbed training.
while [ $x -lt $num_iters ]; do
if [ $x -ge 0 ] && [ $stage -le $x ]; then
@ -316,11 +345,19 @@ while [ $x -lt $num_iters ]; do
this_minibatch_size=$minibatch_size
do_average=true
fi
set_target_objf_change; # only has effect if target_multiplier != 0
if [ "$target_objf_change" != "0" ]; then
[ ! -f $dir/within_covar.spmat ] && \
echo "$0: expected $dir/within_covar.spmat to exist." && exit 1;
perturb_suffix="-perturbed"
perturb_opts="--target-objf-change=$target_objf_change --within-covar=$dir/within_covar.spmat"
fi
$cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
nnet-train$train_suffix \
nnet-train$parallel_suffix$perturb_suffix $parallel_train_opts $perturb_opts \
--minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
ark:- $dir/$[$x+1].JOB.mdl \
|| exit 1;

Просмотреть файл

@ -12,7 +12,6 @@
# Begin configuration section.
stage=-5
fmllr_update_type=full
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
@ -197,9 +196,9 @@ while [ $x -lt $num_iters ]; do
if echo $fmllr_iters | grep -w $x >/dev/null; then
if [ $stage -le $x ]; then
echo Estimating fMLLR transforms
# We estimate a transform that's additional to the previous transform;
# we'll compose them.
# Note: it's not really necessary to re-estimate the basis each time
# but this is the way the script does it right now.
echo Estimating basis and fMLLR transforms
$cmd JOB=1:$nj $dir/log/fmllr_est.$x.JOB.log \
ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
weight-silence-post $silence_weight $silphonelist $dir/$x.mdl ark:- ark:- \| \
@ -209,7 +208,7 @@ while [ $x -lt $num_iters ]; do
# Compute the basis matrices.
$cmd $dir/log/basis_training.log \
gmm-basis-fmllr-training $dir/$x.mdl $dir/fmllr.basis $dir/basis.acc.* || exit 1;
gmm-basis-fmllr-training $dir/$x.mdl $dir/fmllr.basis $dir/basis.acc.* || exit 1;
$cmd JOB=1:$nj $dir/log/fmllr_app.$x.JOB.log \
ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \

Просмотреть файл

@ -28,7 +28,7 @@ for($x = 0; $x < 2; $x++) {
}
}
if ($ARGV[0] eq "-f") {
shift @ARGV;
shift @ARGV;
$field_spec = shift @ARGV;
if ($field_spec =~ m/^\d+$/) {
$field_begin = $field_spec - 1; $field_end = $field_spec - 1;

Просмотреть файл

@ -46,6 +46,14 @@ done
! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \
echo "$0: $data/utt2spk has wrong format." && exit;
ns=$(wc -l < $data/spk2utt)
if [ "$ns" == 1 ]; then
echo "$0: WARNING: you have only one speaker. This probably a bad idea."
echo " Search for the word 'bold' in http://kaldi.sourceforge.net/data_prep.html"
echo " for more information."
fi
tmpdir=$(mktemp -d kaldi.XXXX);
trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM

Просмотреть файл

@ -12,27 +12,39 @@ if(@ARGV != 1) {
}
$dict = shift @ARGV;
$dict =~ s:/$::;
$exit = 0;
$success = 1; # this is re-set each time we read a file.
sub set_to_fail { $exit = 1; $success = 0; }
# Checking silence_phones.txt -------------------------------
print "Checking $dict/silence_phones.txt ...\n";
if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;}
if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;}
$idx = 1;
%silence = ();
$success = 1;
print "--> reading $dict/silence_phones.txt\n";
while(<S>) {
chomp;
if (! s/\n$//) {
print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n";
set_to_fail();
}
my @col = split(" ", $_);
if (@col == 0) {
set_to_fail();
print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n";
}
foreach(0 .. @col-1) {
my $p = $col[$_];
if($silence{$p}) {$exit = 1; print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; $success = 0;}
if($silence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; }
else {$silence{$p} = 1;}
if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){
$exit = 1;
set_to_fail();
print "--> ERROR: phone \"$p\" has disallowed written form";
$success = 0;
}
}
$idx ++;
@ -52,9 +64,9 @@ while(<OS>) {
chomp;
my @col = split(" ", $_);
if ($idx > 1 or @col > 1) {
$exit = 1; print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; $success = 0;
set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n";
} elsif (!$silence{$col[0]}) {
$exit = 1; print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; $success = 0;
set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n";
}
$idx ++;
}
@ -71,22 +83,29 @@ $idx = 1;
$success = 1;
print "--> reading $dict/nonsilence_phones.txt\n";
while(<NS>) {
chomp;
if (! s/\n$//) {
print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n";
set_to_fail();
}
my @col = split(" ", $_);
if (@col == 0) {
set_to_fail();
print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n";
}
foreach(0 .. @col-1) {
my $p = $col[$_];
if($nonsilence{$p}) {$exit = 1; print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; $success = 0;}
if($nonsilence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; }
else {$nonsilence{$p} = 1;}
if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){
$exit = 1;
set_to_fail();
print "--> ERROR: phone \"$p\" has disallowed written form";
$success = 0;
}
}
$idx ++;
}
close(NS);
$success == 0 || print "--> $dict/silence_phones.txt is OK\n";
$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n";
print "\n";
# Checking disjoint -------------------------------
@ -106,37 +125,37 @@ sub intersect {
print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n";
@itset = intersect(\%silence, \%nonsilence);
if(@itset == 0) {print "--> disjoint property is OK.\n";}
else {$exit = 1; print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";}
else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";}
print "\n";
sub check_lexicon {
my ($lexfn, $pron_probs) = @_;
print "Checking $lexfn\n";
if(-z "$lexfn") {$exit = 1; print "--> ERROR: $lexfn is empty or not exists\n";}
if(!open(L, "<$lexfn")) {$exit = 1; print "--> ERROR: fail to open $lexfn\n";}
if(-z "$lexfn") {set_to_fail(); print "--> ERROR: $lexfn is empty or not exists\n";}
if(!open(L, "<$lexfn")) {set_to_fail(); print "--> ERROR: fail to open $lexfn\n";}
$idx = 1;
$success = 1;
print "--> reading $lexfn\n";
while (<L>) {
chomp;
if (! s/\n$//) {
print "--> ERROR: last line '$_' of $lexfn does not end in newline.\n";
set_to_fail();
}
my @col = split(" ", $_);
$word = shift @col;
if (!defined $word) {
$exit = 1; print "--> ERROR: empty lexicon line in $lexfn\n";
$success = 0;
set_to_fail(); print "--> ERROR: empty lexicon line in $lexfn\n";
}
if ($pron_probs) {
$prob = shift @col;
if (!($prob > 0.0 && $prob <= 1.0)) {
$exit = 1; print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lexfn\n";
$success = 0;
set_to_fail(); print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lexfn\n";
}
}
foreach (0 .. @col-1) {
if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
$exit = 1; print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx)\n";
$success = 0;
set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx)\n";
}
}
$idx ++;
@ -150,7 +169,7 @@ if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0); }
if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1); }
if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) {
print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n";
$exit = 1;
set_to_fail();
}
# If both lexicon.txt and lexiconp.txt exist, we check that they correspond to
# each other. If not, it could be that the user overwrote one and we need to
@ -161,11 +180,21 @@ if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) {
die "Error opening lexicon.txt and/or lexiconp.txt"; # already checked, so would be code error.
}
while(<L>) {
if (! s/\n$//) {
print "--> ERROR: last line '$_' of $dict/lexicon.txt does not end in newline.\n";
set_to_fail();
last;
}
@A = split;
$x = <P>;
if ($x !~ s/\n$//) {
print "--> ERROR: last line '$x' of $dict/lexiconp.txt does not end in newline.\n";
set_to_fail();
last;
}
if (!defined $x) {
print "--> ERROR: lexicon.txt and lexiconp.txt have different numbers of lines (mismatch); delete one.\n";
$exit = 1;
set_to_fail();
last;
}
@B = split(" ", $x);
@ -175,13 +204,13 @@ if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) {
# now @A and @B should be the same.
if ($#A != $#B) {
print "--> ERROR: lexicon.txt and lexiconp.txt have mismatched lines '$_' versus '$x'; delete one.\n";
$exit = 1;
set_to_fail();
last;
}
for ($n = 0; $n < @A; $n++) {
if ($A[$n] ne $B[$n]) {
print "--> ERROR: lexicon.txt and lexiconp.txt have mismatched lines '$_' versus '$x'; delete one.\n";
$exit = 1;
set_to_fail();
last;
}
}
@ -189,32 +218,40 @@ if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) {
$x = <P>;
if (defined $x && $exit == 0) {
print "--> ERROR: lexicon.txt and lexiconp.txt have different numbers of lines (mismatch); delete one.\n";
$exit = 1;
set_to_fail();
}
}
# Checking extra_questions.txt -------------------------------
print "Checking $dict/extra_questions.txt ...\n";
if (-s "$dict/extra_questions.txt") {
if(!open(EX, "<$dict/extra_questions.txt")) {$exit = 1; print "--> ERROR: fail to open $dict/extra_questions.txt\n";}
if (!open(EX, "<$dict/extra_questions.txt")) {
set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n";
}
$idx = 1;
$success = 1;
print "--> reading $dict/extra_questions.txt\n";
while(<EX>) {
chomp;
if (! s/\n$//) {
print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n";
set_to_fail();
}
my @col = split(" ", $_);
foreach(0 .. @col-1) {
if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
$exit = 1; print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n";
$success = 0;
}
if (@col == 0) {
set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n";
}
}
foreach(0 .. @col-1) {
if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n";
}
$idx ++;
}
}
close(EX);
$success == 0 || print "--> $dict/extra_questions.txt is OK\n";
} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";}
if($exit == 1) { print " [Error detected ]\n"; exit 1;}
if ($exit == 1) { print "--> ERROR validating dictionary directory $dict (see detailed error messages above)\n"; exit 1;}
else { print "--> SUCCESS [validating dictionary directory $dict]\n"; }
exit 0;

Просмотреть файл

@ -1,6 +1,8 @@
#!/usr/bin/perl
# Guoguo Chen (guoguo@jhu.edu)
# Apache 2.0.
# Copyright 2012 Guoguo Chen
# 2014 Neil Nelson
#
# Validation script for data/lang
@ -132,7 +134,7 @@ sub check_txt_int_csl {
}
sub check_txt_int {
my ($cat, $symtab) = @_;
my ($cat, $symtab, $sym_check) = @_;
print "Checking $cat.\{txt, int\} ...\n";
if (-z "$cat.txt") {$exit = 1; return print "--> ERROR: $cat.txt is empty or does not exist\n";}
if (-z "$cat.int") {$exit = 1; return print "--> ERROR: $cat.int is empty or does not exist\n";}
@ -154,6 +156,7 @@ sub check_txt_int {
close(TXT); $idx1 --;
print "--> $idx1 entry/entries in $cat.txt\n";
my %used_syms = ();
$idx2 = 1;
while(<INT>) {
chomp;
@ -168,6 +171,8 @@ sub check_txt_int {
if (@set != @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";}
foreach(0 .. @set-1) {
if ($symtab->{@set[$_]} ne @col[$_]) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2, block " ,$_+1, ")\n";}
if ($sym_check && defined $used_syms{@set[$_]}) {$exit = 1; return print "--> ERROR: $cat.txt and $cat.int contain duplicate symbols (break at line $idx2, block " ,$_+1, ")\n";}
$used_syms{@set[$_]} = 1;
}
$idx2 ++;
}
@ -175,31 +180,16 @@ sub check_txt_int {
if ($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";}
print "--> $cat.int corresponds to $cat.txt\n";
return print "--> $cat.\{txt, int\} are OK\n";
}
if ($sym_check) {
while ( my ($key, $value) = each(%silence) ) {
if (!defined $used_syms{$key}) {$exit = 1; return print "--> ERROR: $cat.txt and $cat.int do not contain all silence phones\n";}
}
while ( my ($key, $value) = each(%nonsilence) ) {
if (!defined $used_syms{$key}) {$exit = 1; return print "--> ERROR: $cat.txt and $cat.int do not contain all non-silence phones\n";}
}
}
@list1 = ("context_indep", "disambig", "nonsilence", "silence", "optional_silence");
@list2 = ("roots", "sets");
foreach(@list1) {
check_txt_int_csl("$lang/phones/$_", \%psymtab); print "\n";
}
foreach(@list2) {
check_txt_int("$lang/phones/$_", \%psymtab); print "\n";
}
if ((-s "$lang/phones/extra_questions.txt") || (-s "$lang/phones/extra_questions.int")) {
check_txt_int("$lang/phones/extra_questions", \%psymtab); print "\n";
} else {
print "Checking $lang/phones/extra_questions.\{txt, int\} ...\n";
if ((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int")) {
print "--> WARNING: the optional $lang/phones/extra_questions.\{txt, int\} are empty!\n\n";
$warning = 1;
} else {
print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n";
$exit = 1;
}
}
if (-e "$lang/phones/word_boundary.txt") {
check_txt_int("$lang/phones/word_boundary", \%psymtab); print "\n";
return print "--> $cat.\{txt, int\} are OK\n";
}
# Check disjoint and summation -------------------------------
@ -217,7 +207,7 @@ sub intersect {
}
sub check_disjoint {
print "Checking disjoint: silence.txt, nosilenct.txt, disambig.txt ...\n";
print "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n";
if (!open(S, "<$lang/phones/silence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/silence.txt\n";}
if (!open(N, "<$lang/phones/nonsilence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/nonsilence.txt\n";}
if (!open(D, "<$lang/phones/disambig.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/disambig.txt\n";}
@ -336,6 +326,30 @@ sub check_summation {
check_disjoint; print "\n";
check_summation; print "\n";
@list1 = ("context_indep", "disambig", "nonsilence", "silence", "optional_silence");
@list2 = ("roots", "sets");
foreach(@list1) {
check_txt_int_csl("$lang/phones/$_", \%psymtab); print "\n";
}
foreach(@list2) {
check_txt_int("$lang/phones/$_", \%psymtab, 1); print "\n";
}
if ((-s "$lang/phones/extra_questions.txt") || (-s "$lang/phones/extra_questions.int")) {
check_txt_int("$lang/phones/extra_questions", \%psymtab, 0); print "\n";
} else {
print "Checking $lang/phones/extra_questions.\{txt, int\} ...\n";
if ((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int")) {
print "--> WARNING: the optional $lang/phones/extra_questions.\{txt, int\} are empty!\n\n";
$warning = 1;
} else {
print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n";
$exit = 1;
}
}
if (-e "$lang/phones/word_boundary.txt") {
check_txt_int("$lang/phones/word_boundary", \%psymtab, 0); print "\n";
}
# Checking optional_silence.txt -------------------------------
print "Checking optional_silence.txt ...\n";
$idx = 1;
@ -550,7 +564,7 @@ if (-s "$lang/phones/word_boundary.int") {
}
# Check oov -------------------------------
check_txt_int("$lang/oov", \%wsymtab); print "\n";
check_txt_int("$lang/oov", \%wsymtab, 0); print "\n";
# Check determinizability of G.fst
@ -580,7 +594,6 @@ if (-e "$lang/G.fst" && -e "$lang/L_disambig.fst") {
if ($exit == 1) { print "--> ERROR (see error messages above)\n"; exit 1;}
else {
if ($warning == 1) { print "--> WARNING (check output above for warnings)\n"; exit 0; }
else { print "--> SUCCESS\n"; exit 0; }
else { print "--> SUCCESS [validating lang directory $lang]\n"; exit 0; }
}

Просмотреть файл

@ -43,8 +43,8 @@ int main(int argc, char *argv[]) {
"of disambiguation symbols.\n"
"Warning: you probably want to set the --transition-scale and --self-loop-scale\n"
"options; the defaults (zero) are probably not appropriate.\n"
"Usage: compile-train-graphs-fsts [options] tree-in model-in lexicon-fst-in "
" graphs-rspecifier graphs-wspecifier\n"
"Usage: compile-train-graphs-fsts [options] <tree-in> <model-in> <lexicon-fst-in> "
" <graphs-rspecifier> <graphs-wspecifier>\n"
"e.g.: \n"
" compile-train-graphs-fsts --read-disambig-syms=disambig.list\\\n"
" tree 1.mdl lex.fst ark:train.fsts ark:graphs.fsts\n";

Просмотреть файл

@ -37,7 +37,7 @@ int main(int argc, char *argv[]) {
const char *usage =
"Creates training graphs (without transition-probabilities, by default)\n"
"\n"
"Usage: compile-train-graphs [options] tree-in model-in lexicon-fst-in transcriptions-rspecifier graphs-wspecifier\n"
"Usage: compile-train-graphs [options] <tree-in> <model-in> <lexicon-fst-in> <transcriptions-rspecifier> <graphs-wspecifier>\n"
"e.g.: \n"
" compile-train-graphs tree 1.mdl lex.fst ark:train.tra ark:graphs.fsts\n";
ParseOptions po(usage);

Просмотреть файл

@ -1,6 +1,7 @@
// bin/get-post-on-ali.cc
// Copyright 2013 Brno University of Technology (Author: Karel Vesely)
// 2014 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
@ -31,18 +32,24 @@ int main(int argc, char *argv[]) {
typedef kaldi::int32 int32;
try {
const char *usage =
"This program extracts a vector of per-frame posteriors that are selected\n"
"by an alignment (ie. posteriors that are under the alignment path).\n"
"This can be used as a per-frame confidence measure.\n"
"Given input posteriors, e.g. derived from lattice-to-post, and an alignment\n"
"typically derived from the best path of a lattice, outputs the probability in\n"
"the posterior of the corresponding index in the alignment, or zero if it was\n"
"not there. These are output as a vector of weights, one per utterance.\n"
"While, by default, lattice-to-post (as a source of posteriors) and sources of\n"
"alignments such as lattice-best-path will output transition-ids as the index,\n"
"it will generally make sense to either convert these to pdf-ids using\n"
"post-to-pdf-post and ali-to-pdf respectively, or to phones using post-to-phone-post\n"
"and (ali-to-phones --per-frame=true). Since this program only sees the integer\n"
"indexes, it does not care what they represent-- but of course they should match\n"
"(e.g. don't input posteriors with transition-ids and alignments with pdf-ids).\n"
"See http://kaldi.sourceforge.net/hmm.html#transition_model_identifiers for an\n"
"explanation of these types of indexes.\n"
"\n"
"By intuition, it is better to use pdf-posteriors and pdf-alignments,\n"
"because the posteriors of competing hypothesis that are in the same frame\n"
"at same 'pdf-state' are summed up, which is in some sense similar\n"
"to what is done by C-max which sums the posteriors of overlapping words.\n"
"The difference here is that the granularity is per-frame.\n"
"See also: weight-post, post-to-weights, reverse-weights\n"
"\n"
"Usage: get-post-on-ali [options] <posteriors-rspecifier> <ali-rspecifier> <conf-wspecifier>\n"
"e.g.: get-post-on-ali ark:post.ark ark:ali.ark ark:conf.ark\n";
"Usage: get-post-on-ali [options] <posteriors-rspecifier> <ali-rspecifier> <weights-wspecifier>\n"
"e.g.: get-post-on-ali ark:post.ark ark,s,cs:ali.ark ark:weights.ark\n";
ParseOptions po(usage);

Просмотреть файл

@ -32,6 +32,7 @@ int main(int argc, char *argv[]) {
const char *usage =
"This program turns per-frame posteriors, which have transition-ids as\n"
"the integers, into pdf-level posteriors\n"
"See also: post-to-phone-post, post-to-weights, get-post-on-ali\n"
"\n"
"Usage: post-to-pdf-post [options] <model-file> <posteriors-rspecifier> <posteriors-wspecifier>\n"
"e.g.: post-to-pdf-post 1.mdl ark:- ark:-\n";

Просмотреть файл

@ -30,6 +30,7 @@ int main(int argc, char *argv[]) {
const char *usage =
"Convert posteriors to phone-level posteriors\n"
"See also: post-to-pdf-post, post-to-weights, get-post-on-ali\n"
"\n"
"Usage: post-to-phone-post [options] <model> <post-rspecifier> <phone-post-wspecifier>\n"
" e.g.: post-to-phone-post --binary=false 1.mdl \"ark:ali-to-post 1.ali|\" ark,t:-\n";

Просмотреть файл

@ -31,7 +31,9 @@ int main(int argc, char *argv[]) {
const char *usage =
"Turn posteriors into per-frame weights (typically most useful after\n"
"weight-silence-post, to get silence weights)\n"
"Usage: post-to-weights post-rspecifier weights-wspecifier\n";
"See also: weight-silence-post, post-to-pdf-post, post-to-phone-post\n"
"get-post-on-ali\n"
"Usage: post-to-weights <post-rspecifier> <weights-wspecifier>\n";
ParseOptions po(usage);
po.Read(argc, argv);

Просмотреть файл

@ -1128,7 +1128,6 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
*tot_objf += weight * log(this_prob);
*tot_weight += weight;
(*this)(m, label) += weight / this_prob;
}
}
}

Просмотреть файл

@ -580,7 +580,7 @@ class CuMatrix: public CuMatrixBase<Real> {
void CompObjfAndDeriv(const std::vector<MatrixElement<Real> > &elements,
const CuMatrix<Real> &A,
Real *tot_objf,
Real* tot_weight);
Real *tot_weight);
private:
void Destroy();

Просмотреть файл

@ -28,28 +28,39 @@ int main(int argc, char *argv[]) {
const char *usage =
"Reads an archive of features and writes a corresponding archive\n"
"that maps utterance-id to utterance length in frames.\n"
"Usage: feat-to-len [options] in-rspecifier out-wspecifier\n"
"e.g.: feat-to-len scp:feats.scp ark,t:feats.lengths\n";
"that maps utterance-id to utterance length in frames, or (with\n"
"one argument) print to stdout the total number of frames in the\n"
"input archive.\n"
"Usage: feat-to-len [options] <in-rspecifier> [<out-wspecifier>]\n"
"e.g.: feat-to-len scp:feats.scp ark,t:feats.lengths\n"
"or: feat-to-len scp:feats.scp\n";
ParseOptions po(usage);
po.Read(argc, argv);
if (po.NumArgs() != 2) {
if (po.NumArgs() != 1 && po.NumArgs() != 2) {
po.PrintUsage();
exit(1);
}
std::string rspecifier = po.GetArg(1);
std::string wspecifier = po.GetArg(2);
if (po.NumArgs() == 2) {
std::string rspecifier = po.GetArg(1);
std::string wspecifier = po.GetArg(2);
Int32Writer length_writer(wspecifier);
Int32Writer length_writer(wspecifier);
SequentialBaseFloatMatrixReader kaldi_reader(rspecifier);
for (; !kaldi_reader.Done(); kaldi_reader.Next())
length_writer.Write(kaldi_reader.Key(), kaldi_reader.Value().NumRows());
SequentialBaseFloatMatrixReader matrix_reader(rspecifier);
for (; !matrix_reader.Done(); matrix_reader.Next())
length_writer.Write(matrix_reader.Key(), matrix_reader.Value().NumRows());
} else {
int64 tot = 0;
std::string rspecifier = po.GetArg(1);
SequentialBaseFloatMatrixReader matrix_reader(rspecifier);
for (; !matrix_reader.Done(); matrix_reader.Next())
tot += matrix_reader.Value().NumRows();
std::cout << tot << std::endl;
}
return 0;
} catch(const std::exception &e) {
std::cerr << e.what();

Просмотреть файл

@ -234,8 +234,8 @@ int main(int argc, char *argv[]) {
const char *usage =
"Finds the path having the smallest edit-distance between two lattices.\n"
"For efficiency put the smallest lattices first (for example reference strings).\n"
"Usage: lattice-oracle [options] test-lattice-rspecifier reference-rspecifier "
"transcriptions-wspecifier [edit-distance-wspecifier]\n"
"Usage: lattice-oracle [options] <test-lattice-rspecifier> <reference-rspecifier> "
"<transcriptions-wspecifier> [<edit-distance-wspecifier>]\n"
" e.g.: lattice-oracle ark:lat.1 'ark:sym2int.pl -f 2- data/lang/words.txt <data/test/text' ark,t:-\n";
ParseOptions po(usage);
@ -260,20 +260,21 @@ int main(int argc, char *argv[]) {
po.Read(argc, argv);
if (po.NumArgs() != 3) {
if (po.NumArgs() != 3 && po.NumArgs() != 4) {
po.PrintUsage();
exit(1);
}
std::string lats_rspecifier = po.GetArg(1),
reference_rspecifier = po.GetArg(2),
transcriptions_wspecifier = po.GetArg(3);
transcriptions_wspecifier = po.GetArg(3),
edit_distance_wspecifier = po.GetOptArg(4);
// will read input as lattices
SequentialLatticeReader lattice_reader(lats_rspecifier);
RandomAccessInt32VectorReader reference_reader(reference_rspecifier);
Int32VectorWriter transcriptions_writer(transcriptions_wspecifier);
Int32Writer edit_distance_writer(edit_distance_wspecifier);
// Guoguo Chen added the implementation for option "write-lattices".
CompactLatticeWriter lats_writer(lats_wspecifier);
@ -360,8 +361,10 @@ int main(int argc, char *argv[]) {
// count errors
int32 correct, substitutions, insertions, deletions, num_words;
CountErrors(best_path, &correct, &substitutions, &insertions, &deletions, &num_words);
int32 toterrs = substitutions + insertions + deletions;
KALDI_LOG << "%WER " << (100.*toterrs) / num_words << " [ " << toterrs
int32 tot_errs = substitutions + insertions + deletions;
if (edit_distance_wspecifier != "")
edit_distance_writer.Write(key, tot_errs);
KALDI_LOG << "%WER " << (100.*tot_errs) / num_words << " [ " << tot_errs
<< " / " << num_words << ", " << insertions << " insertions, " << deletions
<< " deletions, " << substitutions << " sub ]";
tot_correct += correct;
@ -397,7 +400,7 @@ int main(int argc, char *argv[]) {
}
// Guoguo Chen added the implementation for option "write-lattices".
// Currently it's just a naive implementation: traversal the original
// Currently it's just a naive implementation: traverse the original
// lattice and get the path corresponding to the oracle word sequence.
// Note that this new lattice has the alignment information.
if (lats_wspecifier != "") {

Просмотреть файл

@ -1002,53 +1002,52 @@ void MatrixBase<Real>::MulRowsVec(const VectorBase<Real> &scale) {
}
}
template<typename Real>
void MatrixBase<Real>::MulRowsGroupMat(const MatrixBase<Real> &src) {
KALDI_ASSERT(src.NumCols() > 0 && src.NumCols() <= this->NumCols());
KALDI_ASSERT(this->NumCols() % src.NumCols() == 0 ||
this->NumCols() % (src.NumCols() - 1) < this->NumCols() / (src.NumCols() - 1));
int group_size = 0;
if (this->NumCols() % src.NumCols() == 0) {
group_size = this->NumCols() / src.NumCols();
} else {
group_size = this->NumCols() / src.NumCols() + 1;
}
MatrixIndexT M = num_rows_, N = num_cols_;
KALDI_ASSERT(src.NumRows() == this->NumRows() &&
this->NumCols() % src.NumCols() == 0);
int32 group_size = this->NumCols() / src.NumCols(),
num_groups = this->NumCols() / group_size,
num_rows = this->NumRows();
for (MatrixIndexT i = 0; i < M; i++)
for (MatrixIndexT j = 0; j < N; j++)
(*this)(i, j) *= src(i, j / group_size);
for (MatrixIndexT i = 0; i < num_rows; i++) {
Real *data = this->RowData(i);
for (MatrixIndexT j = 0; j < num_groups; j++, data += group_size) {
Real scale = src(i, j);
cblas_Xscal(group_size, scale, data, 1);
}
}
}
template<typename Real>
void MatrixBase<Real>::GroupPnormDeriv(const MatrixBase<Real> &src1,
const MatrixBase<Real> &src2,
void MatrixBase<Real>::GroupPnormDeriv(const MatrixBase<Real> &input,
const MatrixBase<Real> &output,
Real power) {
KALDI_ASSERT(src2.NumCols() > 0 && src2.NumCols() <= this->NumCols());
KALDI_ASSERT(this->NumCols() % src2.NumCols() == 0 ||
this->NumCols() % (src2.NumCols() - 1) < this->NumCols() / (src2.NumCols() - 1));
int group_size = 0;
if (this->NumCols() % src2.NumCols() == 0) {
group_size = this->NumCols() / src2.NumCols();
} else {
group_size = this->NumCols() / src2.NumCols() + 1;
}
MatrixIndexT M = this->NumRows(), N = this->NumCols();
KALDI_ASSERT(input.NumCols() == this->NumCols() && input.NumRows() == this->NumRows());
KALDI_ASSERT(this->NumCols() % output.NumCols() == 0 &&
this->NumRows() == output.NumRows());
int group_size = this->NumCols() / output.NumCols(),
num_rows = this->NumRows(), num_cols = this->NumCols();
if (power == 1.0) {
for (MatrixIndexT i = 0; i < M; i++)
for (MatrixIndexT j = 0; j < N; j++)
(*this)(i, j) = (src1(i, j) == 0 ? 0 : (src1(i, j) > 0 ? 1 : -1));
for (MatrixIndexT i = 0; i < num_rows; i++) {
for (MatrixIndexT j = 0; j < num_cols; j++) {
Real input_val = input(i, j);
(*this)(i, j) = (input_val == 0 ? 0 : (input_val > 0 ? 1 : -1));
}
}
} else {
for (MatrixIndexT i = 0; i < M; i++) {
for (MatrixIndexT j = 0; j < N; j++) {
if (src2(i, j / group_size) == 0) {
for (MatrixIndexT i = 0; i < num_rows; i++) {
for (MatrixIndexT j = 0; j < num_cols; j++) {
Real output_val = output(i, j / group_size),
input_val = input(i, j);
if (output_val == 0)
(*this)(i, j) = 0;
} else {
(*this)(i, j) = pow(std::abs(src1(i, j)), power - 1) *
(src2(i, j / group_size) > 0 ? pow(src2(i, j / group_size), 1 - power) : 1) *
(src1(i, j) >= 0 ? 1 : -1) ;
}
else
(*this)(i, j) = pow(std::abs(input_val), power - 1) *
pow(output_val, 1 - power) * (input_val >= 0 ? 1 : -1) ;
}
}
}
@ -2428,12 +2427,15 @@ void MatrixBase<Real>::SoftHinge(const MatrixBase<Real> &src) {
}
}
}
template<typename Real>
void MatrixBase<Real>::GroupPnorm(const MatrixBase<Real> &src, Real power) {
int group_size = src.NumCols() / this->NumCols();
KALDI_ASSERT(src.NumCols() == this->NumCols() * group_size);
for (MatrixIndexT i = 0; i < src.NumRows(); i++)
for (MatrixIndexT j = 0; j < this->NumCols(); j++)
KALDI_ASSERT(src.NumCols() % this->NumCols() == 0 &&
src.NumRows() == this->NumRows());
int group_size = src.NumCols() / this->NumCols(),
num_rows = this->NumRows(), num_cols = this->NumCols();
for (MatrixIndexT i = 0; i < num_rows; i++)
for (MatrixIndexT j = 0; j < num_cols; j++)
(*this)(i, j) = src.Row(i).Range(j * group_size, group_size).Norm(power);
}

Просмотреть файл

@ -240,8 +240,9 @@ class MatrixBase {
/// each row by a scalar taken from that dimension of the vector.
void MulRowsVec(const VectorBase<Real> &scale);
/// divide each row into src.NumCols() groups,
/// and then scale i'th row's jth group of elements by src[i, j].
/// Divide each row into src.NumCols() equal groups, and then scale i'th row's
/// j'th group of elements by src(i, j). Requires src.NumRows() ==
/// this->NumRows() and this->NumCols() % src.NumCols() == 0.
void MulRowsGroupMat(const MatrixBase<Real> &src);
/// Returns logdet of matrix.
@ -418,8 +419,8 @@ class MatrixBase {
/// Set each element to y = log(1 + exp(x))
void SoftHinge(const MatrixBase<Real> &src);
/// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j ^ (power)) ^ (1 / p)
/// where G = x.NumCols() / y.NumCols() must be an integer.
/// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j^(power))^(1 / p).
/// Requires src.NumRows() == this->NumRows() and src.NumCols() % this->NumCols() == 0.
void GroupPnorm(const MatrixBase<Real> &src, Real power);

Просмотреть файл

@ -469,9 +469,9 @@ int32 LinearCgd(const LinearCgdOptions &opts,
residual_factor = opts.recompute_residual_factor *
opts.recompute_residual_factor;
// Note: although from a mathematical point of view the method should
// converge after M iterations, in practice it does not always converge
// to good precision after that many iterations so we let the maximum
// Note: although from a mathematical point of view the method should converge
// after M iterations, in practice (due to roundoff) it does not always
// converge to good precision after that many iterations so we let the maximum
// be 1.5 * M + 5 instead.
int32 k = 0;
for (; k < M + M / 2 + 5 && k != opts.max_iters; k++) {

Просмотреть файл

@ -86,8 +86,8 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
} else {
cblas_Xscal(dim, inv_v1, v, 1);
}
if (!KALDI_ISFINITE(inv_v1) || !KALDI_ISFINITE(x1)) {
KALDI_ERR << "NaN or inf encountered in HouseBackward";
if (KALDI_ISNAN(inv_v1)) {
KALDI_ERR << "NaN encountered in HouseBackward";
}
}
}
@ -142,8 +142,8 @@ void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
} else {
cblas_Xscal(dim, inv_v1, v, 1);
}
if (!KALDI_ISFINITE(inv_v1) || !KALDI_ISFINITE(x1)) {
KALDI_ERR << "NaN or inf encountered in HouseBackward";
if (KALDI_ISNAN(inv_v1)) {
KALDI_ERR << "NaN encountered in HouseBackward";
}
}
}

Просмотреть файл

@ -19,7 +19,8 @@ OBJFILES = nnet-component.o nnet-nnet.o train-nnet.o train-nnet-ensemble.o nnet-
nnet-fix.o nnet-stats.o rescale-nnet.o nnet-limit-rank.o nnet-example.o \
get-feature-transform.o widen-nnet.o nnet-precondition-online.o \
nnet-example-functions.o nnet-compute-discriminative.o \
nnet-compute-discriminative-parallel.o online-nnet2-decodable.o
nnet-compute-discriminative-parallel.o online-nnet2-decodable.o \
train-nnet-perturbed.o
LIBNAME = kaldi-nnet2

Просмотреть файл

@ -1595,6 +1595,9 @@ class FixedAffineComponent: public Component {
virtual Component* Copy() const;
virtual void Read(std::istream &is, bool binary);
virtual void Write(std::ostream &os, bool binary) const;
// Function to provide access to linear_params_.
const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
protected:
friend class AffineComponent;
CuMatrix<BaseFloat> linear_params_;

Просмотреть файл

@ -67,6 +67,40 @@ void NnetExample::Read(std::istream &is, bool binary) {
}
void ExamplesRepository::AcceptExamples(
std::vector<NnetExample> *examples) {
KALDI_ASSERT(!examples->empty());
empty_semaphore_.Wait();
KALDI_ASSERT(examples_.empty());
examples_.swap(*examples);
full_semaphore_.Signal();
}
void ExamplesRepository::ExamplesDone() {
empty_semaphore_.Wait();
KALDI_ASSERT(examples_.empty());
done_ = true;
full_semaphore_.Signal();
}
bool ExamplesRepository::ProvideExamples(
std::vector<NnetExample> *examples) {
full_semaphore_.Wait();
if (done_) {
KALDI_ASSERT(examples_.empty());
full_semaphore_.Signal(); // Increment the semaphore so
// the call by the next thread will not block.
return false; // no examples to return-- all finished.
} else {
KALDI_ASSERT(!examples_.empty() && examples->empty());
examples->swap(examples_);
empty_semaphore_.Signal();
return true;
}
}
void DiscriminativeNnetExample::Write(std::ostream &os,
bool binary) const {
// Note: weight, num_ali, den_lat, input_frames, left_context and spk_info are

Просмотреть файл

@ -23,6 +23,7 @@
#include "nnet2/nnet-nnet.h"
#include "util/table-types.h"
#include "lat/kaldi-lattice.h"
#include "thread/kaldi-semaphore.h"
namespace kaldi {
namespace nnet2 {
@ -64,6 +65,35 @@ typedef SequentialTableReader<KaldiObjectHolder<NnetExample > > SequentialNnetEx
typedef RandomAccessTableReader<KaldiObjectHolder<NnetExample > > RandomAccessNnetExampleReader;
/** This class stores neural net training examples to be used in
multi-threaded training. */
class ExamplesRepository {
public:
/// The following function is called by the code that reads in the examples,
/// with a batch of examples. [It will empty the vector "examples").
void AcceptExamples(std::vector<NnetExample> *examples);
/// The following function is called by the code that reads in the examples,
/// when we're done reading examples.
void ExamplesDone();
/// This function is called by the code that does the training. It gets the
/// training examples, and if they are available, puts them in "examples" and
/// returns true. It returns false when there are no examples left and
/// ExamplesDone() has been called.
bool ProvideExamples(std::vector<NnetExample> *examples);
ExamplesRepository(): empty_semaphore_(1), done_(false) { }
private:
Semaphore full_semaphore_;
Semaphore empty_semaphore_;
std::vector<NnetExample> examples_;
bool done_;
KALDI_DISALLOW_COPY_AND_ASSIGN(ExamplesRepository);
};
/**
This struct is used to store the information we need for discriminative training
(MMI or MPE). Each example corresponds to one chunk of a file (for better randomization
@ -116,7 +146,7 @@ struct DiscriminativeNnetExample {
void Read(std::istream &is, bool binary);
};
// Tes, the length of typenames is getting out of hand.
// Yes, the length of typenames is getting out of hand.
typedef TableWriter<KaldiObjectHolder<DiscriminativeNnetExample > >
DiscriminativeNnetExampleWriter;
typedef SequentialTableReader<KaldiObjectHolder<DiscriminativeNnetExample > >

Просмотреть файл

@ -432,6 +432,12 @@ void Nnet::RemovePreconditioning() {
*(dynamic_cast<AffineComponent*>(components_[i])));
delete components_[i];
components_[i] = ac;
} else if (dynamic_cast<AffineComponentPreconditionedOnline*>(
components_[i]) != NULL) {
AffineComponent *ac = new AffineComponent(
*(dynamic_cast<AffineComponent*>(components_[i])));
delete components_[i];
components_[i] = ac;
}
}
SetIndexes();

Просмотреть файл

@ -26,68 +26,6 @@
namespace kaldi {
namespace nnet2 {
/** This struct stores neural net training examples to be used in
multi-threaded training. */
class ExamplesRepository {
public:
/// The following function is called by the code that reads in the examples,
/// with a batch of examples. [It will empty the vector "examples").
void AcceptExamples(std::vector<NnetExample> *examples);
/// The following function is called by the code that reads in the examples,
/// when we're done reading examples.
void ExamplesDone();
/// This function is called by the code that does the training. It gets the
/// training examples, and if they are available, puts them in "examples" and
/// returns true. It returns false when there are no examples left and
/// ExamplesDone() has been called.
bool ProvideExamples(std::vector<NnetExample> *examples);
ExamplesRepository(): empty_semaphore_(1), done_(false) { }
private:
Semaphore full_semaphore_;
Semaphore empty_semaphore_;
std::vector<NnetExample> examples_;
bool done_;
KALDI_DISALLOW_COPY_AND_ASSIGN(ExamplesRepository);
};
void ExamplesRepository::AcceptExamples(
std::vector<NnetExample> *examples) {
KALDI_ASSERT(!examples->empty());
empty_semaphore_.Wait();
KALDI_ASSERT(examples_.empty());
examples_.swap(*examples);
full_semaphore_.Signal();
}
void ExamplesRepository::ExamplesDone() {
empty_semaphore_.Wait();
KALDI_ASSERT(examples_.empty());
done_ = true;
full_semaphore_.Signal();
}
bool ExamplesRepository::ProvideExamples(
std::vector<NnetExample> *examples) {
full_semaphore_.Wait();
if (done_) {
KALDI_ASSERT(examples_.empty());
full_semaphore_.Signal(); // Increment the semaphore so
// the call by the next thread will not block.
return false; // no examples to return-- all finished.
} else {
KALDI_ASSERT(!examples_.empty() && examples->empty());
examples->swap(examples_);
empty_semaphore_.Signal();
return true;
}
}
class DoBackpropParallelClass: public MultiThreadable {
public:

Просмотреть файл

@ -39,8 +39,8 @@ double NnetUpdater::ComputeForMinibatch(
CuMatrix<BaseFloat> tmp_deriv;
double ans = ComputeObjfAndDeriv(data, &tmp_deriv, tot_accuracy);
if (nnet_to_update_ != NULL)
Backprop(data, &tmp_deriv); // this is summed (after weighting), not
// averaged.
Backprop(&tmp_deriv); // this is summed (after weighting), not
// averaged.
return ans;
}
@ -133,9 +133,7 @@ double NnetUpdater::ComputeTotAccuracy(
}
void NnetUpdater::Backprop(const std::vector<NnetExample> &data,
CuMatrix<BaseFloat> *deriv) const {
int32 num_chunks = data.size();
void NnetUpdater::Backprop(CuMatrix<BaseFloat> *deriv) const {
// We assume ComputeObjfAndDeriv has already been called.
for (int32 c = nnet_.NumComponents() - 1; c >= 0; c--) {
const Component &component = nnet_.GetComponent(c);
@ -146,7 +144,7 @@ void NnetUpdater::Backprop(const std::vector<NnetExample> &data,
CuMatrix<BaseFloat> input_deriv(input.NumRows(), input.NumCols());
const CuMatrix<BaseFloat> &output_deriv(*deriv);
component.Backprop(input, output, output_deriv, num_chunks,
component.Backprop(input, output, output_deriv, num_chunks_,
component_to_update, &input_deriv);
input_deriv.Swap(deriv);
}

Просмотреть файл

@ -29,22 +29,20 @@
namespace kaldi {
namespace nnet2 {
/* This header provides functionality for sample-by-sample stochastic
/** @file
This header provides functionality for sample-by-sample stochastic
gradient descent and gradient computation with a neural net.
See also nnet-compute.h which is the same thing but for
See also \ref nnet-compute.h which is the same thing but for
whole utterances.
This is the inner part of the training code; see nnet-train.h
which contains a wrapper for this, with functionality for
automatically keeping the learning rates for each layer updated
using a heuristic involving validation-set gradients.
*/
class NnetEnsembleTrainer;
// This class NnetUpdater contains functions for updating the neural net or
// computing its gradient, given a set of NnetExamples. We
// define it in the header file becaused it's needed by the ensemble training.
// But in normal cases its functionality should be used by calling DoBackprop(),
// and by ComputeNnetObjf()
class NnetEnsembleTrainer;
class NnetUpdater {
public:
// Note: in the case of training with SGD, "nnet" and "nnet_to_update" will
@ -84,8 +82,7 @@ class NnetUpdater {
/// contain, at input, the derivative w.r.t. the output layer (as computed by
/// ComputeObjfAndDeriv), but will be used as a temporary variable by this
/// function.
void Backprop(const std::vector<NnetExample> &data,
CuMatrix<BaseFloat> *deriv) const;
void Backprop(CuMatrix<BaseFloat> *deriv) const;
friend class NnetEnsembleTrainer;
private:
@ -100,10 +97,6 @@ class NnetUpdater {
std::vector<CuMatrix<BaseFloat> > forward_data_; // The forward data
// for the outputs of each of the components.
// These weights are one per parameter; they equal to the "weight"
// member variables in the NnetExample structures. These
// will typically be about one on average.
CuVector<BaseFloat> chunk_weights_;
};
/// This function computes the objective function and either updates the model

Просмотреть файл

@ -90,12 +90,13 @@ void NnetEnsembleTrainer::TrainOneMinibatch() {
post_mat[i].ApplyLog();
std::vector<BaseFloat> log_post_correct;
post_mat[i].Lookup(sv_labels_ind, &log_post_correct);
BaseFloat log_prob_this_net = std::accumulate(log_post_correct.begin(), log_post_correct.end(), static_cast<BaseFloat>(0));
BaseFloat log_prob_this_net = std::accumulate(log_post_correct.begin(),
log_post_correct.end(),
static_cast<BaseFloat>(0));
avg_logprob_this_phase_ += log_prob_this_net;
tmp_deriv.InvertElements();
tmp_deriv.MulElements(post_avg);
updater_ensemble_[i]->Backprop(buffer_, &tmp_deriv);
updater_ensemble_[i]->Backprop(&tmp_deriv);
}
count_this_phase_ += buffer_.size();
buffer_.clear();

Просмотреть файл

@ -0,0 +1,710 @@
// nnet2/train-nnet-perturbed.cc
// Copyright 2012-2014 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "nnet2/train-nnet-perturbed.h"
#include "nnet2/nnet-update.h"
#include "thread/kaldi-thread.h"
namespace kaldi {
namespace nnet2 {
class NnetPerturbedUpdater {
public:
// Note: in the case of training with SGD, "nnet" and "nnet_to_update" will be
// identical. They'd be different if we're accumulating the gradient for a
// held-out set and don't want to update the model, but this shouldn't happen
// for this "perturbed" update. nnet_to_update may be NULL if you don't
// want do do backprop, but this probably doesn't make sense.
// num_layers_before_input is the number of layers to ignore before what
// we consider to be the input (x) for purposes of this technique. This will
// likely equal 2: one for the feature-splicing layer (SpliceComponent) and
// one for the preconditioning layer (FixedAffineComponent). The within_class_covar
// argument (within_class_covar)
//
// within_class_covar is the within-class covariance matrix
NnetPerturbedUpdater(const Nnet &nnet,
int32 num_layers_before_input,
const CuMatrix<BaseFloat> &within_class_covar,
Nnet *nnet_to_update);
// This function does the entire forward and backward computation for this
// minbatch. Outputs to tot_objf_orig and tot_objf_perturbed the total
// objective function (including any weighting factors) over this minibatch,
// and the same after perturbing the data.
void ComputeForMinibatch(const std::vector<NnetExample> &data,
BaseFloat D,
double *tot_objf_orig,
double *tot_objf_perturbed);
protected:
/// takes the input and formats as a single matrix, in forward_data_[0].
void FormatInput(const std::vector<NnetExample> &data);
/// Do the forward propagation for layers 0 ... num_layers_before_input_ - 1,
/// typically the first two layers. This will be called once per minibatch.
void PropagateInitial() { Propagate(0, num_layers_before_input_); }
/// Do the forward propagation for layers num_layers_before_input_
/// ... num-layers-1, typically all but the first two layers. This will be
/// called twice per minibatch, once before and once after perturbing the
/// inputs.
void PropagateRemaining() { Propagate(num_layers_before_input_,
nnet_.NumComponents()); }
/// Internal Propagate function, does the forward computation for
/// layers begin_layer ... end_layer - 1.
void Propagate(int32 begin_layer, int32 end_layer);
/// Computes objective function and derivative at output layer, but does not
/// do the backprop [for that, see Backprop()]. This will be called twice per
/// minibatch, once before and once after perturbing the inputs.
void ComputeObjfAndDeriv(const std::vector<MatrixElement<BaseFloat> > &sv_labels,
CuMatrix<BaseFloat> *deriv,
BaseFloat *tot_objf,
BaseFloat *tot_weight) const;
/// Computes supervision labels from data.
void ComputeSupervisionLabels(const std::vector<NnetExample> &data,
std::vector<MatrixElement<BaseFloat> > *sv_labels);
/// Backprop must be called after ComputeObjfAndDeriv (it will be called
/// twice, the first time with a NULL nnet_to_update pointer). It does the
/// backpropagation (not including the first num_layers_before_input_ layers).
/// "nnet_to_update" is updated, if non-NULL. Note: "deriv" will contain, at
/// input, the derivative w.r.t. the output layer (as computed by
/// ComputeObjfAndDeriv), but will be used as a temporary variable by this
/// function, and exit, will contain the derivative of the objective function
/// w.r.t. the input of layer num_layers_before_input_.
void Backprop(Nnet *nnet_to_update,
CuMatrix<BaseFloat> *deriv) const;
/// Perturb the input features (actually, the features at the input of layer
/// num_layers_before_input_). This modifies the value of
/// forward_data_[num_layers_before_input_]. For the math, see \ref
/// train-nnet-perturbed.h
void PerturbInput(const CuMatrix<BaseFloat> &deriv_at_input,
BaseFloat D);
private:
const Nnet &nnet_;
Nnet *nnet_to_update_;
int32 num_layers_before_input_; // Number of layers before whichever layer we
// regard as the input for purposes of this
// method (normally 2, to include splicing
// layer and preconditioning layer)
const CuMatrix<BaseFloat> &within_class_covar_;
int32 num_chunks_; // same as the minibatch size.
std::vector<CuMatrix<BaseFloat> > forward_data_; // The forward data
// for the outputs of each of the components.
};
NnetPerturbedUpdater::NnetPerturbedUpdater(const Nnet &nnet,
int32 num_layers_before_input,
const CuMatrix<BaseFloat> &within_class_covar,
Nnet *nnet_to_update):
nnet_(nnet),
nnet_to_update_(nnet_to_update),
num_layers_before_input_(num_layers_before_input),
within_class_covar_(within_class_covar) {
KALDI_ASSERT(num_layers_before_input_ >= 0 &&
num_layers_before_input < nnet.NumComponents());
for (int32 c = 0; c < num_layers_before_input_; c++) {
const Component *comp = &(nnet.GetComponent(c));
const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(comp);
if (uc != NULL) {
KALDI_ERR << "One of the pre-input layers is updatable.";
}
}
}
void NnetPerturbedUpdater::PerturbInput(
const CuMatrix<BaseFloat> &deriv_at_input,
BaseFloat D) {
// The code doesn't handle the case where there is further splicing after the
// input.
KALDI_ASSERT(num_chunks_ == deriv_at_input.NumRows());
// For the math, see train-nnet-perturbed.h.
// deriv_at_input is \nabla in the math.
// "input" is the input features, currently unmodified, but we'll
// modify them.
CuMatrix<BaseFloat> &input(forward_data_[num_layers_before_input_]);
KALDI_ASSERT(SameDim(input, deriv_at_input));
// Each row of deriv_w will equal (W nabla_t)', where ' is transpose.
CuMatrix<BaseFloat> deriv_w(input.NumRows(), input.NumCols());
// note: for the second transpose-ness argument below we can choose either
// kTrans or kNoTrans because the matrix is symmetric. I'm guessing that
// kTrans will be faster.
deriv_w.AddMatMat(1.0, deriv_at_input, kNoTrans,
within_class_covar_, kTrans, 0.0);
// k will be used to compute and store the gradient-scaling factor k_t.
CuVector<BaseFloat> k(deriv_at_input.NumRows());
// after the next call, each element of k will contain (\nabla_t^T W \nabla_t)
// We want k_t = D / sqrt(\nabla_t^T W \nabla_t)
// so we need to take this to the power -0.5.
// We can't do this if it's zero, so we first floor to a very small value.
k.AddDiagMatMat(1.0, deriv_w, kNoTrans, deriv_at_input, kTrans, 0.0);
int32 num_floored = k.ApplyFloor(1.0e-20);
if (num_floored > 0.0) {
// Should only happen at the very start of training,
KALDI_WARN << num_floored << " gradients floored (derivative at input was "
<< "close to zero).. should only happen at start of training "
<< "or when adding a new layer.";
}
k.ApplyPow(-0.5);
// now we have k_t = 1.0 / sqrt(\nabla_t^T W \nabla_t).
// in the math, k_t contains an additional factor of D, but we'll
// add this later.
// Below, we will do x'_t = x_t - k_t W \nabla_t
// Here, each row of deriv_w contains the transpose of W \nabla_t.
// The factor of D is because it was missing in k.
input.AddDiagVecMat(-1.0 * D, k, deriv_w, kNoTrans, 1.0);
}
void NnetPerturbedUpdater::ComputeForMinibatch(
const std::vector<NnetExample> &data,
BaseFloat D,
double *tot_objf_orig,
double *tot_objf_perturbed) {
FormatInput(data);
PropagateInitial();
PropagateRemaining();
CuMatrix<BaseFloat> tmp_deriv;
std::vector<MatrixElement<BaseFloat> > sv_labels;
ComputeSupervisionLabels(data, &sv_labels);
BaseFloat tot_objf, tot_weight;
ComputeObjfAndDeriv(sv_labels, &tmp_deriv, &tot_objf, &tot_weight);
KALDI_VLOG(4) << "Objective function (original) is " << (tot_objf/tot_weight)
<< " per sample, over " << tot_weight << " samples (weighted).";
*tot_objf_orig = tot_objf;
// only backprops till layer number num_layers_before_input_,
// and derivative at that layer is in tmp_deriv.
Backprop(NULL, &tmp_deriv);
// perturb forward_data_[num_layers_before_input_].
PerturbInput(tmp_deriv, D);
// Now propagate forward again from that point.
PropagateRemaining();
ComputeObjfAndDeriv(sv_labels, &tmp_deriv, &tot_objf, &tot_weight);
KALDI_VLOG(4) << "Objective function (perturbed) is " << (tot_objf/tot_weight)
<< " per sample, over " << tot_weight << " samples (weighted).";
*tot_objf_perturbed = tot_objf;
// The actual model updating would happen in the next call.
if (nnet_to_update_ != NULL)
Backprop(nnet_to_update_, &tmp_deriv);
}
void NnetPerturbedUpdater::Propagate(int32 begin_layer, int32 end_layer) {
static int32 num_times_printed = 0;
for (int32 c = begin_layer; c < end_layer; c++) {
const Component &component = nnet_.GetComponent(c);
const CuMatrix<BaseFloat> &input = forward_data_[c];
CuMatrix<BaseFloat> &output = forward_data_[c+1];
// Note: the Propagate function will automatically resize the
// output.
component.Propagate(input, num_chunks_, &output);
KALDI_VLOG(4) << "Propagating: sum at output of " << c << " is " << output.Sum();
// If we won't need the output of the previous layer for
// backprop, delete it to save memory.
bool need_last_output =
(c>0 && nnet_.GetComponent(c-1).BackpropNeedsOutput()) ||
component.BackpropNeedsInput();
if (g_kaldi_verbose_level >= 3 && num_times_printed < 100) {
KALDI_VLOG(3) << "Stddev of data for component " << c
<< " for this minibatch is "
<< (TraceMatMat(forward_data_[c], forward_data_[c], kTrans) /
(forward_data_[c].NumRows() * forward_data_[c].NumCols()));
num_times_printed++;
}
if (!need_last_output && c != num_layers_before_input_)
forward_data_[c].Resize(0, 0); // We won't need this data.
}
}
void NnetPerturbedUpdater::ComputeSupervisionLabels(
const std::vector<NnetExample> &data,
std::vector<MatrixElement<BaseFloat> > *sv_labels) {
sv_labels->clear();
sv_labels->reserve(num_chunks_); // We must have at least this many labels.
for (int32 m = 0; m < num_chunks_; m++) {
for (size_t i = 0; i < data[m].labels.size(); i++) {
MatrixElement<BaseFloat>
tmp = {m, data[m].labels[i].first, data[m].labels[i].second};
sv_labels->push_back(tmp);
}
}
}
void NnetPerturbedUpdater::ComputeObjfAndDeriv(
const std::vector<MatrixElement<BaseFloat> > &sv_labels,
CuMatrix<BaseFloat> *deriv,
BaseFloat *tot_objf,
BaseFloat *tot_weight) const {
int32 num_components = nnet_.NumComponents();
deriv->Resize(num_chunks_, nnet_.OutputDim()); // sets to zero.
const CuMatrix<BaseFloat> &output(forward_data_[num_components]);
KALDI_ASSERT(SameDim(output, *deriv));
deriv->CompObjfAndDeriv(sv_labels, output, tot_objf, tot_weight);
}
void NnetPerturbedUpdater::Backprop(Nnet *nnet_to_update,
CuMatrix<BaseFloat> *deriv) const {
// We assume ComputeObjfAndDeriv has already been called.
for (int32 c = nnet_.NumComponents() - 1; c >= num_layers_before_input_; c--) {
const Component &component = nnet_.GetComponent(c);
Component *component_to_update = (nnet_to_update == NULL ? NULL :
&(nnet_to_update->GetComponent(c)));
const CuMatrix<BaseFloat> &input = forward_data_[c],
&output = forward_data_[c+1];
CuMatrix<BaseFloat> input_deriv(input.NumRows(), input.NumCols());
const CuMatrix<BaseFloat> &output_deriv(*deriv);
component.Backprop(input, output, output_deriv, num_chunks_,
component_to_update, &input_deriv);
input_deriv.Swap(deriv);
}
}
void NnetPerturbedUpdater::FormatInput(const std::vector<NnetExample> &data) {
KALDI_ASSERT(data.size() > 0);
int32 num_splice = nnet_.LeftContext() + 1 + nnet_.RightContext();
KALDI_ASSERT(data[0].input_frames.NumRows() >= num_splice);
int32 feat_dim = data[0].input_frames.NumCols(),
spk_dim = data[0].spk_info.Dim(),
tot_dim = feat_dim + spk_dim; // we append these at the neural net
// input... note, spk_dim might be 0.
KALDI_ASSERT(tot_dim == nnet_.InputDim());
KALDI_ASSERT(data[0].left_context >= nnet_.LeftContext());
int32 ignore_frames = data[0].left_context - nnet_.LeftContext(); // If
// the NnetExample has more left-context than we need, ignore some.
// this may happen in settings where we increase the amount of context during
// training, e.g. by adding layers that require more context.
num_chunks_ = data.size();
forward_data_.resize(nnet_.NumComponents() + 1);
// First copy to a single matrix on the CPU, so we can copy to
// GPU with a single copy command.
Matrix<BaseFloat> temp_forward_data(num_splice * num_chunks_,
tot_dim);
for (int32 chunk = 0; chunk < num_chunks_; chunk++) {
SubMatrix<BaseFloat> dest(temp_forward_data,
chunk * num_splice, num_splice,
0, feat_dim);
Matrix<BaseFloat> full_src(data[chunk].input_frames);
SubMatrix<BaseFloat> src(full_src, ignore_frames, num_splice, 0, feat_dim);
dest.CopyFromMat(src);
if (spk_dim != 0) {
SubMatrix<BaseFloat> spk_dest(temp_forward_data,
chunk * num_splice, num_splice,
feat_dim, spk_dim);
spk_dest.CopyRowsFromVec(data[chunk].spk_info);
}
}
forward_data_[0].Swap(&temp_forward_data); // Copy to GPU, if being used.
}
void DoBackpropPerturbed(const Nnet &nnet,
int32 num_layers_before_input,
const CuMatrix<BaseFloat> &within_class_covar,
BaseFloat D,
const std::vector<NnetExample> &examples,
Nnet *nnet_to_update,
double *tot_objf_orig,
double *tot_objf_perturbed) {
try {
NnetPerturbedUpdater updater(nnet, num_layers_before_input,
within_class_covar, nnet_to_update);
updater.ComputeForMinibatch(examples, D, tot_objf_orig, tot_objf_perturbed);
} catch (...) {
KALDI_LOG << "Error doing backprop, nnet info is: " << nnet.Info();
throw;
}
}
NnetPerturbedTrainer::NnetPerturbedTrainer(
const NnetPerturbedTrainerConfig &config,
const SpMatrix<BaseFloat> &within_class_covar,
Nnet *nnet):
config_(config), nnet_(nnet), logprob_this_phase_(0.0),
logprob_perturbed_this_phase_(0.0), weight_this_phase_(0.0),
logprob_total_(0.0), logprob_perturbed_total_(0.0),
weight_total_(0.0),
D_(config.initial_d) {
InitWithinClassCovar(within_class_covar);
num_phases_ = 0;
bool first_time = true;
BeginNewPhase(first_time);
}
// This function is used in class NnetPerturbedTrainer
// and the function DoBackpropPerturbedParallel.
void InitWithinClassCovar(
const SpMatrix<BaseFloat> &within_class_covar,
const Nnet &nnet,
int32 *num_layers_before_input,
CuMatrix<BaseFloat> *within_class_covar_out) {
CuSpMatrix<BaseFloat> orig_covar(within_class_covar);
*num_layers_before_input = 0;
KALDI_ASSERT(nnet.NumComponents() > *num_layers_before_input);
const Component *comp = &(nnet.GetComponent(*num_layers_before_input));
// Skip over any SpliceComponent that appears at the beginning of
// the network.
if (dynamic_cast<const SpliceComponent*>(comp) != NULL)
(*num_layers_before_input)++;
KALDI_ASSERT(nnet.NumComponents() > *num_layers_before_input);
comp = &(nnet.GetComponent(*num_layers_before_input));
const FixedAffineComponent *fa =
dynamic_cast<const FixedAffineComponent*>(comp);
if (fa != NULL) {
(*num_layers_before_input)++;
const CuMatrix<BaseFloat> &linear_params = fa->LinearParams();
if (linear_params.NumCols() != orig_covar.NumCols()) {
KALDI_ERR << "The neural network seems to expect a (spliced) feature "
<< "dimension of " << linear_params.NumCols() << ", but your "
<< "LDA stats have a dimension of " << orig_covar.NumCols();
}
CuMatrix<BaseFloat> temp(linear_params.NumRows(), orig_covar.NumRows());
// temp = linear_params . orig_covar
temp.AddMatSp(1.0, linear_params, kNoTrans, orig_covar, 0.0);
within_class_covar_out->Resize(linear_params.NumRows(),
linear_params.NumRows());
// temp = linear_params . orig_covar . linear_params^T
within_class_covar_out->AddMatMat(1.0, temp, kNoTrans,
linear_params, kTrans, 0.0);
// note: this should be symmetric, spot-test it like this:
KALDI_ASSERT(ApproxEqual(TraceMatMat(*within_class_covar_out,
*within_class_covar_out, kNoTrans),
TraceMatMat(*within_class_covar_out,
*within_class_covar_out, kTrans)));
} else {
if (comp->InputDim() != orig_covar.NumCols()) {
KALDI_ERR << "The neural network seems to expect a (spliced) feature "
<< "dimension of " << comp->InputDim() << ", but your "
<< "LDA stats have a dimension of " << orig_covar.NumCols();
}
within_class_covar_out->Resize(orig_covar.NumRows(), orig_covar.NumCols());
within_class_covar_out->CopyFromSp(orig_covar);
}
}
void NnetPerturbedTrainer::InitWithinClassCovar(
const SpMatrix<BaseFloat> &within_class_covar) {
kaldi::nnet2::InitWithinClassCovar(within_class_covar, *nnet_,
&num_layers_before_input_,
&within_class_covar_);
}
void NnetPerturbedTrainer::TrainOnExample(const NnetExample &value) {
buffer_.push_back(value);
if (static_cast<int32>(buffer_.size()) == config_.minibatch_size)
TrainOneMinibatch();
}
void NnetPerturbedTrainer::TrainOneMinibatch() {
KALDI_ASSERT(!buffer_.empty());
double tot_objf_orig, tot_objf_perturbed;
DoBackpropPerturbed(*nnet_, num_layers_before_input_, within_class_covar_, D_,
buffer_, nnet_, &tot_objf_orig, &tot_objf_perturbed);
logprob_this_phase_ += tot_objf_orig;
logprob_perturbed_this_phase_ += tot_objf_perturbed;
double weight = TotalNnetTrainingWeight(buffer_);
UpdateD(tot_objf_orig / weight, tot_objf_perturbed / weight);
weight_this_phase_ += weight;
buffer_.clear();
minibatches_seen_this_phase_++;
if (minibatches_seen_this_phase_ == config_.minibatches_per_phase) {
bool first_time = false;
BeginNewPhase(first_time);
}
}
void NnetPerturbedTrainer::UpdateD(BaseFloat orig_objf_per_example,
BaseFloat perturbed_objf_per_example) {
BaseFloat diff = orig_objf_per_example - perturbed_objf_per_example;
// note: diff should be positive in the normal case.
KALDI_ASSERT(config_.target_objf_change > 0.0 && config_.max_d_factor > 1.0);
BaseFloat objf_ratio = config_.target_objf_change /
std::max<BaseFloat>(1.0e-20, diff),
D_ratio = pow(objf_ratio, config_.tune_d_power);
if (D_ratio > config_.max_d_factor)
D_ratio = config_.max_d_factor;
else if (D_ratio < 1.0 / config_.max_d_factor)
D_ratio = 1.0 / config_.max_d_factor;
BaseFloat D_new = D_ * D_ratio;
KALDI_VLOG(3) << "Training objective function normal/perturbed is "
<< orig_objf_per_example << '/' << perturbed_objf_per_example
<< ", diff " << diff << " vs. target "
<< config_.target_objf_change
<< ", changing D by factor " << D_ratio << " to " << D_new;
D_ = D_new;
}
void NnetPerturbedTrainer::BeginNewPhase(bool first_time) {
if (!first_time) {
BaseFloat logprob = logprob_this_phase_/weight_this_phase_,
logprob_perturbed = logprob_perturbed_this_phase_/weight_this_phase_,
diff = logprob - logprob_perturbed;
KALDI_LOG << "Training objective function normal->perturbed is "
<< logprob << " -> " << logprob_perturbed << ", diff "
<< diff << " vs. target " << config_.target_objf_change
<< ", over " << weight_this_phase_ << " frames, D is "
<< D_;
}
logprob_total_ += logprob_this_phase_;
logprob_perturbed_total_ += logprob_perturbed_this_phase_;
weight_total_ += weight_this_phase_;
logprob_this_phase_ = 0.0;
logprob_perturbed_this_phase_ = 0.0;
weight_this_phase_ = 0.0;
minibatches_seen_this_phase_ = 0;
num_phases_++;
}
NnetPerturbedTrainer::~NnetPerturbedTrainer() {
if (!buffer_.empty()) {
KALDI_LOG << "Doing partial minibatch of size "
<< buffer_.size();
TrainOneMinibatch();
if (minibatches_seen_this_phase_ != 0) {
bool first_time = false;
BeginNewPhase(first_time);
}
}
if (weight_total_ == 0.0) {
KALDI_WARN << "No data seen.";
} else {
KALDI_LOG << "Did backprop on " << weight_total_
<< " examples, average log-prob normal->perturbed per frame is "
<< (logprob_total_ / weight_total_) << " -> "
<< (logprob_perturbed_total_ / weight_total_);
KALDI_LOG << "[this line is to be parsed by a script:] log-prob-per-frame="
<< (logprob_total_ / weight_total_);
}
}
// compare with DoBackpropParallelClass
class TrainParallelPerturbedClass: public MultiThreadable {
public:
// This constructor is only called for a temporary object
// that we pass to the RunMultiThreaded function.
TrainParallelPerturbedClass(const NnetPerturbedTrainerConfig &config,
const CuMatrix<BaseFloat> &within_class_covar,
int32 num_layers_before_input,
BaseFloat *D,
Nnet *nnet,
ExamplesRepository *repository,
double *log_prob_orig_ptr,
double *log_prob_perturbed_ptr,
double *tot_weight_ptr):
config_(config), within_class_covar_(within_class_covar),
num_layers_before_input_(num_layers_before_input), D_(D),
nnet_(nnet), repository_(repository),
log_prob_orig_ptr_(log_prob_orig_ptr),
log_prob_perturbed_ptr_(log_prob_perturbed_ptr),
tot_weight_ptr_(tot_weight_ptr),
log_prob_orig_(0.0),
log_prob_perturbed_(0.0),
tot_weight_(0.0) { }
// Use the default copy constructor.
// This does the main function of the class.
void operator () () {
std::vector<NnetExample> examples;
while (repository_->ProvideExamples(&examples)) {
double objf_orig, objf_perturbed,
weight = TotalNnetTrainingWeight(examples);
DoBackpropPerturbed(*nnet_, num_layers_before_input_,
within_class_covar_, *D_,
examples, nnet_,
&objf_orig, &objf_perturbed);
UpdateD(objf_orig / weight, objf_perturbed / weight);
tot_weight_ += weight;
log_prob_orig_ += objf_orig;
log_prob_perturbed_ += objf_perturbed;
KALDI_VLOG(4) << "Thread " << thread_id_ << " saw "
<< tot_weight_ << " frames so far (weighted); likelihood "
<< "per frame (orig->perturbed) so far is "
<< (log_prob_orig_ / tot_weight_) << " -> "
<< (log_prob_perturbed_ / tot_weight_);
examples.clear();
}
}
~TrainParallelPerturbedClass() {
*log_prob_orig_ptr_ += log_prob_orig_;
*log_prob_perturbed_ptr_ += log_prob_perturbed_;
*tot_weight_ptr_ += tot_weight_;
}
private:
void UpdateD(BaseFloat orig_logprob, BaseFloat perturbed_logprob) {
BaseFloat diff = orig_logprob - perturbed_logprob;
// note: diff should be positive in the normal case.
KALDI_ASSERT(config_.target_objf_change > 0.0 && config_.max_d_factor > 1.0);
// divide the power we raise the ratio to when tuning D, by the
// number of threads; this should ensure stability of the update.
BaseFloat tune_d_power = config_.tune_d_power / g_num_threads;
BaseFloat objf_ratio = config_.target_objf_change /
std::max<BaseFloat>(1.0e-20, diff),
D_ratio = pow(objf_ratio, tune_d_power);
if (D_ratio > config_.max_d_factor)
D_ratio = config_.max_d_factor;
else if (D_ratio < 1.0 / config_.max_d_factor)
D_ratio = 1.0 / config_.max_d_factor;
BaseFloat D_new = (*D_) * D_ratio;
*D_ = D_new;
// Note: we are accessing *D_ from multiple threads without
// locking, but the negative consequences of this contention are
// very small (
KALDI_VLOG(3) << "Training objective function normal->perturbed is "
<< orig_logprob << " -> " << perturbed_logprob
<< ", diff " << diff << " vs. target "
<< config_.target_objf_change
<< ", changing D by factor " << D_ratio << " to " << D_new;
}
const NnetPerturbedTrainerConfig &config_;
const CuMatrix<BaseFloat> &within_class_covar_;
int32 num_layers_before_input_;
BaseFloat *D_; // Constant D that controls how much to perturb the data. We
// update this as well as use it.
Nnet *nnet_;
ExamplesRepository *repository_;
double *log_prob_orig_ptr_;
double *log_prob_perturbed_ptr_;
double *tot_weight_ptr_;
double log_prob_orig_; // log-like times num frames (before perturbing features)
double log_prob_perturbed_; // log-like times num frames (after perturbing features)
double tot_weight_; // normalizing factor for the above.
};
void DoBackpropPerturbedParallel(const NnetPerturbedTrainerConfig &config,
const SpMatrix<BaseFloat> &within_class_covar,
SequentialNnetExampleReader *example_reader,
double *tot_objf_orig,
double *tot_objf_perturbed,
double *tot_weight,
Nnet *nnet) {
// within_class_covar_processed is the within-class covar as CuMatrix, possibly
// projected by the preconditioning transform in any FixedAffineComponent.
CuMatrix<BaseFloat> within_class_covar_processed;
int32 num_layers_before_input;
InitWithinClassCovar(within_class_covar, *nnet,
&num_layers_before_input,
&within_class_covar_processed);
BaseFloat D = config.initial_d;
ExamplesRepository repository; // handles parallel programming issues regarding
*tot_objf_orig = *tot_objf_perturbed = *tot_weight = 0.0;
TrainParallelPerturbedClass trainer_proto(config,
within_class_covar_processed,
num_layers_before_input, &D,
nnet, &repository,
tot_objf_orig,
tot_objf_perturbed,
tot_weight);
{
// The initialization of the following class spawns the threads that
// process the examples. They get re-joined in its destructor.
MultiThreader<TrainParallelPerturbedClass> m(g_num_threads, trainer_proto);
std::vector<NnetExample> examples;
for (; !example_reader->Done(); example_reader->Next()) {
examples.push_back(example_reader->Value());
if (examples.size() == config.minibatch_size)
repository.AcceptExamples(&examples);
}
if (!examples.empty()) // partial minibatch.
repository.AcceptExamples(&examples);
// Here, the destructor of "m" re-joins the threads, and
// does the summing of the gradients if we're doing gradient
// computation (i.e. &nnet != nnet_to_update). This gets
// done in the destructors of the objects of type
// DoBackpropParallelClass.
repository.ExamplesDone();
}
KALDI_LOG << "Did backprop on " << *tot_weight << " examples, average log-prob "
<< "per frame (orig->perturbed) is "
<< (*tot_objf_orig / *tot_weight) << " -> "
<< (*tot_objf_perturbed / *tot_weight) << " over "
<< *tot_weight << " samples (weighted).";
KALDI_LOG << "[this line is to be parsed by a script:] log-prob-per-frame="
<< (*tot_objf_orig / *tot_weight);
}
} // namespace nnet2
} // namespace kaldi

Просмотреть файл

@ -0,0 +1,327 @@
// nnet2/train-nnet-perturbed.h
// Copyright 2012-2014 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_NNET2_TRAIN_NNET_PERTURBED_H_
#define KALDI_NNET2_TRAIN_NNET_PERTURBED_H_
#include "nnet2/nnet-nnet.h"
#include "nnet2/nnet-example.h"
#include "itf/options-itf.h"
namespace kaldi {
namespace nnet2 {
/**
@file
This file was modified from train-nnet.h in order to implement an idea
about perturbing the training examples slightly, in a direction that's
opposite to the gradient of the objective function w.r.t. those examples.
It's a bit like the idea in "Intriguing properties of neural networks", the
training method they mention, except they have a more complicated formulation
with L-BFGS. We can justify our idea by approximating the neural network
plus objective-function evaluation as a linear function.
Note: before doing this, we want to make sure the input features have a
reasonable distribution, and our choice for this is to make the within-class
covariance matrix unit. [note: we don't have to normalize the mean to zero,
this won't matter.] Rather than explicitly transforming the features using
a transform T, it turns out that we have to multiply the gradients by something
like T T'. We'll describe this later.
Suppose the actual input features are x. Typically we do frame splicing
as part of the network, and it's more convenient to do the perturbation on
the spliced features, so x may actually be the output of the network's
first (splicing) layer. Suppose the within-class covariance matrix of
x is W. If we do the Cholesky transform
W = C C^T,
then C^{-1} W C^{-T} = I, so if we define
T =(def) C^{-1} and
and transformed features
\hat{x} =(def) T x
then it's easy to show that the within-class covariance matrix of the
transformed features \hat{x} would be I.
The way we formulate the perturbed-feature thing is somewhat similar to the
"Intriguing properties of neural networks" paper, except we're not in image
recognition so no need to keep features in the range [0, 1]. Given a training
example \hat{x}_t, we want to find a perturbed example
\hat{x}'_t = \hat{x}_t + d_t
that gives the worst possible loss-value, such that ||d_t|| <= D, where D is
a scalar length parameter (e.g. D = 0.1), and ||.|| is the 2-norm. This means
that we want to perturb the training example in the most damaging way possible,
given that it should not change by more than a certain amount. Because we've
normalized the within-class covariance we believe that using a normal 2-norm
on d_t, rather than a more general form of inner-product, is suitable.
Anyway, we make a simplifying assumption that the loss function for a particular
sample is just a linear function of the input, and when we get to the space of
\hat{x}, it just means we go a certain distance D down the gradient. How we
set a suitable value for D, we'll come to later.
Suppose by backpropagating the
derivative to x we get a derivative \nabla_t of the objective function (e.g. a
log-probability) w.r.t. x_t. Then we can get the derivative \hat{\nabla}_t of
the objective function w.r.t. \hat{x}_t, by identifying
x_t^T nabla_t = \hat{x}_t^T \hat{\nabla}_t
x_t^T nabla_t = x_t^T T^T \hat{\nabla}_t
x_t^T nabla_t = x_t^T T^T T^{-T} \nabla_t, since T^T T^{-T} = I.
[note, ^T is transpose and ^{-T} is inverse-of-transpose.]
so \hat{\nabla}_t = T^{-T} \nabla_t.
(this is not the formal way of getting these derivatives, it's just how I remember).
Anyway, we now have
\hat{x}'_t =(def) \hat{x}_t - k_t T^{-T} \nabla_t
where k_t is chosen to ensure that
k_t || T^{-T} \nabla_t ||_2 = D
k_t sqrt( \nabla_t^T T^{-1} T^{-T} \nabla_t ) = D
so
k_t = D / sqrt(\nabla_t^T T^{-1} T^{-T} \nabla_t)
= D / sqrt(\nabla_t^T C C^T \nabla_t)
= D / sqrt(\nabla_t^T W \nabla_t)
Now, we actually want the update in terms of the parameter x instead of \hat{x},
so multiplying the definition of \hat{x}'_t above by T^{-1} on the left, we have:
x'_t = x_t - k_t T^{-1} T^{-T} \nabla_t
= x_t - k_t W \nabla_t
(note: we can also use W \nabla_t for efficiently computing k_t).
It will actually be more efficient to do this after the FixedAffineTransform
layer that we used to "precondition" the features, so after the second layer
of the input rather than the first. All we need to do is to get the
within-class covariance matrix W in that space (after the
FixedAffineTransform) instead. We'll use the name x for that space, and forget
about the original input space.
Next, we want to discuss how we'll set the constant D. D is a proportion of
the within-class covariance. However, it's not clear a priori how to set
this, or that we can tune it just once and then leave it fixed for other
setups. For one thing, if the input features contain a lot of "nuisance"
dimension that are not very informative about the class, it may be necessary
for D to be smaller (because hopefully the gradients will be small in those
nuisance directions). There is another issue that this whole method is
intended to improve generalization, so we only want to use it strongly if
generalization is actually a problem. For example, if we have so much
training data and so few parameters that we have no trouble generalizing, we
might not want to apply this method too strongly. Our method will be to set D
in order to get, on average, a certain degradation which we'll call
"target-objf-change" in the objective function per frame. Each time we
apply this perturbation to a minibatch, we'll see whether the degradation in
objective is greater or less than "target-objf-change", and we'll change
D accordingly. We'll use a simple heuristic that D should change proportionally
to the 0.5'th power of the ratio between the "target-objf-change" and the
observed objective function change for this minibatch, but never by more than
a factor of two. Note: the only significance of 0.5 here is that 0.5 <= 1; a
smaller number means slower changes in D, so it should change over about 2
minibatches to the right number. If this proves unstable, we'll change it.
Next, it's not absolutely clear how we should set target-objf-change-- the
value which determines how much objective-function degradation we want the
perturbation to produce on average (per sample). To put this in perspective,
for speech tasks with small amounts of data (say, <30 hours) and a couple thousand
classes
we typically see objective values like: training-set -0.6 and valdiation-set -1.1.
These are avearage log-probabilities per frame, of the correct class.
The two numbers are quite different because there is substantial overtraining. Note: for Karel's
nnet1 setup, the difference is typically smaller, more like -0.8 vs. -1.0, as
that setup monitors the validation-set objective and decreases the learning rate
when it starts to degrade. Now, for much larger training sets, we might
see smaller differences in training-set versus validation-set objective function:
for instance: say, -1.40 versus -1.45. (For larger training sets the objectives tend
to be more negative simply because we have more leaves). We measure these values each
iteration: see the files compute_prob_train.*.log and compute_prob_valid.*.log produced
by the example scripts. The reason why I discuss these values
is that if the training-set and validation-set objective functions are very close, then
it means that there is not much overtraining going on and we don't want to apply this
method too strongly; on the other hand, if they are very different, it means we are
overtraining badly and we may want to apply this method more.
So we plan to set target-objf-change to the following value, at the script level:
target-objf-change = target-multiplier * (training-objf - validation-objf))
(e.g. target-multiplier = 1.0).
Note that if target-objf-change is less than a specified min-target-objf-change
(e.g. 0.1) then we won't apply the perturbed training at all, which will save
time. The method is intended to help generalization, and if we're generalizing
well then we don't need to apply it.
The training and validation objective functions are computed over
different (randomly chosen) sets, each with about 3000 samples, and it can
sometimes happen that the validation objective function can be better than the
training set objective function. Also, the validation set is sampled from a
held-out subset of 300 utterances by default; this is done out of a concern
that the correlations within an utterance can be very high, so if we use the
same utterances for training and validation, then the validation set is not
really held-out. But the smallish number (300) of validation utterances
increases the randomness in the training and validation objectives.
*/
struct NnetPerturbedTrainerConfig {
int32 minibatch_size;
int32 minibatches_per_phase;
// target_objf_change will be set from the command line to a value >0.0.
BaseFloat target_objf_change;
BaseFloat initial_d;
// tune_d_power is not configurable from the command line.
BaseFloat tune_d_power;
// max_d_factor is not configurable from the command line.
BaseFloat max_d_factor;
NnetPerturbedTrainerConfig(): minibatch_size(500),
minibatches_per_phase(50),
target_objf_change(0.1),
initial_d(0.05),
tune_d_power(0.5),
max_d_factor(2.0){ }
void Register (OptionsItf *po) {
po->Register("minibatch-size", &minibatch_size,
"Number of samples per minibatch of training data.");
po->Register("minibatches-per-phase", &minibatches_per_phase,
"Number of minibatches to wait before printing training-set "
"objective.");
po->Register("target-objf-change", &target_objf_change, "Target objective "
"function change from feature perturbation, used to set "
"feature distance parameter D");
po->Register("initial-d", &initial_d, "Initial value of parameter D "
"It will ultimately be set according to --target-objf-change");
}
};
/// Class NnetPerturbedTrainer is as NnetSimpleTrainer but implements feature
/// perturbation; see the comment at the top of this file (\ref
/// train-nnet-perturbed.h) for more details.
class NnetPerturbedTrainer {
public:
NnetPerturbedTrainer(const NnetPerturbedTrainerConfig &config,
const SpMatrix<BaseFloat> &within_class_covar,
Nnet *nnet);
/// TrainOnExample will take the example and add it to a buffer;
/// if we've reached the minibatch size it will do the training.
void TrainOnExample(const NnetExample &value);
~NnetPerturbedTrainer();
private:
KALDI_DISALLOW_COPY_AND_ASSIGN(NnetPerturbedTrainer);
void TrainOneMinibatch();
// This function initializes within_class_covar_ and num_layers_before_input_.
// The input within_class_covar is the within-class covariance on the original
// raw features, computed from LDA stats, but if this neural network has
// a data-preconditioning layer of type FixedAffineComponent then we will
// project the transform with that and treat the output of that transform
// as the input x (this is more efficient).
void InitWithinClassCovar(const SpMatrix<BaseFloat> &within_class_covar);
void UpdateD(BaseFloat orig_objf_per_example,
BaseFloat perturbed_objf_per_example);
// The following function is called by TrainOneMinibatch() when we enter a new
// phase. A phase is just a certain number of epochs, and now matters only
// for diagnostics (originally it meant something more).
void BeginNewPhase(bool first_time);
// Things we were given in the initializer:
NnetPerturbedTrainerConfig config_;
Nnet *nnet_; // the nnet we're training.
// static information:
// num_layers_before_input_ is the number of initial layers before what we
// consider to be the input for this method: normally 2, for the splicing
// layer and the (FixedAffineComponent) data-preconditioning layer.
int32 num_layers_before_input_;
// The within_class_covar_ variable below is the within-class covariance; if
// we have a (FixedAffineComponent) data-preconditioning layer, we'd project
// the within-class-covariance with that and store it as within_class_covar_.
CuMatrix<BaseFloat> within_class_covar_;
// State information:
int32 num_phases_;
int32 minibatches_seen_this_phase_;
std::vector<NnetExample> buffer_;
double logprob_this_phase_; // Needed for accumulating train log-prob on each phase.
double logprob_perturbed_this_phase_; // same for perturbed log-prob
double weight_this_phase_; // count corresponding to the above.
double logprob_total_;
double logprob_perturbed_total_;
double weight_total_;
BaseFloat D_; // The distance factor D.
};
/// This function computes the objective function and either updates the model
/// or adds to parameter gradients. It returns the cross-entropy objective
/// function summed over all samples (normalize this by dividing by
/// TotalNnetTrainingWeight(examples)). It is mostly a wrapper for
/// a class NnetPerturbedUpdater that's defined in train-nnet-perturbed.cc, but we
/// don't want to expose that complexity at this level.
/// All these examples will be treated as one minibatch.
///
/// D is the distance factor that determines how much to perturb examples;
/// this is optimized in outer-level code (see class NnetPerturbedTrainer).
/// num_layers_before_input determines how many layers to skip before we find
/// the activation that we regard as the input x to the network, for purposes
/// of this method (e.g. we might skip over the splicing layer and a layer
/// that preconditions the input).
/// within_class_covar (actually a symmetric matrix, but represented as CuMatrix),
/// is the within-class covariance of the features, measured at that level,
/// which ultimately will be derived from LDA stats on the data.
void DoBackpropPerturbed(const Nnet &nnet,
int32 num_layers_before_input,
const CuMatrix<BaseFloat> &within_class_covar,
BaseFloat D,
const std::vector<NnetExample> &examples,
Nnet *nnet_to_update,
double *tot_objf_orig,
double *tot_objf_perturbed);
/// This function is similar to "DoBackpropParallel" as declared in
/// nnet-update-parallel.h, but supports "perturbed" training. It's intended
/// for multi-threaded CPU-based training. The number of threads will be
/// set to g_num_threads.
/// within_class_covar is the within-class covariance after any splicing
/// but before preconditioning, as needed for the LDA computation.
/// All pointer arguments must be non-NULL.
void DoBackpropPerturbedParallel(const NnetPerturbedTrainerConfig &config,
const SpMatrix<BaseFloat> &within_class_covar,
SequentialNnetExampleReader *example_reader,
double *tot_objf_orig,
double *tot_objf_perturbed,
double *tot_weight,
Nnet *nnet);
} // namespace nnet2
} // namespace kaldi
#endif

Просмотреть файл

@ -48,7 +48,7 @@ struct NnetSimpleTrainerConfig {
// Class NnetSimpleTrainer doesn't do much apart from batching up the
// input into minibatches and giving it to the neural net code
// to call Update(), which will typically do stochastic gradient
// descent. It also reports training-set
// descent. It also reports training-set objective-function values.
// It takes in the training examples through the call
// "TrainOnExample()".
class NnetSimpleTrainer {
@ -66,8 +66,9 @@ class NnetSimpleTrainer {
void TrainOneMinibatch();
// The following function is called by TrainOneMinibatch()
// when we enter a new phase.
// The following function is called by TrainOneMinibatch() when we enter a new
// phase. A phase is just a certain number of epochs, and now matters only
// for diagnostics (originally it meant something more).
void BeginNewPhase(bool first_time);
// Things we were given in the initializer:

Просмотреть файл

@ -25,7 +25,8 @@ BINFILES = nnet-randomize-frames nnet-am-info nnet-init \
nnet-train-discriminative-simple nnet-train-discriminative-parallel \
nnet-modify-learning-rates nnet-normalize-stddev nnet-perturb-egs \
nnet-perturb-egs-fmllr nnet-get-weighted-egs nnet-adjust-priors \
cuda-compiled nnet-replace-last-layers nnet-am-switch-preconditioning
cuda-compiled nnet-replace-last-layers nnet-am-switch-preconditioning \
nnet-train-simple-perturbed nnet-train-parallel-perturbed
OBJFILES =

Просмотреть файл

@ -36,12 +36,16 @@ int main(int argc, char *argv[]) {
bool binary = true;
FeatureTransformEstimateOptions opts;
std::string write_cholesky;
std::string write_within_covar;
ParseOptions po(usage);
po.Register("binary", &binary, "Write accumulators in binary mode.");
po.Register("binary", &binary, "Write outputs in binary mode.");
po.Register("write-cholesky", &write_cholesky, "If supplied, write to this "
"wxfilename the Cholesky factor of the within-class covariance."
"wxfilename the Cholesky factor of the within-class covariance. "
"Can be used for perturbing features. E.g. "
"--write-cholesky=exp/nnet5/cholesky.tpmat");
po.Register("write-within-covar", &write_within_covar, "If supplied, write "
"to this wxfilename the within-class covariance (as a symmetric "
"matrix). E.g. --write-within-covar=exp/nnet5/within_covar.mat");
opts.Register(&po);
po.Read(argc, argv);
@ -61,10 +65,18 @@ int main(int argc, char *argv[]) {
Matrix<BaseFloat> mat;
TpMatrix<BaseFloat> cholesky;
fte.Estimate(opts, &mat, write_cholesky != "" ? &cholesky : NULL);
fte.Estimate(opts, &mat,
(write_cholesky != "" || write_within_covar != "" ?
&cholesky : NULL));
WriteKaldiObject(mat, projection_wxfilename, binary);
if (write_cholesky != "")
if (write_cholesky != "") {
WriteKaldiObject(cholesky, write_cholesky, binary);
}
if (write_within_covar != "") {
SpMatrix<BaseFloat> within_var(cholesky.NumRows());
within_var.AddTp2(1.0, cholesky, kNoTrans, 0.0);
WriteKaldiObject(within_var, write_within_covar, binary);
}
return 0;
} catch(const std::exception &e) {
std::cerr << e.what();

Просмотреть файл

@ -0,0 +1,127 @@
// nnet2bin/nnet-train-parallel-perturbed.cc
// Copyright 2012-2014 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "hmm/transition-model.h"
#include "nnet2/train-nnet-perturbed.h"
#include "nnet2/am-nnet.h"
#include "thread/kaldi-thread.h"
int main(int argc, char *argv[]) {
try {
using namespace kaldi;
using namespace kaldi::nnet2;
typedef kaldi::int32 int32;
typedef kaldi::int64 int64;
const char *usage =
"Train the neural network parameters with backprop and stochastic\n"
"gradient descent using minibatches. The training frames and labels\n"
"are read via a pipe from nnet-randomize-frames. This is like nnet-train-parallel,\n"
"using multiple threads in a Hogwild type of update, but also adding\n"
"perturbed training (see src/nnet2/train-nnet-perturbed.h for info)\n"
"\n"
"Usage: nnet-train-parallel-perturbed [options] <model-in> <training-examples-in> <model-out>\n"
"\n"
"e.g.:\n"
"nnet-randomize-frames [args] | nnet-train-parallel-pertured \\\n"
" --within-covar=within.spmat --num-threads=8 --target-objf-change=0.2 1.nnet ark:- 2.nnet\n";
bool binary_write = true;
bool zero_stats = true;
int32 srand_seed = 0;
std::string within_covar_rxfilename;
NnetPerturbedTrainerConfig train_config;
ParseOptions po(usage);
po.Register("binary", &binary_write, "Write output in binary mode");
po.Register("within-covar", &within_covar_rxfilename,
"rxfilename of within-class covariance-matrix, written as "
"SpMatrix. Must be specified.");
po.Register("zero-stats", &zero_stats, "If true, zero stats "
"stored with the neural net (only affects mixing up).");
po.Register("srand", &srand_seed,
"Seed for random number generator (e.g., for dropout)");
po.Register("num-threads", &g_num_threads, "Number of training threads to use "
"in the parallel update. [Note: if you use a parallel "
"implementation of BLAS, the actual number of threads may be larger.]");
train_config.Register(&po);
po.Read(argc, argv);
srand(srand_seed);
if (po.NumArgs() != 3) {
po.PrintUsage();
exit(1);
}
std::string nnet_rxfilename = po.GetArg(1),
examples_rspecifier = po.GetArg(2),
nnet_wxfilename = po.GetArg(3);
if (within_covar_rxfilename == "") {
KALDI_ERR << "The option --within-covar is required.";
}
TransitionModel trans_model;
AmNnet am_nnet;
{
bool binary_read;
Input ki(nnet_rxfilename, &binary_read);
trans_model.Read(ki.Stream(), binary_read);
am_nnet.Read(ki.Stream(), binary_read);
}
KALDI_ASSERT(train_config.minibatch_size > 0);
SpMatrix<BaseFloat> within_covar;
ReadKaldiObject(within_covar_rxfilename, &within_covar);
if (zero_stats) am_nnet.GetNnet().ZeroStats();
SequentialNnetExampleReader example_reader(examples_rspecifier);
double tot_objf_orig, tot_objf_perturbed, tot_weight;
// logging info will be printed from within the next call.
DoBackpropPerturbedParallel(train_config,
within_covar,
&example_reader,
&tot_objf_orig,
&tot_objf_perturbed,
&tot_weight,
&(am_nnet.GetNnet()));
{
Output ko(nnet_wxfilename, binary_write);
trans_model.Write(ko.Stream(), binary_write);
am_nnet.Write(ko.Stream(), binary_write);
}
KALDI_LOG << "Finished training, processed " << tot_weight
<< " training examples (weighted). Wrote model to "
<< nnet_wxfilename;
return (tot_weight == 0 ? 1 : 0);
} catch(const std::exception &e) {
std::cerr << e.what() << '\n';
return -1;
}
}

Просмотреть файл

@ -41,7 +41,7 @@ int main(int argc, char *argv[]) {
"Usage: nnet-train-parallel [options] <model-in> <training-examples-in> <model-out>\n"
"\n"
"e.g.:\n"
"nnet-randomize-frames [args] | nnet-train-simple 1.nnet ark:- 2.nnet\n";
"nnet-randomize-frames [args] | nnet-train-parallel --num-threads=8 1.nnet ark:- 2.nnet\n";
bool binary_write = true;
bool zero_stats = true;

Просмотреть файл

@ -0,0 +1,137 @@
// nnet2bin/nnet-train-perturbed.cc
// Copyright 2012-2014 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "hmm/transition-model.h"
#include "nnet2/nnet-randomize.h"
#include "nnet2/train-nnet-perturbed.h"
#include "nnet2/am-nnet.h"
int main(int argc, char *argv[]) {
try {
using namespace kaldi;
using namespace kaldi::nnet2;
typedef kaldi::int32 int32;
typedef kaldi::int64 int64;
const char *usage =
"Train the neural network parameters with backprop and stochastic\n"
"gradient descent using minibatches. The training frames and labels\n"
"are read via a pipe from nnet-randomize-frames. This version of the\n"
"training program does not update the learning rate, but uses\n"
"the learning rates stored in the neural nets.\n"
"\n"
"Usage: nnet-train-perturbed [options] <model-in> <training-examples-in> <model-out>\n"
"note: the option --within-covar=<file> is needed\n"
"\n"
"e.g.:\n"
"nnet-randomize-frames [args] | nnet-train-perturbed --within-covar=within.spmat 1.nnet ark:- 2.nnet\n";
bool binary_write = true;
bool zero_stats = true;
int32 srand_seed = 0;
std::string use_gpu = "yes";
std::string within_covar_rxfilename;
NnetPerturbedTrainerConfig train_config;
ParseOptions po(usage);
po.Register("binary", &binary_write, "Write output in binary mode");
po.Register("within-covar", &within_covar_rxfilename,
"rxfilename of within-class covariance-matrix, written as "
"SpMatrix. Must be specified.");
po.Register("zero-stats", &zero_stats, "If true, zero occupation "
"counts stored with the neural net (only affects mixing up).");
po.Register("srand", &srand_seed, "Seed for random number generator "
"(relevant if you have layers of type AffineComponentPreconditioned "
"with l2-penalty != 0.0");
po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA");
train_config.Register(&po);
po.Read(argc, argv);
if (po.NumArgs() != 3) {
po.PrintUsage();
exit(1);
}
srand(srand_seed);
#if HAVE_CUDA==1
CuDevice::Instantiate().SelectGpuId(use_gpu);
#endif
if (within_covar_rxfilename == "") {
KALDI_ERR << "The option --within-covar is required.";
}
std::string nnet_rxfilename = po.GetArg(1),
examples_rspecifier = po.GetArg(2),
nnet_wxfilename = po.GetArg(3);
int64 num_examples = 0;
{
TransitionModel trans_model;
AmNnet am_nnet;
{
bool binary_read;
Input ki(nnet_rxfilename, &binary_read);
trans_model.Read(ki.Stream(), binary_read);
am_nnet.Read(ki.Stream(), binary_read);
}
SpMatrix<BaseFloat> within_covar;
ReadKaldiObject(within_covar_rxfilename, &within_covar);
if (zero_stats) am_nnet.GetNnet().ZeroStats();
{ // want to make sure this object deinitializes before
// we write the model, as it does something in the destructor.
NnetPerturbedTrainer trainer(train_config,
within_covar,
&(am_nnet.GetNnet()));
SequentialNnetExampleReader example_reader(examples_rspecifier);
for (; !example_reader.Done(); example_reader.Next(), num_examples++)
trainer.TrainOnExample(example_reader.Value()); // It all happens here!
}
{
Output ko(nnet_wxfilename, binary_write);
trans_model.Write(ko.Stream(), binary_write);
am_nnet.Write(ko.Stream(), binary_write);
}
}
#if HAVE_CUDA==1
CuDevice::Instantiate().PrintProfile();
#endif
KALDI_LOG << "Finished training, processed " << num_examples
<< " training examples. Wrote model to "
<< nnet_wxfilename;
return (num_examples == 0 ? 1 : 0);
} catch(const std::exception &e) {
std::cerr << e.what() << '\n';
return -1;
}
}

Просмотреть файл

@ -0,0 +1,138 @@
// nnet2bin/nnet-train-simple-perturbed.cc
// Copyright 2012-2014 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "hmm/transition-model.h"
#include "nnet2/nnet-randomize.h"
#include "nnet2/train-nnet-perturbed.h"
#include "nnet2/am-nnet.h"
int main(int argc, char *argv[]) {
try {
using namespace kaldi;
using namespace kaldi::nnet2;
typedef kaldi::int32 int32;
typedef kaldi::int64 int64;
const char *usage =
"Train the neural network parameters with backprop and stochastic\n"
"gradient descent using minibatches. The training frames and labels\n"
"are read via a pipe from nnet-randomize-frames. This is as nnet-train-simple\n"
"but implements perturbed training (see src/nnet2/train-nnet-perturbed.h for\n"
"details)\n"
"\n"
"Usage: nnet-train-simple-perturbed [options] <model-in> <training-examples-in> <model-out>\n"
"note: the option --within-covar=<file> is needed\n"
"\n"
"e.g.:\n"
"nnet-randomize-frames [args] | nnet-train-simple-perturbed \\\n"
" --within-covar=within.spmat --target-objf-change=0.2 1.nnet ark:- 2.nnet\n";
bool binary_write = true;
bool zero_stats = true;
int32 srand_seed = 0;
std::string use_gpu = "yes";
std::string within_covar_rxfilename;
NnetPerturbedTrainerConfig train_config;
ParseOptions po(usage);
po.Register("binary", &binary_write, "Write output in binary mode");
po.Register("within-covar", &within_covar_rxfilename,
"rxfilename of within-class covariance-matrix, written as "
"SpMatrix. Must be specified.");
po.Register("zero-stats", &zero_stats, "If true, zero occupation "
"counts stored with the neural net (only affects mixing up).");
po.Register("srand", &srand_seed, "Seed for random number generator "
"(relevant if you have layers of type AffineComponentPreconditioned "
"with l2-penalty != 0.0");
po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA");
train_config.Register(&po);
po.Read(argc, argv);
if (po.NumArgs() != 3) {
po.PrintUsage();
exit(1);
}
srand(srand_seed);
#if HAVE_CUDA==1
CuDevice::Instantiate().SelectGpuId(use_gpu);
#endif
if (within_covar_rxfilename == "") {
KALDI_ERR << "The option --within-covar is required.";
}
std::string nnet_rxfilename = po.GetArg(1),
examples_rspecifier = po.GetArg(2),
nnet_wxfilename = po.GetArg(3);
int64 num_examples = 0;
{
TransitionModel trans_model;
AmNnet am_nnet;
{
bool binary_read;
Input ki(nnet_rxfilename, &binary_read);
trans_model.Read(ki.Stream(), binary_read);
am_nnet.Read(ki.Stream(), binary_read);
}
SpMatrix<BaseFloat> within_covar;
ReadKaldiObject(within_covar_rxfilename, &within_covar);
if (zero_stats) am_nnet.GetNnet().ZeroStats();
{ // want to make sure this object deinitializes before
// we write the model, as it does something in the destructor.
NnetPerturbedTrainer trainer(train_config,
within_covar,
&(am_nnet.GetNnet()));
SequentialNnetExampleReader example_reader(examples_rspecifier);
for (; !example_reader.Done(); example_reader.Next(), num_examples++)
trainer.TrainOnExample(example_reader.Value()); // It all happens here!
}
{
Output ko(nnet_wxfilename, binary_write);
trans_model.Write(ko.Stream(), binary_write);
am_nnet.Write(ko.Stream(), binary_write);
}
}
#if HAVE_CUDA==1
CuDevice::Instantiate().PrintProfile();
#endif
KALDI_LOG << "Finished training, processed " << num_examples
<< " training examples. Wrote model to "
<< nnet_wxfilename;
return (num_examples == 0 ? 1 : 0);
} catch(const std::exception &e) {
std::cerr << e.what() << '\n';
return -1;
}
}