зеркало из https://github.com/mozilla/kaldi.git
sandbox/online: merging changes from trunk
git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/online@4243 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Коммит
3b2a6582b6
|
@ -25,6 +25,8 @@ do
|
|||
|
||||
utils/fix_data_dir.sh $data_dir/$split
|
||||
utils/validate_data_dir.sh $data_dir/$split
|
||||
rm $data_dir/$split/*.tmp
|
||||
if ls $data_dir/$split/*.tmp &> /dev/null; then
|
||||
rm $data_dir/$split/*.tmp
|
||||
fi
|
||||
done
|
||||
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Gets lattice oracles
|
||||
|
||||
if [ $# -lt 3 ]; then
|
||||
echo "Specify lattice dir, symbol table and text file for partition"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
latticeDir=$1
|
||||
textFile=$3
|
||||
symTable=$2
|
||||
oracleDir=$latticeDir/oracle
|
||||
|
||||
echo $latticeDir
|
||||
echo $oracleDir
|
||||
|
||||
. path.sh
|
||||
|
||||
if [ ! -f $textFile -o ! -f $symTable -o ! -d $latticeDir ]; then
|
||||
echo "Required files not found"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
mkdir -p $oracleDir
|
||||
|
||||
cat $textFile | sed 's:\[laughter\]::g' | sed 's:\[noise\]::g' | \
|
||||
utils/sym2int.pl -f 2- $symTable | \
|
||||
$KALDI_ROOT/src/latbin/lattice-oracle --word-symbol-table=$symTable "ark:gunzip -c $latticeDir/lat.*.gz|" ark:- ark,t:$oracleDir/oracle.tra 2>$oracleDir/oracle.log
|
||||
|
||||
sort -k1,1 -u $oracleDir/oracle.tra -o $oracleDir/oracle.tra
|
|
@ -10,7 +10,8 @@ if [ $# -lt 3 ]; then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
prunebeam=2
|
||||
prunebeam=50
|
||||
maxProcesses=10
|
||||
|
||||
latdir=$1
|
||||
decode_dir=$2
|
||||
|
@ -33,6 +34,7 @@ then
|
|||
mkdir -p $latdir/$compiledLatDir
|
||||
mkdir -p $latdir/$preplfLatDir
|
||||
|
||||
runningProcesses=0
|
||||
for l in $decode_dir/lat.*.gz
|
||||
do
|
||||
(
|
||||
|
@ -69,11 +71,19 @@ then
|
|||
continue
|
||||
fi
|
||||
# Replace laugh, unk, oov, noise with eps
|
||||
echo "$line" | awk '{if ($3 == 2038 || $3 == 2039 || $3 == 2040) {$3 = 0; $4 = 0} print}' >> "$latdir/$rawLatDir/$fileName.lat"
|
||||
echo "$line" | awk '{if ($3 == 1157 || $3 == 5327 || $3 == 5328 || $3 == 5329 || $3 ==5326) {$3 = 0; $4 = 0} print}' >> "$latdir/$rawLatDir/$fileName.lat"
|
||||
done < $bname.ark.fst
|
||||
echo "Done isolating lattices"
|
||||
fi
|
||||
) &
|
||||
runningProcesses=$((runningProcesses+1))
|
||||
echo "#### Processes running = " $runningProcesses " ####"
|
||||
if [ $runningProcesses -eq $maxProcesses ]; then
|
||||
echo "#### Waiting for slot ####"
|
||||
wait
|
||||
runningProcesses=0
|
||||
echo "#### Done waiting ####"
|
||||
fi
|
||||
done
|
||||
wait
|
||||
rm $latdir/*.bin
|
||||
|
@ -82,6 +92,7 @@ then
|
|||
|
||||
if [ $stage -le 2 ]; then
|
||||
#Compile lattices
|
||||
runningProcesses=0
|
||||
for l in $latdir/$rawLatDir/*.lat
|
||||
do
|
||||
(
|
||||
|
@ -89,6 +100,14 @@ then
|
|||
bname=${l##*/}
|
||||
fstcompile --arc_type=log $latdir/$rawLatDir/$bname $latdir/$compiledLatDir/$bname
|
||||
) &
|
||||
runningProcesses=$((runningProcesses+1))
|
||||
echo "#### Processes running = " $runningProcesses " ####"
|
||||
if [ $runningProcesses -eq $maxProcesses ]; then
|
||||
echo "#### Waiting for slot ####"
|
||||
wait
|
||||
runningProcesses=0
|
||||
echo "#### Done waiting ####"
|
||||
fi
|
||||
done
|
||||
wait
|
||||
echo "Done compiling lattices."
|
||||
|
@ -99,6 +118,7 @@ then
|
|||
# Create a dummy FST with one state and no arcs first
|
||||
echo 0 | fstcompile --arc_type=log - $latdir/$preplfLatDir/dummy.fst
|
||||
# Push Lattice weights towards initial state
|
||||
runningProcesses=0
|
||||
for l in $latdir/$compiledLatDir/*.lat
|
||||
do
|
||||
(
|
||||
|
@ -112,6 +132,14 @@ then
|
|||
fstrmepsilon - | \
|
||||
fstreverse - $latdir/$preplfLatDir/$bname
|
||||
) &
|
||||
runningProcesses=$((runningProcesses+1))
|
||||
echo "#### Processes running = " $runningProcesses " ####"
|
||||
if [ $runningProcesses -eq $maxProcesses ]; then
|
||||
echo "#### Waiting for slot ####"
|
||||
wait
|
||||
runningProcesses=0
|
||||
echo "#### Done waiting ####"
|
||||
fi
|
||||
done
|
||||
wait
|
||||
# Let's take a moment to thank the dummy FST for playing its
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
outDir=exp/lat
|
||||
mkdir -p $outDir
|
||||
|
||||
stage=1
|
||||
|
||||
if [ $stage -lt 1 ]; then
|
||||
|
||||
# First convert all lattices into the pruned, minimized version
|
||||
decodeDir=exp/tri5a/decode_dev
|
||||
acousticScale=0.8333
|
||||
local/latconvert.sh $outDir $decodeDir $acousticScale
|
||||
|
||||
decodeDir=exp/tri5a/decode_test
|
||||
acousticScale=0.8333
|
||||
local/latconvert.sh $outDir $decodeDir $acousticScale
|
||||
|
||||
fi
|
||||
|
||||
if [ $stage -lt 2 ]; then
|
||||
# Get oracles
|
||||
latticeDir=exp/tri5a/decode_dev
|
||||
textFile=data/dev/text
|
||||
symTable=exp/tri5a/graph/words.txt
|
||||
local/get_oracle.sh $latticeDir $symTable $textFile
|
||||
|
||||
latticeDir=exp/tri5a/decode_test
|
||||
textFile=data/test/text
|
||||
symTable=exp/tri5a/graph/words.txt
|
||||
local/get_oracle.sh $latticeDir $symTable $textFile
|
||||
fi
|
|
@ -56,14 +56,14 @@ steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
|
|||
# ones (mostly uh-huh). So take the 100k shortest ones, and then take 10k random
|
||||
# utterances from those.
|
||||
|
||||
steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
|
||||
steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
|
||||
data/train data/lang exp/mono0a
|
||||
|
||||
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
|
||||
data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;
|
||||
|
||||
steps/train_deltas.sh --cmd "$train_cmd" \
|
||||
1000 10000 data/train data/lang exp/mono0a_ali exp/tri1 || exit 1;
|
||||
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
|
||||
data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;
|
||||
|
||||
steps/train_deltas.sh --cmd "$train_cmd" \
|
||||
1000 10000 data/train data/lang exp/mono0a_ali exp/tri1 || exit 1;
|
||||
|
||||
|
||||
(utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
|
||||
|
|
|
@ -153,9 +153,14 @@ steps/train_sat.sh --cmd "$train_cmd" \
|
|||
(
|
||||
utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
|
||||
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
|
||||
exp/tri5a/graph data/dev exp/tri5a/decode_dev
|
||||
exp/tri5a/graph data/dev exp/tri5a/decode_dev
|
||||
)&
|
||||
|
||||
#
|
||||
# steps/cleanup/find_bad_utts.sh --nj 200 --cmd "$train_cmd" data/train data/lang \
|
||||
# exp/tri5a exp/tri5a_cleanup
|
||||
|
||||
|
||||
# local/run_for_spkid.sh
|
||||
|
||||
# we don't have to results for the step below yet.
|
||||
|
|
|
@ -118,14 +118,17 @@ exit 0
|
|||
%WER 1.80 [ 226 / 12533, 29 ins, 44 del, 153 sub ] exp/nnet4c/decode/wer_4
|
||||
%WER 8.49 [ 1064 / 12533, 80 ins, 175 del, 809 sub ] exp/nnet4c/decode_ug/wer_11
|
||||
|
||||
%WER 1.61 [ 202 / 12533, 25 ins, 47 del, 130 sub ] exp/nnet4d/decode/wer_5
|
||||
%WER 8.17 [ 1024 / 12533, 83 ins, 179 del, 762 sub ] exp/nnet4d/decode_ug/wer_11
|
||||
%WER 1.68 [ 211 / 12533, 29 ins, 39 del, 143 sub ] exp/nnet4d/decode/wer_4
|
||||
%WER 8.40 [ 1053 / 12533, 101 ins, 153 del, 799 sub ] exp/nnet4d/decode_ug/wer_10
|
||||
|
||||
%WER 1.63 [ 204 / 12533, 29 ins, 42 del, 133 sub ] exp/nnet4d_gpu/decode/wer_4
|
||||
%WER 8.11 [ 1016 / 12533, 80 ins, 168 del, 768 sub ] exp/nnet4d_gpu/decode_ug/wer_10
|
||||
%WER 1.74 [ 218 / 12533, 25 ins, 48 del, 145 sub ] exp/nnet4d_gpu/decode/wer_6
|
||||
%WER 8.39 [ 1051 / 12533, 106 ins, 149 del, 796 sub ] exp/nnet4d_gpu/decode_ug/wer_10
|
||||
|
||||
%WER 1.63 [ 204 / 12533, 29 ins, 42 del, 133 sub ] exp/nnet4d_gpu/decode/wer_4
|
||||
%WER 8.11 [ 1016 / 12533, 80 ins, 168 del, 768 sub ] exp/nnet4d_gpu/decode_ug/wer_10
|
||||
%WER 1.53 [ 192 / 12533, 22 ins, 42 del, 128 sub ] exp/nnet4d2/decode/wer_3
|
||||
%WER 8.06 [ 1010 / 12533, 79 ins, 152 del, 779 sub ] exp/nnet4d2/decode_ug/wer_8
|
||||
|
||||
%WER 1.51 [ 189 / 12533, 25 ins, 34 del, 130 sub ] exp/nnet4d2_gpu/decode/wer_3
|
||||
%WER 7.97 [ 999 / 12533, 78 ins, 152 del, 769 sub ] exp/nnet4d2_gpu/decode_ug/wer_8
|
||||
|
||||
%WER 1.37 [ 172 / 12533, 14 ins, 36 del, 122 sub ] exp/nnet4e_gpu/decode/wer_3
|
||||
%WER 8.03 [ 1006 / 12533, 61 ins, 179 del, 766 sub ] exp/nnet4e_gpu/decode_ug/wer_8
|
||||
|
@ -153,8 +156,8 @@ exit 0
|
|||
|
||||
|
||||
# Discriminatively trained system (using p-norm rather than tanh nonlinearities, using SMBR, on GPU)
|
||||
%WER 1.56 [ 195 / 12533, 28 ins, 31 del, 136 sub ] exp/nnet5d_mpe_gpu/decode_epoch2/wer_2
|
||||
%WER 8.35 [ 1047 / 12533, 77 ins, 171 del, 799 sub ] exp/nnet5d_mpe_gpu/decode_ug_epoch4/wer_10
|
||||
%WER 1.74 [ 218 / 12533, 25 ins, 48 del, 145 sub ] exp/nnet5d_mpe_gpu/decode_epoch1/wer_6
|
||||
%WER 8.40 [ 1053 / 12533, 108 ins, 148 del, 797 sub ] exp/nnet5d_mpe_gpu/decode_ug_epoch1/wer_10
|
||||
|
||||
# Discriminatively trained system on top of ensemble trained p-norm network (using SMBR, on GPU)
|
||||
%WER 1.36 [ 170 / 12533, 15 ins, 34 del, 121 sub ] exp/nnet5e_mpe_gpu/decode_epoch2/wer_3
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
#!/bin/bash
|
||||
|
||||
# 4d2 is as 4d but adding perturbed training with multiplier=1.0
|
||||
|
||||
train_stage=-10
|
||||
use_gpu=true
|
||||
|
||||
. cmd.sh
|
||||
. ./path.sh
|
||||
. utils/parse_options.sh
|
||||
|
||||
|
||||
if $use_gpu; then
|
||||
if ! cuda-compiled; then
|
||||
cat <<EOF && exit 1
|
||||
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
|
||||
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
|
||||
where "nvcc" is installed.
|
||||
EOF
|
||||
fi
|
||||
parallel_opts="-l gpu=1"
|
||||
num_threads=1
|
||||
minibatch_size=512
|
||||
dir=exp/nnet4d2_gpu
|
||||
else
|
||||
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
|
||||
# almost the same, but this may be a little bit slow.
|
||||
num_threads=16
|
||||
minibatch_size=128
|
||||
parallel_opts="-pe smp $num_threads"
|
||||
dir=exp/nnet4d2
|
||||
fi
|
||||
|
||||
|
||||
|
||||
if [ ! -f $dir/final.mdl ]; then
|
||||
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
|
||||
--target-multiplier 1.0 \
|
||||
--num-threads "$num_threads" \
|
||||
--minibatch-size "$minibatch_size" \
|
||||
--parallel-opts "$parallel_opts" \
|
||||
--num-jobs-nnet 4 \
|
||||
--num-epochs-extra 10 --add-layers-period 1 \
|
||||
--num-hidden-layers 2 \
|
||||
--mix-up 4000 \
|
||||
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
|
||||
--cmd "$decode_cmd" \
|
||||
--pnorm-input-dim 1000 \
|
||||
--pnorm-output-dim 200 \
|
||||
data/train data/lang exp/tri3b_ali $dir || exit 1;
|
||||
fi
|
||||
|
||||
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--transform-dir exp/tri3b/decode \
|
||||
exp/tri3b/graph data/test $dir/decode &
|
||||
|
||||
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
|
||||
--transform-dir exp/tri3b/decode_ug \
|
||||
exp/tri3b/graph_ug data/test $dir/decode_ug
|
||||
|
||||
wait
|
||||
|
|
@ -0,0 +1,126 @@
|
|||
#!/bin/bash
|
||||
|
||||
|
||||
# This script demonstrates discriminative training of p-norm neural nets.
|
||||
# It's on top of run_4d_gpu.sh which uses adapted 40-dimensional features.
|
||||
# This version of the script uses GPUs. We distinguish it by putting "_gpu"
|
||||
# at the end of the directory name.
|
||||
|
||||
|
||||
use_gpu=true
|
||||
stage=0
|
||||
transform_dir=exp/tri3b_ali
|
||||
|
||||
. cmd.sh
|
||||
. ./path.sh
|
||||
. utils/parse_options.sh
|
||||
|
||||
|
||||
[ ! -f $transform_dir/num_jobs ] && \
|
||||
echo "Expected $transform_dir/num_jobs to exist" && exit 1;
|
||||
nj_orig=$(cat $transform_dir/num_jobs)
|
||||
|
||||
|
||||
# The queue options in this script are for the CLSP network, and might not work
|
||||
# for you.
|
||||
|
||||
if $use_gpu; then
|
||||
. ./cmd.sh
|
||||
. ./path.sh
|
||||
! cuda-compiled && cat <<EOF && exit 1
|
||||
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
|
||||
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
|
||||
where "nvcc" is installed.
|
||||
EOF
|
||||
align_gpu_opts="-l gpu=1"
|
||||
use_gpu_flag="--use-gpu yes"
|
||||
train_parallel_opts="-l gpu=1"
|
||||
train_num_threads=1
|
||||
srcdir=exp/nnet4d_gpu
|
||||
dir=exp/nnet5d_mpe_gpu
|
||||
nj=$nj_orig
|
||||
else
|
||||
align_gpu_opts=
|
||||
use_gpu_flag="--use-gpu no"
|
||||
train_parallel_opts="-pe smp 6"
|
||||
train_num_threads=6
|
||||
srcdir=exp/nnet4d
|
||||
dir=exp/nnet5d_mpe
|
||||
if [ "$decode_cmd" != "run.pl" ]; then
|
||||
nj=$[$nj_orig*5]; # use more jobs, or it will be slow in the alignment
|
||||
# phase. But if we are just running everything on
|
||||
# one machine this won't help us
|
||||
else
|
||||
nj=$nj_orig
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ ! -f $srcdir/final.mdl ]; then
|
||||
echo "$0: expected $srcdir/final.mdl to exist."
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
# The denominator lattice creation currently doesn't use GPUs; that would be
|
||||
# wasteful since the lattice determinization and graph search use up a fair
|
||||
# amount of CPU, and we'd be idling the GPU much of the time.
|
||||
|
||||
# We specify 1G each for the mem_free and ram_free which, is per thread... it
|
||||
# will likely be less than the default. Increase the beam relative to the
|
||||
# defaults; this is just for this RM setup, where the default beams will likely
|
||||
# generate very thin lattices.
|
||||
|
||||
# Note: the transform-dir is important to
|
||||
# specify, since this system is on top of fMLLR features.
|
||||
|
||||
|
||||
if [ $stage -le 0 ]; then
|
||||
steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
|
||||
--nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
|
||||
--beam 20.0 --lattice-beam 10.0 \
|
||||
--transform-dir $transform_dir \
|
||||
data/train data/lang $srcdir ${srcdir}_denlats
|
||||
fi
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
steps/nnet2/align.sh --cmd "$decode_cmd $align_gpu_opts" $use_gpu_flag \
|
||||
--transform-dir $transform_dir \
|
||||
--nj $nj data/train data/lang $srcdir ${srcdir}_ali
|
||||
fi
|
||||
if [ $stage -le 2 ]; then
|
||||
steps/nnet2/train_discriminative.sh --cmd "$decode_cmd" \
|
||||
--num-jobs-nnet 2 --transform-dir $transform_dir \
|
||||
--num-threads "$train_num_threads" --parallel-opts "$train_parallel_opts" data/train data/lang \
|
||||
${srcdir}_ali ${srcdir}_denlats $srcdir/final.mdl $dir
|
||||
fi
|
||||
if [ $stage -le 3 ]; then
|
||||
for epoch in 1 2 3 4; do
|
||||
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 --iter epoch$epoch \
|
||||
--transform-dir exp/tri3b/decode \
|
||||
exp/tri3b/graph data/test $dir/decode_epoch$epoch &
|
||||
|
||||
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 --iter epoch$epoch \
|
||||
--transform-dir exp/tri3b/decode_ug \
|
||||
exp/tri3b/graph_ug data/test $dir/decode_ug_epoch$epoch &
|
||||
done
|
||||
wait
|
||||
fi
|
||||
|
||||
exit 0;
|
||||
|
||||
|
||||
|
||||
# The following is some test commands that I ran in order to verify that
|
||||
# the neural-net splitting and excising code was working as intended.
|
||||
|
||||
# (
|
||||
# acoustic_scale=0.1
|
||||
# for criterion in smbr mmi mpfe; do
|
||||
# for drop_frames in true false; do
|
||||
# nnet-get-egs-discriminative --drop-frames=$drop_frames --criterion=$criterion --excise=true exp/tri5c_mpe/0.mdl 'ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:data/train/split8/1/utt2spk scp:data/train/split8/1/cmvn.scp "scp:head -n 40 data/train/split8/1/feats.scp|" ark:- | splice-feats --left-context=3 --right-context=3 ark:- ark:- | transform-feats exp/tri5c_mpe/final.mat ark:- ark:- | transform-feats --utt2spk=ark:data/train/split8/1/utt2spk ark:$transform_dir/trans.1 ark:- ark:- |' 'ark,s,cs:gunzip -c exp/${dir}_ali/ali.1.gz |' 'ark,s,cs:gunzip -c exp/${dir}_denlats/lat.1.gz|' "ark:|nnet-combine-egs-discriminative ark:- ark:1.egs"
|
||||
|
||||
# nnet-get-egs-discriminative --drop-frames=$drop_frames --criterion=$criterion --split=false --excise=false exp/tri5c_mpe/0.mdl 'ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:data/train/split8/1/utt2spk scp:data/train/split8/1/cmvn.scp "scp:head -n 40 data/train/split8/1/feats.scp|" ark:- | splice-feats --left-context=3 --right-context=3 ark:- ark:- | transform-feats exp/tri5c_mpe/final.mat ark:- ark:- | transform-feats --utt2spk=ark:data/train/split8/1/utt2spk ark:$transform_dir/trans.1 ark:- ark:- |' 'ark,s,cs:gunzip -c exp/${dir}_ali/ali.1.gz |' 'ark,s,cs:gunzip -c exp/${dir}_denlats/lat.1.gz|' ark:2.egs
|
||||
|
||||
# nnet-compare-hash-discriminative --acoustic-scale=$acoustic_scale --drop-frames=$drop_frames --criterion=$criterion $dir/final.mdl ark:1.egs ark:2.egs || exit 1;
|
||||
# done
|
||||
# done
|
||||
# )
|
|
@ -21,12 +21,15 @@ if $use_gpu; then
|
|||
# This one is for training pnorm nnets on top of 40-dim + fMLLR features
|
||||
# **THIS IS THE PRIMARY RECIPE**
|
||||
local/nnet2/run_4d.sh --use-gpu true
|
||||
|
||||
# as above with 'perturbed training'. A bit better results, a bit slower.
|
||||
local/nnet2/run_4d2.sh --use-gpu true
|
||||
|
||||
# This is discriminative training on top of 4c.
|
||||
# This is discriminative training on top of 4c. (hardly helps)
|
||||
local/nnet2/run_5c_gpu.sh
|
||||
|
||||
# This is discriminative training on top of 4d.
|
||||
local/nnet2/run_5d_gpu.sh
|
||||
local/nnet2/run_5d.sh --use-gpu true
|
||||
else
|
||||
# This example runs on top of "raw-fMLLR" features;
|
||||
# you have to run local/run_raw_fmllr.sh first.
|
||||
|
@ -42,9 +45,15 @@ else
|
|||
# **THIS IS THE PRIMARY RECIPE (40-dim + fMLLR + p-norm neural net)**
|
||||
local/nnet2/run_4d.sh --use-gpu false
|
||||
|
||||
# as above with 'perturbed training'. A bit better results, a bit slower.
|
||||
local/nnet2/run_4d2.sh --use-gpu false
|
||||
|
||||
# This is discriminative training on top of 4c.
|
||||
local/nnet2/run_5c.sh
|
||||
|
||||
# This is discriminative training on top of 4d.
|
||||
local/nnet2/run_5d.sh --use-gpu false
|
||||
|
||||
# This is p-norm on top of raw-fMLLR.
|
||||
#local/nnet2/run_4e.sh
|
||||
|
||||
|
|
|
@ -146,6 +146,15 @@ steps/decode_fmllr.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
|
|||
steps/align_fmllr.sh --nj 8 --cmd "$train_cmd" --use-graphs true \
|
||||
data/train data/lang exp/tri3b exp/tri3b_ali
|
||||
|
||||
|
||||
# # We have now added a script that will help you find portions of your data that
|
||||
# # has bad transcripts, so you can filter it out. Below we demonstrate how to
|
||||
# # run this script.
|
||||
# steps/cleanup/find_bad_utts.sh --nj 20 --cmd "$train_cmd" data/train data/lang \
|
||||
# exp/tri3b_ali exp/tri3b_cleanup
|
||||
# # The following command will show you some of the hardest-to-align utterances in the data.
|
||||
# head exp/tri3b_cleanup/all_info.sorted.txt
|
||||
|
||||
## MMI on top of tri3b (i.e. LDA+MLLT+SAT+MMI)
|
||||
steps/make_denlats.sh --config conf/decode.config \
|
||||
--nj 8 --cmd "$train_cmd" --transform-dir exp/tri3b_ali \
|
||||
|
|
|
@ -20,6 +20,9 @@
|
|||
|
||||
. ./path.sh ## Source the tools/utils (import the queue.pl)
|
||||
|
||||
nj=80
|
||||
decode_nj=8
|
||||
|
||||
# Config:
|
||||
gmmdir=exp/tri3
|
||||
data_fmllr=data-fmllr-tri3
|
||||
|
@ -69,10 +72,10 @@ if [ $stage -le 2 ]; then
|
|||
steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
|
||||
$data_fmllr/train_tr90 $data_fmllr/train_cv10 data/lang $ali $ali $dir || exit 1;
|
||||
# Decode (reuse HCLG graph)
|
||||
steps/nnet/decode.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
|
||||
steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
|
||||
--num-threads 3 --parallel-opts "-pe smp 4" \
|
||||
$gmmdir/graph $data_fmllr/dev $dir/decode_dev || exit 1;
|
||||
steps/nnet/decode.sh --nj 11 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
|
||||
steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
|
||||
--num-threads 3 --parallel-opts "-pe smp 4" \
|
||||
$gmmdir/graph $data_fmllr/test $dir/decode_test || exit 1;
|
||||
fi
|
||||
|
@ -87,9 +90,9 @@ acwt=0.1
|
|||
|
||||
if [ $stage -le 3 ]; then
|
||||
# First we generate lattices and alignments:
|
||||
steps/nnet/align.sh --nj 80 --cmd "$train_cmd" \
|
||||
steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \
|
||||
$data_fmllr/train data/lang $srcdir ${srcdir}_ali || exit 1;
|
||||
steps/nnet/make_denlats.sh --nj 3 --sub-split 100 --cmd "$decode_cmd" --config conf/decode_dnn.config \
|
||||
steps/nnet/make_denlats.sh --nj 6 --sub-split $nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
|
||||
--acwt $acwt $data_fmllr/train data/lang $srcdir ${srcdir}_denlats || exit 1;
|
||||
fi
|
||||
|
||||
|
@ -99,11 +102,11 @@ if [ $stage -le 4 ]; then
|
|||
$data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
|
||||
# Decode (reuse HCLG graph)
|
||||
for ITER in 1; do
|
||||
steps/nnet/decode.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config \
|
||||
steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
|
||||
--num-threads 3 --parallel-opts "-pe smp 4" \
|
||||
--nnet $dir/${ITER}.nnet --acwt $acwt \
|
||||
$gmmdir/graph $data_fmllr/dev $dir/decode_dev || exit 1;
|
||||
steps/nnet/decode.sh --nj 11 --cmd "$decode_cmd" --config conf/decode_dnn.config \
|
||||
steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
|
||||
--num-threads 3 --parallel-opts "-pe smp 4" \
|
||||
--nnet $dir/${ITER}.nnet --acwt $acwt \
|
||||
$gmmdir/graph $data_fmllr/test $dir/decode_test || exit 1;
|
||||
|
@ -117,9 +120,9 @@ acwt=0.1
|
|||
|
||||
if [ $stage -le 5 ]; then
|
||||
# First we generate lattices and alignments:
|
||||
steps/nnet/align.sh --nj 80 --cmd "$train_cmd" \
|
||||
steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \
|
||||
$data_fmllr/train data/lang $srcdir ${srcdir}_ali || exit 1;
|
||||
steps/nnet/make_denlats.sh --nj 3 --sub-split 100 --cmd "$decode_cmd" --config conf/decode_dnn.config \
|
||||
steps/nnet/make_denlats.sh --nj 6 --sub-split $nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
|
||||
--acwt $acwt $data_fmllr/train data/lang $srcdir ${srcdir}_denlats || exit 1;
|
||||
fi
|
||||
|
||||
|
@ -129,11 +132,11 @@ if [ $stage -le 6 ]; then
|
|||
$data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
|
||||
# Decode (reuse HCLG graph)
|
||||
for ITER in 1 2 3 4; do
|
||||
steps/nnet/decode.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config \
|
||||
steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
|
||||
--num-threads 3 --parallel-opts "-pe smp 4" \
|
||||
--nnet $dir/${ITER}.nnet --acwt $acwt \
|
||||
$gmmdir/graph $data_fmllr/dev $dir/decode_dev || exit 1;
|
||||
steps/nnet/decode.sh --nj 11 --cmd "$decode_cmd" --config conf/decode_dnn.config \
|
||||
steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
|
||||
--num-threads 3 --parallel-opts "-pe smp 4" \
|
||||
--nnet $dir/${ITER}.nnet --acwt $acwt \
|
||||
$gmmdir/graph $data_fmllr/test $dir/decode_test || exit 1;
|
||||
|
|
|
@ -27,8 +27,9 @@ numGaussUBM=400
|
|||
numLeavesSGMM=7000
|
||||
numGaussSGMM=9000
|
||||
|
||||
decode_nj=5
|
||||
feats_nj=10
|
||||
train_nj=30
|
||||
decode_nj=5
|
||||
|
||||
echo ============================================================================
|
||||
echo " Data & Lexicon & Language Preparation "
|
||||
|
@ -60,7 +61,7 @@ mfccdir=mfcc
|
|||
|
||||
|
||||
for x in train dev test; do
|
||||
steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 data/$x exp/make_mfcc/$x $mfccdir
|
||||
steps/make_mfcc.sh --cmd "$train_cmd" --nj $feats_nj data/$x exp/make_mfcc/$x $mfccdir
|
||||
steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
|
||||
done
|
||||
|
||||
|
|
|
@ -272,6 +272,8 @@ steps/train_sat.sh --cmd "$train_cmd" \
|
|||
) &
|
||||
|
||||
|
||||
# This step is just to demonstrate the train_quick.sh script, in which we
|
||||
# initialize the GMMs from the old system's GMMs.
|
||||
steps/train_quick.sh --cmd "$train_cmd" \
|
||||
4200 40000 data/train_si284 data/lang exp/tri3b_ali_si284 exp/tri4b || exit 1;
|
||||
|
||||
|
|
|
@ -56,6 +56,7 @@ echo $nj > $dir/num_jobs
|
|||
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
|
||||
|
||||
cp $srcdir/{tree,final.mdl} $dir || exit 1;
|
||||
cp $srcdir/final.alimdl $dir 2>/dev/null
|
||||
cp $srcdir/final.occs $dir;
|
||||
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
|
||||
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
|
||||
|
|
|
@ -42,6 +42,11 @@ lang=$2
|
|||
srcdir=$3
|
||||
dir=$4
|
||||
|
||||
|
||||
for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl; do
|
||||
[ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
|
||||
done
|
||||
|
||||
oov=`cat $lang/oov.int` || exit 1;
|
||||
mkdir -p $dir/log
|
||||
echo $nj > $dir/num_jobs
|
||||
|
@ -57,6 +62,7 @@ cp $srcdir/{tree,final.mdl} $dir || exit 1;
|
|||
cp $srcdir/final.occs $dir;
|
||||
|
||||
|
||||
|
||||
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
|
||||
echo "$0: feature type is $feat_type"
|
||||
|
||||
|
|
|
@ -0,0 +1,165 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey)
|
||||
# Apache 2.0
|
||||
|
||||
# Computes training alignments using a model with delta or
|
||||
# LDA+MLLT features. This version, rather than just using the
|
||||
# text to align, computes mini-language models (unigram) from the text
|
||||
# and a few common words in the LM, and allows
|
||||
|
||||
# Begin configuration section.
|
||||
nj=4
|
||||
cmd=run.pl
|
||||
use_graphs=false
|
||||
# Begin configuration.
|
||||
scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
|
||||
acoustic_scale=0.1
|
||||
beam=20.0
|
||||
lattice_beam=10.0
|
||||
transform_dir= # directory to find fMLLR transforms in.
|
||||
top_n_words=100 # Number of common words that we compile into each graph (most frequent
|
||||
# in $lang/text.
|
||||
stage=0
|
||||
cleanup=true
|
||||
# End configuration options.
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
[ -f path.sh ] && . ./path.sh # source the path.
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
if [ $# != 4 ]; then
|
||||
echo "usage: $0 <data-dir> <lang-dir> <src-dir> <align-dir>"
|
||||
echo "e.g.: $0 data/train data/lang exp/tri1 exp/tri1_ali"
|
||||
echo "main options (for others, see top of script file)"
|
||||
echo " --config <config-file> # config containing options"
|
||||
echo " --nj <nj> # number of parallel jobs"
|
||||
echo " --use-graphs true # use graphs in src-dir"
|
||||
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
data=$1
|
||||
lang=$2
|
||||
srcdir=$3
|
||||
dir=$4
|
||||
|
||||
for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl \
|
||||
$lang/L_disambig.fst $lang/phones/disambig.int; do
|
||||
[ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
|
||||
done
|
||||
|
||||
oov=`cat $lang/oov.int` || exit 1;
|
||||
mkdir -p $dir/log
|
||||
echo $nj > $dir/num_jobs
|
||||
sdata=$data/split$nj
|
||||
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
|
||||
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
|
||||
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
|
||||
cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
|
||||
|
||||
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
|
||||
|
||||
cp $srcdir/{tree,final.mdl} $dir || exit 1;
|
||||
cp $srcdir/final.occs $dir;
|
||||
|
||||
|
||||
utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt <$data/text | \
|
||||
awk '{for(x=2;x<=NF;x++) print $x;}' | sort | uniq -c | \
|
||||
sort -rn > $dir/word_counts.int || exit 1;
|
||||
num_words=$(awk '{x+=$1} END{print x}' < $dir/word_counts.int) || exit 1;
|
||||
# print top-n words with their unigram probabilities.
|
||||
|
||||
head -n $top_n_words $dir/word_counts.int | awk -v tot=$num_words '{print $1/tot, $2;}' >$dir/top_words.int
|
||||
utils/int2sym.pl -f 2 $lang/words.txt <$dir/top_words.int >$dir/top_words.txt
|
||||
|
||||
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
|
||||
echo "$0: feature type is $feat_type"
|
||||
|
||||
case $feat_type in
|
||||
delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
|
||||
lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
|
||||
cp $srcdir/final.mat $srcdir/full.mat $dir
|
||||
;;
|
||||
*) echo "$0: invalid feature type $feat_type" && exit 1;
|
||||
esac
|
||||
if [ -z "$transform_dir" ] && [ -f $srcdir/trans.1 ]; then
|
||||
transform_dir=$srcdir
|
||||
fi
|
||||
if [ ! -z "$transform_dir" ]; then
|
||||
echo "$0: using transforms from $transform_dir"
|
||||
[ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
|
||||
nj_orig=$(cat $transform_dir/num_jobs)
|
||||
if [ $nj -ne $nj_orig ]; then
|
||||
# Copy the transforms into an archive with an index.
|
||||
for n in $(seq $nj_orig); do cat $transform_dir/trans.$n; done | \
|
||||
copy-feats ark:- ark,scp:$dir/trans.ark,$dir/trans.scp || exit 1;
|
||||
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |"
|
||||
else
|
||||
# number of jobs matches with alignment dir.
|
||||
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
|
||||
fi
|
||||
elif [ -f $srcdir/final.alimdl ]; then
|
||||
echo "$0: **WARNING**: you seem to be using an fMLLR system as input,"
|
||||
echo " but you are not providing the --transform-dir option during alignment."
|
||||
fi
|
||||
|
||||
|
||||
echo "$0: decoding $data using utterance-specific decoding graphs using model from $srcdir, output in $dir"
|
||||
|
||||
if [ $stage -le 0 ]; then
|
||||
rm $dir/edits.*.txt $dir/aligned_ref.*.txt 2>/dev/null
|
||||
|
||||
$cmd JOB=1:$nj $dir/log/decode.JOB.log \
|
||||
utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text \| \
|
||||
steps/cleanup/make_utterance_fsts.pl $dir/top_words.int \| \
|
||||
compile-train-graphs-fsts $scale_opts --read-disambig-syms=$lang/phones/disambig.int \
|
||||
$dir/tree $dir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \
|
||||
gmm-latgen-faster --acoustic-scale=$acoustic_scale --beam=$beam \
|
||||
--lattice-beam=$lattice_beam --word-symbol-table=$lang/words.txt \
|
||||
$dir/final.mdl ark:- "$feats" ark:- \| \
|
||||
lattice-oracle ark:- "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|" \
|
||||
ark,t:- ark,t:$dir/edits.JOB.txt \| \
|
||||
utils/int2sym.pl -f 2- $lang/words.txt '>' $dir/aligned_ref.JOB.txt || exit 1;
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
if [ -f $dir/edits.1.txt ]; then
|
||||
for x in $(seq $nj); do cat $dir/edits.$x.txt; done > $dir/edits.txt
|
||||
for x in $(seq $nj); do cat $dir/aligned_ref.$x.txt; done > $dir/aligned_ref.txt
|
||||
else
|
||||
echo "$0: warning: no file $dir/edits.1.txt, using previously concatenated file if present."
|
||||
fi
|
||||
|
||||
# in case any utterances failed to align, get filtered copy of $data/text that's filtered.
|
||||
utils/filter_scp.pl $dir/edits.txt < $data/text > $dir/text
|
||||
cat $dir/text | awk '{print $1, (NF-1);}' > $dir/length.txt
|
||||
|
||||
n1=$(wc -l < $dir/edits.txt)
|
||||
n2=$(wc -l < $dir/aligned_ref.txt)
|
||||
n3=$(wc -l < $dir/text)
|
||||
n4=$(wc -l < $dir/length.txt)
|
||||
if [ $n1 -ne $n2 ] || [ $n2 -ne $n3 ] || [ $n3 -ne $n4 ]; then
|
||||
echo "$0: mismatch in lengths of files:"
|
||||
wc $dir/edits.txt $dir/aligned_ref.txt $dir/text $dir/length.txt
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
# note: the format of all_info.txt is:
|
||||
# <utterance-id> <number of errors> <reference-length> <decoded-output> <reference>
|
||||
# with the fields separated by tabs, e.g.
|
||||
# adg04_sr009_trn 1 12 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED
|
||||
|
||||
paste $dir/edits.txt \
|
||||
<(awk '{print $2}' $dir/length.txt) \
|
||||
<(awk '{$1="";print;}' <$dir/aligned_ref.txt) \
|
||||
<(awk '{$1="";print;}' <$dir/text) > $dir/all_info.txt
|
||||
|
||||
sort -nr -k2 $dir/all_info.txt > $dir/all_info.sorted.txt
|
||||
|
||||
if $cleanup; then
|
||||
rm $dir/edits.*.txt $dir/aligned_ref.*.txt
|
||||
fi
|
||||
fi
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
# makes unigram decoding-graph FSTs specific to each utterances, where the
|
||||
# supplied top-n-words list together with the supervision text of the utterance are
|
||||
# combined.
|
||||
|
||||
if (@ARGV != 1) {
|
||||
print STDERR "Usage: make_utterance_fsts.pl top-words-file.txt < text-archive > fsts-archive\n" .
|
||||
"e.g.: utils/sym2int.pl -f 2- data/lang/words.txt data/train/text | \\\n" .
|
||||
" make_utterance_fsts.pl exp/foo/top_words.int | compile-train-graphs-fsts ... \n";
|
||||
}
|
||||
|
||||
($top_words_file) = @ARGV;
|
||||
|
||||
open(F, "<$top_words_file") || die "opening $top_words_file";
|
||||
|
||||
%top_word_probs = ( );
|
||||
|
||||
while(<F>) {
|
||||
@A = split;
|
||||
(@A == 2 && $A[0] > 0.0) || die "Bad line $_ in $top_words_file";
|
||||
$A[1] =~ m/^[0-9]+$/ || die "Expecting numeric word-ids in $top_words_file: $_\n";
|
||||
$top_word_probs{$A[1]} += $A[0];
|
||||
}
|
||||
|
||||
while (<STDIN>) {
|
||||
@A = split;
|
||||
$utterance_id = shift @A;
|
||||
print "$utterance_id\n";
|
||||
$num_words = @A + 0; # length of array @A
|
||||
%word_probs = %top_word_probs;
|
||||
foreach $w (@A) {
|
||||
$w =~ m/^[0-9]+$/ || die "Expecting numeric word-ids as stdin: $_";
|
||||
$word_probs{$w} += 1.0 / $num_words;
|
||||
}
|
||||
foreach $w (keys %word_probs) {
|
||||
$prob = $word_probs{$w};
|
||||
$prob > 0.0 || die "Word $w with bad probability $prob, utterance-id = $utterance_id\n";
|
||||
$cost = -log($prob);
|
||||
print "0 0 $w $w $cost\n";
|
||||
}
|
||||
$final_cost = -log(1.0 / $num_words);
|
||||
print "0 $final_cost\n";
|
||||
print "\n"; # Empty line terminates the FST in the text-archive format.
|
||||
}
|
|
@ -4,7 +4,8 @@
|
|||
# Apache 2.0
|
||||
|
||||
# Begin configuration section.
|
||||
transform_dir=
|
||||
transform_dir= # this option won't normally be used, but it can be used if you want to
|
||||
# supply existing fMLLR transforms when decoding.
|
||||
iter=
|
||||
model= # You can specify the model to use (e.g. if you want to use the .alimdl)
|
||||
stage=0
|
||||
|
|
|
@ -77,20 +77,31 @@ case $feat_type in
|
|||
;;
|
||||
*) echo "$0: invalid feature type $feat_type" && exit 1;
|
||||
esac
|
||||
|
||||
if [ ! -z "$transform_dir" ]; then
|
||||
if ! [ $nj -eq `cat $transform_dir/num_jobs` ]; then
|
||||
echo "$0: Number of jobs mismatch with transform-dir: $nj versus `cat $transform_dir/num_jobs`";
|
||||
echo "$0: using transforms from $transform_dir"
|
||||
[ ! -s $transform_dir/num_jobs ] && \
|
||||
echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
|
||||
nj_orig=$(cat $transform_dir/num_jobs)
|
||||
|
||||
if [ $feat_type == "raw" ]; then trans=raw_trans;
|
||||
else trans=trans; fi
|
||||
if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then
|
||||
echo "$0: LDA transforms differ between $srcdir and $transform_dir"
|
||||
exit 1;
|
||||
fi
|
||||
if [ $feat_type == "lda" ]; then
|
||||
[ ! -f $transform_dir/trans.1 ] && echo "No such file $transform_dir/raw_trans.1" && exit 1;
|
||||
echo "$0: using transforms from $transform_dir"
|
||||
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
|
||||
if [ ! -f $transform_dir/$trans.1 ]; then
|
||||
echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
|
||||
exit 1;
|
||||
fi
|
||||
if [ $feat_type == "raw" ]; then
|
||||
[ ! -f $transform_dir/raw_trans.1 ] && echo "No such file $transform_dir/raw_trans.1" && exit 1;
|
||||
echo "$0: using raw-fMLLR transforms from $transform_dir"
|
||||
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
|
||||
if [ $nj -ne $nj_orig ]; then
|
||||
# Copy the transforms into an archive with an index.
|
||||
for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
|
||||
copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
|
||||
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
|
||||
else
|
||||
# number of jobs matches with alignment dir.
|
||||
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
|
|
@ -145,6 +145,7 @@ fi
|
|||
|
||||
if [ $stage -le 0 ]; then
|
||||
echo "$0: Accumulating LDA statistics."
|
||||
rm $dir/lda.*.acc 2>/dev/null # in case any left over from before.
|
||||
$cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
|
||||
ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
|
||||
weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
|
||||
|
@ -157,11 +158,19 @@ echo $lda_dim > $dir/lda_dim
|
|||
echo $ivector_dim > $dir/ivector_dim
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
nnet-get-feature-transform --write-cholesky=$dir/cholesky.tpmat \
|
||||
--within-class-factor=$within_class_factor --dim=$lda_dim \
|
||||
$dir/lda.mat $dir/lda.*.acc \
|
||||
2>$dir/log/lda_est.log || exit 1;
|
||||
sum-lda-accs $dir/lda.acc $dir/lda.*.acc 2>$dir/log/lda_sum.log || exit 1;
|
||||
rm $dir/lda.*.acc
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ]; then
|
||||
# There are various things that we sometimes (but not always) need
|
||||
# the within-class covariance and its Cholesky factor for, and we
|
||||
# write these to disk just in case.
|
||||
nnet-get-feature-transform --write-cholesky=$dir/cholesky.tpmat \
|
||||
--write-within-covar=$dir/within_covar.spmat \
|
||||
--within-class-factor=$within_class_factor --dim=$lda_dim \
|
||||
$dir/lda.mat $dir/lda.acc \
|
||||
2>$dir/log/lda_est.log || exit 1;
|
||||
fi
|
||||
|
||||
echo "$0: Finished estimating LDA"
|
||||
|
|
|
@ -95,25 +95,39 @@ echo "align_si.sh: feature type is $feat_type"
|
|||
|
||||
case $feat_type in
|
||||
delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
|
||||
raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
|
||||
;;
|
||||
lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
|
||||
cp $srcdir/final.mat $dir
|
||||
;;
|
||||
*) echo "Invalid feature type $feat_type" && exit 1;
|
||||
esac
|
||||
|
||||
if [ ! -z "$transform_dir" ]; then # add transforms to features...
|
||||
echo "$0: using fMLLR transforms from $transform_dir"
|
||||
[ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
|
||||
[ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \
|
||||
&& echo "$0: mismatch in number of jobs with $transform_dir" && exit 1;
|
||||
[ -f $srcdir/final.mat ] && ! cmp $transform_dir/final.mat $srcdir/final.mat && \
|
||||
echo "$0: LDA transforms differ between $srcdir and $transform_dir"
|
||||
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
|
||||
else
|
||||
if [ -f $srcdir/final.alimdl ]; then
|
||||
echo "$0: you seem to have a SAT system but you did not supply the --transform-dir option.";
|
||||
if [ ! -z "$transform_dir" ]; then
|
||||
echo "$0: using transforms from $transform_dir"
|
||||
[ ! -s $transform_dir/num_jobs ] && \
|
||||
echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
|
||||
nj_orig=$(cat $transform_dir/num_jobs)
|
||||
|
||||
if [ $feat_type == "raw" ]; then trans=raw_trans;
|
||||
else trans=trans; fi
|
||||
if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then
|
||||
echo "$0: LDA transforms differ between $srcdir and $transform_dir"
|
||||
exit 1;
|
||||
fi
|
||||
if [ ! -f $transform_dir/$trans.1 ]; then
|
||||
echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
|
||||
exit 1;
|
||||
fi
|
||||
if [ $nj -ne $nj_orig ]; then
|
||||
# Copy the transforms into an archive with an index.
|
||||
for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
|
||||
copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
|
||||
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
|
||||
else
|
||||
# number of jobs matches with alignment dir.
|
||||
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $sub_split -eq 1 ]; then
|
||||
|
|
|
@ -22,7 +22,7 @@ num_jobs_nnet=4 # Number of neural net jobs to run in parallel. Note: this
|
|||
samples_per_iter=400000 # measured in frames, not in "examples"
|
||||
|
||||
spk_vecs_dir=
|
||||
modify_learning_rates=false
|
||||
modify_learning_rates=true
|
||||
last_layer_factor=1.0 # relates to modify-learning-rates
|
||||
first_layer_factor=1.0 # relates to modify-learning-rates
|
||||
shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
|
||||
|
@ -140,17 +140,38 @@ case $feat_type in
|
|||
*) echo "$0: invalid feature type $feat_type" && exit 1;
|
||||
esac
|
||||
|
||||
[ -z "$transform_dir" ] && transform_dir=$alidir
|
||||
if [ -z "$transform_dir" ]; then
|
||||
if [ -f $transform_dir/trans.1 ] || [ -f $transform_dir/raw_trans.1 ]; then
|
||||
transform_dir=$alidir
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
|
||||
if [ ! -z "$transform_dir" ]; then
|
||||
echo "$0: using transforms from $transform_dir"
|
||||
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
|
||||
[ ! -s $transform_dir/num_jobs ] && \
|
||||
echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
|
||||
nj_orig=$(cat $transform_dir/num_jobs)
|
||||
|
||||
if [ $feat_type == "raw" ]; then trans=raw_trans;
|
||||
else trans=trans; fi
|
||||
if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $alidir/final.mat; then
|
||||
echo "$0: LDA transforms differ between $alidir and $transform_dir"
|
||||
exit 1;
|
||||
fi
|
||||
if [ ! -f $transform_dir/$trans.1 ]; then
|
||||
echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
|
||||
exit 1;
|
||||
fi
|
||||
if [ $nj -ne $nj_orig ]; then
|
||||
# Copy the transforms into an archive with an index.
|
||||
for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
|
||||
copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
|
||||
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
|
||||
else
|
||||
# number of jobs matches with alignment dir.
|
||||
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
|
||||
fi
|
||||
fi
|
||||
if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
|
||||
echo "$0: using raw-fMLLR transforms from $transform_dir"
|
||||
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
|
||||
fi
|
||||
|
||||
|
||||
if [ -z "$degs_dir" ]; then
|
||||
if [ $stage -le -8 ]; then
|
||||
|
|
|
@ -64,6 +64,10 @@ max_change_per_sample=0.075
|
|||
precondition_rank_in=20 # relates to online preconditioning
|
||||
precondition_rank_out=80 # relates to online preconditioning
|
||||
|
||||
# this relates to perturbed training.
|
||||
min_target_objf_change=0.1
|
||||
target_multiplier=0 # Set this to e.g. 1.0 to enable perturbed training.
|
||||
|
||||
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
|
||||
# specified.)
|
||||
num_threads=16
|
||||
|
@ -262,24 +266,49 @@ echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
|
|||
echo "$0: (while reducing learning rate) + (with constant learning rate)."
|
||||
|
||||
|
||||
function set_target_objf_change {
|
||||
# nothing to do if $target_multiplier not set.
|
||||
[ "$target_multiplier" == "0" -o "$target_multiplier" == "0.0" ] && return;
|
||||
[ $x -le $finish_add_layers_iter ] && return;
|
||||
wait=2 # the compute_prob_{train,valid} from 2 iterations ago should
|
||||
# most likey be done even though we backgrounded them.
|
||||
[ $[$x-$wait] -le 0 ] && return;
|
||||
while true; do
|
||||
# Note: awk 'some-expression' is the same as: awk '{if(some-expression) print;}'
|
||||
train_prob=$(awk '(NF == 1)' < $dir/log/compute_prob_train.$[$x-$wait].log)
|
||||
valid_prob=$(awk '(NF == 1)' < $dir/log/compute_prob_valid.$[$x-$wait].log)
|
||||
if [ -z "$train_prob" ] || [ -z "$valid_prob" ]; then
|
||||
echo "$0: waiting until $dir/log/compute_prob_{train,valid}.$[$x-$wait].log are done"
|
||||
sleep 60
|
||||
else
|
||||
target_objf_change=$(perl -e '($train,$valid,$min_change,$multiplier)=@ARGV; if (!($train < 0.0) || !($valid < 0.0)) { print "0\n"; print STDERR "Error: invalid train or valid prob: $train_prob, $valid_prob\n"; exit(0); } else { print STDERR "train,valid=$train,$valid\n"; $proposed_target = $multiplier * ($train-$valid); if ($proposed_target < $min_change) { print "0"; } else { print $proposed_target; }}' -- "$train_prob" "$valid_prob" "$min_target_objf_change" "$target_multiplier")
|
||||
echo "On iter $x, (train,valid) probs from iter $[$x-$wait] were ($train_prob,$valid_prob), and setting target-objf-change to $target_objf_change."
|
||||
return;
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
|
||||
# This is when we decide to mix up from: halfway between when we've finished
|
||||
# adding the hidden layers and the end of training.
|
||||
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]
|
||||
|
||||
if [ $num_threads -eq 1 ]; then
|
||||
train_suffix="-simple" # this enables us to use GPU code if
|
||||
parallel_suffix="-simple" # this enables us to use GPU code if
|
||||
# we have just one thread.
|
||||
parallel_train_opts=
|
||||
if ! cuda-compiled; then
|
||||
echo "$0: WARNING: you are running with one thread but you have not compiled"
|
||||
echo " for CUDA. You may be running a setup optimized for GPUs. If you have"
|
||||
echo " GPUs and have nvcc installed, go to src/ and do ./configure; make"
|
||||
fi
|
||||
else
|
||||
train_suffix="-parallel --num-threads=$num_threads"
|
||||
parallel_suffix="-parallel"
|
||||
parallel_train_opts="--num-threads=$num_threads"
|
||||
fi
|
||||
|
||||
x=0
|
||||
target_objf_change=0 # relates to perturbed training.
|
||||
|
||||
while [ $x -lt $num_iters ]; do
|
||||
if [ $x -ge 0 ] && [ $stage -le $x ]; then
|
||||
|
@ -316,11 +345,19 @@ while [ $x -lt $num_iters ]; do
|
|||
this_minibatch_size=$minibatch_size
|
||||
do_average=true
|
||||
fi
|
||||
|
||||
set_target_objf_change; # only has effect if target_multiplier != 0
|
||||
if [ "$target_objf_change" != "0" ]; then
|
||||
[ ! -f $dir/within_covar.spmat ] && \
|
||||
echo "$0: expected $dir/within_covar.spmat to exist." && exit 1;
|
||||
perturb_suffix="-perturbed"
|
||||
perturb_opts="--target-objf-change=$target_objf_change --within-covar=$dir/within_covar.spmat"
|
||||
fi
|
||||
|
||||
$cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
|
||||
nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
|
||||
ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
|
||||
nnet-train$train_suffix \
|
||||
nnet-train$parallel_suffix$perturb_suffix $parallel_train_opts $perturb_opts \
|
||||
--minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
|
||||
ark:- $dir/$[$x+1].JOB.mdl \
|
||||
|| exit 1;
|
||||
|
|
|
@ -12,7 +12,6 @@
|
|||
|
||||
# Begin configuration section.
|
||||
stage=-5
|
||||
fmllr_update_type=full
|
||||
cmd=run.pl
|
||||
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
|
||||
beam=10
|
||||
|
@ -197,9 +196,9 @@ while [ $x -lt $num_iters ]; do
|
|||
|
||||
if echo $fmllr_iters | grep -w $x >/dev/null; then
|
||||
if [ $stage -le $x ]; then
|
||||
echo Estimating fMLLR transforms
|
||||
# We estimate a transform that's additional to the previous transform;
|
||||
# we'll compose them.
|
||||
# Note: it's not really necessary to re-estimate the basis each time
|
||||
# but this is the way the script does it right now.
|
||||
echo Estimating basis and fMLLR transforms
|
||||
$cmd JOB=1:$nj $dir/log/fmllr_est.$x.JOB.log \
|
||||
ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
|
||||
weight-silence-post $silence_weight $silphonelist $dir/$x.mdl ark:- ark:- \| \
|
||||
|
@ -209,7 +208,7 @@ while [ $x -lt $num_iters ]; do
|
|||
|
||||
# Compute the basis matrices.
|
||||
$cmd $dir/log/basis_training.log \
|
||||
gmm-basis-fmllr-training $dir/$x.mdl $dir/fmllr.basis $dir/basis.acc.* || exit 1;
|
||||
gmm-basis-fmllr-training $dir/$x.mdl $dir/fmllr.basis $dir/basis.acc.* || exit 1;
|
||||
|
||||
$cmd JOB=1:$nj $dir/log/fmllr_app.$x.JOB.log \
|
||||
ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
|
||||
|
|
|
@ -28,7 +28,7 @@ for($x = 0; $x < 2; $x++) {
|
|||
}
|
||||
}
|
||||
if ($ARGV[0] eq "-f") {
|
||||
shift @ARGV;
|
||||
shift @ARGV;
|
||||
$field_spec = shift @ARGV;
|
||||
if ($field_spec =~ m/^\d+$/) {
|
||||
$field_begin = $field_spec - 1; $field_end = $field_spec - 1;
|
||||
|
|
|
@ -46,6 +46,14 @@ done
|
|||
! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \
|
||||
echo "$0: $data/utt2spk has wrong format." && exit;
|
||||
|
||||
ns=$(wc -l < $data/spk2utt)
|
||||
if [ "$ns" == 1 ]; then
|
||||
echo "$0: WARNING: you have only one speaker. This probably a bad idea."
|
||||
echo " Search for the word 'bold' in http://kaldi.sourceforge.net/data_prep.html"
|
||||
echo " for more information."
|
||||
fi
|
||||
|
||||
|
||||
tmpdir=$(mktemp -d kaldi.XXXX);
|
||||
trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
|
||||
|
||||
|
|
|
@ -12,27 +12,39 @@ if(@ARGV != 1) {
|
|||
}
|
||||
|
||||
$dict = shift @ARGV;
|
||||
$dict =~ s:/$::;
|
||||
|
||||
$exit = 0;
|
||||
$success = 1; # this is re-set each time we read a file.
|
||||
|
||||
sub set_to_fail { $exit = 1; $success = 0; }
|
||||
|
||||
# Checking silence_phones.txt -------------------------------
|
||||
print "Checking $dict/silence_phones.txt ...\n";
|
||||
if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;}
|
||||
if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;}
|
||||
$idx = 1;
|
||||
%silence = ();
|
||||
$success = 1;
|
||||
|
||||
print "--> reading $dict/silence_phones.txt\n";
|
||||
while(<S>) {
|
||||
chomp;
|
||||
if (! s/\n$//) {
|
||||
print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n";
|
||||
set_to_fail();
|
||||
}
|
||||
my @col = split(" ", $_);
|
||||
if (@col == 0) {
|
||||
set_to_fail();
|
||||
print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n";
|
||||
}
|
||||
foreach(0 .. @col-1) {
|
||||
my $p = $col[$_];
|
||||
if($silence{$p}) {$exit = 1; print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; $success = 0;}
|
||||
if($silence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; }
|
||||
else {$silence{$p} = 1;}
|
||||
if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){
|
||||
$exit = 1;
|
||||
set_to_fail();
|
||||
print "--> ERROR: phone \"$p\" has disallowed written form";
|
||||
$success = 0;
|
||||
|
||||
}
|
||||
}
|
||||
$idx ++;
|
||||
|
@ -52,9 +64,9 @@ while(<OS>) {
|
|||
chomp;
|
||||
my @col = split(" ", $_);
|
||||
if ($idx > 1 or @col > 1) {
|
||||
$exit = 1; print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; $success = 0;
|
||||
set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n";
|
||||
} elsif (!$silence{$col[0]}) {
|
||||
$exit = 1; print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; $success = 0;
|
||||
set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n";
|
||||
}
|
||||
$idx ++;
|
||||
}
|
||||
|
@ -71,22 +83,29 @@ $idx = 1;
|
|||
$success = 1;
|
||||
print "--> reading $dict/nonsilence_phones.txt\n";
|
||||
while(<NS>) {
|
||||
chomp;
|
||||
if (! s/\n$//) {
|
||||
print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n";
|
||||
set_to_fail();
|
||||
}
|
||||
my @col = split(" ", $_);
|
||||
if (@col == 0) {
|
||||
set_to_fail();
|
||||
print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n";
|
||||
}
|
||||
foreach(0 .. @col-1) {
|
||||
my $p = $col[$_];
|
||||
if($nonsilence{$p}) {$exit = 1; print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; $success = 0;}
|
||||
if($nonsilence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; }
|
||||
else {$nonsilence{$p} = 1;}
|
||||
if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){
|
||||
$exit = 1;
|
||||
set_to_fail();
|
||||
print "--> ERROR: phone \"$p\" has disallowed written form";
|
||||
$success = 0;
|
||||
|
||||
}
|
||||
}
|
||||
$idx ++;
|
||||
}
|
||||
close(NS);
|
||||
$success == 0 || print "--> $dict/silence_phones.txt is OK\n";
|
||||
$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n";
|
||||
print "\n";
|
||||
|
||||
# Checking disjoint -------------------------------
|
||||
|
@ -106,37 +125,37 @@ sub intersect {
|
|||
print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n";
|
||||
@itset = intersect(\%silence, \%nonsilence);
|
||||
if(@itset == 0) {print "--> disjoint property is OK.\n";}
|
||||
else {$exit = 1; print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";}
|
||||
else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";}
|
||||
print "\n";
|
||||
|
||||
|
||||
sub check_lexicon {
|
||||
my ($lexfn, $pron_probs) = @_;
|
||||
print "Checking $lexfn\n";
|
||||
if(-z "$lexfn") {$exit = 1; print "--> ERROR: $lexfn is empty or not exists\n";}
|
||||
if(!open(L, "<$lexfn")) {$exit = 1; print "--> ERROR: fail to open $lexfn\n";}
|
||||
if(-z "$lexfn") {set_to_fail(); print "--> ERROR: $lexfn is empty or not exists\n";}
|
||||
if(!open(L, "<$lexfn")) {set_to_fail(); print "--> ERROR: fail to open $lexfn\n";}
|
||||
$idx = 1;
|
||||
$success = 1;
|
||||
print "--> reading $lexfn\n";
|
||||
while (<L>) {
|
||||
chomp;
|
||||
if (! s/\n$//) {
|
||||
print "--> ERROR: last line '$_' of $lexfn does not end in newline.\n";
|
||||
set_to_fail();
|
||||
}
|
||||
my @col = split(" ", $_);
|
||||
$word = shift @col;
|
||||
if (!defined $word) {
|
||||
$exit = 1; print "--> ERROR: empty lexicon line in $lexfn\n";
|
||||
$success = 0;
|
||||
set_to_fail(); print "--> ERROR: empty lexicon line in $lexfn\n";
|
||||
}
|
||||
if ($pron_probs) {
|
||||
$prob = shift @col;
|
||||
if (!($prob > 0.0 && $prob <= 1.0)) {
|
||||
$exit = 1; print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lexfn\n";
|
||||
$success = 0;
|
||||
set_to_fail(); print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lexfn\n";
|
||||
}
|
||||
}
|
||||
foreach (0 .. @col-1) {
|
||||
if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
|
||||
$exit = 1; print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx)\n";
|
||||
$success = 0;
|
||||
set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx)\n";
|
||||
}
|
||||
}
|
||||
$idx ++;
|
||||
|
@ -150,7 +169,7 @@ if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0); }
|
|||
if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1); }
|
||||
if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) {
|
||||
print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n";
|
||||
$exit = 1;
|
||||
set_to_fail();
|
||||
}
|
||||
# If both lexicon.txt and lexiconp.txt exist, we check that they correspond to
|
||||
# each other. If not, it could be that the user overwrote one and we need to
|
||||
|
@ -161,11 +180,21 @@ if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) {
|
|||
die "Error opening lexicon.txt and/or lexiconp.txt"; # already checked, so would be code error.
|
||||
}
|
||||
while(<L>) {
|
||||
if (! s/\n$//) {
|
||||
print "--> ERROR: last line '$_' of $dict/lexicon.txt does not end in newline.\n";
|
||||
set_to_fail();
|
||||
last;
|
||||
}
|
||||
@A = split;
|
||||
$x = <P>;
|
||||
if ($x !~ s/\n$//) {
|
||||
print "--> ERROR: last line '$x' of $dict/lexiconp.txt does not end in newline.\n";
|
||||
set_to_fail();
|
||||
last;
|
||||
}
|
||||
if (!defined $x) {
|
||||
print "--> ERROR: lexicon.txt and lexiconp.txt have different numbers of lines (mismatch); delete one.\n";
|
||||
$exit = 1;
|
||||
set_to_fail();
|
||||
last;
|
||||
}
|
||||
@B = split(" ", $x);
|
||||
|
@ -175,13 +204,13 @@ if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) {
|
|||
# now @A and @B should be the same.
|
||||
if ($#A != $#B) {
|
||||
print "--> ERROR: lexicon.txt and lexiconp.txt have mismatched lines '$_' versus '$x'; delete one.\n";
|
||||
$exit = 1;
|
||||
set_to_fail();
|
||||
last;
|
||||
}
|
||||
for ($n = 0; $n < @A; $n++) {
|
||||
if ($A[$n] ne $B[$n]) {
|
||||
print "--> ERROR: lexicon.txt and lexiconp.txt have mismatched lines '$_' versus '$x'; delete one.\n";
|
||||
$exit = 1;
|
||||
set_to_fail();
|
||||
last;
|
||||
}
|
||||
}
|
||||
|
@ -189,32 +218,40 @@ if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) {
|
|||
$x = <P>;
|
||||
if (defined $x && $exit == 0) {
|
||||
print "--> ERROR: lexicon.txt and lexiconp.txt have different numbers of lines (mismatch); delete one.\n";
|
||||
$exit = 1;
|
||||
set_to_fail();
|
||||
}
|
||||
}
|
||||
|
||||
# Checking extra_questions.txt -------------------------------
|
||||
print "Checking $dict/extra_questions.txt ...\n";
|
||||
if (-s "$dict/extra_questions.txt") {
|
||||
if(!open(EX, "<$dict/extra_questions.txt")) {$exit = 1; print "--> ERROR: fail to open $dict/extra_questions.txt\n";}
|
||||
if (!open(EX, "<$dict/extra_questions.txt")) {
|
||||
set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n";
|
||||
}
|
||||
$idx = 1;
|
||||
$success = 1;
|
||||
print "--> reading $dict/extra_questions.txt\n";
|
||||
while(<EX>) {
|
||||
chomp;
|
||||
if (! s/\n$//) {
|
||||
print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n";
|
||||
set_to_fail();
|
||||
}
|
||||
my @col = split(" ", $_);
|
||||
foreach(0 .. @col-1) {
|
||||
if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
|
||||
$exit = 1; print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n";
|
||||
$success = 0;
|
||||
}
|
||||
if (@col == 0) {
|
||||
set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n";
|
||||
}
|
||||
}
|
||||
foreach(0 .. @col-1) {
|
||||
if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
|
||||
set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n";
|
||||
}
|
||||
$idx ++;
|
||||
}
|
||||
}
|
||||
close(EX);
|
||||
$success == 0 || print "--> $dict/extra_questions.txt is OK\n";
|
||||
} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";}
|
||||
|
||||
if($exit == 1) { print " [Error detected ]\n"; exit 1;}
|
||||
if ($exit == 1) { print "--> ERROR validating dictionary directory $dict (see detailed error messages above)\n"; exit 1;}
|
||||
else { print "--> SUCCESS [validating dictionary directory $dict]\n"; }
|
||||
|
||||
exit 0;
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
# Guoguo Chen (guoguo@jhu.edu)
|
||||
# Apache 2.0.
|
||||
# Copyright 2012 Guoguo Chen
|
||||
# 2014 Neil Nelson
|
||||
#
|
||||
# Validation script for data/lang
|
||||
|
||||
|
@ -132,7 +134,7 @@ sub check_txt_int_csl {
|
|||
}
|
||||
|
||||
sub check_txt_int {
|
||||
my ($cat, $symtab) = @_;
|
||||
my ($cat, $symtab, $sym_check) = @_;
|
||||
print "Checking $cat.\{txt, int\} ...\n";
|
||||
if (-z "$cat.txt") {$exit = 1; return print "--> ERROR: $cat.txt is empty or does not exist\n";}
|
||||
if (-z "$cat.int") {$exit = 1; return print "--> ERROR: $cat.int is empty or does not exist\n";}
|
||||
|
@ -154,6 +156,7 @@ sub check_txt_int {
|
|||
close(TXT); $idx1 --;
|
||||
print "--> $idx1 entry/entries in $cat.txt\n";
|
||||
|
||||
my %used_syms = ();
|
||||
$idx2 = 1;
|
||||
while(<INT>) {
|
||||
chomp;
|
||||
|
@ -168,6 +171,8 @@ sub check_txt_int {
|
|||
if (@set != @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";}
|
||||
foreach(0 .. @set-1) {
|
||||
if ($symtab->{@set[$_]} ne @col[$_]) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2, block " ,$_+1, ")\n";}
|
||||
if ($sym_check && defined $used_syms{@set[$_]}) {$exit = 1; return print "--> ERROR: $cat.txt and $cat.int contain duplicate symbols (break at line $idx2, block " ,$_+1, ")\n";}
|
||||
$used_syms{@set[$_]} = 1;
|
||||
}
|
||||
$idx2 ++;
|
||||
}
|
||||
|
@ -175,31 +180,16 @@ sub check_txt_int {
|
|||
if ($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";}
|
||||
print "--> $cat.int corresponds to $cat.txt\n";
|
||||
|
||||
return print "--> $cat.\{txt, int\} are OK\n";
|
||||
}
|
||||
if ($sym_check) {
|
||||
while ( my ($key, $value) = each(%silence) ) {
|
||||
if (!defined $used_syms{$key}) {$exit = 1; return print "--> ERROR: $cat.txt and $cat.int do not contain all silence phones\n";}
|
||||
}
|
||||
while ( my ($key, $value) = each(%nonsilence) ) {
|
||||
if (!defined $used_syms{$key}) {$exit = 1; return print "--> ERROR: $cat.txt and $cat.int do not contain all non-silence phones\n";}
|
||||
}
|
||||
}
|
||||
|
||||
@list1 = ("context_indep", "disambig", "nonsilence", "silence", "optional_silence");
|
||||
@list2 = ("roots", "sets");
|
||||
foreach(@list1) {
|
||||
check_txt_int_csl("$lang/phones/$_", \%psymtab); print "\n";
|
||||
}
|
||||
foreach(@list2) {
|
||||
check_txt_int("$lang/phones/$_", \%psymtab); print "\n";
|
||||
}
|
||||
if ((-s "$lang/phones/extra_questions.txt") || (-s "$lang/phones/extra_questions.int")) {
|
||||
check_txt_int("$lang/phones/extra_questions", \%psymtab); print "\n";
|
||||
} else {
|
||||
print "Checking $lang/phones/extra_questions.\{txt, int\} ...\n";
|
||||
if ((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int")) {
|
||||
print "--> WARNING: the optional $lang/phones/extra_questions.\{txt, int\} are empty!\n\n";
|
||||
$warning = 1;
|
||||
} else {
|
||||
print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n";
|
||||
$exit = 1;
|
||||
}
|
||||
}
|
||||
if (-e "$lang/phones/word_boundary.txt") {
|
||||
check_txt_int("$lang/phones/word_boundary", \%psymtab); print "\n";
|
||||
return print "--> $cat.\{txt, int\} are OK\n";
|
||||
}
|
||||
|
||||
# Check disjoint and summation -------------------------------
|
||||
|
@ -217,7 +207,7 @@ sub intersect {
|
|||
}
|
||||
|
||||
sub check_disjoint {
|
||||
print "Checking disjoint: silence.txt, nosilenct.txt, disambig.txt ...\n";
|
||||
print "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n";
|
||||
if (!open(S, "<$lang/phones/silence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/silence.txt\n";}
|
||||
if (!open(N, "<$lang/phones/nonsilence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/nonsilence.txt\n";}
|
||||
if (!open(D, "<$lang/phones/disambig.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/disambig.txt\n";}
|
||||
|
@ -336,6 +326,30 @@ sub check_summation {
|
|||
check_disjoint; print "\n";
|
||||
check_summation; print "\n";
|
||||
|
||||
@list1 = ("context_indep", "disambig", "nonsilence", "silence", "optional_silence");
|
||||
@list2 = ("roots", "sets");
|
||||
foreach(@list1) {
|
||||
check_txt_int_csl("$lang/phones/$_", \%psymtab); print "\n";
|
||||
}
|
||||
foreach(@list2) {
|
||||
check_txt_int("$lang/phones/$_", \%psymtab, 1); print "\n";
|
||||
}
|
||||
if ((-s "$lang/phones/extra_questions.txt") || (-s "$lang/phones/extra_questions.int")) {
|
||||
check_txt_int("$lang/phones/extra_questions", \%psymtab, 0); print "\n";
|
||||
} else {
|
||||
print "Checking $lang/phones/extra_questions.\{txt, int\} ...\n";
|
||||
if ((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int")) {
|
||||
print "--> WARNING: the optional $lang/phones/extra_questions.\{txt, int\} are empty!\n\n";
|
||||
$warning = 1;
|
||||
} else {
|
||||
print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n";
|
||||
$exit = 1;
|
||||
}
|
||||
}
|
||||
if (-e "$lang/phones/word_boundary.txt") {
|
||||
check_txt_int("$lang/phones/word_boundary", \%psymtab, 0); print "\n";
|
||||
}
|
||||
|
||||
# Checking optional_silence.txt -------------------------------
|
||||
print "Checking optional_silence.txt ...\n";
|
||||
$idx = 1;
|
||||
|
@ -550,7 +564,7 @@ if (-s "$lang/phones/word_boundary.int") {
|
|||
}
|
||||
|
||||
# Check oov -------------------------------
|
||||
check_txt_int("$lang/oov", \%wsymtab); print "\n";
|
||||
check_txt_int("$lang/oov", \%wsymtab, 0); print "\n";
|
||||
|
||||
|
||||
# Check determinizability of G.fst
|
||||
|
@ -580,7 +594,6 @@ if (-e "$lang/G.fst" && -e "$lang/L_disambig.fst") {
|
|||
if ($exit == 1) { print "--> ERROR (see error messages above)\n"; exit 1;}
|
||||
else {
|
||||
if ($warning == 1) { print "--> WARNING (check output above for warnings)\n"; exit 0; }
|
||||
else { print "--> SUCCESS\n"; exit 0; }
|
||||
else { print "--> SUCCESS [validating lang directory $lang]\n"; exit 0; }
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -43,8 +43,8 @@ int main(int argc, char *argv[]) {
|
|||
"of disambiguation symbols.\n"
|
||||
"Warning: you probably want to set the --transition-scale and --self-loop-scale\n"
|
||||
"options; the defaults (zero) are probably not appropriate.\n"
|
||||
"Usage: compile-train-graphs-fsts [options] tree-in model-in lexicon-fst-in "
|
||||
" graphs-rspecifier graphs-wspecifier\n"
|
||||
"Usage: compile-train-graphs-fsts [options] <tree-in> <model-in> <lexicon-fst-in> "
|
||||
" <graphs-rspecifier> <graphs-wspecifier>\n"
|
||||
"e.g.: \n"
|
||||
" compile-train-graphs-fsts --read-disambig-syms=disambig.list\\\n"
|
||||
" tree 1.mdl lex.fst ark:train.fsts ark:graphs.fsts\n";
|
||||
|
|
|
@ -37,7 +37,7 @@ int main(int argc, char *argv[]) {
|
|||
const char *usage =
|
||||
"Creates training graphs (without transition-probabilities, by default)\n"
|
||||
"\n"
|
||||
"Usage: compile-train-graphs [options] tree-in model-in lexicon-fst-in transcriptions-rspecifier graphs-wspecifier\n"
|
||||
"Usage: compile-train-graphs [options] <tree-in> <model-in> <lexicon-fst-in> <transcriptions-rspecifier> <graphs-wspecifier>\n"
|
||||
"e.g.: \n"
|
||||
" compile-train-graphs tree 1.mdl lex.fst ark:train.tra ark:graphs.fsts\n";
|
||||
ParseOptions po(usage);
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
// bin/get-post-on-ali.cc
|
||||
|
||||
// Copyright 2013 Brno University of Technology (Author: Karel Vesely)
|
||||
// 2014 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
|
@ -31,18 +32,24 @@ int main(int argc, char *argv[]) {
|
|||
typedef kaldi::int32 int32;
|
||||
try {
|
||||
const char *usage =
|
||||
"This program extracts a vector of per-frame posteriors that are selected\n"
|
||||
"by an alignment (ie. posteriors that are under the alignment path).\n"
|
||||
"This can be used as a per-frame confidence measure.\n"
|
||||
"Given input posteriors, e.g. derived from lattice-to-post, and an alignment\n"
|
||||
"typically derived from the best path of a lattice, outputs the probability in\n"
|
||||
"the posterior of the corresponding index in the alignment, or zero if it was\n"
|
||||
"not there. These are output as a vector of weights, one per utterance.\n"
|
||||
"While, by default, lattice-to-post (as a source of posteriors) and sources of\n"
|
||||
"alignments such as lattice-best-path will output transition-ids as the index,\n"
|
||||
"it will generally make sense to either convert these to pdf-ids using\n"
|
||||
"post-to-pdf-post and ali-to-pdf respectively, or to phones using post-to-phone-post\n"
|
||||
"and (ali-to-phones --per-frame=true). Since this program only sees the integer\n"
|
||||
"indexes, it does not care what they represent-- but of course they should match\n"
|
||||
"(e.g. don't input posteriors with transition-ids and alignments with pdf-ids).\n"
|
||||
"See http://kaldi.sourceforge.net/hmm.html#transition_model_identifiers for an\n"
|
||||
"explanation of these types of indexes.\n"
|
||||
"\n"
|
||||
"By intuition, it is better to use pdf-posteriors and pdf-alignments,\n"
|
||||
"because the posteriors of competing hypothesis that are in the same frame\n"
|
||||
"at same 'pdf-state' are summed up, which is in some sense similar\n"
|
||||
"to what is done by C-max which sums the posteriors of overlapping words.\n"
|
||||
"The difference here is that the granularity is per-frame.\n"
|
||||
"See also: weight-post, post-to-weights, reverse-weights\n"
|
||||
"\n"
|
||||
"Usage: get-post-on-ali [options] <posteriors-rspecifier> <ali-rspecifier> <conf-wspecifier>\n"
|
||||
"e.g.: get-post-on-ali ark:post.ark ark:ali.ark ark:conf.ark\n";
|
||||
"Usage: get-post-on-ali [options] <posteriors-rspecifier> <ali-rspecifier> <weights-wspecifier>\n"
|
||||
"e.g.: get-post-on-ali ark:post.ark ark,s,cs:ali.ark ark:weights.ark\n";
|
||||
|
||||
ParseOptions po(usage);
|
||||
|
||||
|
|
|
@ -32,6 +32,7 @@ int main(int argc, char *argv[]) {
|
|||
const char *usage =
|
||||
"This program turns per-frame posteriors, which have transition-ids as\n"
|
||||
"the integers, into pdf-level posteriors\n"
|
||||
"See also: post-to-phone-post, post-to-weights, get-post-on-ali\n"
|
||||
"\n"
|
||||
"Usage: post-to-pdf-post [options] <model-file> <posteriors-rspecifier> <posteriors-wspecifier>\n"
|
||||
"e.g.: post-to-pdf-post 1.mdl ark:- ark:-\n";
|
||||
|
|
|
@ -30,6 +30,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
const char *usage =
|
||||
"Convert posteriors to phone-level posteriors\n"
|
||||
"See also: post-to-pdf-post, post-to-weights, get-post-on-ali\n"
|
||||
"\n"
|
||||
"Usage: post-to-phone-post [options] <model> <post-rspecifier> <phone-post-wspecifier>\n"
|
||||
" e.g.: post-to-phone-post --binary=false 1.mdl \"ark:ali-to-post 1.ali|\" ark,t:-\n";
|
||||
|
|
|
@ -31,7 +31,9 @@ int main(int argc, char *argv[]) {
|
|||
const char *usage =
|
||||
"Turn posteriors into per-frame weights (typically most useful after\n"
|
||||
"weight-silence-post, to get silence weights)\n"
|
||||
"Usage: post-to-weights post-rspecifier weights-wspecifier\n";
|
||||
"See also: weight-silence-post, post-to-pdf-post, post-to-phone-post\n"
|
||||
"get-post-on-ali\n"
|
||||
"Usage: post-to-weights <post-rspecifier> <weights-wspecifier>\n";
|
||||
|
||||
ParseOptions po(usage);
|
||||
po.Read(argc, argv);
|
||||
|
|
|
@ -1128,7 +1128,6 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
|
|||
*tot_objf += weight * log(this_prob);
|
||||
*tot_weight += weight;
|
||||
(*this)(m, label) += weight / this_prob;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -580,7 +580,7 @@ class CuMatrix: public CuMatrixBase<Real> {
|
|||
void CompObjfAndDeriv(const std::vector<MatrixElement<Real> > &elements,
|
||||
const CuMatrix<Real> &A,
|
||||
Real *tot_objf,
|
||||
Real* tot_weight);
|
||||
Real *tot_weight);
|
||||
|
||||
private:
|
||||
void Destroy();
|
||||
|
|
|
@ -28,28 +28,39 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
const char *usage =
|
||||
"Reads an archive of features and writes a corresponding archive\n"
|
||||
"that maps utterance-id to utterance length in frames.\n"
|
||||
"Usage: feat-to-len [options] in-rspecifier out-wspecifier\n"
|
||||
"e.g.: feat-to-len scp:feats.scp ark,t:feats.lengths\n";
|
||||
"that maps utterance-id to utterance length in frames, or (with\n"
|
||||
"one argument) print to stdout the total number of frames in the\n"
|
||||
"input archive.\n"
|
||||
"Usage: feat-to-len [options] <in-rspecifier> [<out-wspecifier>]\n"
|
||||
"e.g.: feat-to-len scp:feats.scp ark,t:feats.lengths\n"
|
||||
"or: feat-to-len scp:feats.scp\n";
|
||||
|
||||
ParseOptions po(usage);
|
||||
|
||||
po.Read(argc, argv);
|
||||
|
||||
if (po.NumArgs() != 2) {
|
||||
if (po.NumArgs() != 1 && po.NumArgs() != 2) {
|
||||
po.PrintUsage();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
std::string rspecifier = po.GetArg(1);
|
||||
std::string wspecifier = po.GetArg(2);
|
||||
if (po.NumArgs() == 2) {
|
||||
std::string rspecifier = po.GetArg(1);
|
||||
std::string wspecifier = po.GetArg(2);
|
||||
|
||||
Int32Writer length_writer(wspecifier);
|
||||
Int32Writer length_writer(wspecifier);
|
||||
|
||||
SequentialBaseFloatMatrixReader kaldi_reader(rspecifier);
|
||||
for (; !kaldi_reader.Done(); kaldi_reader.Next())
|
||||
length_writer.Write(kaldi_reader.Key(), kaldi_reader.Value().NumRows());
|
||||
|
||||
SequentialBaseFloatMatrixReader matrix_reader(rspecifier);
|
||||
for (; !matrix_reader.Done(); matrix_reader.Next())
|
||||
length_writer.Write(matrix_reader.Key(), matrix_reader.Value().NumRows());
|
||||
} else {
|
||||
int64 tot = 0;
|
||||
std::string rspecifier = po.GetArg(1);
|
||||
SequentialBaseFloatMatrixReader matrix_reader(rspecifier);
|
||||
for (; !matrix_reader.Done(); matrix_reader.Next())
|
||||
tot += matrix_reader.Value().NumRows();
|
||||
std::cout << tot << std::endl;
|
||||
}
|
||||
return 0;
|
||||
} catch(const std::exception &e) {
|
||||
std::cerr << e.what();
|
||||
|
|
|
@ -234,8 +234,8 @@ int main(int argc, char *argv[]) {
|
|||
const char *usage =
|
||||
"Finds the path having the smallest edit-distance between two lattices.\n"
|
||||
"For efficiency put the smallest lattices first (for example reference strings).\n"
|
||||
"Usage: lattice-oracle [options] test-lattice-rspecifier reference-rspecifier "
|
||||
"transcriptions-wspecifier [edit-distance-wspecifier]\n"
|
||||
"Usage: lattice-oracle [options] <test-lattice-rspecifier> <reference-rspecifier> "
|
||||
"<transcriptions-wspecifier> [<edit-distance-wspecifier>]\n"
|
||||
" e.g.: lattice-oracle ark:lat.1 'ark:sym2int.pl -f 2- data/lang/words.txt <data/test/text' ark,t:-\n";
|
||||
|
||||
ParseOptions po(usage);
|
||||
|
@ -260,20 +260,21 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
po.Read(argc, argv);
|
||||
|
||||
if (po.NumArgs() != 3) {
|
||||
if (po.NumArgs() != 3 && po.NumArgs() != 4) {
|
||||
po.PrintUsage();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
std::string lats_rspecifier = po.GetArg(1),
|
||||
reference_rspecifier = po.GetArg(2),
|
||||
transcriptions_wspecifier = po.GetArg(3);
|
||||
|
||||
transcriptions_wspecifier = po.GetArg(3),
|
||||
edit_distance_wspecifier = po.GetOptArg(4);
|
||||
|
||||
// will read input as lattices
|
||||
SequentialLatticeReader lattice_reader(lats_rspecifier);
|
||||
RandomAccessInt32VectorReader reference_reader(reference_rspecifier);
|
||||
|
||||
Int32VectorWriter transcriptions_writer(transcriptions_wspecifier);
|
||||
Int32Writer edit_distance_writer(edit_distance_wspecifier);
|
||||
|
||||
// Guoguo Chen added the implementation for option "write-lattices".
|
||||
CompactLatticeWriter lats_writer(lats_wspecifier);
|
||||
|
@ -360,8 +361,10 @@ int main(int argc, char *argv[]) {
|
|||
// count errors
|
||||
int32 correct, substitutions, insertions, deletions, num_words;
|
||||
CountErrors(best_path, &correct, &substitutions, &insertions, &deletions, &num_words);
|
||||
int32 toterrs = substitutions + insertions + deletions;
|
||||
KALDI_LOG << "%WER " << (100.*toterrs) / num_words << " [ " << toterrs
|
||||
int32 tot_errs = substitutions + insertions + deletions;
|
||||
if (edit_distance_wspecifier != "")
|
||||
edit_distance_writer.Write(key, tot_errs);
|
||||
KALDI_LOG << "%WER " << (100.*tot_errs) / num_words << " [ " << tot_errs
|
||||
<< " / " << num_words << ", " << insertions << " insertions, " << deletions
|
||||
<< " deletions, " << substitutions << " sub ]";
|
||||
tot_correct += correct;
|
||||
|
@ -397,7 +400,7 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
// Guoguo Chen added the implementation for option "write-lattices".
|
||||
// Currently it's just a naive implementation: traversal the original
|
||||
// Currently it's just a naive implementation: traverse the original
|
||||
// lattice and get the path corresponding to the oracle word sequence.
|
||||
// Note that this new lattice has the alignment information.
|
||||
if (lats_wspecifier != "") {
|
||||
|
|
|
@ -1002,53 +1002,52 @@ void MatrixBase<Real>::MulRowsVec(const VectorBase<Real> &scale) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::MulRowsGroupMat(const MatrixBase<Real> &src) {
|
||||
KALDI_ASSERT(src.NumCols() > 0 && src.NumCols() <= this->NumCols());
|
||||
KALDI_ASSERT(this->NumCols() % src.NumCols() == 0 ||
|
||||
this->NumCols() % (src.NumCols() - 1) < this->NumCols() / (src.NumCols() - 1));
|
||||
int group_size = 0;
|
||||
if (this->NumCols() % src.NumCols() == 0) {
|
||||
group_size = this->NumCols() / src.NumCols();
|
||||
} else {
|
||||
group_size = this->NumCols() / src.NumCols() + 1;
|
||||
}
|
||||
MatrixIndexT M = num_rows_, N = num_cols_;
|
||||
KALDI_ASSERT(src.NumRows() == this->NumRows() &&
|
||||
this->NumCols() % src.NumCols() == 0);
|
||||
int32 group_size = this->NumCols() / src.NumCols(),
|
||||
num_groups = this->NumCols() / group_size,
|
||||
num_rows = this->NumRows();
|
||||
|
||||
for (MatrixIndexT i = 0; i < M; i++)
|
||||
for (MatrixIndexT j = 0; j < N; j++)
|
||||
(*this)(i, j) *= src(i, j / group_size);
|
||||
for (MatrixIndexT i = 0; i < num_rows; i++) {
|
||||
Real *data = this->RowData(i);
|
||||
for (MatrixIndexT j = 0; j < num_groups; j++, data += group_size) {
|
||||
Real scale = src(i, j);
|
||||
cblas_Xscal(group_size, scale, data, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::GroupPnormDeriv(const MatrixBase<Real> &src1,
|
||||
const MatrixBase<Real> &src2,
|
||||
void MatrixBase<Real>::GroupPnormDeriv(const MatrixBase<Real> &input,
|
||||
const MatrixBase<Real> &output,
|
||||
Real power) {
|
||||
KALDI_ASSERT(src2.NumCols() > 0 && src2.NumCols() <= this->NumCols());
|
||||
KALDI_ASSERT(this->NumCols() % src2.NumCols() == 0 ||
|
||||
this->NumCols() % (src2.NumCols() - 1) < this->NumCols() / (src2.NumCols() - 1));
|
||||
int group_size = 0;
|
||||
if (this->NumCols() % src2.NumCols() == 0) {
|
||||
group_size = this->NumCols() / src2.NumCols();
|
||||
} else {
|
||||
group_size = this->NumCols() / src2.NumCols() + 1;
|
||||
}
|
||||
MatrixIndexT M = this->NumRows(), N = this->NumCols();
|
||||
KALDI_ASSERT(input.NumCols() == this->NumCols() && input.NumRows() == this->NumRows());
|
||||
KALDI_ASSERT(this->NumCols() % output.NumCols() == 0 &&
|
||||
this->NumRows() == output.NumRows());
|
||||
|
||||
int group_size = this->NumCols() / output.NumCols(),
|
||||
num_rows = this->NumRows(), num_cols = this->NumCols();
|
||||
|
||||
if (power == 1.0) {
|
||||
for (MatrixIndexT i = 0; i < M; i++)
|
||||
for (MatrixIndexT j = 0; j < N; j++)
|
||||
(*this)(i, j) = (src1(i, j) == 0 ? 0 : (src1(i, j) > 0 ? 1 : -1));
|
||||
for (MatrixIndexT i = 0; i < num_rows; i++) {
|
||||
for (MatrixIndexT j = 0; j < num_cols; j++) {
|
||||
Real input_val = input(i, j);
|
||||
(*this)(i, j) = (input_val == 0 ? 0 : (input_val > 0 ? 1 : -1));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (MatrixIndexT i = 0; i < M; i++) {
|
||||
for (MatrixIndexT j = 0; j < N; j++) {
|
||||
if (src2(i, j / group_size) == 0) {
|
||||
for (MatrixIndexT i = 0; i < num_rows; i++) {
|
||||
for (MatrixIndexT j = 0; j < num_cols; j++) {
|
||||
Real output_val = output(i, j / group_size),
|
||||
input_val = input(i, j);
|
||||
if (output_val == 0)
|
||||
(*this)(i, j) = 0;
|
||||
} else {
|
||||
(*this)(i, j) = pow(std::abs(src1(i, j)), power - 1) *
|
||||
(src2(i, j / group_size) > 0 ? pow(src2(i, j / group_size), 1 - power) : 1) *
|
||||
(src1(i, j) >= 0 ? 1 : -1) ;
|
||||
}
|
||||
else
|
||||
(*this)(i, j) = pow(std::abs(input_val), power - 1) *
|
||||
pow(output_val, 1 - power) * (input_val >= 0 ? 1 : -1) ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2428,12 +2427,15 @@ void MatrixBase<Real>::SoftHinge(const MatrixBase<Real> &src) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::GroupPnorm(const MatrixBase<Real> &src, Real power) {
|
||||
int group_size = src.NumCols() / this->NumCols();
|
||||
KALDI_ASSERT(src.NumCols() == this->NumCols() * group_size);
|
||||
for (MatrixIndexT i = 0; i < src.NumRows(); i++)
|
||||
for (MatrixIndexT j = 0; j < this->NumCols(); j++)
|
||||
KALDI_ASSERT(src.NumCols() % this->NumCols() == 0 &&
|
||||
src.NumRows() == this->NumRows());
|
||||
int group_size = src.NumCols() / this->NumCols(),
|
||||
num_rows = this->NumRows(), num_cols = this->NumCols();
|
||||
for (MatrixIndexT i = 0; i < num_rows; i++)
|
||||
for (MatrixIndexT j = 0; j < num_cols; j++)
|
||||
(*this)(i, j) = src.Row(i).Range(j * group_size, group_size).Norm(power);
|
||||
}
|
||||
|
||||
|
|
|
@ -240,8 +240,9 @@ class MatrixBase {
|
|||
/// each row by a scalar taken from that dimension of the vector.
|
||||
void MulRowsVec(const VectorBase<Real> &scale);
|
||||
|
||||
/// divide each row into src.NumCols() groups,
|
||||
/// and then scale i'th row's jth group of elements by src[i, j].
|
||||
/// Divide each row into src.NumCols() equal groups, and then scale i'th row's
|
||||
/// j'th group of elements by src(i, j). Requires src.NumRows() ==
|
||||
/// this->NumRows() and this->NumCols() % src.NumCols() == 0.
|
||||
void MulRowsGroupMat(const MatrixBase<Real> &src);
|
||||
|
||||
/// Returns logdet of matrix.
|
||||
|
@ -418,8 +419,8 @@ class MatrixBase {
|
|||
/// Set each element to y = log(1 + exp(x))
|
||||
void SoftHinge(const MatrixBase<Real> &src);
|
||||
|
||||
/// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j ^ (power)) ^ (1 / p)
|
||||
/// where G = x.NumCols() / y.NumCols() must be an integer.
|
||||
/// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j^(power))^(1 / p).
|
||||
/// Requires src.NumRows() == this->NumRows() and src.NumCols() % this->NumCols() == 0.
|
||||
void GroupPnorm(const MatrixBase<Real> &src, Real power);
|
||||
|
||||
|
||||
|
|
|
@ -469,9 +469,9 @@ int32 LinearCgd(const LinearCgdOptions &opts,
|
|||
residual_factor = opts.recompute_residual_factor *
|
||||
opts.recompute_residual_factor;
|
||||
|
||||
// Note: although from a mathematical point of view the method should
|
||||
// converge after M iterations, in practice it does not always converge
|
||||
// to good precision after that many iterations so we let the maximum
|
||||
// Note: although from a mathematical point of view the method should converge
|
||||
// after M iterations, in practice (due to roundoff) it does not always
|
||||
// converge to good precision after that many iterations so we let the maximum
|
||||
// be 1.5 * M + 5 instead.
|
||||
int32 k = 0;
|
||||
for (; k < M + M / 2 + 5 && k != opts.max_iters; k++) {
|
||||
|
|
|
@ -86,8 +86,8 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
|
|||
} else {
|
||||
cblas_Xscal(dim, inv_v1, v, 1);
|
||||
}
|
||||
if (!KALDI_ISFINITE(inv_v1) || !KALDI_ISFINITE(x1)) {
|
||||
KALDI_ERR << "NaN or inf encountered in HouseBackward";
|
||||
if (KALDI_ISNAN(inv_v1)) {
|
||||
KALDI_ERR << "NaN encountered in HouseBackward";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -142,8 +142,8 @@ void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
|
|||
} else {
|
||||
cblas_Xscal(dim, inv_v1, v, 1);
|
||||
}
|
||||
if (!KALDI_ISFINITE(inv_v1) || !KALDI_ISFINITE(x1)) {
|
||||
KALDI_ERR << "NaN or inf encountered in HouseBackward";
|
||||
if (KALDI_ISNAN(inv_v1)) {
|
||||
KALDI_ERR << "NaN encountered in HouseBackward";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,7 +19,8 @@ OBJFILES = nnet-component.o nnet-nnet.o train-nnet.o train-nnet-ensemble.o nnet-
|
|||
nnet-fix.o nnet-stats.o rescale-nnet.o nnet-limit-rank.o nnet-example.o \
|
||||
get-feature-transform.o widen-nnet.o nnet-precondition-online.o \
|
||||
nnet-example-functions.o nnet-compute-discriminative.o \
|
||||
nnet-compute-discriminative-parallel.o online-nnet2-decodable.o
|
||||
nnet-compute-discriminative-parallel.o online-nnet2-decodable.o \
|
||||
train-nnet-perturbed.o
|
||||
|
||||
LIBNAME = kaldi-nnet2
|
||||
|
||||
|
|
|
@ -1595,6 +1595,9 @@ class FixedAffineComponent: public Component {
|
|||
virtual Component* Copy() const;
|
||||
virtual void Read(std::istream &is, bool binary);
|
||||
virtual void Write(std::ostream &os, bool binary) const;
|
||||
|
||||
// Function to provide access to linear_params_.
|
||||
const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
|
||||
protected:
|
||||
friend class AffineComponent;
|
||||
CuMatrix<BaseFloat> linear_params_;
|
||||
|
|
|
@ -67,6 +67,40 @@ void NnetExample::Read(std::istream &is, bool binary) {
|
|||
}
|
||||
|
||||
|
||||
|
||||
void ExamplesRepository::AcceptExamples(
|
||||
std::vector<NnetExample> *examples) {
|
||||
KALDI_ASSERT(!examples->empty());
|
||||
empty_semaphore_.Wait();
|
||||
KALDI_ASSERT(examples_.empty());
|
||||
examples_.swap(*examples);
|
||||
full_semaphore_.Signal();
|
||||
}
|
||||
|
||||
void ExamplesRepository::ExamplesDone() {
|
||||
empty_semaphore_.Wait();
|
||||
KALDI_ASSERT(examples_.empty());
|
||||
done_ = true;
|
||||
full_semaphore_.Signal();
|
||||
}
|
||||
|
||||
bool ExamplesRepository::ProvideExamples(
|
||||
std::vector<NnetExample> *examples) {
|
||||
full_semaphore_.Wait();
|
||||
if (done_) {
|
||||
KALDI_ASSERT(examples_.empty());
|
||||
full_semaphore_.Signal(); // Increment the semaphore so
|
||||
// the call by the next thread will not block.
|
||||
return false; // no examples to return-- all finished.
|
||||
} else {
|
||||
KALDI_ASSERT(!examples_.empty() && examples->empty());
|
||||
examples->swap(examples_);
|
||||
empty_semaphore_.Signal();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void DiscriminativeNnetExample::Write(std::ostream &os,
|
||||
bool binary) const {
|
||||
// Note: weight, num_ali, den_lat, input_frames, left_context and spk_info are
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include "nnet2/nnet-nnet.h"
|
||||
#include "util/table-types.h"
|
||||
#include "lat/kaldi-lattice.h"
|
||||
#include "thread/kaldi-semaphore.h"
|
||||
|
||||
namespace kaldi {
|
||||
namespace nnet2 {
|
||||
|
@ -64,6 +65,35 @@ typedef SequentialTableReader<KaldiObjectHolder<NnetExample > > SequentialNnetEx
|
|||
typedef RandomAccessTableReader<KaldiObjectHolder<NnetExample > > RandomAccessNnetExampleReader;
|
||||
|
||||
|
||||
/** This class stores neural net training examples to be used in
|
||||
multi-threaded training. */
|
||||
class ExamplesRepository {
|
||||
public:
|
||||
/// The following function is called by the code that reads in the examples,
|
||||
/// with a batch of examples. [It will empty the vector "examples").
|
||||
void AcceptExamples(std::vector<NnetExample> *examples);
|
||||
|
||||
/// The following function is called by the code that reads in the examples,
|
||||
/// when we're done reading examples.
|
||||
void ExamplesDone();
|
||||
|
||||
/// This function is called by the code that does the training. It gets the
|
||||
/// training examples, and if they are available, puts them in "examples" and
|
||||
/// returns true. It returns false when there are no examples left and
|
||||
/// ExamplesDone() has been called.
|
||||
bool ProvideExamples(std::vector<NnetExample> *examples);
|
||||
|
||||
ExamplesRepository(): empty_semaphore_(1), done_(false) { }
|
||||
private:
|
||||
Semaphore full_semaphore_;
|
||||
Semaphore empty_semaphore_;
|
||||
|
||||
std::vector<NnetExample> examples_;
|
||||
bool done_;
|
||||
KALDI_DISALLOW_COPY_AND_ASSIGN(ExamplesRepository);
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
This struct is used to store the information we need for discriminative training
|
||||
(MMI or MPE). Each example corresponds to one chunk of a file (for better randomization
|
||||
|
@ -116,7 +146,7 @@ struct DiscriminativeNnetExample {
|
|||
void Read(std::istream &is, bool binary);
|
||||
};
|
||||
|
||||
// Tes, the length of typenames is getting out of hand.
|
||||
// Yes, the length of typenames is getting out of hand.
|
||||
typedef TableWriter<KaldiObjectHolder<DiscriminativeNnetExample > >
|
||||
DiscriminativeNnetExampleWriter;
|
||||
typedef SequentialTableReader<KaldiObjectHolder<DiscriminativeNnetExample > >
|
||||
|
|
|
@ -432,6 +432,12 @@ void Nnet::RemovePreconditioning() {
|
|||
*(dynamic_cast<AffineComponent*>(components_[i])));
|
||||
delete components_[i];
|
||||
components_[i] = ac;
|
||||
} else if (dynamic_cast<AffineComponentPreconditionedOnline*>(
|
||||
components_[i]) != NULL) {
|
||||
AffineComponent *ac = new AffineComponent(
|
||||
*(dynamic_cast<AffineComponent*>(components_[i])));
|
||||
delete components_[i];
|
||||
components_[i] = ac;
|
||||
}
|
||||
}
|
||||
SetIndexes();
|
||||
|
|
|
@ -26,68 +26,6 @@
|
|||
namespace kaldi {
|
||||
namespace nnet2 {
|
||||
|
||||
/** This struct stores neural net training examples to be used in
|
||||
multi-threaded training. */
|
||||
class ExamplesRepository {
|
||||
public:
|
||||
/// The following function is called by the code that reads in the examples,
|
||||
/// with a batch of examples. [It will empty the vector "examples").
|
||||
void AcceptExamples(std::vector<NnetExample> *examples);
|
||||
|
||||
/// The following function is called by the code that reads in the examples,
|
||||
/// when we're done reading examples.
|
||||
void ExamplesDone();
|
||||
|
||||
/// This function is called by the code that does the training. It gets the
|
||||
/// training examples, and if they are available, puts them in "examples" and
|
||||
/// returns true. It returns false when there are no examples left and
|
||||
/// ExamplesDone() has been called.
|
||||
bool ProvideExamples(std::vector<NnetExample> *examples);
|
||||
|
||||
ExamplesRepository(): empty_semaphore_(1), done_(false) { }
|
||||
private:
|
||||
Semaphore full_semaphore_;
|
||||
Semaphore empty_semaphore_;
|
||||
|
||||
std::vector<NnetExample> examples_;
|
||||
bool done_;
|
||||
KALDI_DISALLOW_COPY_AND_ASSIGN(ExamplesRepository);
|
||||
};
|
||||
|
||||
|
||||
void ExamplesRepository::AcceptExamples(
|
||||
std::vector<NnetExample> *examples) {
|
||||
KALDI_ASSERT(!examples->empty());
|
||||
empty_semaphore_.Wait();
|
||||
KALDI_ASSERT(examples_.empty());
|
||||
examples_.swap(*examples);
|
||||
full_semaphore_.Signal();
|
||||
}
|
||||
|
||||
void ExamplesRepository::ExamplesDone() {
|
||||
empty_semaphore_.Wait();
|
||||
KALDI_ASSERT(examples_.empty());
|
||||
done_ = true;
|
||||
full_semaphore_.Signal();
|
||||
}
|
||||
|
||||
bool ExamplesRepository::ProvideExamples(
|
||||
std::vector<NnetExample> *examples) {
|
||||
full_semaphore_.Wait();
|
||||
if (done_) {
|
||||
KALDI_ASSERT(examples_.empty());
|
||||
full_semaphore_.Signal(); // Increment the semaphore so
|
||||
// the call by the next thread will not block.
|
||||
return false; // no examples to return-- all finished.
|
||||
} else {
|
||||
KALDI_ASSERT(!examples_.empty() && examples->empty());
|
||||
examples->swap(examples_);
|
||||
empty_semaphore_.Signal();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
class DoBackpropParallelClass: public MultiThreadable {
|
||||
public:
|
||||
|
|
|
@ -39,8 +39,8 @@ double NnetUpdater::ComputeForMinibatch(
|
|||
CuMatrix<BaseFloat> tmp_deriv;
|
||||
double ans = ComputeObjfAndDeriv(data, &tmp_deriv, tot_accuracy);
|
||||
if (nnet_to_update_ != NULL)
|
||||
Backprop(data, &tmp_deriv); // this is summed (after weighting), not
|
||||
// averaged.
|
||||
Backprop(&tmp_deriv); // this is summed (after weighting), not
|
||||
// averaged.
|
||||
return ans;
|
||||
}
|
||||
|
||||
|
@ -133,9 +133,7 @@ double NnetUpdater::ComputeTotAccuracy(
|
|||
}
|
||||
|
||||
|
||||
void NnetUpdater::Backprop(const std::vector<NnetExample> &data,
|
||||
CuMatrix<BaseFloat> *deriv) const {
|
||||
int32 num_chunks = data.size();
|
||||
void NnetUpdater::Backprop(CuMatrix<BaseFloat> *deriv) const {
|
||||
// We assume ComputeObjfAndDeriv has already been called.
|
||||
for (int32 c = nnet_.NumComponents() - 1; c >= 0; c--) {
|
||||
const Component &component = nnet_.GetComponent(c);
|
||||
|
@ -146,7 +144,7 @@ void NnetUpdater::Backprop(const std::vector<NnetExample> &data,
|
|||
CuMatrix<BaseFloat> input_deriv(input.NumRows(), input.NumCols());
|
||||
const CuMatrix<BaseFloat> &output_deriv(*deriv);
|
||||
|
||||
component.Backprop(input, output, output_deriv, num_chunks,
|
||||
component.Backprop(input, output, output_deriv, num_chunks_,
|
||||
component_to_update, &input_deriv);
|
||||
input_deriv.Swap(deriv);
|
||||
}
|
||||
|
|
|
@ -29,22 +29,20 @@
|
|||
namespace kaldi {
|
||||
namespace nnet2 {
|
||||
|
||||
/* This header provides functionality for sample-by-sample stochastic
|
||||
/** @file
|
||||
This header provides functionality for sample-by-sample stochastic
|
||||
gradient descent and gradient computation with a neural net.
|
||||
See also nnet-compute.h which is the same thing but for
|
||||
See also \ref nnet-compute.h which is the same thing but for
|
||||
whole utterances.
|
||||
This is the inner part of the training code; see nnet-train.h
|
||||
which contains a wrapper for this, with functionality for
|
||||
automatically keeping the learning rates for each layer updated
|
||||
using a heuristic involving validation-set gradients.
|
||||
*/
|
||||
|
||||
class NnetEnsembleTrainer;
|
||||
|
||||
// This class NnetUpdater contains functions for updating the neural net or
|
||||
// computing its gradient, given a set of NnetExamples. We
|
||||
// define it in the header file becaused it's needed by the ensemble training.
|
||||
// But in normal cases its functionality should be used by calling DoBackprop(),
|
||||
// and by ComputeNnetObjf()
|
||||
class NnetEnsembleTrainer;
|
||||
class NnetUpdater {
|
||||
public:
|
||||
// Note: in the case of training with SGD, "nnet" and "nnet_to_update" will
|
||||
|
@ -84,8 +82,7 @@ class NnetUpdater {
|
|||
/// contain, at input, the derivative w.r.t. the output layer (as computed by
|
||||
/// ComputeObjfAndDeriv), but will be used as a temporary variable by this
|
||||
/// function.
|
||||
void Backprop(const std::vector<NnetExample> &data,
|
||||
CuMatrix<BaseFloat> *deriv) const;
|
||||
void Backprop(CuMatrix<BaseFloat> *deriv) const;
|
||||
|
||||
friend class NnetEnsembleTrainer;
|
||||
private:
|
||||
|
@ -100,10 +97,6 @@ class NnetUpdater {
|
|||
std::vector<CuMatrix<BaseFloat> > forward_data_; // The forward data
|
||||
// for the outputs of each of the components.
|
||||
|
||||
// These weights are one per parameter; they equal to the "weight"
|
||||
// member variables in the NnetExample structures. These
|
||||
// will typically be about one on average.
|
||||
CuVector<BaseFloat> chunk_weights_;
|
||||
};
|
||||
|
||||
/// This function computes the objective function and either updates the model
|
||||
|
|
|
@ -90,12 +90,13 @@ void NnetEnsembleTrainer::TrainOneMinibatch() {
|
|||
post_mat[i].ApplyLog();
|
||||
std::vector<BaseFloat> log_post_correct;
|
||||
post_mat[i].Lookup(sv_labels_ind, &log_post_correct);
|
||||
BaseFloat log_prob_this_net = std::accumulate(log_post_correct.begin(), log_post_correct.end(), static_cast<BaseFloat>(0));
|
||||
|
||||
BaseFloat log_prob_this_net = std::accumulate(log_post_correct.begin(),
|
||||
log_post_correct.end(),
|
||||
static_cast<BaseFloat>(0));
|
||||
avg_logprob_this_phase_ += log_prob_this_net;
|
||||
tmp_deriv.InvertElements();
|
||||
tmp_deriv.MulElements(post_avg);
|
||||
updater_ensemble_[i]->Backprop(buffer_, &tmp_deriv);
|
||||
updater_ensemble_[i]->Backprop(&tmp_deriv);
|
||||
}
|
||||
count_this_phase_ += buffer_.size();
|
||||
buffer_.clear();
|
||||
|
|
|
@ -0,0 +1,710 @@
|
|||
// nnet2/train-nnet-perturbed.cc
|
||||
|
||||
// Copyright 2012-2014 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "nnet2/train-nnet-perturbed.h"
|
||||
#include "nnet2/nnet-update.h"
|
||||
#include "thread/kaldi-thread.h"
|
||||
|
||||
namespace kaldi {
|
||||
namespace nnet2 {
|
||||
|
||||
|
||||
class NnetPerturbedUpdater {
|
||||
public:
|
||||
// Note: in the case of training with SGD, "nnet" and "nnet_to_update" will be
|
||||
// identical. They'd be different if we're accumulating the gradient for a
|
||||
// held-out set and don't want to update the model, but this shouldn't happen
|
||||
// for this "perturbed" update. nnet_to_update may be NULL if you don't
|
||||
// want do do backprop, but this probably doesn't make sense.
|
||||
// num_layers_before_input is the number of layers to ignore before what
|
||||
// we consider to be the input (x) for purposes of this technique. This will
|
||||
// likely equal 2: one for the feature-splicing layer (SpliceComponent) and
|
||||
// one for the preconditioning layer (FixedAffineComponent). The within_class_covar
|
||||
// argument (within_class_covar)
|
||||
//
|
||||
// within_class_covar is the within-class covariance matrix
|
||||
NnetPerturbedUpdater(const Nnet &nnet,
|
||||
int32 num_layers_before_input,
|
||||
const CuMatrix<BaseFloat> &within_class_covar,
|
||||
Nnet *nnet_to_update);
|
||||
|
||||
// This function does the entire forward and backward computation for this
|
||||
// minbatch. Outputs to tot_objf_orig and tot_objf_perturbed the total
|
||||
// objective function (including any weighting factors) over this minibatch,
|
||||
// and the same after perturbing the data.
|
||||
void ComputeForMinibatch(const std::vector<NnetExample> &data,
|
||||
BaseFloat D,
|
||||
double *tot_objf_orig,
|
||||
double *tot_objf_perturbed);
|
||||
|
||||
protected:
|
||||
|
||||
/// takes the input and formats as a single matrix, in forward_data_[0].
|
||||
void FormatInput(const std::vector<NnetExample> &data);
|
||||
|
||||
/// Do the forward propagation for layers 0 ... num_layers_before_input_ - 1,
|
||||
/// typically the first two layers. This will be called once per minibatch.
|
||||
void PropagateInitial() { Propagate(0, num_layers_before_input_); }
|
||||
|
||||
|
||||
/// Do the forward propagation for layers num_layers_before_input_
|
||||
/// ... num-layers-1, typically all but the first two layers. This will be
|
||||
/// called twice per minibatch, once before and once after perturbing the
|
||||
/// inputs.
|
||||
void PropagateRemaining() { Propagate(num_layers_before_input_,
|
||||
nnet_.NumComponents()); }
|
||||
|
||||
/// Internal Propagate function, does the forward computation for
|
||||
/// layers begin_layer ... end_layer - 1.
|
||||
void Propagate(int32 begin_layer, int32 end_layer);
|
||||
|
||||
/// Computes objective function and derivative at output layer, but does not
|
||||
/// do the backprop [for that, see Backprop()]. This will be called twice per
|
||||
/// minibatch, once before and once after perturbing the inputs.
|
||||
void ComputeObjfAndDeriv(const std::vector<MatrixElement<BaseFloat> > &sv_labels,
|
||||
CuMatrix<BaseFloat> *deriv,
|
||||
BaseFloat *tot_objf,
|
||||
BaseFloat *tot_weight) const;
|
||||
|
||||
/// Computes supervision labels from data.
|
||||
void ComputeSupervisionLabels(const std::vector<NnetExample> &data,
|
||||
std::vector<MatrixElement<BaseFloat> > *sv_labels);
|
||||
|
||||
/// Backprop must be called after ComputeObjfAndDeriv (it will be called
|
||||
/// twice, the first time with a NULL nnet_to_update pointer). It does the
|
||||
/// backpropagation (not including the first num_layers_before_input_ layers).
|
||||
/// "nnet_to_update" is updated, if non-NULL. Note: "deriv" will contain, at
|
||||
/// input, the derivative w.r.t. the output layer (as computed by
|
||||
/// ComputeObjfAndDeriv), but will be used as a temporary variable by this
|
||||
/// function, and exit, will contain the derivative of the objective function
|
||||
/// w.r.t. the input of layer num_layers_before_input_.
|
||||
void Backprop(Nnet *nnet_to_update,
|
||||
CuMatrix<BaseFloat> *deriv) const;
|
||||
|
||||
/// Perturb the input features (actually, the features at the input of layer
|
||||
/// num_layers_before_input_). This modifies the value of
|
||||
/// forward_data_[num_layers_before_input_]. For the math, see \ref
|
||||
/// train-nnet-perturbed.h
|
||||
void PerturbInput(const CuMatrix<BaseFloat> &deriv_at_input,
|
||||
BaseFloat D);
|
||||
|
||||
private:
|
||||
|
||||
const Nnet &nnet_;
|
||||
|
||||
Nnet *nnet_to_update_;
|
||||
int32 num_layers_before_input_; // Number of layers before whichever layer we
|
||||
// regard as the input for purposes of this
|
||||
// method (normally 2, to include splicing
|
||||
// layer and preconditioning layer)
|
||||
|
||||
const CuMatrix<BaseFloat> &within_class_covar_;
|
||||
|
||||
int32 num_chunks_; // same as the minibatch size.
|
||||
|
||||
std::vector<CuMatrix<BaseFloat> > forward_data_; // The forward data
|
||||
// for the outputs of each of the components.
|
||||
};
|
||||
|
||||
|
||||
NnetPerturbedUpdater::NnetPerturbedUpdater(const Nnet &nnet,
|
||||
int32 num_layers_before_input,
|
||||
const CuMatrix<BaseFloat> &within_class_covar,
|
||||
Nnet *nnet_to_update):
|
||||
nnet_(nnet),
|
||||
nnet_to_update_(nnet_to_update),
|
||||
num_layers_before_input_(num_layers_before_input),
|
||||
within_class_covar_(within_class_covar) {
|
||||
KALDI_ASSERT(num_layers_before_input_ >= 0 &&
|
||||
num_layers_before_input < nnet.NumComponents());
|
||||
for (int32 c = 0; c < num_layers_before_input_; c++) {
|
||||
const Component *comp = &(nnet.GetComponent(c));
|
||||
const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(comp);
|
||||
if (uc != NULL) {
|
||||
KALDI_ERR << "One of the pre-input layers is updatable.";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void NnetPerturbedUpdater::PerturbInput(
|
||||
const CuMatrix<BaseFloat> &deriv_at_input,
|
||||
BaseFloat D) {
|
||||
// The code doesn't handle the case where there is further splicing after the
|
||||
// input.
|
||||
KALDI_ASSERT(num_chunks_ == deriv_at_input.NumRows());
|
||||
// For the math, see train-nnet-perturbed.h.
|
||||
// deriv_at_input is \nabla in the math.
|
||||
|
||||
// "input" is the input features, currently unmodified, but we'll
|
||||
// modify them.
|
||||
CuMatrix<BaseFloat> &input(forward_data_[num_layers_before_input_]);
|
||||
KALDI_ASSERT(SameDim(input, deriv_at_input));
|
||||
// Each row of deriv_w will equal (W nabla_t)', where ' is transpose.
|
||||
CuMatrix<BaseFloat> deriv_w(input.NumRows(), input.NumCols());
|
||||
// note: for the second transpose-ness argument below we can choose either
|
||||
// kTrans or kNoTrans because the matrix is symmetric. I'm guessing that
|
||||
// kTrans will be faster.
|
||||
deriv_w.AddMatMat(1.0, deriv_at_input, kNoTrans,
|
||||
within_class_covar_, kTrans, 0.0);
|
||||
|
||||
// k will be used to compute and store the gradient-scaling factor k_t.
|
||||
CuVector<BaseFloat> k(deriv_at_input.NumRows());
|
||||
// after the next call, each element of k will contain (\nabla_t^T W \nabla_t)
|
||||
// We want k_t = D / sqrt(\nabla_t^T W \nabla_t)
|
||||
// so we need to take this to the power -0.5.
|
||||
// We can't do this if it's zero, so we first floor to a very small value.
|
||||
k.AddDiagMatMat(1.0, deriv_w, kNoTrans, deriv_at_input, kTrans, 0.0);
|
||||
int32 num_floored = k.ApplyFloor(1.0e-20);
|
||||
if (num_floored > 0.0) {
|
||||
// Should only happen at the very start of training,
|
||||
KALDI_WARN << num_floored << " gradients floored (derivative at input was "
|
||||
<< "close to zero).. should only happen at start of training "
|
||||
<< "or when adding a new layer.";
|
||||
}
|
||||
k.ApplyPow(-0.5);
|
||||
// now we have k_t = 1.0 / sqrt(\nabla_t^T W \nabla_t).
|
||||
// in the math, k_t contains an additional factor of D, but we'll
|
||||
// add this later.
|
||||
// Below, we will do x'_t = x_t - k_t W \nabla_t
|
||||
// Here, each row of deriv_w contains the transpose of W \nabla_t.
|
||||
// The factor of D is because it was missing in k.
|
||||
input.AddDiagVecMat(-1.0 * D, k, deriv_w, kNoTrans, 1.0);
|
||||
}
|
||||
|
||||
void NnetPerturbedUpdater::ComputeForMinibatch(
|
||||
const std::vector<NnetExample> &data,
|
||||
BaseFloat D,
|
||||
double *tot_objf_orig,
|
||||
double *tot_objf_perturbed) {
|
||||
|
||||
FormatInput(data);
|
||||
PropagateInitial();
|
||||
PropagateRemaining();
|
||||
CuMatrix<BaseFloat> tmp_deriv;
|
||||
|
||||
std::vector<MatrixElement<BaseFloat> > sv_labels;
|
||||
ComputeSupervisionLabels(data, &sv_labels);
|
||||
|
||||
BaseFloat tot_objf, tot_weight;
|
||||
ComputeObjfAndDeriv(sv_labels, &tmp_deriv, &tot_objf, &tot_weight);
|
||||
|
||||
KALDI_VLOG(4) << "Objective function (original) is " << (tot_objf/tot_weight)
|
||||
<< " per sample, over " << tot_weight << " samples (weighted).";
|
||||
*tot_objf_orig = tot_objf;
|
||||
|
||||
// only backprops till layer number num_layers_before_input_,
|
||||
// and derivative at that layer is in tmp_deriv.
|
||||
Backprop(NULL, &tmp_deriv);
|
||||
|
||||
// perturb forward_data_[num_layers_before_input_].
|
||||
PerturbInput(tmp_deriv, D);
|
||||
|
||||
// Now propagate forward again from that point.
|
||||
PropagateRemaining();
|
||||
|
||||
ComputeObjfAndDeriv(sv_labels, &tmp_deriv, &tot_objf, &tot_weight);
|
||||
KALDI_VLOG(4) << "Objective function (perturbed) is " << (tot_objf/tot_weight)
|
||||
<< " per sample, over " << tot_weight << " samples (weighted).";
|
||||
*tot_objf_perturbed = tot_objf;
|
||||
|
||||
// The actual model updating would happen in the next call.
|
||||
if (nnet_to_update_ != NULL)
|
||||
Backprop(nnet_to_update_, &tmp_deriv);
|
||||
}
|
||||
|
||||
void NnetPerturbedUpdater::Propagate(int32 begin_layer, int32 end_layer) {
|
||||
static int32 num_times_printed = 0;
|
||||
|
||||
for (int32 c = begin_layer; c < end_layer; c++) {
|
||||
const Component &component = nnet_.GetComponent(c);
|
||||
const CuMatrix<BaseFloat> &input = forward_data_[c];
|
||||
CuMatrix<BaseFloat> &output = forward_data_[c+1];
|
||||
// Note: the Propagate function will automatically resize the
|
||||
// output.
|
||||
component.Propagate(input, num_chunks_, &output);
|
||||
|
||||
KALDI_VLOG(4) << "Propagating: sum at output of " << c << " is " << output.Sum();
|
||||
|
||||
// If we won't need the output of the previous layer for
|
||||
// backprop, delete it to save memory.
|
||||
bool need_last_output =
|
||||
(c>0 && nnet_.GetComponent(c-1).BackpropNeedsOutput()) ||
|
||||
component.BackpropNeedsInput();
|
||||
if (g_kaldi_verbose_level >= 3 && num_times_printed < 100) {
|
||||
KALDI_VLOG(3) << "Stddev of data for component " << c
|
||||
<< " for this minibatch is "
|
||||
<< (TraceMatMat(forward_data_[c], forward_data_[c], kTrans) /
|
||||
(forward_data_[c].NumRows() * forward_data_[c].NumCols()));
|
||||
num_times_printed++;
|
||||
}
|
||||
if (!need_last_output && c != num_layers_before_input_)
|
||||
forward_data_[c].Resize(0, 0); // We won't need this data.
|
||||
}
|
||||
}
|
||||
|
||||
void NnetPerturbedUpdater::ComputeSupervisionLabels(
|
||||
const std::vector<NnetExample> &data,
|
||||
std::vector<MatrixElement<BaseFloat> > *sv_labels) {
|
||||
sv_labels->clear();
|
||||
sv_labels->reserve(num_chunks_); // We must have at least this many labels.
|
||||
for (int32 m = 0; m < num_chunks_; m++) {
|
||||
for (size_t i = 0; i < data[m].labels.size(); i++) {
|
||||
MatrixElement<BaseFloat>
|
||||
tmp = {m, data[m].labels[i].first, data[m].labels[i].second};
|
||||
sv_labels->push_back(tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void NnetPerturbedUpdater::ComputeObjfAndDeriv(
|
||||
const std::vector<MatrixElement<BaseFloat> > &sv_labels,
|
||||
CuMatrix<BaseFloat> *deriv,
|
||||
BaseFloat *tot_objf,
|
||||
BaseFloat *tot_weight) const {
|
||||
int32 num_components = nnet_.NumComponents();
|
||||
deriv->Resize(num_chunks_, nnet_.OutputDim()); // sets to zero.
|
||||
const CuMatrix<BaseFloat> &output(forward_data_[num_components]);
|
||||
KALDI_ASSERT(SameDim(output, *deriv));
|
||||
|
||||
deriv->CompObjfAndDeriv(sv_labels, output, tot_objf, tot_weight);
|
||||
}
|
||||
|
||||
|
||||
void NnetPerturbedUpdater::Backprop(Nnet *nnet_to_update,
|
||||
CuMatrix<BaseFloat> *deriv) const {
|
||||
// We assume ComputeObjfAndDeriv has already been called.
|
||||
for (int32 c = nnet_.NumComponents() - 1; c >= num_layers_before_input_; c--) {
|
||||
const Component &component = nnet_.GetComponent(c);
|
||||
Component *component_to_update = (nnet_to_update == NULL ? NULL :
|
||||
&(nnet_to_update->GetComponent(c)));
|
||||
const CuMatrix<BaseFloat> &input = forward_data_[c],
|
||||
&output = forward_data_[c+1];
|
||||
CuMatrix<BaseFloat> input_deriv(input.NumRows(), input.NumCols());
|
||||
const CuMatrix<BaseFloat> &output_deriv(*deriv);
|
||||
|
||||
component.Backprop(input, output, output_deriv, num_chunks_,
|
||||
component_to_update, &input_deriv);
|
||||
input_deriv.Swap(deriv);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void NnetPerturbedUpdater::FormatInput(const std::vector<NnetExample> &data) {
|
||||
KALDI_ASSERT(data.size() > 0);
|
||||
int32 num_splice = nnet_.LeftContext() + 1 + nnet_.RightContext();
|
||||
KALDI_ASSERT(data[0].input_frames.NumRows() >= num_splice);
|
||||
|
||||
int32 feat_dim = data[0].input_frames.NumCols(),
|
||||
spk_dim = data[0].spk_info.Dim(),
|
||||
tot_dim = feat_dim + spk_dim; // we append these at the neural net
|
||||
// input... note, spk_dim might be 0.
|
||||
KALDI_ASSERT(tot_dim == nnet_.InputDim());
|
||||
KALDI_ASSERT(data[0].left_context >= nnet_.LeftContext());
|
||||
int32 ignore_frames = data[0].left_context - nnet_.LeftContext(); // If
|
||||
// the NnetExample has more left-context than we need, ignore some.
|
||||
// this may happen in settings where we increase the amount of context during
|
||||
// training, e.g. by adding layers that require more context.
|
||||
num_chunks_ = data.size();
|
||||
|
||||
forward_data_.resize(nnet_.NumComponents() + 1);
|
||||
|
||||
// First copy to a single matrix on the CPU, so we can copy to
|
||||
// GPU with a single copy command.
|
||||
Matrix<BaseFloat> temp_forward_data(num_splice * num_chunks_,
|
||||
tot_dim);
|
||||
|
||||
for (int32 chunk = 0; chunk < num_chunks_; chunk++) {
|
||||
SubMatrix<BaseFloat> dest(temp_forward_data,
|
||||
chunk * num_splice, num_splice,
|
||||
0, feat_dim);
|
||||
|
||||
Matrix<BaseFloat> full_src(data[chunk].input_frames);
|
||||
SubMatrix<BaseFloat> src(full_src, ignore_frames, num_splice, 0, feat_dim);
|
||||
|
||||
dest.CopyFromMat(src);
|
||||
if (spk_dim != 0) {
|
||||
SubMatrix<BaseFloat> spk_dest(temp_forward_data,
|
||||
chunk * num_splice, num_splice,
|
||||
feat_dim, spk_dim);
|
||||
spk_dest.CopyRowsFromVec(data[chunk].spk_info);
|
||||
}
|
||||
}
|
||||
forward_data_[0].Swap(&temp_forward_data); // Copy to GPU, if being used.
|
||||
}
|
||||
|
||||
|
||||
|
||||
void DoBackpropPerturbed(const Nnet &nnet,
|
||||
int32 num_layers_before_input,
|
||||
const CuMatrix<BaseFloat> &within_class_covar,
|
||||
BaseFloat D,
|
||||
const std::vector<NnetExample> &examples,
|
||||
Nnet *nnet_to_update,
|
||||
double *tot_objf_orig,
|
||||
double *tot_objf_perturbed) {
|
||||
|
||||
try {
|
||||
NnetPerturbedUpdater updater(nnet, num_layers_before_input,
|
||||
within_class_covar, nnet_to_update);
|
||||
|
||||
updater.ComputeForMinibatch(examples, D, tot_objf_orig, tot_objf_perturbed);
|
||||
} catch (...) {
|
||||
KALDI_LOG << "Error doing backprop, nnet info is: " << nnet.Info();
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
NnetPerturbedTrainer::NnetPerturbedTrainer(
|
||||
const NnetPerturbedTrainerConfig &config,
|
||||
const SpMatrix<BaseFloat> &within_class_covar,
|
||||
Nnet *nnet):
|
||||
config_(config), nnet_(nnet), logprob_this_phase_(0.0),
|
||||
logprob_perturbed_this_phase_(0.0), weight_this_phase_(0.0),
|
||||
logprob_total_(0.0), logprob_perturbed_total_(0.0),
|
||||
weight_total_(0.0),
|
||||
D_(config.initial_d) {
|
||||
InitWithinClassCovar(within_class_covar);
|
||||
num_phases_ = 0;
|
||||
bool first_time = true;
|
||||
BeginNewPhase(first_time);
|
||||
}
|
||||
|
||||
|
||||
// This function is used in class NnetPerturbedTrainer
|
||||
// and the function DoBackpropPerturbedParallel.
|
||||
void InitWithinClassCovar(
|
||||
const SpMatrix<BaseFloat> &within_class_covar,
|
||||
const Nnet &nnet,
|
||||
int32 *num_layers_before_input,
|
||||
CuMatrix<BaseFloat> *within_class_covar_out) {
|
||||
|
||||
CuSpMatrix<BaseFloat> orig_covar(within_class_covar);
|
||||
*num_layers_before_input = 0;
|
||||
KALDI_ASSERT(nnet.NumComponents() > *num_layers_before_input);
|
||||
const Component *comp = &(nnet.GetComponent(*num_layers_before_input));
|
||||
// Skip over any SpliceComponent that appears at the beginning of
|
||||
// the network.
|
||||
if (dynamic_cast<const SpliceComponent*>(comp) != NULL)
|
||||
(*num_layers_before_input)++;
|
||||
|
||||
KALDI_ASSERT(nnet.NumComponents() > *num_layers_before_input);
|
||||
comp = &(nnet.GetComponent(*num_layers_before_input));
|
||||
|
||||
const FixedAffineComponent *fa =
|
||||
dynamic_cast<const FixedAffineComponent*>(comp);
|
||||
if (fa != NULL) {
|
||||
(*num_layers_before_input)++;
|
||||
const CuMatrix<BaseFloat> &linear_params = fa->LinearParams();
|
||||
if (linear_params.NumCols() != orig_covar.NumCols()) {
|
||||
KALDI_ERR << "The neural network seems to expect a (spliced) feature "
|
||||
<< "dimension of " << linear_params.NumCols() << ", but your "
|
||||
<< "LDA stats have a dimension of " << orig_covar.NumCols();
|
||||
}
|
||||
CuMatrix<BaseFloat> temp(linear_params.NumRows(), orig_covar.NumRows());
|
||||
// temp = linear_params . orig_covar
|
||||
temp.AddMatSp(1.0, linear_params, kNoTrans, orig_covar, 0.0);
|
||||
within_class_covar_out->Resize(linear_params.NumRows(),
|
||||
linear_params.NumRows());
|
||||
// temp = linear_params . orig_covar . linear_params^T
|
||||
within_class_covar_out->AddMatMat(1.0, temp, kNoTrans,
|
||||
linear_params, kTrans, 0.0);
|
||||
// note: this should be symmetric, spot-test it like this:
|
||||
KALDI_ASSERT(ApproxEqual(TraceMatMat(*within_class_covar_out,
|
||||
*within_class_covar_out, kNoTrans),
|
||||
TraceMatMat(*within_class_covar_out,
|
||||
*within_class_covar_out, kTrans)));
|
||||
} else {
|
||||
if (comp->InputDim() != orig_covar.NumCols()) {
|
||||
KALDI_ERR << "The neural network seems to expect a (spliced) feature "
|
||||
<< "dimension of " << comp->InputDim() << ", but your "
|
||||
<< "LDA stats have a dimension of " << orig_covar.NumCols();
|
||||
}
|
||||
within_class_covar_out->Resize(orig_covar.NumRows(), orig_covar.NumCols());
|
||||
within_class_covar_out->CopyFromSp(orig_covar);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
void NnetPerturbedTrainer::InitWithinClassCovar(
|
||||
const SpMatrix<BaseFloat> &within_class_covar) {
|
||||
kaldi::nnet2::InitWithinClassCovar(within_class_covar, *nnet_,
|
||||
&num_layers_before_input_,
|
||||
&within_class_covar_);
|
||||
}
|
||||
|
||||
void NnetPerturbedTrainer::TrainOnExample(const NnetExample &value) {
|
||||
buffer_.push_back(value);
|
||||
if (static_cast<int32>(buffer_.size()) == config_.minibatch_size)
|
||||
TrainOneMinibatch();
|
||||
}
|
||||
|
||||
void NnetPerturbedTrainer::TrainOneMinibatch() {
|
||||
KALDI_ASSERT(!buffer_.empty());
|
||||
|
||||
double tot_objf_orig, tot_objf_perturbed;
|
||||
DoBackpropPerturbed(*nnet_, num_layers_before_input_, within_class_covar_, D_,
|
||||
buffer_, nnet_, &tot_objf_orig, &tot_objf_perturbed);
|
||||
|
||||
logprob_this_phase_ += tot_objf_orig;
|
||||
logprob_perturbed_this_phase_ += tot_objf_perturbed;
|
||||
double weight = TotalNnetTrainingWeight(buffer_);
|
||||
UpdateD(tot_objf_orig / weight, tot_objf_perturbed / weight);
|
||||
weight_this_phase_ += weight;
|
||||
buffer_.clear();
|
||||
minibatches_seen_this_phase_++;
|
||||
if (minibatches_seen_this_phase_ == config_.minibatches_per_phase) {
|
||||
bool first_time = false;
|
||||
BeginNewPhase(first_time);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void NnetPerturbedTrainer::UpdateD(BaseFloat orig_objf_per_example,
|
||||
BaseFloat perturbed_objf_per_example) {
|
||||
|
||||
BaseFloat diff = orig_objf_per_example - perturbed_objf_per_example;
|
||||
// note: diff should be positive in the normal case.
|
||||
KALDI_ASSERT(config_.target_objf_change > 0.0 && config_.max_d_factor > 1.0);
|
||||
BaseFloat objf_ratio = config_.target_objf_change /
|
||||
std::max<BaseFloat>(1.0e-20, diff),
|
||||
D_ratio = pow(objf_ratio, config_.tune_d_power);
|
||||
if (D_ratio > config_.max_d_factor)
|
||||
D_ratio = config_.max_d_factor;
|
||||
else if (D_ratio < 1.0 / config_.max_d_factor)
|
||||
D_ratio = 1.0 / config_.max_d_factor;
|
||||
BaseFloat D_new = D_ * D_ratio;
|
||||
|
||||
KALDI_VLOG(3) << "Training objective function normal/perturbed is "
|
||||
<< orig_objf_per_example << '/' << perturbed_objf_per_example
|
||||
<< ", diff " << diff << " vs. target "
|
||||
<< config_.target_objf_change
|
||||
<< ", changing D by factor " << D_ratio << " to " << D_new;
|
||||
D_ = D_new;
|
||||
}
|
||||
|
||||
void NnetPerturbedTrainer::BeginNewPhase(bool first_time) {
|
||||
if (!first_time) {
|
||||
BaseFloat logprob = logprob_this_phase_/weight_this_phase_,
|
||||
logprob_perturbed = logprob_perturbed_this_phase_/weight_this_phase_,
|
||||
diff = logprob - logprob_perturbed;
|
||||
KALDI_LOG << "Training objective function normal->perturbed is "
|
||||
<< logprob << " -> " << logprob_perturbed << ", diff "
|
||||
<< diff << " vs. target " << config_.target_objf_change
|
||||
<< ", over " << weight_this_phase_ << " frames, D is "
|
||||
<< D_;
|
||||
}
|
||||
logprob_total_ += logprob_this_phase_;
|
||||
logprob_perturbed_total_ += logprob_perturbed_this_phase_;
|
||||
weight_total_ += weight_this_phase_;
|
||||
logprob_this_phase_ = 0.0;
|
||||
logprob_perturbed_this_phase_ = 0.0;
|
||||
weight_this_phase_ = 0.0;
|
||||
minibatches_seen_this_phase_ = 0;
|
||||
num_phases_++;
|
||||
}
|
||||
|
||||
|
||||
NnetPerturbedTrainer::~NnetPerturbedTrainer() {
|
||||
if (!buffer_.empty()) {
|
||||
KALDI_LOG << "Doing partial minibatch of size "
|
||||
<< buffer_.size();
|
||||
TrainOneMinibatch();
|
||||
if (minibatches_seen_this_phase_ != 0) {
|
||||
bool first_time = false;
|
||||
BeginNewPhase(first_time);
|
||||
}
|
||||
}
|
||||
if (weight_total_ == 0.0) {
|
||||
KALDI_WARN << "No data seen.";
|
||||
} else {
|
||||
KALDI_LOG << "Did backprop on " << weight_total_
|
||||
<< " examples, average log-prob normal->perturbed per frame is "
|
||||
<< (logprob_total_ / weight_total_) << " -> "
|
||||
<< (logprob_perturbed_total_ / weight_total_);
|
||||
KALDI_LOG << "[this line is to be parsed by a script:] log-prob-per-frame="
|
||||
<< (logprob_total_ / weight_total_);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// compare with DoBackpropParallelClass
|
||||
class TrainParallelPerturbedClass: public MultiThreadable {
|
||||
public:
|
||||
// This constructor is only called for a temporary object
|
||||
// that we pass to the RunMultiThreaded function.
|
||||
TrainParallelPerturbedClass(const NnetPerturbedTrainerConfig &config,
|
||||
const CuMatrix<BaseFloat> &within_class_covar,
|
||||
int32 num_layers_before_input,
|
||||
BaseFloat *D,
|
||||
Nnet *nnet,
|
||||
ExamplesRepository *repository,
|
||||
double *log_prob_orig_ptr,
|
||||
double *log_prob_perturbed_ptr,
|
||||
double *tot_weight_ptr):
|
||||
config_(config), within_class_covar_(within_class_covar),
|
||||
num_layers_before_input_(num_layers_before_input), D_(D),
|
||||
nnet_(nnet), repository_(repository),
|
||||
log_prob_orig_ptr_(log_prob_orig_ptr),
|
||||
log_prob_perturbed_ptr_(log_prob_perturbed_ptr),
|
||||
tot_weight_ptr_(tot_weight_ptr),
|
||||
log_prob_orig_(0.0),
|
||||
log_prob_perturbed_(0.0),
|
||||
tot_weight_(0.0) { }
|
||||
|
||||
// Use the default copy constructor.
|
||||
|
||||
// This does the main function of the class.
|
||||
void operator () () {
|
||||
std::vector<NnetExample> examples;
|
||||
while (repository_->ProvideExamples(&examples)) {
|
||||
double objf_orig, objf_perturbed,
|
||||
weight = TotalNnetTrainingWeight(examples);
|
||||
DoBackpropPerturbed(*nnet_, num_layers_before_input_,
|
||||
within_class_covar_, *D_,
|
||||
examples, nnet_,
|
||||
&objf_orig, &objf_perturbed);
|
||||
UpdateD(objf_orig / weight, objf_perturbed / weight);
|
||||
|
||||
tot_weight_ += weight;
|
||||
log_prob_orig_ += objf_orig;
|
||||
log_prob_perturbed_ += objf_perturbed;
|
||||
KALDI_VLOG(4) << "Thread " << thread_id_ << " saw "
|
||||
<< tot_weight_ << " frames so far (weighted); likelihood "
|
||||
<< "per frame (orig->perturbed) so far is "
|
||||
<< (log_prob_orig_ / tot_weight_) << " -> "
|
||||
<< (log_prob_perturbed_ / tot_weight_);
|
||||
examples.clear();
|
||||
}
|
||||
}
|
||||
|
||||
~TrainParallelPerturbedClass() {
|
||||
*log_prob_orig_ptr_ += log_prob_orig_;
|
||||
*log_prob_perturbed_ptr_ += log_prob_perturbed_;
|
||||
*tot_weight_ptr_ += tot_weight_;
|
||||
}
|
||||
private:
|
||||
void UpdateD(BaseFloat orig_logprob, BaseFloat perturbed_logprob) {
|
||||
BaseFloat diff = orig_logprob - perturbed_logprob;
|
||||
// note: diff should be positive in the normal case.
|
||||
KALDI_ASSERT(config_.target_objf_change > 0.0 && config_.max_d_factor > 1.0);
|
||||
// divide the power we raise the ratio to when tuning D, by the
|
||||
// number of threads; this should ensure stability of the update.
|
||||
BaseFloat tune_d_power = config_.tune_d_power / g_num_threads;
|
||||
BaseFloat objf_ratio = config_.target_objf_change /
|
||||
std::max<BaseFloat>(1.0e-20, diff),
|
||||
D_ratio = pow(objf_ratio, tune_d_power);
|
||||
if (D_ratio > config_.max_d_factor)
|
||||
D_ratio = config_.max_d_factor;
|
||||
else if (D_ratio < 1.0 / config_.max_d_factor)
|
||||
D_ratio = 1.0 / config_.max_d_factor;
|
||||
BaseFloat D_new = (*D_) * D_ratio;
|
||||
*D_ = D_new;
|
||||
|
||||
// Note: we are accessing *D_ from multiple threads without
|
||||
// locking, but the negative consequences of this contention are
|
||||
// very small (
|
||||
KALDI_VLOG(3) << "Training objective function normal->perturbed is "
|
||||
<< orig_logprob << " -> " << perturbed_logprob
|
||||
<< ", diff " << diff << " vs. target "
|
||||
<< config_.target_objf_change
|
||||
<< ", changing D by factor " << D_ratio << " to " << D_new;
|
||||
}
|
||||
|
||||
const NnetPerturbedTrainerConfig &config_;
|
||||
const CuMatrix<BaseFloat> &within_class_covar_;
|
||||
int32 num_layers_before_input_;
|
||||
BaseFloat *D_; // Constant D that controls how much to perturb the data. We
|
||||
// update this as well as use it.
|
||||
Nnet *nnet_;
|
||||
ExamplesRepository *repository_;
|
||||
|
||||
double *log_prob_orig_ptr_;
|
||||
double *log_prob_perturbed_ptr_;
|
||||
double *tot_weight_ptr_;
|
||||
double log_prob_orig_; // log-like times num frames (before perturbing features)
|
||||
double log_prob_perturbed_; // log-like times num frames (after perturbing features)
|
||||
double tot_weight_; // normalizing factor for the above.
|
||||
};
|
||||
|
||||
void DoBackpropPerturbedParallel(const NnetPerturbedTrainerConfig &config,
|
||||
const SpMatrix<BaseFloat> &within_class_covar,
|
||||
SequentialNnetExampleReader *example_reader,
|
||||
double *tot_objf_orig,
|
||||
double *tot_objf_perturbed,
|
||||
double *tot_weight,
|
||||
Nnet *nnet) {
|
||||
|
||||
// within_class_covar_processed is the within-class covar as CuMatrix, possibly
|
||||
// projected by the preconditioning transform in any FixedAffineComponent.
|
||||
CuMatrix<BaseFloat> within_class_covar_processed;
|
||||
int32 num_layers_before_input;
|
||||
InitWithinClassCovar(within_class_covar, *nnet,
|
||||
&num_layers_before_input,
|
||||
&within_class_covar_processed);
|
||||
BaseFloat D = config.initial_d;
|
||||
|
||||
ExamplesRepository repository; // handles parallel programming issues regarding
|
||||
|
||||
*tot_objf_orig = *tot_objf_perturbed = *tot_weight = 0.0;
|
||||
|
||||
TrainParallelPerturbedClass trainer_proto(config,
|
||||
within_class_covar_processed,
|
||||
num_layers_before_input, &D,
|
||||
nnet, &repository,
|
||||
tot_objf_orig,
|
||||
tot_objf_perturbed,
|
||||
tot_weight);
|
||||
|
||||
{
|
||||
// The initialization of the following class spawns the threads that
|
||||
// process the examples. They get re-joined in its destructor.
|
||||
MultiThreader<TrainParallelPerturbedClass> m(g_num_threads, trainer_proto);
|
||||
|
||||
std::vector<NnetExample> examples;
|
||||
for (; !example_reader->Done(); example_reader->Next()) {
|
||||
examples.push_back(example_reader->Value());
|
||||
if (examples.size() == config.minibatch_size)
|
||||
repository.AcceptExamples(&examples);
|
||||
}
|
||||
if (!examples.empty()) // partial minibatch.
|
||||
repository.AcceptExamples(&examples);
|
||||
// Here, the destructor of "m" re-joins the threads, and
|
||||
// does the summing of the gradients if we're doing gradient
|
||||
// computation (i.e. &nnet != nnet_to_update). This gets
|
||||
// done in the destructors of the objects of type
|
||||
// DoBackpropParallelClass.
|
||||
repository.ExamplesDone();
|
||||
}
|
||||
KALDI_LOG << "Did backprop on " << *tot_weight << " examples, average log-prob "
|
||||
<< "per frame (orig->perturbed) is "
|
||||
<< (*tot_objf_orig / *tot_weight) << " -> "
|
||||
<< (*tot_objf_perturbed / *tot_weight) << " over "
|
||||
<< *tot_weight << " samples (weighted).";
|
||||
|
||||
KALDI_LOG << "[this line is to be parsed by a script:] log-prob-per-frame="
|
||||
<< (*tot_objf_orig / *tot_weight);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
} // namespace nnet2
|
||||
} // namespace kaldi
|
|
@ -0,0 +1,327 @@
|
|||
// nnet2/train-nnet-perturbed.h
|
||||
|
||||
// Copyright 2012-2014 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef KALDI_NNET2_TRAIN_NNET_PERTURBED_H_
|
||||
#define KALDI_NNET2_TRAIN_NNET_PERTURBED_H_
|
||||
|
||||
#include "nnet2/nnet-nnet.h"
|
||||
#include "nnet2/nnet-example.h"
|
||||
#include "itf/options-itf.h"
|
||||
|
||||
namespace kaldi {
|
||||
namespace nnet2 {
|
||||
|
||||
/**
|
||||
@file
|
||||
|
||||
This file was modified from train-nnet.h in order to implement an idea
|
||||
about perturbing the training examples slightly, in a direction that's
|
||||
opposite to the gradient of the objective function w.r.t. those examples.
|
||||
It's a bit like the idea in "Intriguing properties of neural networks", the
|
||||
training method they mention, except they have a more complicated formulation
|
||||
with L-BFGS. We can justify our idea by approximating the neural network
|
||||
plus objective-function evaluation as a linear function.
|
||||
|
||||
Note: before doing this, we want to make sure the input features have a
|
||||
reasonable distribution, and our choice for this is to make the within-class
|
||||
covariance matrix unit. [note: we don't have to normalize the mean to zero,
|
||||
this won't matter.] Rather than explicitly transforming the features using
|
||||
a transform T, it turns out that we have to multiply the gradients by something
|
||||
like T T'. We'll describe this later.
|
||||
|
||||
Suppose the actual input features are x. Typically we do frame splicing
|
||||
as part of the network, and it's more convenient to do the perturbation on
|
||||
the spliced features, so x may actually be the output of the network's
|
||||
first (splicing) layer. Suppose the within-class covariance matrix of
|
||||
x is W. If we do the Cholesky transform
|
||||
W = C C^T,
|
||||
then C^{-1} W C^{-T} = I, so if we define
|
||||
T =(def) C^{-1} and
|
||||
and transformed features
|
||||
\hat{x} =(def) T x
|
||||
then it's easy to show that the within-class covariance matrix of the
|
||||
transformed features \hat{x} would be I.
|
||||
|
||||
The way we formulate the perturbed-feature thing is somewhat similar to the
|
||||
"Intriguing properties of neural networks" paper, except we're not in image
|
||||
recognition so no need to keep features in the range [0, 1]. Given a training
|
||||
example \hat{x}_t, we want to find a perturbed example
|
||||
\hat{x}'_t = \hat{x}_t + d_t
|
||||
that gives the worst possible loss-value, such that ||d_t|| <= D, where D is
|
||||
a scalar length parameter (e.g. D = 0.1), and ||.|| is the 2-norm. This means
|
||||
that we want to perturb the training example in the most damaging way possible,
|
||||
given that it should not change by more than a certain amount. Because we've
|
||||
normalized the within-class covariance we believe that using a normal 2-norm
|
||||
on d_t, rather than a more general form of inner-product, is suitable.
|
||||
|
||||
Anyway, we make a simplifying assumption that the loss function for a particular
|
||||
sample is just a linear function of the input, and when we get to the space of
|
||||
\hat{x}, it just means we go a certain distance D down the gradient. How we
|
||||
set a suitable value for D, we'll come to later.
|
||||
|
||||
Suppose by backpropagating the
|
||||
derivative to x we get a derivative \nabla_t of the objective function (e.g. a
|
||||
log-probability) w.r.t. x_t. Then we can get the derivative \hat{\nabla}_t of
|
||||
the objective function w.r.t. \hat{x}_t, by identifying
|
||||
x_t^T nabla_t = \hat{x}_t^T \hat{\nabla}_t
|
||||
x_t^T nabla_t = x_t^T T^T \hat{\nabla}_t
|
||||
x_t^T nabla_t = x_t^T T^T T^{-T} \nabla_t, since T^T T^{-T} = I.
|
||||
[note, ^T is transpose and ^{-T} is inverse-of-transpose.]
|
||||
so \hat{\nabla}_t = T^{-T} \nabla_t.
|
||||
(this is not the formal way of getting these derivatives, it's just how I remember).
|
||||
Anyway, we now have
|
||||
\hat{x}'_t =(def) \hat{x}_t - k_t T^{-T} \nabla_t
|
||||
where k_t is chosen to ensure that
|
||||
k_t || T^{-T} \nabla_t ||_2 = D
|
||||
k_t sqrt( \nabla_t^T T^{-1} T^{-T} \nabla_t ) = D
|
||||
so
|
||||
k_t = D / sqrt(\nabla_t^T T^{-1} T^{-T} \nabla_t)
|
||||
= D / sqrt(\nabla_t^T C C^T \nabla_t)
|
||||
= D / sqrt(\nabla_t^T W \nabla_t)
|
||||
Now, we actually want the update in terms of the parameter x instead of \hat{x},
|
||||
so multiplying the definition of \hat{x}'_t above by T^{-1} on the left, we have:
|
||||
x'_t = x_t - k_t T^{-1} T^{-T} \nabla_t
|
||||
= x_t - k_t W \nabla_t
|
||||
(note: we can also use W \nabla_t for efficiently computing k_t).
|
||||
|
||||
It will actually be more efficient to do this after the FixedAffineTransform
|
||||
layer that we used to "precondition" the features, so after the second layer
|
||||
of the input rather than the first. All we need to do is to get the
|
||||
within-class covariance matrix W in that space (after the
|
||||
FixedAffineTransform) instead. We'll use the name x for that space, and forget
|
||||
about the original input space.
|
||||
|
||||
Next, we want to discuss how we'll set the constant D. D is a proportion of
|
||||
the within-class covariance. However, it's not clear a priori how to set
|
||||
this, or that we can tune it just once and then leave it fixed for other
|
||||
setups. For one thing, if the input features contain a lot of "nuisance"
|
||||
dimension that are not very informative about the class, it may be necessary
|
||||
for D to be smaller (because hopefully the gradients will be small in those
|
||||
nuisance directions). There is another issue that this whole method is
|
||||
intended to improve generalization, so we only want to use it strongly if
|
||||
generalization is actually a problem. For example, if we have so much
|
||||
training data and so few parameters that we have no trouble generalizing, we
|
||||
might not want to apply this method too strongly. Our method will be to set D
|
||||
in order to get, on average, a certain degradation which we'll call
|
||||
"target-objf-change" in the objective function per frame. Each time we
|
||||
apply this perturbation to a minibatch, we'll see whether the degradation in
|
||||
objective is greater or less than "target-objf-change", and we'll change
|
||||
D accordingly. We'll use a simple heuristic that D should change proportionally
|
||||
to the 0.5'th power of the ratio between the "target-objf-change" and the
|
||||
observed objective function change for this minibatch, but never by more than
|
||||
a factor of two. Note: the only significance of 0.5 here is that 0.5 <= 1; a
|
||||
smaller number means slower changes in D, so it should change over about 2
|
||||
minibatches to the right number. If this proves unstable, we'll change it.
|
||||
|
||||
Next, it's not absolutely clear how we should set target-objf-change-- the
|
||||
value which determines how much objective-function degradation we want the
|
||||
perturbation to produce on average (per sample). To put this in perspective,
|
||||
for speech tasks with small amounts of data (say, <30 hours) and a couple thousand
|
||||
classes
|
||||
we typically see objective values like: training-set -0.6 and valdiation-set -1.1.
|
||||
These are avearage log-probabilities per frame, of the correct class.
|
||||
The two numbers are quite different because there is substantial overtraining. Note: for Karel's
|
||||
nnet1 setup, the difference is typically smaller, more like -0.8 vs. -1.0, as
|
||||
that setup monitors the validation-set objective and decreases the learning rate
|
||||
when it starts to degrade. Now, for much larger training sets, we might
|
||||
see smaller differences in training-set versus validation-set objective function:
|
||||
for instance: say, -1.40 versus -1.45. (For larger training sets the objectives tend
|
||||
to be more negative simply because we have more leaves). We measure these values each
|
||||
iteration: see the files compute_prob_train.*.log and compute_prob_valid.*.log produced
|
||||
by the example scripts. The reason why I discuss these values
|
||||
is that if the training-set and validation-set objective functions are very close, then
|
||||
it means that there is not much overtraining going on and we don't want to apply this
|
||||
method too strongly; on the other hand, if they are very different, it means we are
|
||||
overtraining badly and we may want to apply this method more.
|
||||
|
||||
So we plan to set target-objf-change to the following value, at the script level:
|
||||
|
||||
target-objf-change = target-multiplier * (training-objf - validation-objf))
|
||||
|
||||
(e.g. target-multiplier = 1.0).
|
||||
Note that if target-objf-change is less than a specified min-target-objf-change
|
||||
(e.g. 0.1) then we won't apply the perturbed training at all, which will save
|
||||
time. The method is intended to help generalization, and if we're generalizing
|
||||
well then we don't need to apply it.
|
||||
The training and validation objective functions are computed over
|
||||
different (randomly chosen) sets, each with about 3000 samples, and it can
|
||||
sometimes happen that the validation objective function can be better than the
|
||||
training set objective function. Also, the validation set is sampled from a
|
||||
held-out subset of 300 utterances by default; this is done out of a concern
|
||||
that the correlations within an utterance can be very high, so if we use the
|
||||
same utterances for training and validation, then the validation set is not
|
||||
really held-out. But the smallish number (300) of validation utterances
|
||||
increases the randomness in the training and validation objectives.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
struct NnetPerturbedTrainerConfig {
|
||||
int32 minibatch_size;
|
||||
int32 minibatches_per_phase;
|
||||
// target_objf_change will be set from the command line to a value >0.0.
|
||||
BaseFloat target_objf_change;
|
||||
BaseFloat initial_d;
|
||||
// tune_d_power is not configurable from the command line.
|
||||
BaseFloat tune_d_power;
|
||||
// max_d_factor is not configurable from the command line.
|
||||
BaseFloat max_d_factor;
|
||||
|
||||
|
||||
NnetPerturbedTrainerConfig(): minibatch_size(500),
|
||||
minibatches_per_phase(50),
|
||||
target_objf_change(0.1),
|
||||
initial_d(0.05),
|
||||
tune_d_power(0.5),
|
||||
max_d_factor(2.0){ }
|
||||
|
||||
void Register (OptionsItf *po) {
|
||||
po->Register("minibatch-size", &minibatch_size,
|
||||
"Number of samples per minibatch of training data.");
|
||||
po->Register("minibatches-per-phase", &minibatches_per_phase,
|
||||
"Number of minibatches to wait before printing training-set "
|
||||
"objective.");
|
||||
po->Register("target-objf-change", &target_objf_change, "Target objective "
|
||||
"function change from feature perturbation, used to set "
|
||||
"feature distance parameter D");
|
||||
po->Register("initial-d", &initial_d, "Initial value of parameter D "
|
||||
"It will ultimately be set according to --target-objf-change");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// Class NnetPerturbedTrainer is as NnetSimpleTrainer but implements feature
|
||||
/// perturbation; see the comment at the top of this file (\ref
|
||||
/// train-nnet-perturbed.h) for more details.
|
||||
|
||||
class NnetPerturbedTrainer {
|
||||
public:
|
||||
NnetPerturbedTrainer(const NnetPerturbedTrainerConfig &config,
|
||||
const SpMatrix<BaseFloat> &within_class_covar,
|
||||
Nnet *nnet);
|
||||
|
||||
/// TrainOnExample will take the example and add it to a buffer;
|
||||
/// if we've reached the minibatch size it will do the training.
|
||||
void TrainOnExample(const NnetExample &value);
|
||||
|
||||
~NnetPerturbedTrainer();
|
||||
private:
|
||||
KALDI_DISALLOW_COPY_AND_ASSIGN(NnetPerturbedTrainer);
|
||||
|
||||
void TrainOneMinibatch();
|
||||
|
||||
// This function initializes within_class_covar_ and num_layers_before_input_.
|
||||
// The input within_class_covar is the within-class covariance on the original
|
||||
// raw features, computed from LDA stats, but if this neural network has
|
||||
// a data-preconditioning layer of type FixedAffineComponent then we will
|
||||
// project the transform with that and treat the output of that transform
|
||||
// as the input x (this is more efficient).
|
||||
void InitWithinClassCovar(const SpMatrix<BaseFloat> &within_class_covar);
|
||||
|
||||
void UpdateD(BaseFloat orig_objf_per_example,
|
||||
BaseFloat perturbed_objf_per_example);
|
||||
|
||||
// The following function is called by TrainOneMinibatch() when we enter a new
|
||||
// phase. A phase is just a certain number of epochs, and now matters only
|
||||
// for diagnostics (originally it meant something more).
|
||||
void BeginNewPhase(bool first_time);
|
||||
|
||||
// Things we were given in the initializer:
|
||||
NnetPerturbedTrainerConfig config_;
|
||||
|
||||
Nnet *nnet_; // the nnet we're training.
|
||||
|
||||
// static information:
|
||||
// num_layers_before_input_ is the number of initial layers before what we
|
||||
// consider to be the input for this method: normally 2, for the splicing
|
||||
// layer and the (FixedAffineComponent) data-preconditioning layer.
|
||||
int32 num_layers_before_input_;
|
||||
// The within_class_covar_ variable below is the within-class covariance; if
|
||||
// we have a (FixedAffineComponent) data-preconditioning layer, we'd project
|
||||
// the within-class-covariance with that and store it as within_class_covar_.
|
||||
CuMatrix<BaseFloat> within_class_covar_;
|
||||
|
||||
// State information:
|
||||
int32 num_phases_;
|
||||
int32 minibatches_seen_this_phase_;
|
||||
std::vector<NnetExample> buffer_;
|
||||
|
||||
double logprob_this_phase_; // Needed for accumulating train log-prob on each phase.
|
||||
double logprob_perturbed_this_phase_; // same for perturbed log-prob
|
||||
double weight_this_phase_; // count corresponding to the above.
|
||||
|
||||
double logprob_total_;
|
||||
double logprob_perturbed_total_;
|
||||
double weight_total_;
|
||||
|
||||
BaseFloat D_; // The distance factor D.
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
/// This function computes the objective function and either updates the model
|
||||
/// or adds to parameter gradients. It returns the cross-entropy objective
|
||||
/// function summed over all samples (normalize this by dividing by
|
||||
/// TotalNnetTrainingWeight(examples)). It is mostly a wrapper for
|
||||
/// a class NnetPerturbedUpdater that's defined in train-nnet-perturbed.cc, but we
|
||||
/// don't want to expose that complexity at this level.
|
||||
/// All these examples will be treated as one minibatch.
|
||||
///
|
||||
/// D is the distance factor that determines how much to perturb examples;
|
||||
/// this is optimized in outer-level code (see class NnetPerturbedTrainer).
|
||||
/// num_layers_before_input determines how many layers to skip before we find
|
||||
/// the activation that we regard as the input x to the network, for purposes
|
||||
/// of this method (e.g. we might skip over the splicing layer and a layer
|
||||
/// that preconditions the input).
|
||||
/// within_class_covar (actually a symmetric matrix, but represented as CuMatrix),
|
||||
/// is the within-class covariance of the features, measured at that level,
|
||||
/// which ultimately will be derived from LDA stats on the data.
|
||||
|
||||
void DoBackpropPerturbed(const Nnet &nnet,
|
||||
int32 num_layers_before_input,
|
||||
const CuMatrix<BaseFloat> &within_class_covar,
|
||||
BaseFloat D,
|
||||
const std::vector<NnetExample> &examples,
|
||||
Nnet *nnet_to_update,
|
||||
double *tot_objf_orig,
|
||||
double *tot_objf_perturbed);
|
||||
|
||||
|
||||
|
||||
/// This function is similar to "DoBackpropParallel" as declared in
|
||||
/// nnet-update-parallel.h, but supports "perturbed" training. It's intended
|
||||
/// for multi-threaded CPU-based training. The number of threads will be
|
||||
/// set to g_num_threads.
|
||||
/// within_class_covar is the within-class covariance after any splicing
|
||||
/// but before preconditioning, as needed for the LDA computation.
|
||||
/// All pointer arguments must be non-NULL.
|
||||
void DoBackpropPerturbedParallel(const NnetPerturbedTrainerConfig &config,
|
||||
const SpMatrix<BaseFloat> &within_class_covar,
|
||||
SequentialNnetExampleReader *example_reader,
|
||||
double *tot_objf_orig,
|
||||
double *tot_objf_perturbed,
|
||||
double *tot_weight,
|
||||
Nnet *nnet);
|
||||
|
||||
|
||||
} // namespace nnet2
|
||||
} // namespace kaldi
|
||||
|
||||
#endif
|
|
@ -48,7 +48,7 @@ struct NnetSimpleTrainerConfig {
|
|||
// Class NnetSimpleTrainer doesn't do much apart from batching up the
|
||||
// input into minibatches and giving it to the neural net code
|
||||
// to call Update(), which will typically do stochastic gradient
|
||||
// descent. It also reports training-set
|
||||
// descent. It also reports training-set objective-function values.
|
||||
// It takes in the training examples through the call
|
||||
// "TrainOnExample()".
|
||||
class NnetSimpleTrainer {
|
||||
|
@ -66,8 +66,9 @@ class NnetSimpleTrainer {
|
|||
|
||||
void TrainOneMinibatch();
|
||||
|
||||
// The following function is called by TrainOneMinibatch()
|
||||
// when we enter a new phase.
|
||||
// The following function is called by TrainOneMinibatch() when we enter a new
|
||||
// phase. A phase is just a certain number of epochs, and now matters only
|
||||
// for diagnostics (originally it meant something more).
|
||||
void BeginNewPhase(bool first_time);
|
||||
|
||||
// Things we were given in the initializer:
|
||||
|
|
|
@ -25,7 +25,8 @@ BINFILES = nnet-randomize-frames nnet-am-info nnet-init \
|
|||
nnet-train-discriminative-simple nnet-train-discriminative-parallel \
|
||||
nnet-modify-learning-rates nnet-normalize-stddev nnet-perturb-egs \
|
||||
nnet-perturb-egs-fmllr nnet-get-weighted-egs nnet-adjust-priors \
|
||||
cuda-compiled nnet-replace-last-layers nnet-am-switch-preconditioning
|
||||
cuda-compiled nnet-replace-last-layers nnet-am-switch-preconditioning \
|
||||
nnet-train-simple-perturbed nnet-train-parallel-perturbed
|
||||
|
||||
OBJFILES =
|
||||
|
||||
|
|
|
@ -36,12 +36,16 @@ int main(int argc, char *argv[]) {
|
|||
bool binary = true;
|
||||
FeatureTransformEstimateOptions opts;
|
||||
std::string write_cholesky;
|
||||
std::string write_within_covar;
|
||||
ParseOptions po(usage);
|
||||
po.Register("binary", &binary, "Write accumulators in binary mode.");
|
||||
po.Register("binary", &binary, "Write outputs in binary mode.");
|
||||
po.Register("write-cholesky", &write_cholesky, "If supplied, write to this "
|
||||
"wxfilename the Cholesky factor of the within-class covariance."
|
||||
"wxfilename the Cholesky factor of the within-class covariance. "
|
||||
"Can be used for perturbing features. E.g. "
|
||||
"--write-cholesky=exp/nnet5/cholesky.tpmat");
|
||||
po.Register("write-within-covar", &write_within_covar, "If supplied, write "
|
||||
"to this wxfilename the within-class covariance (as a symmetric "
|
||||
"matrix). E.g. --write-within-covar=exp/nnet5/within_covar.mat");
|
||||
opts.Register(&po);
|
||||
po.Read(argc, argv);
|
||||
|
||||
|
@ -61,10 +65,18 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
Matrix<BaseFloat> mat;
|
||||
TpMatrix<BaseFloat> cholesky;
|
||||
fte.Estimate(opts, &mat, write_cholesky != "" ? &cholesky : NULL);
|
||||
fte.Estimate(opts, &mat,
|
||||
(write_cholesky != "" || write_within_covar != "" ?
|
||||
&cholesky : NULL));
|
||||
WriteKaldiObject(mat, projection_wxfilename, binary);
|
||||
if (write_cholesky != "")
|
||||
if (write_cholesky != "") {
|
||||
WriteKaldiObject(cholesky, write_cholesky, binary);
|
||||
}
|
||||
if (write_within_covar != "") {
|
||||
SpMatrix<BaseFloat> within_var(cholesky.NumRows());
|
||||
within_var.AddTp2(1.0, cholesky, kNoTrans, 0.0);
|
||||
WriteKaldiObject(within_var, write_within_covar, binary);
|
||||
}
|
||||
return 0;
|
||||
} catch(const std::exception &e) {
|
||||
std::cerr << e.what();
|
||||
|
|
|
@ -0,0 +1,127 @@
|
|||
// nnet2bin/nnet-train-parallel-perturbed.cc
|
||||
|
||||
// Copyright 2012-2014 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "hmm/transition-model.h"
|
||||
#include "nnet2/train-nnet-perturbed.h"
|
||||
#include "nnet2/am-nnet.h"
|
||||
#include "thread/kaldi-thread.h"
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
try {
|
||||
using namespace kaldi;
|
||||
using namespace kaldi::nnet2;
|
||||
typedef kaldi::int32 int32;
|
||||
typedef kaldi::int64 int64;
|
||||
|
||||
const char *usage =
|
||||
"Train the neural network parameters with backprop and stochastic\n"
|
||||
"gradient descent using minibatches. The training frames and labels\n"
|
||||
"are read via a pipe from nnet-randomize-frames. This is like nnet-train-parallel,\n"
|
||||
"using multiple threads in a Hogwild type of update, but also adding\n"
|
||||
"perturbed training (see src/nnet2/train-nnet-perturbed.h for info)\n"
|
||||
"\n"
|
||||
"Usage: nnet-train-parallel-perturbed [options] <model-in> <training-examples-in> <model-out>\n"
|
||||
"\n"
|
||||
"e.g.:\n"
|
||||
"nnet-randomize-frames [args] | nnet-train-parallel-pertured \\\n"
|
||||
" --within-covar=within.spmat --num-threads=8 --target-objf-change=0.2 1.nnet ark:- 2.nnet\n";
|
||||
|
||||
bool binary_write = true;
|
||||
bool zero_stats = true;
|
||||
int32 srand_seed = 0;
|
||||
std::string within_covar_rxfilename;
|
||||
NnetPerturbedTrainerConfig train_config;
|
||||
|
||||
ParseOptions po(usage);
|
||||
po.Register("binary", &binary_write, "Write output in binary mode");
|
||||
po.Register("within-covar", &within_covar_rxfilename,
|
||||
"rxfilename of within-class covariance-matrix, written as "
|
||||
"SpMatrix. Must be specified.");
|
||||
po.Register("zero-stats", &zero_stats, "If true, zero stats "
|
||||
"stored with the neural net (only affects mixing up).");
|
||||
po.Register("srand", &srand_seed,
|
||||
"Seed for random number generator (e.g., for dropout)");
|
||||
po.Register("num-threads", &g_num_threads, "Number of training threads to use "
|
||||
"in the parallel update. [Note: if you use a parallel "
|
||||
"implementation of BLAS, the actual number of threads may be larger.]");
|
||||
train_config.Register(&po);
|
||||
|
||||
po.Read(argc, argv);
|
||||
srand(srand_seed);
|
||||
|
||||
if (po.NumArgs() != 3) {
|
||||
po.PrintUsage();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
std::string nnet_rxfilename = po.GetArg(1),
|
||||
examples_rspecifier = po.GetArg(2),
|
||||
nnet_wxfilename = po.GetArg(3);
|
||||
|
||||
if (within_covar_rxfilename == "") {
|
||||
KALDI_ERR << "The option --within-covar is required.";
|
||||
}
|
||||
|
||||
TransitionModel trans_model;
|
||||
AmNnet am_nnet;
|
||||
{
|
||||
bool binary_read;
|
||||
Input ki(nnet_rxfilename, &binary_read);
|
||||
trans_model.Read(ki.Stream(), binary_read);
|
||||
am_nnet.Read(ki.Stream(), binary_read);
|
||||
}
|
||||
|
||||
KALDI_ASSERT(train_config.minibatch_size > 0);
|
||||
|
||||
SpMatrix<BaseFloat> within_covar;
|
||||
ReadKaldiObject(within_covar_rxfilename, &within_covar);
|
||||
|
||||
if (zero_stats) am_nnet.GetNnet().ZeroStats();
|
||||
|
||||
SequentialNnetExampleReader example_reader(examples_rspecifier);
|
||||
|
||||
|
||||
double tot_objf_orig, tot_objf_perturbed, tot_weight;
|
||||
// logging info will be printed from within the next call.
|
||||
DoBackpropPerturbedParallel(train_config,
|
||||
within_covar,
|
||||
&example_reader,
|
||||
&tot_objf_orig,
|
||||
&tot_objf_perturbed,
|
||||
&tot_weight,
|
||||
&(am_nnet.GetNnet()));
|
||||
{
|
||||
Output ko(nnet_wxfilename, binary_write);
|
||||
trans_model.Write(ko.Stream(), binary_write);
|
||||
am_nnet.Write(ko.Stream(), binary_write);
|
||||
}
|
||||
|
||||
KALDI_LOG << "Finished training, processed " << tot_weight
|
||||
<< " training examples (weighted). Wrote model to "
|
||||
<< nnet_wxfilename;
|
||||
return (tot_weight == 0 ? 1 : 0);
|
||||
} catch(const std::exception &e) {
|
||||
std::cerr << e.what() << '\n';
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -41,7 +41,7 @@ int main(int argc, char *argv[]) {
|
|||
"Usage: nnet-train-parallel [options] <model-in> <training-examples-in> <model-out>\n"
|
||||
"\n"
|
||||
"e.g.:\n"
|
||||
"nnet-randomize-frames [args] | nnet-train-simple 1.nnet ark:- 2.nnet\n";
|
||||
"nnet-randomize-frames [args] | nnet-train-parallel --num-threads=8 1.nnet ark:- 2.nnet\n";
|
||||
|
||||
bool binary_write = true;
|
||||
bool zero_stats = true;
|
||||
|
|
|
@ -0,0 +1,137 @@
|
|||
// nnet2bin/nnet-train-perturbed.cc
|
||||
|
||||
// Copyright 2012-2014 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "hmm/transition-model.h"
|
||||
#include "nnet2/nnet-randomize.h"
|
||||
#include "nnet2/train-nnet-perturbed.h"
|
||||
#include "nnet2/am-nnet.h"
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
try {
|
||||
using namespace kaldi;
|
||||
using namespace kaldi::nnet2;
|
||||
typedef kaldi::int32 int32;
|
||||
typedef kaldi::int64 int64;
|
||||
|
||||
const char *usage =
|
||||
"Train the neural network parameters with backprop and stochastic\n"
|
||||
"gradient descent using minibatches. The training frames and labels\n"
|
||||
"are read via a pipe from nnet-randomize-frames. This version of the\n"
|
||||
"training program does not update the learning rate, but uses\n"
|
||||
"the learning rates stored in the neural nets.\n"
|
||||
"\n"
|
||||
"Usage: nnet-train-perturbed [options] <model-in> <training-examples-in> <model-out>\n"
|
||||
"note: the option --within-covar=<file> is needed\n"
|
||||
"\n"
|
||||
"e.g.:\n"
|
||||
"nnet-randomize-frames [args] | nnet-train-perturbed --within-covar=within.spmat 1.nnet ark:- 2.nnet\n";
|
||||
|
||||
bool binary_write = true;
|
||||
bool zero_stats = true;
|
||||
int32 srand_seed = 0;
|
||||
std::string use_gpu = "yes";
|
||||
std::string within_covar_rxfilename;
|
||||
NnetPerturbedTrainerConfig train_config;
|
||||
|
||||
ParseOptions po(usage);
|
||||
po.Register("binary", &binary_write, "Write output in binary mode");
|
||||
po.Register("within-covar", &within_covar_rxfilename,
|
||||
"rxfilename of within-class covariance-matrix, written as "
|
||||
"SpMatrix. Must be specified.");
|
||||
po.Register("zero-stats", &zero_stats, "If true, zero occupation "
|
||||
"counts stored with the neural net (only affects mixing up).");
|
||||
po.Register("srand", &srand_seed, "Seed for random number generator "
|
||||
"(relevant if you have layers of type AffineComponentPreconditioned "
|
||||
"with l2-penalty != 0.0");
|
||||
po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA");
|
||||
|
||||
train_config.Register(&po);
|
||||
|
||||
po.Read(argc, argv);
|
||||
|
||||
if (po.NumArgs() != 3) {
|
||||
po.PrintUsage();
|
||||
exit(1);
|
||||
}
|
||||
srand(srand_seed);
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
CuDevice::Instantiate().SelectGpuId(use_gpu);
|
||||
#endif
|
||||
if (within_covar_rxfilename == "") {
|
||||
KALDI_ERR << "The option --within-covar is required.";
|
||||
}
|
||||
|
||||
std::string nnet_rxfilename = po.GetArg(1),
|
||||
examples_rspecifier = po.GetArg(2),
|
||||
nnet_wxfilename = po.GetArg(3);
|
||||
|
||||
int64 num_examples = 0;
|
||||
|
||||
{
|
||||
TransitionModel trans_model;
|
||||
AmNnet am_nnet;
|
||||
{
|
||||
bool binary_read;
|
||||
Input ki(nnet_rxfilename, &binary_read);
|
||||
trans_model.Read(ki.Stream(), binary_read);
|
||||
am_nnet.Read(ki.Stream(), binary_read);
|
||||
}
|
||||
|
||||
SpMatrix<BaseFloat> within_covar;
|
||||
ReadKaldiObject(within_covar_rxfilename, &within_covar);
|
||||
|
||||
if (zero_stats) am_nnet.GetNnet().ZeroStats();
|
||||
|
||||
{ // want to make sure this object deinitializes before
|
||||
// we write the model, as it does something in the destructor.
|
||||
NnetPerturbedTrainer trainer(train_config,
|
||||
within_covar,
|
||||
&(am_nnet.GetNnet()));
|
||||
|
||||
SequentialNnetExampleReader example_reader(examples_rspecifier);
|
||||
|
||||
for (; !example_reader.Done(); example_reader.Next(), num_examples++)
|
||||
trainer.TrainOnExample(example_reader.Value()); // It all happens here!
|
||||
}
|
||||
|
||||
{
|
||||
Output ko(nnet_wxfilename, binary_write);
|
||||
trans_model.Write(ko.Stream(), binary_write);
|
||||
am_nnet.Write(ko.Stream(), binary_write);
|
||||
}
|
||||
}
|
||||
#if HAVE_CUDA==1
|
||||
CuDevice::Instantiate().PrintProfile();
|
||||
#endif
|
||||
|
||||
KALDI_LOG << "Finished training, processed " << num_examples
|
||||
<< " training examples. Wrote model to "
|
||||
<< nnet_wxfilename;
|
||||
return (num_examples == 0 ? 1 : 0);
|
||||
} catch(const std::exception &e) {
|
||||
std::cerr << e.what() << '\n';
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,138 @@
|
|||
// nnet2bin/nnet-train-simple-perturbed.cc
|
||||
|
||||
// Copyright 2012-2014 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "hmm/transition-model.h"
|
||||
#include "nnet2/nnet-randomize.h"
|
||||
#include "nnet2/train-nnet-perturbed.h"
|
||||
#include "nnet2/am-nnet.h"
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
try {
|
||||
using namespace kaldi;
|
||||
using namespace kaldi::nnet2;
|
||||
typedef kaldi::int32 int32;
|
||||
typedef kaldi::int64 int64;
|
||||
|
||||
const char *usage =
|
||||
"Train the neural network parameters with backprop and stochastic\n"
|
||||
"gradient descent using minibatches. The training frames and labels\n"
|
||||
"are read via a pipe from nnet-randomize-frames. This is as nnet-train-simple\n"
|
||||
"but implements perturbed training (see src/nnet2/train-nnet-perturbed.h for\n"
|
||||
"details)\n"
|
||||
"\n"
|
||||
"Usage: nnet-train-simple-perturbed [options] <model-in> <training-examples-in> <model-out>\n"
|
||||
"note: the option --within-covar=<file> is needed\n"
|
||||
"\n"
|
||||
"e.g.:\n"
|
||||
"nnet-randomize-frames [args] | nnet-train-simple-perturbed \\\n"
|
||||
" --within-covar=within.spmat --target-objf-change=0.2 1.nnet ark:- 2.nnet\n";
|
||||
|
||||
bool binary_write = true;
|
||||
bool zero_stats = true;
|
||||
int32 srand_seed = 0;
|
||||
std::string use_gpu = "yes";
|
||||
std::string within_covar_rxfilename;
|
||||
NnetPerturbedTrainerConfig train_config;
|
||||
|
||||
ParseOptions po(usage);
|
||||
po.Register("binary", &binary_write, "Write output in binary mode");
|
||||
po.Register("within-covar", &within_covar_rxfilename,
|
||||
"rxfilename of within-class covariance-matrix, written as "
|
||||
"SpMatrix. Must be specified.");
|
||||
po.Register("zero-stats", &zero_stats, "If true, zero occupation "
|
||||
"counts stored with the neural net (only affects mixing up).");
|
||||
po.Register("srand", &srand_seed, "Seed for random number generator "
|
||||
"(relevant if you have layers of type AffineComponentPreconditioned "
|
||||
"with l2-penalty != 0.0");
|
||||
po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA");
|
||||
|
||||
train_config.Register(&po);
|
||||
|
||||
po.Read(argc, argv);
|
||||
|
||||
if (po.NumArgs() != 3) {
|
||||
po.PrintUsage();
|
||||
exit(1);
|
||||
}
|
||||
srand(srand_seed);
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
CuDevice::Instantiate().SelectGpuId(use_gpu);
|
||||
#endif
|
||||
if (within_covar_rxfilename == "") {
|
||||
KALDI_ERR << "The option --within-covar is required.";
|
||||
}
|
||||
|
||||
std::string nnet_rxfilename = po.GetArg(1),
|
||||
examples_rspecifier = po.GetArg(2),
|
||||
nnet_wxfilename = po.GetArg(3);
|
||||
|
||||
int64 num_examples = 0;
|
||||
|
||||
{
|
||||
TransitionModel trans_model;
|
||||
AmNnet am_nnet;
|
||||
{
|
||||
bool binary_read;
|
||||
Input ki(nnet_rxfilename, &binary_read);
|
||||
trans_model.Read(ki.Stream(), binary_read);
|
||||
am_nnet.Read(ki.Stream(), binary_read);
|
||||
}
|
||||
|
||||
SpMatrix<BaseFloat> within_covar;
|
||||
ReadKaldiObject(within_covar_rxfilename, &within_covar);
|
||||
|
||||
if (zero_stats) am_nnet.GetNnet().ZeroStats();
|
||||
|
||||
{ // want to make sure this object deinitializes before
|
||||
// we write the model, as it does something in the destructor.
|
||||
NnetPerturbedTrainer trainer(train_config,
|
||||
within_covar,
|
||||
&(am_nnet.GetNnet()));
|
||||
|
||||
SequentialNnetExampleReader example_reader(examples_rspecifier);
|
||||
|
||||
for (; !example_reader.Done(); example_reader.Next(), num_examples++)
|
||||
trainer.TrainOnExample(example_reader.Value()); // It all happens here!
|
||||
}
|
||||
|
||||
{
|
||||
Output ko(nnet_wxfilename, binary_write);
|
||||
trans_model.Write(ko.Stream(), binary_write);
|
||||
am_nnet.Write(ko.Stream(), binary_write);
|
||||
}
|
||||
}
|
||||
#if HAVE_CUDA==1
|
||||
CuDevice::Instantiate().PrintProfile();
|
||||
#endif
|
||||
|
||||
KALDI_LOG << "Finished training, processed " << num_examples
|
||||
<< " training examples. Wrote model to "
|
||||
<< nnet_wxfilename;
|
||||
return (num_examples == 0 ? 1 : 0);
|
||||
} catch(const std::exception &e) {
|
||||
std::cerr << e.what() << '\n';
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче