diff --git a/egs/callhome_egyptian/s5/local/create_splits b/egs/callhome_egyptian/s5/local/create_splits index 95365e01a..98b27b010 100755 --- a/egs/callhome_egyptian/s5/local/create_splits +++ b/egs/callhome_egyptian/s5/local/create_splits @@ -25,6 +25,8 @@ do utils/fix_data_dir.sh $data_dir/$split utils/validate_data_dir.sh $data_dir/$split - rm $data_dir/$split/*.tmp + if ls $data_dir/$split/*.tmp &> /dev/null; then + rm $data_dir/$split/*.tmp + fi done diff --git a/egs/callhome_egyptian/s5/local/get_oracle.sh b/egs/callhome_egyptian/s5/local/get_oracle.sh new file mode 100755 index 000000000..48310f577 --- /dev/null +++ b/egs/callhome_egyptian/s5/local/get_oracle.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +# Gets lattice oracles + +if [ $# -lt 3 ]; then + echo "Specify lattice dir, symbol table and text file for partition" + exit 1; +fi + +latticeDir=$1 +textFile=$3 +symTable=$2 +oracleDir=$latticeDir/oracle + +echo $latticeDir +echo $oracleDir + +. path.sh + +if [ ! -f $textFile -o ! -f $symTable -o ! -d $latticeDir ]; then + echo "Required files not found" + exit 1; +fi + +mkdir -p $oracleDir + +cat $textFile | sed 's:\[laughter\]::g' | sed 's:\[noise\]::g' | \ + utils/sym2int.pl -f 2- $symTable | \ + $KALDI_ROOT/src/latbin/lattice-oracle --word-symbol-table=$symTable "ark:gunzip -c $latticeDir/lat.*.gz|" ark:- ark,t:$oracleDir/oracle.tra 2>$oracleDir/oracle.log + +sort -k1,1 -u $oracleDir/oracle.tra -o $oracleDir/oracle.tra diff --git a/egs/callhome_egyptian/s5/local/latconvert.sh b/egs/callhome_egyptian/s5/local/latconvert.sh index d99c9e39e..e5f9bac43 100755 --- a/egs/callhome_egyptian/s5/local/latconvert.sh +++ b/egs/callhome_egyptian/s5/local/latconvert.sh @@ -10,7 +10,8 @@ if [ $# -lt 3 ]; then exit 1 fi -prunebeam=2 +prunebeam=50 +maxProcesses=10 latdir=$1 decode_dir=$2 @@ -33,6 +34,7 @@ then mkdir -p $latdir/$compiledLatDir mkdir -p $latdir/$preplfLatDir + runningProcesses=0 for l in $decode_dir/lat.*.gz do ( @@ -69,11 +71,19 @@ then continue fi # Replace laugh, unk, oov, noise with eps - echo "$line" | awk '{if ($3 == 2038 || $3 == 2039 || $3 == 2040) {$3 = 0; $4 = 0} print}' >> "$latdir/$rawLatDir/$fileName.lat" + echo "$line" | awk '{if ($3 == 1157 || $3 == 5327 || $3 == 5328 || $3 == 5329 || $3 ==5326) {$3 = 0; $4 = 0} print}' >> "$latdir/$rawLatDir/$fileName.lat" done < $bname.ark.fst echo "Done isolating lattices" fi ) & + runningProcesses=$((runningProcesses+1)) + echo "#### Processes running = " $runningProcesses " ####" + if [ $runningProcesses -eq $maxProcesses ]; then + echo "#### Waiting for slot ####" + wait + runningProcesses=0 + echo "#### Done waiting ####" + fi done wait rm $latdir/*.bin @@ -82,6 +92,7 @@ then if [ $stage -le 2 ]; then #Compile lattices + runningProcesses=0 for l in $latdir/$rawLatDir/*.lat do ( @@ -89,6 +100,14 @@ then bname=${l##*/} fstcompile --arc_type=log $latdir/$rawLatDir/$bname $latdir/$compiledLatDir/$bname ) & + runningProcesses=$((runningProcesses+1)) + echo "#### Processes running = " $runningProcesses " ####" + if [ $runningProcesses -eq $maxProcesses ]; then + echo "#### Waiting for slot ####" + wait + runningProcesses=0 + echo "#### Done waiting ####" + fi done wait echo "Done compiling lattices." @@ -99,6 +118,7 @@ then # Create a dummy FST with one state and no arcs first echo 0 | fstcompile --arc_type=log - $latdir/$preplfLatDir/dummy.fst # Push Lattice weights towards initial state + runningProcesses=0 for l in $latdir/$compiledLatDir/*.lat do ( @@ -112,6 +132,14 @@ then fstrmepsilon - | \ fstreverse - $latdir/$preplfLatDir/$bname ) & + runningProcesses=$((runningProcesses+1)) + echo "#### Processes running = " $runningProcesses " ####" + if [ $runningProcesses -eq $maxProcesses ]; then + echo "#### Waiting for slot ####" + wait + runningProcesses=0 + echo "#### Done waiting ####" + fi done wait # Let's take a moment to thank the dummy FST for playing its diff --git a/egs/callhome_egyptian/s5/local/lattice_main.sh b/egs/callhome_egyptian/s5/local/lattice_main.sh new file mode 100755 index 000000000..d11711ced --- /dev/null +++ b/egs/callhome_egyptian/s5/local/lattice_main.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +outDir=exp/lat +mkdir -p $outDir + +stage=1 + +if [ $stage -lt 1 ]; then + + # First convert all lattices into the pruned, minimized version + decodeDir=exp/tri5a/decode_dev + acousticScale=0.8333 + local/latconvert.sh $outDir $decodeDir $acousticScale + + decodeDir=exp/tri5a/decode_test + acousticScale=0.8333 + local/latconvert.sh $outDir $decodeDir $acousticScale + +fi + +if [ $stage -lt 2 ]; then + # Get oracles + latticeDir=exp/tri5a/decode_dev + textFile=data/dev/text + symTable=exp/tri5a/graph/words.txt + local/get_oracle.sh $latticeDir $symTable $textFile + + latticeDir=exp/tri5a/decode_test + textFile=data/test/text + symTable=exp/tri5a/graph/words.txt + local/get_oracle.sh $latticeDir $symTable $textFile +fi diff --git a/egs/callhome_egyptian/s5/run.sh b/egs/callhome_egyptian/s5/run.sh index 8f03884e8..53753e31b 100755 --- a/egs/callhome_egyptian/s5/run.sh +++ b/egs/callhome_egyptian/s5/run.sh @@ -56,14 +56,14 @@ steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir # ones (mostly uh-huh). So take the 100k shortest ones, and then take 10k random # utterances from those. -steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ +steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ data/train data/lang exp/mono0a -steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train data/lang exp/mono0a exp/mono0a_ali || exit 1; - -steps/train_deltas.sh --cmd "$train_cmd" \ - 1000 10000 data/train data/lang exp/mono0a_ali exp/tri1 || exit 1; +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train data/lang exp/mono0a exp/mono0a_ali || exit 1; + +steps/train_deltas.sh --cmd "$train_cmd" \ + 1000 10000 data/train data/lang exp/mono0a_ali exp/tri1 || exit 1; (utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph diff --git a/egs/fisher_english/s5/run.sh b/egs/fisher_english/s5/run.sh index 492840f07..b0c091d8d 100755 --- a/egs/fisher_english/s5/run.sh +++ b/egs/fisher_english/s5/run.sh @@ -153,9 +153,14 @@ steps/train_sat.sh --cmd "$train_cmd" \ ( utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri5a/graph data/dev exp/tri5a/decode_dev + exp/tri5a/graph data/dev exp/tri5a/decode_dev )& +# +# steps/cleanup/find_bad_utts.sh --nj 200 --cmd "$train_cmd" data/train data/lang \ +# exp/tri5a exp/tri5a_cleanup + + # local/run_for_spkid.sh # we don't have to results for the step below yet. diff --git a/egs/rm/s5/RESULTS b/egs/rm/s5/RESULTS index 41b8630a0..00741b16a 100644 --- a/egs/rm/s5/RESULTS +++ b/egs/rm/s5/RESULTS @@ -118,14 +118,17 @@ exit 0 %WER 1.80 [ 226 / 12533, 29 ins, 44 del, 153 sub ] exp/nnet4c/decode/wer_4 %WER 8.49 [ 1064 / 12533, 80 ins, 175 del, 809 sub ] exp/nnet4c/decode_ug/wer_11 -%WER 1.61 [ 202 / 12533, 25 ins, 47 del, 130 sub ] exp/nnet4d/decode/wer_5 -%WER 8.17 [ 1024 / 12533, 83 ins, 179 del, 762 sub ] exp/nnet4d/decode_ug/wer_11 +%WER 1.68 [ 211 / 12533, 29 ins, 39 del, 143 sub ] exp/nnet4d/decode/wer_4 +%WER 8.40 [ 1053 / 12533, 101 ins, 153 del, 799 sub ] exp/nnet4d/decode_ug/wer_10 -%WER 1.63 [ 204 / 12533, 29 ins, 42 del, 133 sub ] exp/nnet4d_gpu/decode/wer_4 -%WER 8.11 [ 1016 / 12533, 80 ins, 168 del, 768 sub ] exp/nnet4d_gpu/decode_ug/wer_10 +%WER 1.74 [ 218 / 12533, 25 ins, 48 del, 145 sub ] exp/nnet4d_gpu/decode/wer_6 +%WER 8.39 [ 1051 / 12533, 106 ins, 149 del, 796 sub ] exp/nnet4d_gpu/decode_ug/wer_10 -%WER 1.63 [ 204 / 12533, 29 ins, 42 del, 133 sub ] exp/nnet4d_gpu/decode/wer_4 -%WER 8.11 [ 1016 / 12533, 80 ins, 168 del, 768 sub ] exp/nnet4d_gpu/decode_ug/wer_10 +%WER 1.53 [ 192 / 12533, 22 ins, 42 del, 128 sub ] exp/nnet4d2/decode/wer_3 +%WER 8.06 [ 1010 / 12533, 79 ins, 152 del, 779 sub ] exp/nnet4d2/decode_ug/wer_8 + +%WER 1.51 [ 189 / 12533, 25 ins, 34 del, 130 sub ] exp/nnet4d2_gpu/decode/wer_3 +%WER 7.97 [ 999 / 12533, 78 ins, 152 del, 769 sub ] exp/nnet4d2_gpu/decode_ug/wer_8 %WER 1.37 [ 172 / 12533, 14 ins, 36 del, 122 sub ] exp/nnet4e_gpu/decode/wer_3 %WER 8.03 [ 1006 / 12533, 61 ins, 179 del, 766 sub ] exp/nnet4e_gpu/decode_ug/wer_8 @@ -153,8 +156,8 @@ exit 0 # Discriminatively trained system (using p-norm rather than tanh nonlinearities, using SMBR, on GPU) -%WER 1.56 [ 195 / 12533, 28 ins, 31 del, 136 sub ] exp/nnet5d_mpe_gpu/decode_epoch2/wer_2 -%WER 8.35 [ 1047 / 12533, 77 ins, 171 del, 799 sub ] exp/nnet5d_mpe_gpu/decode_ug_epoch4/wer_10 +%WER 1.74 [ 218 / 12533, 25 ins, 48 del, 145 sub ] exp/nnet5d_mpe_gpu/decode_epoch1/wer_6 +%WER 8.40 [ 1053 / 12533, 108 ins, 148 del, 797 sub ] exp/nnet5d_mpe_gpu/decode_ug_epoch1/wer_10 # Discriminatively trained system on top of ensemble trained p-norm network (using SMBR, on GPU) %WER 1.36 [ 170 / 12533, 15 ins, 34 del, 121 sub ] exp/nnet5e_mpe_gpu/decode_epoch2/wer_3 diff --git a/egs/rm/s5/local/nnet2/run_4d2.sh b/egs/rm/s5/local/nnet2/run_4d2.sh new file mode 100755 index 000000000..123b52f75 --- /dev/null +++ b/egs/rm/s5/local/nnet2/run_4d2.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# 4d2 is as 4d but adding perturbed training with multiplier=1.0 + +train_stage=-10 +use_gpu=true + +. cmd.sh +. ./path.sh +. utils/parse_options.sh + + +if $use_gpu; then + if ! cuda-compiled; then + cat < $dir/num_jobs [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; cp $srcdir/{tree,final.mdl} $dir || exit 1; +cp $srcdir/final.alimdl $dir 2>/dev/null cp $srcdir/final.occs $dir; splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. diff --git a/egs/wsj/s5/steps/align_si.sh b/egs/wsj/s5/steps/align_si.sh index 23c9ffa8c..f58f1b7ae 100755 --- a/egs/wsj/s5/steps/align_si.sh +++ b/egs/wsj/s5/steps/align_si.sh @@ -42,6 +42,11 @@ lang=$2 srcdir=$3 dir=$4 + +for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1; +done + oov=`cat $lang/oov.int` || exit 1; mkdir -p $dir/log echo $nj > $dir/num_jobs @@ -57,6 +62,7 @@ cp $srcdir/{tree,final.mdl} $dir || exit 1; cp $srcdir/final.occs $dir; + if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi echo "$0: feature type is $feat_type" diff --git a/egs/wsj/s5/steps/cleanup/find_bad_utts.sh b/egs/wsj/s5/steps/cleanup/find_bad_utts.sh new file mode 100755 index 000000000..0d9445674 --- /dev/null +++ b/egs/wsj/s5/steps/cleanup/find_bad_utts.sh @@ -0,0 +1,165 @@ +#!/bin/bash +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Computes training alignments using a model with delta or +# LDA+MLLT features. This version, rather than just using the +# text to align, computes mini-language models (unigram) from the text +# and a few common words in the LM, and allows + +# Begin configuration section. +nj=4 +cmd=run.pl +use_graphs=false +# Begin configuration. +scale_opts="--transition-scale=1.0 --self-loop-scale=0.1" +acoustic_scale=0.1 +beam=20.0 +lattice_beam=10.0 +transform_dir= # directory to find fMLLR transforms in. +top_n_words=100 # Number of common words that we compile into each graph (most frequent + # in $lang/text. +stage=0 +cleanup=true +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: $0 " + echo "e.g.: $0 data/train data/lang exp/tri1 exp/tri1_ali" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --use-graphs true # use graphs in src-dir" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl \ + $lang/L_disambig.fst $lang/phones/disambig.int; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1; +done + +oov=`cat $lang/oov.int` || exit 1; +mkdir -p $dir/log +echo $nj > $dir/num_jobs +sdata=$data/split$nj +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. +cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. +cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. + +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; +cp $srcdir/final.occs $dir; + + +utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt <$data/text | \ + awk '{for(x=2;x<=NF;x++) print $x;}' | sort | uniq -c | \ + sort -rn > $dir/word_counts.int || exit 1; +num_words=$(awk '{x+=$1} END{print x}' < $dir/word_counts.int) || exit 1; +# print top-n words with their unigram probabilities. + +head -n $top_n_words $dir/word_counts.int | awk -v tot=$num_words '{print $1/tot, $2;}' >$dir/top_words.int +utils/int2sym.pl -f 2 $lang/words.txt <$dir/top_words.int >$dir/top_words.txt + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $srcdir/full.mat $dir + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ -z "$transform_dir" ] && [ -f $srcdir/trans.1 ]; then + transform_dir=$srcdir +fi +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; + nj_orig=$(cat $transform_dir/num_jobs) + if [ $nj -ne $nj_orig ]; then + # Copy the transforms into an archive with an index. + for n in $(seq $nj_orig); do cat $transform_dir/trans.$n; done | \ + copy-feats ark:- ark,scp:$dir/trans.ark,$dir/trans.scp || exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" + else + # number of jobs matches with alignment dir. + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" + fi +elif [ -f $srcdir/final.alimdl ]; then + echo "$0: **WARNING**: you seem to be using an fMLLR system as input," + echo " but you are not providing the --transform-dir option during alignment." +fi + + +echo "$0: decoding $data using utterance-specific decoding graphs using model from $srcdir, output in $dir" + +if [ $stage -le 0 ]; then + rm $dir/edits.*.txt $dir/aligned_ref.*.txt 2>/dev/null + + $cmd JOB=1:$nj $dir/log/decode.JOB.log \ + utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text \| \ + steps/cleanup/make_utterance_fsts.pl $dir/top_words.int \| \ + compile-train-graphs-fsts $scale_opts --read-disambig-syms=$lang/phones/disambig.int \ + $dir/tree $dir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \ + gmm-latgen-faster --acoustic-scale=$acoustic_scale --beam=$beam \ + --lattice-beam=$lattice_beam --word-symbol-table=$lang/words.txt \ + $dir/final.mdl ark:- "$feats" ark:- \| \ + lattice-oracle ark:- "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|" \ + ark,t:- ark,t:$dir/edits.JOB.txt \| \ + utils/int2sym.pl -f 2- $lang/words.txt '>' $dir/aligned_ref.JOB.txt || exit 1; +fi + + +if [ $stage -le 1 ]; then + if [ -f $dir/edits.1.txt ]; then + for x in $(seq $nj); do cat $dir/edits.$x.txt; done > $dir/edits.txt + for x in $(seq $nj); do cat $dir/aligned_ref.$x.txt; done > $dir/aligned_ref.txt + else + echo "$0: warning: no file $dir/edits.1.txt, using previously concatenated file if present." + fi + + # in case any utterances failed to align, get filtered copy of $data/text that's filtered. + utils/filter_scp.pl $dir/edits.txt < $data/text > $dir/text + cat $dir/text | awk '{print $1, (NF-1);}' > $dir/length.txt + + n1=$(wc -l < $dir/edits.txt) + n2=$(wc -l < $dir/aligned_ref.txt) + n3=$(wc -l < $dir/text) + n4=$(wc -l < $dir/length.txt) + if [ $n1 -ne $n2 ] || [ $n2 -ne $n3 ] || [ $n3 -ne $n4 ]; then + echo "$0: mismatch in lengths of files:" + wc $dir/edits.txt $dir/aligned_ref.txt $dir/text $dir/length.txt + exit 1; + fi + + # note: the format of all_info.txt is: + # + # with the fields separated by tabs, e.g. + # adg04_sr009_trn 1 12 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED + + paste $dir/edits.txt \ + <(awk '{print $2}' $dir/length.txt) \ + <(awk '{$1="";print;}' <$dir/aligned_ref.txt) \ + <(awk '{$1="";print;}' <$dir/text) > $dir/all_info.txt + + sort -nr -k2 $dir/all_info.txt > $dir/all_info.sorted.txt + + if $cleanup; then + rm $dir/edits.*.txt $dir/aligned_ref.*.txt + fi +fi + diff --git a/egs/wsj/s5/steps/cleanup/make_utterance_fsts.pl b/egs/wsj/s5/steps/cleanup/make_utterance_fsts.pl new file mode 100755 index 000000000..0929291bc --- /dev/null +++ b/egs/wsj/s5/steps/cleanup/make_utterance_fsts.pl @@ -0,0 +1,45 @@ +#!/usr/bin/perl -w + +# makes unigram decoding-graph FSTs specific to each utterances, where the +# supplied top-n-words list together with the supervision text of the utterance are +# combined. + +if (@ARGV != 1) { + print STDERR "Usage: make_utterance_fsts.pl top-words-file.txt < text-archive > fsts-archive\n" . + "e.g.: utils/sym2int.pl -f 2- data/lang/words.txt data/train/text | \\\n" . + " make_utterance_fsts.pl exp/foo/top_words.int | compile-train-graphs-fsts ... \n"; +} + +($top_words_file) = @ARGV; + +open(F, "<$top_words_file") || die "opening $top_words_file"; + +%top_word_probs = ( ); + +while() { + @A = split; + (@A == 2 && $A[0] > 0.0) || die "Bad line $_ in $top_words_file"; + $A[1] =~ m/^[0-9]+$/ || die "Expecting numeric word-ids in $top_words_file: $_\n"; + $top_word_probs{$A[1]} += $A[0]; +} + +while () { + @A = split; + $utterance_id = shift @A; + print "$utterance_id\n"; + $num_words = @A + 0; # length of array @A + %word_probs = %top_word_probs; + foreach $w (@A) { + $w =~ m/^[0-9]+$/ || die "Expecting numeric word-ids as stdin: $_"; + $word_probs{$w} += 1.0 / $num_words; + } + foreach $w (keys %word_probs) { + $prob = $word_probs{$w}; + $prob > 0.0 || die "Word $w with bad probability $prob, utterance-id = $utterance_id\n"; + $cost = -log($prob); + print "0 0 $w $w $cost\n"; + } + $final_cost = -log(1.0 / $num_words); + print "0 $final_cost\n"; + print "\n"; # Empty line terminates the FST in the text-archive format. +} diff --git a/egs/wsj/s5/steps/decode.sh b/egs/wsj/s5/steps/decode.sh index 9aad3895a..a44bf1a3d 100755 --- a/egs/wsj/s5/steps/decode.sh +++ b/egs/wsj/s5/steps/decode.sh @@ -4,7 +4,8 @@ # Apache 2.0 # Begin configuration section. -transform_dir= +transform_dir= # this option won't normally be used, but it can be used if you want to + # supply existing fMLLR transforms when decoding. iter= model= # You can specify the model to use (e.g. if you want to use the .alimdl) stage=0 diff --git a/egs/wsj/s5/steps/nnet2/align.sh b/egs/wsj/s5/steps/nnet2/align.sh index 0d90337a9..43099a57e 100755 --- a/egs/wsj/s5/steps/nnet2/align.sh +++ b/egs/wsj/s5/steps/nnet2/align.sh @@ -77,20 +77,31 @@ case $feat_type in ;; *) echo "$0: invalid feature type $feat_type" && exit 1; esac + if [ ! -z "$transform_dir" ]; then - if ! [ $nj -eq `cat $transform_dir/num_jobs` ]; then - echo "$0: Number of jobs mismatch with transform-dir: $nj versus `cat $transform_dir/num_jobs`"; + echo "$0: using transforms from $transform_dir" + [ ! -s $transform_dir/num_jobs ] && \ + echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; + nj_orig=$(cat $transform_dir/num_jobs) + + if [ $feat_type == "raw" ]; then trans=raw_trans; + else trans=trans; fi + if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then + echo "$0: LDA transforms differ between $srcdir and $transform_dir" exit 1; fi - if [ $feat_type == "lda" ]; then - [ ! -f $transform_dir/trans.1 ] && echo "No such file $transform_dir/raw_trans.1" && exit 1; - echo "$0: using transforms from $transform_dir" - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" + if [ ! -f $transform_dir/$trans.1 ]; then + echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)" + exit 1; fi - if [ $feat_type == "raw" ]; then - [ ! -f $transform_dir/raw_trans.1 ] && echo "No such file $transform_dir/raw_trans.1" && exit 1; - echo "$0: using raw-fMLLR transforms from $transform_dir" - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |" + if [ $nj -ne $nj_orig ]; then + # Copy the transforms into an archive with an index. + for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \ + copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |" + else + # number of jobs matches with alignment dir. + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |" fi fi diff --git a/egs/wsj/s5/steps/nnet2/get_lda.sh b/egs/wsj/s5/steps/nnet2/get_lda.sh index 63938aff3..cdc0ecc09 100755 --- a/egs/wsj/s5/steps/nnet2/get_lda.sh +++ b/egs/wsj/s5/steps/nnet2/get_lda.sh @@ -145,6 +145,7 @@ fi if [ $stage -le 0 ]; then echo "$0: Accumulating LDA statistics." + rm $dir/lda.*.acc 2>/dev/null # in case any left over from before. $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \ ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \ @@ -157,11 +158,19 @@ echo $lda_dim > $dir/lda_dim echo $ivector_dim > $dir/ivector_dim if [ $stage -le 1 ]; then - nnet-get-feature-transform --write-cholesky=$dir/cholesky.tpmat \ - --within-class-factor=$within_class_factor --dim=$lda_dim \ - $dir/lda.mat $dir/lda.*.acc \ - 2>$dir/log/lda_est.log || exit 1; + sum-lda-accs $dir/lda.acc $dir/lda.*.acc 2>$dir/log/lda_sum.log || exit 1; rm $dir/lda.*.acc fi +if [ $stage -le 2 ]; then + # There are various things that we sometimes (but not always) need + # the within-class covariance and its Cholesky factor for, and we + # write these to disk just in case. + nnet-get-feature-transform --write-cholesky=$dir/cholesky.tpmat \ + --write-within-covar=$dir/within_covar.spmat \ + --within-class-factor=$within_class_factor --dim=$lda_dim \ + $dir/lda.mat $dir/lda.acc \ + 2>$dir/log/lda_est.log || exit 1; +fi + echo "$0: Finished estimating LDA" diff --git a/egs/wsj/s5/steps/nnet2/make_denlats.sh b/egs/wsj/s5/steps/nnet2/make_denlats.sh index 1777357ba..5d4895c43 100755 --- a/egs/wsj/s5/steps/nnet2/make_denlats.sh +++ b/egs/wsj/s5/steps/nnet2/make_denlats.sh @@ -95,25 +95,39 @@ echo "align_si.sh: feature type is $feat_type" case $feat_type in delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" + ;; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" cp $srcdir/final.mat $dir ;; *) echo "Invalid feature type $feat_type" && exit 1; esac -if [ ! -z "$transform_dir" ]; then # add transforms to features... - echo "$0: using fMLLR transforms from $transform_dir" - [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." - [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \ - && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1; - [ -f $srcdir/final.mat ] && ! cmp $transform_dir/final.mat $srcdir/final.mat && \ - echo "$0: LDA transforms differ between $srcdir and $transform_dir" - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" -else - if [ -f $srcdir/final.alimdl ]; then - echo "$0: you seem to have a SAT system but you did not supply the --transform-dir option."; +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -s $transform_dir/num_jobs ] && \ + echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; + nj_orig=$(cat $transform_dir/num_jobs) + + if [ $feat_type == "raw" ]; then trans=raw_trans; + else trans=trans; fi + if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then + echo "$0: LDA transforms differ between $srcdir and $transform_dir" exit 1; fi + if [ ! -f $transform_dir/$trans.1 ]; then + echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)" + exit 1; + fi + if [ $nj -ne $nj_orig ]; then + # Copy the transforms into an archive with an index. + for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \ + copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |" + else + # number of jobs matches with alignment dir. + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |" + fi fi if [ $sub_split -eq 1 ]; then diff --git a/egs/wsj/s5/steps/nnet2/train_discriminative.sh b/egs/wsj/s5/steps/nnet2/train_discriminative.sh index 2e25a3158..8e1a56d18 100755 --- a/egs/wsj/s5/steps/nnet2/train_discriminative.sh +++ b/egs/wsj/s5/steps/nnet2/train_discriminative.sh @@ -22,7 +22,7 @@ num_jobs_nnet=4 # Number of neural net jobs to run in parallel. Note: this samples_per_iter=400000 # measured in frames, not in "examples" spk_vecs_dir= -modify_learning_rates=false +modify_learning_rates=true last_layer_factor=1.0 # relates to modify-learning-rates first_layer_factor=1.0 # relates to modify-learning-rates shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples @@ -140,17 +140,38 @@ case $feat_type in *) echo "$0: invalid feature type $feat_type" && exit 1; esac -[ -z "$transform_dir" ] && transform_dir=$alidir +if [ -z "$transform_dir" ]; then + if [ -f $transform_dir/trans.1 ] || [ -f $transform_dir/raw_trans.1 ]; then + transform_dir=$alidir + fi +fi -if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then +if [ ! -z "$transform_dir" ]; then echo "$0: using transforms from $transform_dir" - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" + [ ! -s $transform_dir/num_jobs ] && \ + echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; + nj_orig=$(cat $transform_dir/num_jobs) + + if [ $feat_type == "raw" ]; then trans=raw_trans; + else trans=trans; fi + if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $alidir/final.mat; then + echo "$0: LDA transforms differ between $alidir and $transform_dir" + exit 1; + fi + if [ ! -f $transform_dir/$trans.1 ]; then + echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)" + exit 1; + fi + if [ $nj -ne $nj_orig ]; then + # Copy the transforms into an archive with an index. + for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \ + copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |" + else + # number of jobs matches with alignment dir. + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |" + fi fi -if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then - echo "$0: using raw-fMLLR transforms from $transform_dir" - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |" -fi - if [ -z "$degs_dir" ]; then if [ $stage -le -8 ]; then diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_fast.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_fast.sh index 0453172ee..21677848d 100755 --- a/egs/wsj/s5/steps/nnet2/train_pnorm_fast.sh +++ b/egs/wsj/s5/steps/nnet2/train_pnorm_fast.sh @@ -64,6 +64,10 @@ max_change_per_sample=0.075 precondition_rank_in=20 # relates to online preconditioning precondition_rank_out=80 # relates to online preconditioning +# this relates to perturbed training. +min_target_objf_change=0.1 +target_multiplier=0 # Set this to e.g. 1.0 to enable perturbed training. + mix_up=0 # Number of components to mix up to (should be > #tree leaves, if # specified.) num_threads=16 @@ -262,24 +266,49 @@ echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, " echo "$0: (while reducing learning rate) + (with constant learning rate)." +function set_target_objf_change { + # nothing to do if $target_multiplier not set. + [ "$target_multiplier" == "0" -o "$target_multiplier" == "0.0" ] && return; + [ $x -le $finish_add_layers_iter ] && return; + wait=2 # the compute_prob_{train,valid} from 2 iterations ago should + # most likey be done even though we backgrounded them. + [ $[$x-$wait] -le 0 ] && return; + while true; do + # Note: awk 'some-expression' is the same as: awk '{if(some-expression) print;}' + train_prob=$(awk '(NF == 1)' < $dir/log/compute_prob_train.$[$x-$wait].log) + valid_prob=$(awk '(NF == 1)' < $dir/log/compute_prob_valid.$[$x-$wait].log) + if [ -z "$train_prob" ] || [ -z "$valid_prob" ]; then + echo "$0: waiting until $dir/log/compute_prob_{train,valid}.$[$x-$wait].log are done" + sleep 60 + else + target_objf_change=$(perl -e '($train,$valid,$min_change,$multiplier)=@ARGV; if (!($train < 0.0) || !($valid < 0.0)) { print "0\n"; print STDERR "Error: invalid train or valid prob: $train_prob, $valid_prob\n"; exit(0); } else { print STDERR "train,valid=$train,$valid\n"; $proposed_target = $multiplier * ($train-$valid); if ($proposed_target < $min_change) { print "0"; } else { print $proposed_target; }}' -- "$train_prob" "$valid_prob" "$min_target_objf_change" "$target_multiplier") + echo "On iter $x, (train,valid) probs from iter $[$x-$wait] were ($train_prob,$valid_prob), and setting target-objf-change to $target_objf_change." + return; + fi + done +} + finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period] # This is when we decide to mix up from: halfway between when we've finished # adding the hidden layers and the end of training. mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2] if [ $num_threads -eq 1 ]; then - train_suffix="-simple" # this enables us to use GPU code if + parallel_suffix="-simple" # this enables us to use GPU code if # we have just one thread. + parallel_train_opts= if ! cuda-compiled; then echo "$0: WARNING: you are running with one thread but you have not compiled" echo " for CUDA. You may be running a setup optimized for GPUs. If you have" echo " GPUs and have nvcc installed, go to src/ and do ./configure; make" fi else - train_suffix="-parallel --num-threads=$num_threads" + parallel_suffix="-parallel" + parallel_train_opts="--num-threads=$num_threads" fi x=0 +target_objf_change=0 # relates to perturbed training. while [ $x -lt $num_iters ]; do if [ $x -ge 0 ] && [ $stage -le $x ]; then @@ -316,11 +345,19 @@ while [ $x -lt $num_iters ]; do this_minibatch_size=$minibatch_size do_average=true fi + + set_target_objf_change; # only has effect if target_multiplier != 0 + if [ "$target_objf_change" != "0" ]; then + [ ! -f $dir/within_covar.spmat ] && \ + echo "$0: expected $dir/within_covar.spmat to exist." && exit 1; + perturb_suffix="-perturbed" + perturb_opts="--target-objf-change=$target_objf_change --within-covar=$dir/within_covar.spmat" + fi $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \ nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \ ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \ - nnet-train$train_suffix \ + nnet-train$parallel_suffix$perturb_suffix $parallel_train_opts $perturb_opts \ --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \ ark:- $dir/$[$x+1].JOB.mdl \ || exit 1; diff --git a/egs/wsj/s5/steps/train_sat_basis.sh b/egs/wsj/s5/steps/train_sat_basis.sh index f626baa71..e21faade5 100755 --- a/egs/wsj/s5/steps/train_sat_basis.sh +++ b/egs/wsj/s5/steps/train_sat_basis.sh @@ -12,7 +12,6 @@ # Begin configuration section. stage=-5 -fmllr_update_type=full cmd=run.pl scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" beam=10 @@ -197,9 +196,9 @@ while [ $x -lt $num_iters ]; do if echo $fmllr_iters | grep -w $x >/dev/null; then if [ $stage -le $x ]; then - echo Estimating fMLLR transforms - # We estimate a transform that's additional to the previous transform; - # we'll compose them. + # Note: it's not really necessary to re-estimate the basis each time + # but this is the way the script does it right now. + echo Estimating basis and fMLLR transforms $cmd JOB=1:$nj $dir/log/fmllr_est.$x.JOB.log \ ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ weight-silence-post $silence_weight $silphonelist $dir/$x.mdl ark:- ark:- \| \ @@ -209,7 +208,7 @@ while [ $x -lt $num_iters ]; do # Compute the basis matrices. $cmd $dir/log/basis_training.log \ - gmm-basis-fmllr-training $dir/$x.mdl $dir/fmllr.basis $dir/basis.acc.* || exit 1; + gmm-basis-fmllr-training $dir/$x.mdl $dir/fmllr.basis $dir/basis.acc.* || exit 1; $cmd JOB=1:$nj $dir/log/fmllr_app.$x.JOB.log \ ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ diff --git a/egs/wsj/s5/utils/sym2int.pl b/egs/wsj/s5/utils/sym2int.pl index dedb993b6..be0a577cf 100755 --- a/egs/wsj/s5/utils/sym2int.pl +++ b/egs/wsj/s5/utils/sym2int.pl @@ -28,7 +28,7 @@ for($x = 0; $x < 2; $x++) { } } if ($ARGV[0] eq "-f") { - shift @ARGV; + shift @ARGV; $field_spec = shift @ARGV; if ($field_spec =~ m/^\d+$/) { $field_begin = $field_spec - 1; $field_end = $field_spec - 1; diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh index 0d630f601..554c9d0b6 100755 --- a/egs/wsj/s5/utils/validate_data_dir.sh +++ b/egs/wsj/s5/utils/validate_data_dir.sh @@ -46,6 +46,14 @@ done ! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ echo "$0: $data/utt2spk has wrong format." && exit; +ns=$(wc -l < $data/spk2utt) +if [ "$ns" == 1 ]; then + echo "$0: WARNING: you have only one speaker. This probably a bad idea." + echo " Search for the word 'bold' in http://kaldi.sourceforge.net/data_prep.html" + echo " for more information." +fi + + tmpdir=$(mktemp -d kaldi.XXXX); trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM diff --git a/egs/wsj/s5/utils/validate_dict_dir.pl b/egs/wsj/s5/utils/validate_dict_dir.pl index c79970b43..7f4083c7d 100755 --- a/egs/wsj/s5/utils/validate_dict_dir.pl +++ b/egs/wsj/s5/utils/validate_dict_dir.pl @@ -12,27 +12,39 @@ if(@ARGV != 1) { } $dict = shift @ARGV; +$dict =~ s:/$::; $exit = 0; +$success = 1; # this is re-set each time we read a file. + +sub set_to_fail { $exit = 1; $success = 0; } + # Checking silence_phones.txt ------------------------------- print "Checking $dict/silence_phones.txt ...\n"; if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} $idx = 1; %silence = (); -$success = 1; + print "--> reading $dict/silence_phones.txt\n"; while() { - chomp; + if (! s/\n$//) { + print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; + set_to_fail(); + } my @col = split(" ", $_); + if (@col == 0) { + set_to_fail(); + print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; + } foreach(0 .. @col-1) { my $p = $col[$_]; - if($silence{$p}) {$exit = 1; print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; $success = 0;} + if($silence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; } else {$silence{$p} = 1;} if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){ - $exit = 1; + set_to_fail(); print "--> ERROR: phone \"$p\" has disallowed written form"; - $success = 0; + } } $idx ++; @@ -52,9 +64,9 @@ while() { chomp; my @col = split(" ", $_); if ($idx > 1 or @col > 1) { - $exit = 1; print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; $success = 0; + set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; } elsif (!$silence{$col[0]}) { - $exit = 1; print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; $success = 0; + set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; } $idx ++; } @@ -71,22 +83,29 @@ $idx = 1; $success = 1; print "--> reading $dict/nonsilence_phones.txt\n"; while() { - chomp; + if (! s/\n$//) { + print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; + set_to_fail(); + } my @col = split(" ", $_); + if (@col == 0) { + set_to_fail(); + print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; + } foreach(0 .. @col-1) { my $p = $col[$_]; - if($nonsilence{$p}) {$exit = 1; print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; $success = 0;} + if($nonsilence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; } else {$nonsilence{$p} = 1;} if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){ - $exit = 1; + set_to_fail(); print "--> ERROR: phone \"$p\" has disallowed written form"; - $success = 0; + } } $idx ++; } close(NS); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; +$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; print "\n"; # Checking disjoint ------------------------------- @@ -106,37 +125,37 @@ sub intersect { print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; @itset = intersect(\%silence, \%nonsilence); if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {$exit = 1; print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} +else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} print "\n"; sub check_lexicon { my ($lexfn, $pron_probs) = @_; print "Checking $lexfn\n"; - if(-z "$lexfn") {$exit = 1; print "--> ERROR: $lexfn is empty or not exists\n";} - if(!open(L, "<$lexfn")) {$exit = 1; print "--> ERROR: fail to open $lexfn\n";} + if(-z "$lexfn") {set_to_fail(); print "--> ERROR: $lexfn is empty or not exists\n";} + if(!open(L, "<$lexfn")) {set_to_fail(); print "--> ERROR: fail to open $lexfn\n";} $idx = 1; $success = 1; print "--> reading $lexfn\n"; while () { - chomp; + if (! s/\n$//) { + print "--> ERROR: last line '$_' of $lexfn does not end in newline.\n"; + set_to_fail(); + } my @col = split(" ", $_); $word = shift @col; if (!defined $word) { - $exit = 1; print "--> ERROR: empty lexicon line in $lexfn\n"; - $success = 0; + set_to_fail(); print "--> ERROR: empty lexicon line in $lexfn\n"; } if ($pron_probs) { $prob = shift @col; if (!($prob > 0.0 && $prob <= 1.0)) { - $exit = 1; print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lexfn\n"; - $success = 0; + set_to_fail(); print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lexfn\n"; } } foreach (0 .. @col-1) { if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - $exit = 1; print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx)\n"; - $success = 0; + set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx)\n"; } } $idx ++; @@ -150,7 +169,7 @@ if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0); } if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1); } if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - $exit = 1; + set_to_fail(); } # If both lexicon.txt and lexiconp.txt exist, we check that they correspond to # each other. If not, it could be that the user overwrote one and we need to @@ -161,11 +180,21 @@ if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) { die "Error opening lexicon.txt and/or lexiconp.txt"; # already checked, so would be code error. } while() { + if (! s/\n$//) { + print "--> ERROR: last line '$_' of $dict/lexicon.txt does not end in newline.\n"; + set_to_fail(); + last; + } @A = split; $x =

; + if ($x !~ s/\n$//) { + print "--> ERROR: last line '$x' of $dict/lexiconp.txt does not end in newline.\n"; + set_to_fail(); + last; + } if (!defined $x) { print "--> ERROR: lexicon.txt and lexiconp.txt have different numbers of lines (mismatch); delete one.\n"; - $exit = 1; + set_to_fail(); last; } @B = split(" ", $x); @@ -175,13 +204,13 @@ if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) { # now @A and @B should be the same. if ($#A != $#B) { print "--> ERROR: lexicon.txt and lexiconp.txt have mismatched lines '$_' versus '$x'; delete one.\n"; - $exit = 1; + set_to_fail(); last; } for ($n = 0; $n < @A; $n++) { if ($A[$n] ne $B[$n]) { print "--> ERROR: lexicon.txt and lexiconp.txt have mismatched lines '$_' versus '$x'; delete one.\n"; - $exit = 1; + set_to_fail(); last; } } @@ -189,32 +218,40 @@ if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) { $x =

; if (defined $x && $exit == 0) { print "--> ERROR: lexicon.txt and lexiconp.txt have different numbers of lines (mismatch); delete one.\n"; - $exit = 1; + set_to_fail(); } } # Checking extra_questions.txt ------------------------------- print "Checking $dict/extra_questions.txt ...\n"; if (-s "$dict/extra_questions.txt") { - if(!open(EX, "<$dict/extra_questions.txt")) {$exit = 1; print "--> ERROR: fail to open $dict/extra_questions.txt\n";} + if (!open(EX, "<$dict/extra_questions.txt")) { + set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; + } $idx = 1; $success = 1; print "--> reading $dict/extra_questions.txt\n"; while() { - chomp; + if (! s/\n$//) { + print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; + set_to_fail(); + } my @col = split(" ", $_); - foreach(0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - $exit = 1; print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n"; - $success = 0; - } + if (@col == 0) { + set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; + } + } + foreach(0 .. @col-1) { + if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { + set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n"; } $idx ++; - } + } close(EX); $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; } else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} -if($exit == 1) { print " [Error detected ]\n"; exit 1;} +if ($exit == 1) { print "--> ERROR validating dictionary directory $dict (see detailed error messages above)\n"; exit 1;} +else { print "--> SUCCESS [validating dictionary directory $dict]\n"; } exit 0; diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl index 60b4aa68f..3dd76eeb0 100755 --- a/egs/wsj/s5/utils/validate_lang.pl +++ b/egs/wsj/s5/utils/validate_lang.pl @@ -1,6 +1,8 @@ #!/usr/bin/perl -# Guoguo Chen (guoguo@jhu.edu) +# Apache 2.0. +# Copyright 2012 Guoguo Chen +# 2014 Neil Nelson # # Validation script for data/lang @@ -132,7 +134,7 @@ sub check_txt_int_csl { } sub check_txt_int { - my ($cat, $symtab) = @_; + my ($cat, $symtab, $sym_check) = @_; print "Checking $cat.\{txt, int\} ...\n"; if (-z "$cat.txt") {$exit = 1; return print "--> ERROR: $cat.txt is empty or does not exist\n";} if (-z "$cat.int") {$exit = 1; return print "--> ERROR: $cat.int is empty or does not exist\n";} @@ -154,6 +156,7 @@ sub check_txt_int { close(TXT); $idx1 --; print "--> $idx1 entry/entries in $cat.txt\n"; + my %used_syms = (); $idx2 = 1; while() { chomp; @@ -168,6 +171,8 @@ sub check_txt_int { if (@set != @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";} foreach(0 .. @set-1) { if ($symtab->{@set[$_]} ne @col[$_]) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2, block " ,$_+1, ")\n";} + if ($sym_check && defined $used_syms{@set[$_]}) {$exit = 1; return print "--> ERROR: $cat.txt and $cat.int contain duplicate symbols (break at line $idx2, block " ,$_+1, ")\n";} + $used_syms{@set[$_]} = 1; } $idx2 ++; } @@ -175,31 +180,16 @@ sub check_txt_int { if ($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";} print "--> $cat.int corresponds to $cat.txt\n"; - return print "--> $cat.\{txt, int\} are OK\n"; -} + if ($sym_check) { + while ( my ($key, $value) = each(%silence) ) { + if (!defined $used_syms{$key}) {$exit = 1; return print "--> ERROR: $cat.txt and $cat.int do not contain all silence phones\n";} + } + while ( my ($key, $value) = each(%nonsilence) ) { + if (!defined $used_syms{$key}) {$exit = 1; return print "--> ERROR: $cat.txt and $cat.int do not contain all non-silence phones\n";} + } + } -@list1 = ("context_indep", "disambig", "nonsilence", "silence", "optional_silence"); -@list2 = ("roots", "sets"); -foreach(@list1) { - check_txt_int_csl("$lang/phones/$_", \%psymtab); print "\n"; -} -foreach(@list2) { - check_txt_int("$lang/phones/$_", \%psymtab); print "\n"; -} -if ((-s "$lang/phones/extra_questions.txt") || (-s "$lang/phones/extra_questions.int")) { - check_txt_int("$lang/phones/extra_questions", \%psymtab); print "\n"; -} else { - print "Checking $lang/phones/extra_questions.\{txt, int\} ...\n"; - if ((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int")) { - print "--> WARNING: the optional $lang/phones/extra_questions.\{txt, int\} are empty!\n\n"; - $warning = 1; - } else { - print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n"; - $exit = 1; - } -} -if (-e "$lang/phones/word_boundary.txt") { - check_txt_int("$lang/phones/word_boundary", \%psymtab); print "\n"; + return print "--> $cat.\{txt, int\} are OK\n"; } # Check disjoint and summation ------------------------------- @@ -217,7 +207,7 @@ sub intersect { } sub check_disjoint { - print "Checking disjoint: silence.txt, nosilenct.txt, disambig.txt ...\n"; + print "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n"; if (!open(S, "<$lang/phones/silence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/silence.txt\n";} if (!open(N, "<$lang/phones/nonsilence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/nonsilence.txt\n";} if (!open(D, "<$lang/phones/disambig.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/disambig.txt\n";} @@ -336,6 +326,30 @@ sub check_summation { check_disjoint; print "\n"; check_summation; print "\n"; +@list1 = ("context_indep", "disambig", "nonsilence", "silence", "optional_silence"); +@list2 = ("roots", "sets"); +foreach(@list1) { + check_txt_int_csl("$lang/phones/$_", \%psymtab); print "\n"; +} +foreach(@list2) { + check_txt_int("$lang/phones/$_", \%psymtab, 1); print "\n"; +} +if ((-s "$lang/phones/extra_questions.txt") || (-s "$lang/phones/extra_questions.int")) { + check_txt_int("$lang/phones/extra_questions", \%psymtab, 0); print "\n"; +} else { + print "Checking $lang/phones/extra_questions.\{txt, int\} ...\n"; + if ((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int")) { + print "--> WARNING: the optional $lang/phones/extra_questions.\{txt, int\} are empty!\n\n"; + $warning = 1; + } else { + print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n"; + $exit = 1; + } +} +if (-e "$lang/phones/word_boundary.txt") { + check_txt_int("$lang/phones/word_boundary", \%psymtab, 0); print "\n"; +} + # Checking optional_silence.txt ------------------------------- print "Checking optional_silence.txt ...\n"; $idx = 1; @@ -550,7 +564,7 @@ if (-s "$lang/phones/word_boundary.int") { } # Check oov ------------------------------- -check_txt_int("$lang/oov", \%wsymtab); print "\n"; +check_txt_int("$lang/oov", \%wsymtab, 0); print "\n"; # Check determinizability of G.fst @@ -580,7 +594,6 @@ if (-e "$lang/G.fst" && -e "$lang/L_disambig.fst") { if ($exit == 1) { print "--> ERROR (see error messages above)\n"; exit 1;} else { if ($warning == 1) { print "--> WARNING (check output above for warnings)\n"; exit 0; } - else { print "--> SUCCESS\n"; exit 0; } + else { print "--> SUCCESS [validating lang directory $lang]\n"; exit 0; } } - diff --git a/src/bin/compile-train-graphs-fsts.cc b/src/bin/compile-train-graphs-fsts.cc index dc60409c6..93fe29d1a 100644 --- a/src/bin/compile-train-graphs-fsts.cc +++ b/src/bin/compile-train-graphs-fsts.cc @@ -43,8 +43,8 @@ int main(int argc, char *argv[]) { "of disambiguation symbols.\n" "Warning: you probably want to set the --transition-scale and --self-loop-scale\n" "options; the defaults (zero) are probably not appropriate.\n" - "Usage: compile-train-graphs-fsts [options] tree-in model-in lexicon-fst-in " - " graphs-rspecifier graphs-wspecifier\n" + "Usage: compile-train-graphs-fsts [options] " + " \n" "e.g.: \n" " compile-train-graphs-fsts --read-disambig-syms=disambig.list\\\n" " tree 1.mdl lex.fst ark:train.fsts ark:graphs.fsts\n"; diff --git a/src/bin/compile-train-graphs.cc b/src/bin/compile-train-graphs.cc index 019ef6e54..bcbf06a01 100644 --- a/src/bin/compile-train-graphs.cc +++ b/src/bin/compile-train-graphs.cc @@ -37,7 +37,7 @@ int main(int argc, char *argv[]) { const char *usage = "Creates training graphs (without transition-probabilities, by default)\n" "\n" - "Usage: compile-train-graphs [options] tree-in model-in lexicon-fst-in transcriptions-rspecifier graphs-wspecifier\n" + "Usage: compile-train-graphs [options] \n" "e.g.: \n" " compile-train-graphs tree 1.mdl lex.fst ark:train.tra ark:graphs.fsts\n"; ParseOptions po(usage); diff --git a/src/bin/get-post-on-ali.cc b/src/bin/get-post-on-ali.cc index f02c4d55f..c2baf17d0 100644 --- a/src/bin/get-post-on-ali.cc +++ b/src/bin/get-post-on-ali.cc @@ -1,6 +1,7 @@ // bin/get-post-on-ali.cc // Copyright 2013 Brno University of Technology (Author: Karel Vesely) +// 2014 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -31,18 +32,24 @@ int main(int argc, char *argv[]) { typedef kaldi::int32 int32; try { const char *usage = - "This program extracts a vector of per-frame posteriors that are selected\n" - "by an alignment (ie. posteriors that are under the alignment path).\n" - "This can be used as a per-frame confidence measure.\n" + "Given input posteriors, e.g. derived from lattice-to-post, and an alignment\n" + "typically derived from the best path of a lattice, outputs the probability in\n" + "the posterior of the corresponding index in the alignment, or zero if it was\n" + "not there. These are output as a vector of weights, one per utterance.\n" + "While, by default, lattice-to-post (as a source of posteriors) and sources of\n" + "alignments such as lattice-best-path will output transition-ids as the index,\n" + "it will generally make sense to either convert these to pdf-ids using\n" + "post-to-pdf-post and ali-to-pdf respectively, or to phones using post-to-phone-post\n" + "and (ali-to-phones --per-frame=true). Since this program only sees the integer\n" + "indexes, it does not care what they represent-- but of course they should match\n" + "(e.g. don't input posteriors with transition-ids and alignments with pdf-ids).\n" + "See http://kaldi.sourceforge.net/hmm.html#transition_model_identifiers for an\n" + "explanation of these types of indexes.\n" "\n" - "By intuition, it is better to use pdf-posteriors and pdf-alignments,\n" - "because the posteriors of competing hypothesis that are in the same frame\n" - "at same 'pdf-state' are summed up, which is in some sense similar\n" - "to what is done by C-max which sums the posteriors of overlapping words.\n" - "The difference here is that the granularity is per-frame.\n" + "See also: weight-post, post-to-weights, reverse-weights\n" "\n" - "Usage: get-post-on-ali [options] \n" - "e.g.: get-post-on-ali ark:post.ark ark:ali.ark ark:conf.ark\n"; + "Usage: get-post-on-ali [options] \n" + "e.g.: get-post-on-ali ark:post.ark ark,s,cs:ali.ark ark:weights.ark\n"; ParseOptions po(usage); diff --git a/src/bin/post-to-pdf-post.cc b/src/bin/post-to-pdf-post.cc index d50673c90..99aa5770a 100644 --- a/src/bin/post-to-pdf-post.cc +++ b/src/bin/post-to-pdf-post.cc @@ -32,6 +32,7 @@ int main(int argc, char *argv[]) { const char *usage = "This program turns per-frame posteriors, which have transition-ids as\n" "the integers, into pdf-level posteriors\n" + "See also: post-to-phone-post, post-to-weights, get-post-on-ali\n" "\n" "Usage: post-to-pdf-post [options] \n" "e.g.: post-to-pdf-post 1.mdl ark:- ark:-\n"; diff --git a/src/bin/post-to-phone-post.cc b/src/bin/post-to-phone-post.cc index d37e0d6da..92f67514a 100644 --- a/src/bin/post-to-phone-post.cc +++ b/src/bin/post-to-phone-post.cc @@ -30,6 +30,7 @@ int main(int argc, char *argv[]) { const char *usage = "Convert posteriors to phone-level posteriors\n" + "See also: post-to-pdf-post, post-to-weights, get-post-on-ali\n" "\n" "Usage: post-to-phone-post [options] \n" " e.g.: post-to-phone-post --binary=false 1.mdl \"ark:ali-to-post 1.ali|\" ark,t:-\n"; diff --git a/src/bin/post-to-weights.cc b/src/bin/post-to-weights.cc index 720bcdeb5..9ee21c12f 100644 --- a/src/bin/post-to-weights.cc +++ b/src/bin/post-to-weights.cc @@ -31,7 +31,9 @@ int main(int argc, char *argv[]) { const char *usage = "Turn posteriors into per-frame weights (typically most useful after\n" "weight-silence-post, to get silence weights)\n" - "Usage: post-to-weights post-rspecifier weights-wspecifier\n"; + "See also: weight-silence-post, post-to-pdf-post, post-to-phone-post\n" + "get-post-on-ali\n" + "Usage: post-to-weights \n"; ParseOptions po(usage); po.Read(argc, argv); diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 997bdcbcb..8e9b22d41 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -1128,7 +1128,6 @@ void CuMatrix::CompObjfAndDeriv(const std::vector >& s *tot_objf += weight * log(this_prob); *tot_weight += weight; (*this)(m, label) += weight / this_prob; - } } } diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 1959905a3..7f8a3ed9a 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -580,7 +580,7 @@ class CuMatrix: public CuMatrixBase { void CompObjfAndDeriv(const std::vector > &elements, const CuMatrix &A, Real *tot_objf, - Real* tot_weight); + Real *tot_weight); private: void Destroy(); diff --git a/src/featbin/feat-to-len.cc b/src/featbin/feat-to-len.cc index ac0474dd5..098de13ef 100644 --- a/src/featbin/feat-to-len.cc +++ b/src/featbin/feat-to-len.cc @@ -28,28 +28,39 @@ int main(int argc, char *argv[]) { const char *usage = "Reads an archive of features and writes a corresponding archive\n" - "that maps utterance-id to utterance length in frames.\n" - "Usage: feat-to-len [options] in-rspecifier out-wspecifier\n" - "e.g.: feat-to-len scp:feats.scp ark,t:feats.lengths\n"; + "that maps utterance-id to utterance length in frames, or (with\n" + "one argument) print to stdout the total number of frames in the\n" + "input archive.\n" + "Usage: feat-to-len [options] []\n" + "e.g.: feat-to-len scp:feats.scp ark,t:feats.lengths\n" + "or: feat-to-len scp:feats.scp\n"; ParseOptions po(usage); po.Read(argc, argv); - if (po.NumArgs() != 2) { + if (po.NumArgs() != 1 && po.NumArgs() != 2) { po.PrintUsage(); exit(1); } - std::string rspecifier = po.GetArg(1); - std::string wspecifier = po.GetArg(2); + if (po.NumArgs() == 2) { + std::string rspecifier = po.GetArg(1); + std::string wspecifier = po.GetArg(2); - Int32Writer length_writer(wspecifier); + Int32Writer length_writer(wspecifier); - SequentialBaseFloatMatrixReader kaldi_reader(rspecifier); - for (; !kaldi_reader.Done(); kaldi_reader.Next()) - length_writer.Write(kaldi_reader.Key(), kaldi_reader.Value().NumRows()); - + SequentialBaseFloatMatrixReader matrix_reader(rspecifier); + for (; !matrix_reader.Done(); matrix_reader.Next()) + length_writer.Write(matrix_reader.Key(), matrix_reader.Value().NumRows()); + } else { + int64 tot = 0; + std::string rspecifier = po.GetArg(1); + SequentialBaseFloatMatrixReader matrix_reader(rspecifier); + for (; !matrix_reader.Done(); matrix_reader.Next()) + tot += matrix_reader.Value().NumRows(); + std::cout << tot << std::endl; + } return 0; } catch(const std::exception &e) { std::cerr << e.what(); diff --git a/src/latbin/lattice-oracle.cc b/src/latbin/lattice-oracle.cc index bdf5b60a3..2f2a13059 100644 --- a/src/latbin/lattice-oracle.cc +++ b/src/latbin/lattice-oracle.cc @@ -234,8 +234,8 @@ int main(int argc, char *argv[]) { const char *usage = "Finds the path having the smallest edit-distance between two lattices.\n" "For efficiency put the smallest lattices first (for example reference strings).\n" - "Usage: lattice-oracle [options] test-lattice-rspecifier reference-rspecifier " - "transcriptions-wspecifier [edit-distance-wspecifier]\n" + "Usage: lattice-oracle [options] " + " []\n" " e.g.: lattice-oracle ark:lat.1 'ark:sym2int.pl -f 2- data/lang/words.txt ::MulRowsVec(const VectorBase &scale) { } } + template void MatrixBase::MulRowsGroupMat(const MatrixBase &src) { - KALDI_ASSERT(src.NumCols() > 0 && src.NumCols() <= this->NumCols()); - KALDI_ASSERT(this->NumCols() % src.NumCols() == 0 || - this->NumCols() % (src.NumCols() - 1) < this->NumCols() / (src.NumCols() - 1)); - int group_size = 0; - if (this->NumCols() % src.NumCols() == 0) { - group_size = this->NumCols() / src.NumCols(); - } else { - group_size = this->NumCols() / src.NumCols() + 1; - } - MatrixIndexT M = num_rows_, N = num_cols_; + KALDI_ASSERT(src.NumRows() == this->NumRows() && + this->NumCols() % src.NumCols() == 0); + int32 group_size = this->NumCols() / src.NumCols(), + num_groups = this->NumCols() / group_size, + num_rows = this->NumRows(); - for (MatrixIndexT i = 0; i < M; i++) - for (MatrixIndexT j = 0; j < N; j++) - (*this)(i, j) *= src(i, j / group_size); + for (MatrixIndexT i = 0; i < num_rows; i++) { + Real *data = this->RowData(i); + for (MatrixIndexT j = 0; j < num_groups; j++, data += group_size) { + Real scale = src(i, j); + cblas_Xscal(group_size, scale, data, 1); + } + } } template -void MatrixBase::GroupPnormDeriv(const MatrixBase &src1, - const MatrixBase &src2, +void MatrixBase::GroupPnormDeriv(const MatrixBase &input, + const MatrixBase &output, Real power) { - KALDI_ASSERT(src2.NumCols() > 0 && src2.NumCols() <= this->NumCols()); - KALDI_ASSERT(this->NumCols() % src2.NumCols() == 0 || - this->NumCols() % (src2.NumCols() - 1) < this->NumCols() / (src2.NumCols() - 1)); - int group_size = 0; - if (this->NumCols() % src2.NumCols() == 0) { - group_size = this->NumCols() / src2.NumCols(); - } else { - group_size = this->NumCols() / src2.NumCols() + 1; - } - MatrixIndexT M = this->NumRows(), N = this->NumCols(); + KALDI_ASSERT(input.NumCols() == this->NumCols() && input.NumRows() == this->NumRows()); + KALDI_ASSERT(this->NumCols() % output.NumCols() == 0 && + this->NumRows() == output.NumRows()); + + int group_size = this->NumCols() / output.NumCols(), + num_rows = this->NumRows(), num_cols = this->NumCols(); if (power == 1.0) { - for (MatrixIndexT i = 0; i < M; i++) - for (MatrixIndexT j = 0; j < N; j++) - (*this)(i, j) = (src1(i, j) == 0 ? 0 : (src1(i, j) > 0 ? 1 : -1)); + for (MatrixIndexT i = 0; i < num_rows; i++) { + for (MatrixIndexT j = 0; j < num_cols; j++) { + Real input_val = input(i, j); + (*this)(i, j) = (input_val == 0 ? 0 : (input_val > 0 ? 1 : -1)); + } + } } else { - for (MatrixIndexT i = 0; i < M; i++) { - for (MatrixIndexT j = 0; j < N; j++) { - if (src2(i, j / group_size) == 0) { + for (MatrixIndexT i = 0; i < num_rows; i++) { + for (MatrixIndexT j = 0; j < num_cols; j++) { + Real output_val = output(i, j / group_size), + input_val = input(i, j); + if (output_val == 0) (*this)(i, j) = 0; - } else { - (*this)(i, j) = pow(std::abs(src1(i, j)), power - 1) * - (src2(i, j / group_size) > 0 ? pow(src2(i, j / group_size), 1 - power) : 1) * - (src1(i, j) >= 0 ? 1 : -1) ; - } + else + (*this)(i, j) = pow(std::abs(input_val), power - 1) * + pow(output_val, 1 - power) * (input_val >= 0 ? 1 : -1) ; } } } @@ -2428,12 +2427,15 @@ void MatrixBase::SoftHinge(const MatrixBase &src) { } } } + template void MatrixBase::GroupPnorm(const MatrixBase &src, Real power) { - int group_size = src.NumCols() / this->NumCols(); - KALDI_ASSERT(src.NumCols() == this->NumCols() * group_size); - for (MatrixIndexT i = 0; i < src.NumRows(); i++) - for (MatrixIndexT j = 0; j < this->NumCols(); j++) + KALDI_ASSERT(src.NumCols() % this->NumCols() == 0 && + src.NumRows() == this->NumRows()); + int group_size = src.NumCols() / this->NumCols(), + num_rows = this->NumRows(), num_cols = this->NumCols(); + for (MatrixIndexT i = 0; i < num_rows; i++) + for (MatrixIndexT j = 0; j < num_cols; j++) (*this)(i, j) = src.Row(i).Range(j * group_size, group_size).Norm(power); } diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h index 71db6b16f..37f0df5e8 100644 --- a/src/matrix/kaldi-matrix.h +++ b/src/matrix/kaldi-matrix.h @@ -240,8 +240,9 @@ class MatrixBase { /// each row by a scalar taken from that dimension of the vector. void MulRowsVec(const VectorBase &scale); - /// divide each row into src.NumCols() groups, - /// and then scale i'th row's jth group of elements by src[i, j]. + /// Divide each row into src.NumCols() equal groups, and then scale i'th row's + /// j'th group of elements by src(i, j). Requires src.NumRows() == + /// this->NumRows() and this->NumCols() % src.NumCols() == 0. void MulRowsGroupMat(const MatrixBase &src); /// Returns logdet of matrix. @@ -418,8 +419,8 @@ class MatrixBase { /// Set each element to y = log(1 + exp(x)) void SoftHinge(const MatrixBase &src); - /// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j ^ (power)) ^ (1 / p) - /// where G = x.NumCols() / y.NumCols() must be an integer. + /// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j^(power))^(1 / p). + /// Requires src.NumRows() == this->NumRows() and src.NumCols() % this->NumCols() == 0. void GroupPnorm(const MatrixBase &src, Real power); diff --git a/src/matrix/optimization.cc b/src/matrix/optimization.cc index 5d6654a1c..a415800bd 100644 --- a/src/matrix/optimization.cc +++ b/src/matrix/optimization.cc @@ -469,9 +469,9 @@ int32 LinearCgd(const LinearCgdOptions &opts, residual_factor = opts.recompute_residual_factor * opts.recompute_residual_factor; - // Note: although from a mathematical point of view the method should - // converge after M iterations, in practice it does not always converge - // to good precision after that many iterations so we let the maximum + // Note: although from a mathematical point of view the method should converge + // after M iterations, in practice (due to roundoff) it does not always + // converge to good precision after that many iterations so we let the maximum // be 1.5 * M + 5 instead. int32 k = 0; for (; k < M + M / 2 + 5 && k != opts.max_iters; k++) { diff --git a/src/matrix/qr.cc b/src/matrix/qr.cc index 48eb66781..861dead05 100644 --- a/src/matrix/qr.cc +++ b/src/matrix/qr.cc @@ -86,8 +86,8 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) { } else { cblas_Xscal(dim, inv_v1, v, 1); } - if (!KALDI_ISFINITE(inv_v1) || !KALDI_ISFINITE(x1)) { - KALDI_ERR << "NaN or inf encountered in HouseBackward"; + if (KALDI_ISNAN(inv_v1)) { + KALDI_ERR << "NaN encountered in HouseBackward"; } } } @@ -142,8 +142,8 @@ void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) { } else { cblas_Xscal(dim, inv_v1, v, 1); } - if (!KALDI_ISFINITE(inv_v1) || !KALDI_ISFINITE(x1)) { - KALDI_ERR << "NaN or inf encountered in HouseBackward"; + if (KALDI_ISNAN(inv_v1)) { + KALDI_ERR << "NaN encountered in HouseBackward"; } } } diff --git a/src/nnet2/Makefile b/src/nnet2/Makefile index 3f15d7224..3eff49a33 100644 --- a/src/nnet2/Makefile +++ b/src/nnet2/Makefile @@ -19,7 +19,8 @@ OBJFILES = nnet-component.o nnet-nnet.o train-nnet.o train-nnet-ensemble.o nnet- nnet-fix.o nnet-stats.o rescale-nnet.o nnet-limit-rank.o nnet-example.o \ get-feature-transform.o widen-nnet.o nnet-precondition-online.o \ nnet-example-functions.o nnet-compute-discriminative.o \ - nnet-compute-discriminative-parallel.o online-nnet2-decodable.o + nnet-compute-discriminative-parallel.o online-nnet2-decodable.o \ + train-nnet-perturbed.o LIBNAME = kaldi-nnet2 diff --git a/src/nnet2/nnet-component.h b/src/nnet2/nnet-component.h index c9b06867f..a3130022c 100644 --- a/src/nnet2/nnet-component.h +++ b/src/nnet2/nnet-component.h @@ -1595,6 +1595,9 @@ class FixedAffineComponent: public Component { virtual Component* Copy() const; virtual void Read(std::istream &is, bool binary); virtual void Write(std::ostream &os, bool binary) const; + + // Function to provide access to linear_params_. + const CuMatrix &LinearParams() const { return linear_params_; } protected: friend class AffineComponent; CuMatrix linear_params_; diff --git a/src/nnet2/nnet-example.cc b/src/nnet2/nnet-example.cc index b84d223a3..fe342b199 100644 --- a/src/nnet2/nnet-example.cc +++ b/src/nnet2/nnet-example.cc @@ -67,6 +67,40 @@ void NnetExample::Read(std::istream &is, bool binary) { } + +void ExamplesRepository::AcceptExamples( + std::vector *examples) { + KALDI_ASSERT(!examples->empty()); + empty_semaphore_.Wait(); + KALDI_ASSERT(examples_.empty()); + examples_.swap(*examples); + full_semaphore_.Signal(); +} + +void ExamplesRepository::ExamplesDone() { + empty_semaphore_.Wait(); + KALDI_ASSERT(examples_.empty()); + done_ = true; + full_semaphore_.Signal(); +} + +bool ExamplesRepository::ProvideExamples( + std::vector *examples) { + full_semaphore_.Wait(); + if (done_) { + KALDI_ASSERT(examples_.empty()); + full_semaphore_.Signal(); // Increment the semaphore so + // the call by the next thread will not block. + return false; // no examples to return-- all finished. + } else { + KALDI_ASSERT(!examples_.empty() && examples->empty()); + examples->swap(examples_); + empty_semaphore_.Signal(); + return true; + } +} + + void DiscriminativeNnetExample::Write(std::ostream &os, bool binary) const { // Note: weight, num_ali, den_lat, input_frames, left_context and spk_info are diff --git a/src/nnet2/nnet-example.h b/src/nnet2/nnet-example.h index f9893e817..4e156103f 100644 --- a/src/nnet2/nnet-example.h +++ b/src/nnet2/nnet-example.h @@ -23,6 +23,7 @@ #include "nnet2/nnet-nnet.h" #include "util/table-types.h" #include "lat/kaldi-lattice.h" +#include "thread/kaldi-semaphore.h" namespace kaldi { namespace nnet2 { @@ -64,6 +65,35 @@ typedef SequentialTableReader > SequentialNnetEx typedef RandomAccessTableReader > RandomAccessNnetExampleReader; +/** This class stores neural net training examples to be used in + multi-threaded training. */ +class ExamplesRepository { + public: + /// The following function is called by the code that reads in the examples, + /// with a batch of examples. [It will empty the vector "examples"). + void AcceptExamples(std::vector *examples); + + /// The following function is called by the code that reads in the examples, + /// when we're done reading examples. + void ExamplesDone(); + + /// This function is called by the code that does the training. It gets the + /// training examples, and if they are available, puts them in "examples" and + /// returns true. It returns false when there are no examples left and + /// ExamplesDone() has been called. + bool ProvideExamples(std::vector *examples); + + ExamplesRepository(): empty_semaphore_(1), done_(false) { } + private: + Semaphore full_semaphore_; + Semaphore empty_semaphore_; + + std::vector examples_; + bool done_; + KALDI_DISALLOW_COPY_AND_ASSIGN(ExamplesRepository); +}; + + /** This struct is used to store the information we need for discriminative training (MMI or MPE). Each example corresponds to one chunk of a file (for better randomization @@ -116,7 +146,7 @@ struct DiscriminativeNnetExample { void Read(std::istream &is, bool binary); }; -// Tes, the length of typenames is getting out of hand. +// Yes, the length of typenames is getting out of hand. typedef TableWriter > DiscriminativeNnetExampleWriter; typedef SequentialTableReader > diff --git a/src/nnet2/nnet-nnet.cc b/src/nnet2/nnet-nnet.cc index 02477948c..56a3fab4a 100644 --- a/src/nnet2/nnet-nnet.cc +++ b/src/nnet2/nnet-nnet.cc @@ -432,6 +432,12 @@ void Nnet::RemovePreconditioning() { *(dynamic_cast(components_[i]))); delete components_[i]; components_[i] = ac; + } else if (dynamic_cast( + components_[i]) != NULL) { + AffineComponent *ac = new AffineComponent( + *(dynamic_cast(components_[i]))); + delete components_[i]; + components_[i] = ac; } } SetIndexes(); diff --git a/src/nnet2/nnet-update-parallel.cc b/src/nnet2/nnet-update-parallel.cc index 678169605..c596d8e63 100644 --- a/src/nnet2/nnet-update-parallel.cc +++ b/src/nnet2/nnet-update-parallel.cc @@ -26,68 +26,6 @@ namespace kaldi { namespace nnet2 { -/** This struct stores neural net training examples to be used in - multi-threaded training. */ -class ExamplesRepository { - public: - /// The following function is called by the code that reads in the examples, - /// with a batch of examples. [It will empty the vector "examples"). - void AcceptExamples(std::vector *examples); - - /// The following function is called by the code that reads in the examples, - /// when we're done reading examples. - void ExamplesDone(); - - /// This function is called by the code that does the training. It gets the - /// training examples, and if they are available, puts them in "examples" and - /// returns true. It returns false when there are no examples left and - /// ExamplesDone() has been called. - bool ProvideExamples(std::vector *examples); - - ExamplesRepository(): empty_semaphore_(1), done_(false) { } - private: - Semaphore full_semaphore_; - Semaphore empty_semaphore_; - - std::vector examples_; - bool done_; - KALDI_DISALLOW_COPY_AND_ASSIGN(ExamplesRepository); -}; - - -void ExamplesRepository::AcceptExamples( - std::vector *examples) { - KALDI_ASSERT(!examples->empty()); - empty_semaphore_.Wait(); - KALDI_ASSERT(examples_.empty()); - examples_.swap(*examples); - full_semaphore_.Signal(); -} - -void ExamplesRepository::ExamplesDone() { - empty_semaphore_.Wait(); - KALDI_ASSERT(examples_.empty()); - done_ = true; - full_semaphore_.Signal(); -} - -bool ExamplesRepository::ProvideExamples( - std::vector *examples) { - full_semaphore_.Wait(); - if (done_) { - KALDI_ASSERT(examples_.empty()); - full_semaphore_.Signal(); // Increment the semaphore so - // the call by the next thread will not block. - return false; // no examples to return-- all finished. - } else { - KALDI_ASSERT(!examples_.empty() && examples->empty()); - examples->swap(examples_); - empty_semaphore_.Signal(); - return true; - } -} - - class DoBackpropParallelClass: public MultiThreadable { public: diff --git a/src/nnet2/nnet-update.cc b/src/nnet2/nnet-update.cc index 2027ac9f5..250cab95f 100644 --- a/src/nnet2/nnet-update.cc +++ b/src/nnet2/nnet-update.cc @@ -39,8 +39,8 @@ double NnetUpdater::ComputeForMinibatch( CuMatrix tmp_deriv; double ans = ComputeObjfAndDeriv(data, &tmp_deriv, tot_accuracy); if (nnet_to_update_ != NULL) - Backprop(data, &tmp_deriv); // this is summed (after weighting), not - // averaged. + Backprop(&tmp_deriv); // this is summed (after weighting), not + // averaged. return ans; } @@ -133,9 +133,7 @@ double NnetUpdater::ComputeTotAccuracy( } -void NnetUpdater::Backprop(const std::vector &data, - CuMatrix *deriv) const { - int32 num_chunks = data.size(); +void NnetUpdater::Backprop(CuMatrix *deriv) const { // We assume ComputeObjfAndDeriv has already been called. for (int32 c = nnet_.NumComponents() - 1; c >= 0; c--) { const Component &component = nnet_.GetComponent(c); @@ -146,7 +144,7 @@ void NnetUpdater::Backprop(const std::vector &data, CuMatrix input_deriv(input.NumRows(), input.NumCols()); const CuMatrix &output_deriv(*deriv); - component.Backprop(input, output, output_deriv, num_chunks, + component.Backprop(input, output, output_deriv, num_chunks_, component_to_update, &input_deriv); input_deriv.Swap(deriv); } diff --git a/src/nnet2/nnet-update.h b/src/nnet2/nnet-update.h index 60a332b65..989940eec 100644 --- a/src/nnet2/nnet-update.h +++ b/src/nnet2/nnet-update.h @@ -29,22 +29,20 @@ namespace kaldi { namespace nnet2 { -/* This header provides functionality for sample-by-sample stochastic +/** @file + This header provides functionality for sample-by-sample stochastic gradient descent and gradient computation with a neural net. - See also nnet-compute.h which is the same thing but for + See also \ref nnet-compute.h which is the same thing but for whole utterances. - This is the inner part of the training code; see nnet-train.h - which contains a wrapper for this, with functionality for - automatically keeping the learning rates for each layer updated - using a heuristic involving validation-set gradients. */ +class NnetEnsembleTrainer; + // This class NnetUpdater contains functions for updating the neural net or // computing its gradient, given a set of NnetExamples. We // define it in the header file becaused it's needed by the ensemble training. // But in normal cases its functionality should be used by calling DoBackprop(), // and by ComputeNnetObjf() -class NnetEnsembleTrainer; class NnetUpdater { public: // Note: in the case of training with SGD, "nnet" and "nnet_to_update" will @@ -84,8 +82,7 @@ class NnetUpdater { /// contain, at input, the derivative w.r.t. the output layer (as computed by /// ComputeObjfAndDeriv), but will be used as a temporary variable by this /// function. - void Backprop(const std::vector &data, - CuMatrix *deriv) const; + void Backprop(CuMatrix *deriv) const; friend class NnetEnsembleTrainer; private: @@ -100,10 +97,6 @@ class NnetUpdater { std::vector > forward_data_; // The forward data // for the outputs of each of the components. - // These weights are one per parameter; they equal to the "weight" - // member variables in the NnetExample structures. These - // will typically be about one on average. - CuVector chunk_weights_; }; /// This function computes the objective function and either updates the model diff --git a/src/nnet2/train-nnet-ensemble.cc b/src/nnet2/train-nnet-ensemble.cc index 269de778d..e53f3f3f8 100644 --- a/src/nnet2/train-nnet-ensemble.cc +++ b/src/nnet2/train-nnet-ensemble.cc @@ -90,12 +90,13 @@ void NnetEnsembleTrainer::TrainOneMinibatch() { post_mat[i].ApplyLog(); std::vector log_post_correct; post_mat[i].Lookup(sv_labels_ind, &log_post_correct); - BaseFloat log_prob_this_net = std::accumulate(log_post_correct.begin(), log_post_correct.end(), static_cast(0)); - + BaseFloat log_prob_this_net = std::accumulate(log_post_correct.begin(), + log_post_correct.end(), + static_cast(0)); avg_logprob_this_phase_ += log_prob_this_net; tmp_deriv.InvertElements(); tmp_deriv.MulElements(post_avg); - updater_ensemble_[i]->Backprop(buffer_, &tmp_deriv); + updater_ensemble_[i]->Backprop(&tmp_deriv); } count_this_phase_ += buffer_.size(); buffer_.clear(); diff --git a/src/nnet2/train-nnet-perturbed.cc b/src/nnet2/train-nnet-perturbed.cc new file mode 100644 index 000000000..c60e14d92 --- /dev/null +++ b/src/nnet2/train-nnet-perturbed.cc @@ -0,0 +1,710 @@ +// nnet2/train-nnet-perturbed.cc + +// Copyright 2012-2014 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "nnet2/train-nnet-perturbed.h" +#include "nnet2/nnet-update.h" +#include "thread/kaldi-thread.h" + +namespace kaldi { +namespace nnet2 { + + +class NnetPerturbedUpdater { + public: + // Note: in the case of training with SGD, "nnet" and "nnet_to_update" will be + // identical. They'd be different if we're accumulating the gradient for a + // held-out set and don't want to update the model, but this shouldn't happen + // for this "perturbed" update. nnet_to_update may be NULL if you don't + // want do do backprop, but this probably doesn't make sense. + // num_layers_before_input is the number of layers to ignore before what + // we consider to be the input (x) for purposes of this technique. This will + // likely equal 2: one for the feature-splicing layer (SpliceComponent) and + // one for the preconditioning layer (FixedAffineComponent). The within_class_covar + // argument (within_class_covar) + // + // within_class_covar is the within-class covariance matrix + NnetPerturbedUpdater(const Nnet &nnet, + int32 num_layers_before_input, + const CuMatrix &within_class_covar, + Nnet *nnet_to_update); + + // This function does the entire forward and backward computation for this + // minbatch. Outputs to tot_objf_orig and tot_objf_perturbed the total + // objective function (including any weighting factors) over this minibatch, + // and the same after perturbing the data. + void ComputeForMinibatch(const std::vector &data, + BaseFloat D, + double *tot_objf_orig, + double *tot_objf_perturbed); + + protected: + + /// takes the input and formats as a single matrix, in forward_data_[0]. + void FormatInput(const std::vector &data); + + /// Do the forward propagation for layers 0 ... num_layers_before_input_ - 1, + /// typically the first two layers. This will be called once per minibatch. + void PropagateInitial() { Propagate(0, num_layers_before_input_); } + + + /// Do the forward propagation for layers num_layers_before_input_ + /// ... num-layers-1, typically all but the first two layers. This will be + /// called twice per minibatch, once before and once after perturbing the + /// inputs. + void PropagateRemaining() { Propagate(num_layers_before_input_, + nnet_.NumComponents()); } + + /// Internal Propagate function, does the forward computation for + /// layers begin_layer ... end_layer - 1. + void Propagate(int32 begin_layer, int32 end_layer); + + /// Computes objective function and derivative at output layer, but does not + /// do the backprop [for that, see Backprop()]. This will be called twice per + /// minibatch, once before and once after perturbing the inputs. + void ComputeObjfAndDeriv(const std::vector > &sv_labels, + CuMatrix *deriv, + BaseFloat *tot_objf, + BaseFloat *tot_weight) const; + + /// Computes supervision labels from data. + void ComputeSupervisionLabels(const std::vector &data, + std::vector > *sv_labels); + + /// Backprop must be called after ComputeObjfAndDeriv (it will be called + /// twice, the first time with a NULL nnet_to_update pointer). It does the + /// backpropagation (not including the first num_layers_before_input_ layers). + /// "nnet_to_update" is updated, if non-NULL. Note: "deriv" will contain, at + /// input, the derivative w.r.t. the output layer (as computed by + /// ComputeObjfAndDeriv), but will be used as a temporary variable by this + /// function, and exit, will contain the derivative of the objective function + /// w.r.t. the input of layer num_layers_before_input_. + void Backprop(Nnet *nnet_to_update, + CuMatrix *deriv) const; + + /// Perturb the input features (actually, the features at the input of layer + /// num_layers_before_input_). This modifies the value of + /// forward_data_[num_layers_before_input_]. For the math, see \ref + /// train-nnet-perturbed.h + void PerturbInput(const CuMatrix &deriv_at_input, + BaseFloat D); + + private: + + const Nnet &nnet_; + + Nnet *nnet_to_update_; + int32 num_layers_before_input_; // Number of layers before whichever layer we + // regard as the input for purposes of this + // method (normally 2, to include splicing + // layer and preconditioning layer) + + const CuMatrix &within_class_covar_; + + int32 num_chunks_; // same as the minibatch size. + + std::vector > forward_data_; // The forward data + // for the outputs of each of the components. +}; + + +NnetPerturbedUpdater::NnetPerturbedUpdater(const Nnet &nnet, + int32 num_layers_before_input, + const CuMatrix &within_class_covar, + Nnet *nnet_to_update): + nnet_(nnet), + nnet_to_update_(nnet_to_update), + num_layers_before_input_(num_layers_before_input), + within_class_covar_(within_class_covar) { + KALDI_ASSERT(num_layers_before_input_ >= 0 && + num_layers_before_input < nnet.NumComponents()); + for (int32 c = 0; c < num_layers_before_input_; c++) { + const Component *comp = &(nnet.GetComponent(c)); + const UpdatableComponent *uc = dynamic_cast(comp); + if (uc != NULL) { + KALDI_ERR << "One of the pre-input layers is updatable."; + } + } +} + +void NnetPerturbedUpdater::PerturbInput( + const CuMatrix &deriv_at_input, + BaseFloat D) { + // The code doesn't handle the case where there is further splicing after the + // input. + KALDI_ASSERT(num_chunks_ == deriv_at_input.NumRows()); + // For the math, see train-nnet-perturbed.h. + // deriv_at_input is \nabla in the math. + + // "input" is the input features, currently unmodified, but we'll + // modify them. + CuMatrix &input(forward_data_[num_layers_before_input_]); + KALDI_ASSERT(SameDim(input, deriv_at_input)); + // Each row of deriv_w will equal (W nabla_t)', where ' is transpose. + CuMatrix deriv_w(input.NumRows(), input.NumCols()); + // note: for the second transpose-ness argument below we can choose either + // kTrans or kNoTrans because the matrix is symmetric. I'm guessing that + // kTrans will be faster. + deriv_w.AddMatMat(1.0, deriv_at_input, kNoTrans, + within_class_covar_, kTrans, 0.0); + + // k will be used to compute and store the gradient-scaling factor k_t. + CuVector k(deriv_at_input.NumRows()); + // after the next call, each element of k will contain (\nabla_t^T W \nabla_t) + // We want k_t = D / sqrt(\nabla_t^T W \nabla_t) + // so we need to take this to the power -0.5. + // We can't do this if it's zero, so we first floor to a very small value. + k.AddDiagMatMat(1.0, deriv_w, kNoTrans, deriv_at_input, kTrans, 0.0); + int32 num_floored = k.ApplyFloor(1.0e-20); + if (num_floored > 0.0) { + // Should only happen at the very start of training, + KALDI_WARN << num_floored << " gradients floored (derivative at input was " + << "close to zero).. should only happen at start of training " + << "or when adding a new layer."; + } + k.ApplyPow(-0.5); + // now we have k_t = 1.0 / sqrt(\nabla_t^T W \nabla_t). + // in the math, k_t contains an additional factor of D, but we'll + // add this later. + // Below, we will do x'_t = x_t - k_t W \nabla_t + // Here, each row of deriv_w contains the transpose of W \nabla_t. + // The factor of D is because it was missing in k. + input.AddDiagVecMat(-1.0 * D, k, deriv_w, kNoTrans, 1.0); +} + +void NnetPerturbedUpdater::ComputeForMinibatch( + const std::vector &data, + BaseFloat D, + double *tot_objf_orig, + double *tot_objf_perturbed) { + + FormatInput(data); + PropagateInitial(); + PropagateRemaining(); + CuMatrix tmp_deriv; + + std::vector > sv_labels; + ComputeSupervisionLabels(data, &sv_labels); + + BaseFloat tot_objf, tot_weight; + ComputeObjfAndDeriv(sv_labels, &tmp_deriv, &tot_objf, &tot_weight); + + KALDI_VLOG(4) << "Objective function (original) is " << (tot_objf/tot_weight) + << " per sample, over " << tot_weight << " samples (weighted)."; + *tot_objf_orig = tot_objf; + + // only backprops till layer number num_layers_before_input_, + // and derivative at that layer is in tmp_deriv. + Backprop(NULL, &tmp_deriv); + + // perturb forward_data_[num_layers_before_input_]. + PerturbInput(tmp_deriv, D); + + // Now propagate forward again from that point. + PropagateRemaining(); + + ComputeObjfAndDeriv(sv_labels, &tmp_deriv, &tot_objf, &tot_weight); + KALDI_VLOG(4) << "Objective function (perturbed) is " << (tot_objf/tot_weight) + << " per sample, over " << tot_weight << " samples (weighted)."; + *tot_objf_perturbed = tot_objf; + + // The actual model updating would happen in the next call. + if (nnet_to_update_ != NULL) + Backprop(nnet_to_update_, &tmp_deriv); +} + +void NnetPerturbedUpdater::Propagate(int32 begin_layer, int32 end_layer) { + static int32 num_times_printed = 0; + + for (int32 c = begin_layer; c < end_layer; c++) { + const Component &component = nnet_.GetComponent(c); + const CuMatrix &input = forward_data_[c]; + CuMatrix &output = forward_data_[c+1]; + // Note: the Propagate function will automatically resize the + // output. + component.Propagate(input, num_chunks_, &output); + + KALDI_VLOG(4) << "Propagating: sum at output of " << c << " is " << output.Sum(); + + // If we won't need the output of the previous layer for + // backprop, delete it to save memory. + bool need_last_output = + (c>0 && nnet_.GetComponent(c-1).BackpropNeedsOutput()) || + component.BackpropNeedsInput(); + if (g_kaldi_verbose_level >= 3 && num_times_printed < 100) { + KALDI_VLOG(3) << "Stddev of data for component " << c + << " for this minibatch is " + << (TraceMatMat(forward_data_[c], forward_data_[c], kTrans) / + (forward_data_[c].NumRows() * forward_data_[c].NumCols())); + num_times_printed++; + } + if (!need_last_output && c != num_layers_before_input_) + forward_data_[c].Resize(0, 0); // We won't need this data. + } +} + +void NnetPerturbedUpdater::ComputeSupervisionLabels( + const std::vector &data, + std::vector > *sv_labels) { + sv_labels->clear(); + sv_labels->reserve(num_chunks_); // We must have at least this many labels. + for (int32 m = 0; m < num_chunks_; m++) { + for (size_t i = 0; i < data[m].labels.size(); i++) { + MatrixElement + tmp = {m, data[m].labels[i].first, data[m].labels[i].second}; + sv_labels->push_back(tmp); + } + } +} + +void NnetPerturbedUpdater::ComputeObjfAndDeriv( + const std::vector > &sv_labels, + CuMatrix *deriv, + BaseFloat *tot_objf, + BaseFloat *tot_weight) const { + int32 num_components = nnet_.NumComponents(); + deriv->Resize(num_chunks_, nnet_.OutputDim()); // sets to zero. + const CuMatrix &output(forward_data_[num_components]); + KALDI_ASSERT(SameDim(output, *deriv)); + + deriv->CompObjfAndDeriv(sv_labels, output, tot_objf, tot_weight); +} + + +void NnetPerturbedUpdater::Backprop(Nnet *nnet_to_update, + CuMatrix *deriv) const { + // We assume ComputeObjfAndDeriv has already been called. + for (int32 c = nnet_.NumComponents() - 1; c >= num_layers_before_input_; c--) { + const Component &component = nnet_.GetComponent(c); + Component *component_to_update = (nnet_to_update == NULL ? NULL : + &(nnet_to_update->GetComponent(c))); + const CuMatrix &input = forward_data_[c], + &output = forward_data_[c+1]; + CuMatrix input_deriv(input.NumRows(), input.NumCols()); + const CuMatrix &output_deriv(*deriv); + + component.Backprop(input, output, output_deriv, num_chunks_, + component_to_update, &input_deriv); + input_deriv.Swap(deriv); + } +} + + +void NnetPerturbedUpdater::FormatInput(const std::vector &data) { + KALDI_ASSERT(data.size() > 0); + int32 num_splice = nnet_.LeftContext() + 1 + nnet_.RightContext(); + KALDI_ASSERT(data[0].input_frames.NumRows() >= num_splice); + + int32 feat_dim = data[0].input_frames.NumCols(), + spk_dim = data[0].spk_info.Dim(), + tot_dim = feat_dim + spk_dim; // we append these at the neural net + // input... note, spk_dim might be 0. + KALDI_ASSERT(tot_dim == nnet_.InputDim()); + KALDI_ASSERT(data[0].left_context >= nnet_.LeftContext()); + int32 ignore_frames = data[0].left_context - nnet_.LeftContext(); // If + // the NnetExample has more left-context than we need, ignore some. + // this may happen in settings where we increase the amount of context during + // training, e.g. by adding layers that require more context. + num_chunks_ = data.size(); + + forward_data_.resize(nnet_.NumComponents() + 1); + + // First copy to a single matrix on the CPU, so we can copy to + // GPU with a single copy command. + Matrix temp_forward_data(num_splice * num_chunks_, + tot_dim); + + for (int32 chunk = 0; chunk < num_chunks_; chunk++) { + SubMatrix dest(temp_forward_data, + chunk * num_splice, num_splice, + 0, feat_dim); + + Matrix full_src(data[chunk].input_frames); + SubMatrix src(full_src, ignore_frames, num_splice, 0, feat_dim); + + dest.CopyFromMat(src); + if (spk_dim != 0) { + SubMatrix spk_dest(temp_forward_data, + chunk * num_splice, num_splice, + feat_dim, spk_dim); + spk_dest.CopyRowsFromVec(data[chunk].spk_info); + } + } + forward_data_[0].Swap(&temp_forward_data); // Copy to GPU, if being used. +} + + + +void DoBackpropPerturbed(const Nnet &nnet, + int32 num_layers_before_input, + const CuMatrix &within_class_covar, + BaseFloat D, + const std::vector &examples, + Nnet *nnet_to_update, + double *tot_objf_orig, + double *tot_objf_perturbed) { + + try { + NnetPerturbedUpdater updater(nnet, num_layers_before_input, + within_class_covar, nnet_to_update); + + updater.ComputeForMinibatch(examples, D, tot_objf_orig, tot_objf_perturbed); + } catch (...) { + KALDI_LOG << "Error doing backprop, nnet info is: " << nnet.Info(); + throw; + } +} + + +NnetPerturbedTrainer::NnetPerturbedTrainer( + const NnetPerturbedTrainerConfig &config, + const SpMatrix &within_class_covar, + Nnet *nnet): + config_(config), nnet_(nnet), logprob_this_phase_(0.0), + logprob_perturbed_this_phase_(0.0), weight_this_phase_(0.0), + logprob_total_(0.0), logprob_perturbed_total_(0.0), + weight_total_(0.0), + D_(config.initial_d) { + InitWithinClassCovar(within_class_covar); + num_phases_ = 0; + bool first_time = true; + BeginNewPhase(first_time); +} + + +// This function is used in class NnetPerturbedTrainer +// and the function DoBackpropPerturbedParallel. +void InitWithinClassCovar( + const SpMatrix &within_class_covar, + const Nnet &nnet, + int32 *num_layers_before_input, + CuMatrix *within_class_covar_out) { + + CuSpMatrix orig_covar(within_class_covar); + *num_layers_before_input = 0; + KALDI_ASSERT(nnet.NumComponents() > *num_layers_before_input); + const Component *comp = &(nnet.GetComponent(*num_layers_before_input)); + // Skip over any SpliceComponent that appears at the beginning of + // the network. + if (dynamic_cast(comp) != NULL) + (*num_layers_before_input)++; + + KALDI_ASSERT(nnet.NumComponents() > *num_layers_before_input); + comp = &(nnet.GetComponent(*num_layers_before_input)); + + const FixedAffineComponent *fa = + dynamic_cast(comp); + if (fa != NULL) { + (*num_layers_before_input)++; + const CuMatrix &linear_params = fa->LinearParams(); + if (linear_params.NumCols() != orig_covar.NumCols()) { + KALDI_ERR << "The neural network seems to expect a (spliced) feature " + << "dimension of " << linear_params.NumCols() << ", but your " + << "LDA stats have a dimension of " << orig_covar.NumCols(); + } + CuMatrix temp(linear_params.NumRows(), orig_covar.NumRows()); + // temp = linear_params . orig_covar + temp.AddMatSp(1.0, linear_params, kNoTrans, orig_covar, 0.0); + within_class_covar_out->Resize(linear_params.NumRows(), + linear_params.NumRows()); + // temp = linear_params . orig_covar . linear_params^T + within_class_covar_out->AddMatMat(1.0, temp, kNoTrans, + linear_params, kTrans, 0.0); + // note: this should be symmetric, spot-test it like this: + KALDI_ASSERT(ApproxEqual(TraceMatMat(*within_class_covar_out, + *within_class_covar_out, kNoTrans), + TraceMatMat(*within_class_covar_out, + *within_class_covar_out, kTrans))); + } else { + if (comp->InputDim() != orig_covar.NumCols()) { + KALDI_ERR << "The neural network seems to expect a (spliced) feature " + << "dimension of " << comp->InputDim() << ", but your " + << "LDA stats have a dimension of " << orig_covar.NumCols(); + } + within_class_covar_out->Resize(orig_covar.NumRows(), orig_covar.NumCols()); + within_class_covar_out->CopyFromSp(orig_covar); + } +} + + + +void NnetPerturbedTrainer::InitWithinClassCovar( + const SpMatrix &within_class_covar) { + kaldi::nnet2::InitWithinClassCovar(within_class_covar, *nnet_, + &num_layers_before_input_, + &within_class_covar_); +} + +void NnetPerturbedTrainer::TrainOnExample(const NnetExample &value) { + buffer_.push_back(value); + if (static_cast(buffer_.size()) == config_.minibatch_size) + TrainOneMinibatch(); +} + +void NnetPerturbedTrainer::TrainOneMinibatch() { + KALDI_ASSERT(!buffer_.empty()); + + double tot_objf_orig, tot_objf_perturbed; + DoBackpropPerturbed(*nnet_, num_layers_before_input_, within_class_covar_, D_, + buffer_, nnet_, &tot_objf_orig, &tot_objf_perturbed); + + logprob_this_phase_ += tot_objf_orig; + logprob_perturbed_this_phase_ += tot_objf_perturbed; + double weight = TotalNnetTrainingWeight(buffer_); + UpdateD(tot_objf_orig / weight, tot_objf_perturbed / weight); + weight_this_phase_ += weight; + buffer_.clear(); + minibatches_seen_this_phase_++; + if (minibatches_seen_this_phase_ == config_.minibatches_per_phase) { + bool first_time = false; + BeginNewPhase(first_time); + } +} + + +void NnetPerturbedTrainer::UpdateD(BaseFloat orig_objf_per_example, + BaseFloat perturbed_objf_per_example) { + + BaseFloat diff = orig_objf_per_example - perturbed_objf_per_example; + // note: diff should be positive in the normal case. + KALDI_ASSERT(config_.target_objf_change > 0.0 && config_.max_d_factor > 1.0); + BaseFloat objf_ratio = config_.target_objf_change / + std::max(1.0e-20, diff), + D_ratio = pow(objf_ratio, config_.tune_d_power); + if (D_ratio > config_.max_d_factor) + D_ratio = config_.max_d_factor; + else if (D_ratio < 1.0 / config_.max_d_factor) + D_ratio = 1.0 / config_.max_d_factor; + BaseFloat D_new = D_ * D_ratio; + + KALDI_VLOG(3) << "Training objective function normal/perturbed is " + << orig_objf_per_example << '/' << perturbed_objf_per_example + << ", diff " << diff << " vs. target " + << config_.target_objf_change + << ", changing D by factor " << D_ratio << " to " << D_new; + D_ = D_new; +} + +void NnetPerturbedTrainer::BeginNewPhase(bool first_time) { + if (!first_time) { + BaseFloat logprob = logprob_this_phase_/weight_this_phase_, + logprob_perturbed = logprob_perturbed_this_phase_/weight_this_phase_, + diff = logprob - logprob_perturbed; + KALDI_LOG << "Training objective function normal->perturbed is " + << logprob << " -> " << logprob_perturbed << ", diff " + << diff << " vs. target " << config_.target_objf_change + << ", over " << weight_this_phase_ << " frames, D is " + << D_; + } + logprob_total_ += logprob_this_phase_; + logprob_perturbed_total_ += logprob_perturbed_this_phase_; + weight_total_ += weight_this_phase_; + logprob_this_phase_ = 0.0; + logprob_perturbed_this_phase_ = 0.0; + weight_this_phase_ = 0.0; + minibatches_seen_this_phase_ = 0; + num_phases_++; +} + + +NnetPerturbedTrainer::~NnetPerturbedTrainer() { + if (!buffer_.empty()) { + KALDI_LOG << "Doing partial minibatch of size " + << buffer_.size(); + TrainOneMinibatch(); + if (minibatches_seen_this_phase_ != 0) { + bool first_time = false; + BeginNewPhase(first_time); + } + } + if (weight_total_ == 0.0) { + KALDI_WARN << "No data seen."; + } else { + KALDI_LOG << "Did backprop on " << weight_total_ + << " examples, average log-prob normal->perturbed per frame is " + << (logprob_total_ / weight_total_) << " -> " + << (logprob_perturbed_total_ / weight_total_); + KALDI_LOG << "[this line is to be parsed by a script:] log-prob-per-frame=" + << (logprob_total_ / weight_total_); + } +} + + +// compare with DoBackpropParallelClass +class TrainParallelPerturbedClass: public MultiThreadable { + public: + // This constructor is only called for a temporary object + // that we pass to the RunMultiThreaded function. + TrainParallelPerturbedClass(const NnetPerturbedTrainerConfig &config, + const CuMatrix &within_class_covar, + int32 num_layers_before_input, + BaseFloat *D, + Nnet *nnet, + ExamplesRepository *repository, + double *log_prob_orig_ptr, + double *log_prob_perturbed_ptr, + double *tot_weight_ptr): + config_(config), within_class_covar_(within_class_covar), + num_layers_before_input_(num_layers_before_input), D_(D), + nnet_(nnet), repository_(repository), + log_prob_orig_ptr_(log_prob_orig_ptr), + log_prob_perturbed_ptr_(log_prob_perturbed_ptr), + tot_weight_ptr_(tot_weight_ptr), + log_prob_orig_(0.0), + log_prob_perturbed_(0.0), + tot_weight_(0.0) { } + + // Use the default copy constructor. + + // This does the main function of the class. + void operator () () { + std::vector examples; + while (repository_->ProvideExamples(&examples)) { + double objf_orig, objf_perturbed, + weight = TotalNnetTrainingWeight(examples); + DoBackpropPerturbed(*nnet_, num_layers_before_input_, + within_class_covar_, *D_, + examples, nnet_, + &objf_orig, &objf_perturbed); + UpdateD(objf_orig / weight, objf_perturbed / weight); + + tot_weight_ += weight; + log_prob_orig_ += objf_orig; + log_prob_perturbed_ += objf_perturbed; + KALDI_VLOG(4) << "Thread " << thread_id_ << " saw " + << tot_weight_ << " frames so far (weighted); likelihood " + << "per frame (orig->perturbed) so far is " + << (log_prob_orig_ / tot_weight_) << " -> " + << (log_prob_perturbed_ / tot_weight_); + examples.clear(); + } + } + + ~TrainParallelPerturbedClass() { + *log_prob_orig_ptr_ += log_prob_orig_; + *log_prob_perturbed_ptr_ += log_prob_perturbed_; + *tot_weight_ptr_ += tot_weight_; + } + private: + void UpdateD(BaseFloat orig_logprob, BaseFloat perturbed_logprob) { + BaseFloat diff = orig_logprob - perturbed_logprob; + // note: diff should be positive in the normal case. + KALDI_ASSERT(config_.target_objf_change > 0.0 && config_.max_d_factor > 1.0); + // divide the power we raise the ratio to when tuning D, by the + // number of threads; this should ensure stability of the update. + BaseFloat tune_d_power = config_.tune_d_power / g_num_threads; + BaseFloat objf_ratio = config_.target_objf_change / + std::max(1.0e-20, diff), + D_ratio = pow(objf_ratio, tune_d_power); + if (D_ratio > config_.max_d_factor) + D_ratio = config_.max_d_factor; + else if (D_ratio < 1.0 / config_.max_d_factor) + D_ratio = 1.0 / config_.max_d_factor; + BaseFloat D_new = (*D_) * D_ratio; + *D_ = D_new; + + // Note: we are accessing *D_ from multiple threads without + // locking, but the negative consequences of this contention are + // very small ( + KALDI_VLOG(3) << "Training objective function normal->perturbed is " + << orig_logprob << " -> " << perturbed_logprob + << ", diff " << diff << " vs. target " + << config_.target_objf_change + << ", changing D by factor " << D_ratio << " to " << D_new; + } + + const NnetPerturbedTrainerConfig &config_; + const CuMatrix &within_class_covar_; + int32 num_layers_before_input_; + BaseFloat *D_; // Constant D that controls how much to perturb the data. We + // update this as well as use it. + Nnet *nnet_; + ExamplesRepository *repository_; + + double *log_prob_orig_ptr_; + double *log_prob_perturbed_ptr_; + double *tot_weight_ptr_; + double log_prob_orig_; // log-like times num frames (before perturbing features) + double log_prob_perturbed_; // log-like times num frames (after perturbing features) + double tot_weight_; // normalizing factor for the above. +}; + +void DoBackpropPerturbedParallel(const NnetPerturbedTrainerConfig &config, + const SpMatrix &within_class_covar, + SequentialNnetExampleReader *example_reader, + double *tot_objf_orig, + double *tot_objf_perturbed, + double *tot_weight, + Nnet *nnet) { + + // within_class_covar_processed is the within-class covar as CuMatrix, possibly + // projected by the preconditioning transform in any FixedAffineComponent. + CuMatrix within_class_covar_processed; + int32 num_layers_before_input; + InitWithinClassCovar(within_class_covar, *nnet, + &num_layers_before_input, + &within_class_covar_processed); + BaseFloat D = config.initial_d; + + ExamplesRepository repository; // handles parallel programming issues regarding + + *tot_objf_orig = *tot_objf_perturbed = *tot_weight = 0.0; + + TrainParallelPerturbedClass trainer_proto(config, + within_class_covar_processed, + num_layers_before_input, &D, + nnet, &repository, + tot_objf_orig, + tot_objf_perturbed, + tot_weight); + + { + // The initialization of the following class spawns the threads that + // process the examples. They get re-joined in its destructor. + MultiThreader m(g_num_threads, trainer_proto); + + std::vector examples; + for (; !example_reader->Done(); example_reader->Next()) { + examples.push_back(example_reader->Value()); + if (examples.size() == config.minibatch_size) + repository.AcceptExamples(&examples); + } + if (!examples.empty()) // partial minibatch. + repository.AcceptExamples(&examples); + // Here, the destructor of "m" re-joins the threads, and + // does the summing of the gradients if we're doing gradient + // computation (i.e. &nnet != nnet_to_update). This gets + // done in the destructors of the objects of type + // DoBackpropParallelClass. + repository.ExamplesDone(); + } + KALDI_LOG << "Did backprop on " << *tot_weight << " examples, average log-prob " + << "per frame (orig->perturbed) is " + << (*tot_objf_orig / *tot_weight) << " -> " + << (*tot_objf_perturbed / *tot_weight) << " over " + << *tot_weight << " samples (weighted)."; + + KALDI_LOG << "[this line is to be parsed by a script:] log-prob-per-frame=" + << (*tot_objf_orig / *tot_weight); +} + + + + +} // namespace nnet2 +} // namespace kaldi diff --git a/src/nnet2/train-nnet-perturbed.h b/src/nnet2/train-nnet-perturbed.h new file mode 100644 index 000000000..711a63e27 --- /dev/null +++ b/src/nnet2/train-nnet-perturbed.h @@ -0,0 +1,327 @@ +// nnet2/train-nnet-perturbed.h + +// Copyright 2012-2014 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_NNET2_TRAIN_NNET_PERTURBED_H_ +#define KALDI_NNET2_TRAIN_NNET_PERTURBED_H_ + +#include "nnet2/nnet-nnet.h" +#include "nnet2/nnet-example.h" +#include "itf/options-itf.h" + +namespace kaldi { +namespace nnet2 { + +/** + @file + + This file was modified from train-nnet.h in order to implement an idea + about perturbing the training examples slightly, in a direction that's + opposite to the gradient of the objective function w.r.t. those examples. + It's a bit like the idea in "Intriguing properties of neural networks", the + training method they mention, except they have a more complicated formulation + with L-BFGS. We can justify our idea by approximating the neural network + plus objective-function evaluation as a linear function. + + Note: before doing this, we want to make sure the input features have a + reasonable distribution, and our choice for this is to make the within-class + covariance matrix unit. [note: we don't have to normalize the mean to zero, + this won't matter.] Rather than explicitly transforming the features using + a transform T, it turns out that we have to multiply the gradients by something + like T T'. We'll describe this later. + + Suppose the actual input features are x. Typically we do frame splicing + as part of the network, and it's more convenient to do the perturbation on + the spliced features, so x may actually be the output of the network's + first (splicing) layer. Suppose the within-class covariance matrix of + x is W. If we do the Cholesky transform + W = C C^T, + then C^{-1} W C^{-T} = I, so if we define + T =(def) C^{-1} and + and transformed features + \hat{x} =(def) T x + then it's easy to show that the within-class covariance matrix of the + transformed features \hat{x} would be I. + + The way we formulate the perturbed-feature thing is somewhat similar to the + "Intriguing properties of neural networks" paper, except we're not in image + recognition so no need to keep features in the range [0, 1]. Given a training + example \hat{x}_t, we want to find a perturbed example + \hat{x}'_t = \hat{x}_t + d_t + that gives the worst possible loss-value, such that ||d_t|| <= D, where D is + a scalar length parameter (e.g. D = 0.1), and ||.|| is the 2-norm. This means + that we want to perturb the training example in the most damaging way possible, + given that it should not change by more than a certain amount. Because we've + normalized the within-class covariance we believe that using a normal 2-norm + on d_t, rather than a more general form of inner-product, is suitable. + + Anyway, we make a simplifying assumption that the loss function for a particular + sample is just a linear function of the input, and when we get to the space of + \hat{x}, it just means we go a certain distance D down the gradient. How we + set a suitable value for D, we'll come to later. + + Suppose by backpropagating the + derivative to x we get a derivative \nabla_t of the objective function (e.g. a + log-probability) w.r.t. x_t. Then we can get the derivative \hat{\nabla}_t of + the objective function w.r.t. \hat{x}_t, by identifying + x_t^T nabla_t = \hat{x}_t^T \hat{\nabla}_t + x_t^T nabla_t = x_t^T T^T \hat{\nabla}_t + x_t^T nabla_t = x_t^T T^T T^{-T} \nabla_t, since T^T T^{-T} = I. + [note, ^T is transpose and ^{-T} is inverse-of-transpose.] + so \hat{\nabla}_t = T^{-T} \nabla_t. + (this is not the formal way of getting these derivatives, it's just how I remember). + Anyway, we now have + \hat{x}'_t =(def) \hat{x}_t - k_t T^{-T} \nabla_t + where k_t is chosen to ensure that + k_t || T^{-T} \nabla_t ||_2 = D + k_t sqrt( \nabla_t^T T^{-1} T^{-T} \nabla_t ) = D + so + k_t = D / sqrt(\nabla_t^T T^{-1} T^{-T} \nabla_t) + = D / sqrt(\nabla_t^T C C^T \nabla_t) + = D / sqrt(\nabla_t^T W \nabla_t) + Now, we actually want the update in terms of the parameter x instead of \hat{x}, + so multiplying the definition of \hat{x}'_t above by T^{-1} on the left, we have: + x'_t = x_t - k_t T^{-1} T^{-T} \nabla_t + = x_t - k_t W \nabla_t + (note: we can also use W \nabla_t for efficiently computing k_t). + + It will actually be more efficient to do this after the FixedAffineTransform + layer that we used to "precondition" the features, so after the second layer + of the input rather than the first. All we need to do is to get the + within-class covariance matrix W in that space (after the + FixedAffineTransform) instead. We'll use the name x for that space, and forget + about the original input space. + + Next, we want to discuss how we'll set the constant D. D is a proportion of + the within-class covariance. However, it's not clear a priori how to set + this, or that we can tune it just once and then leave it fixed for other + setups. For one thing, if the input features contain a lot of "nuisance" + dimension that are not very informative about the class, it may be necessary + for D to be smaller (because hopefully the gradients will be small in those + nuisance directions). There is another issue that this whole method is + intended to improve generalization, so we only want to use it strongly if + generalization is actually a problem. For example, if we have so much + training data and so few parameters that we have no trouble generalizing, we + might not want to apply this method too strongly. Our method will be to set D + in order to get, on average, a certain degradation which we'll call + "target-objf-change" in the objective function per frame. Each time we + apply this perturbation to a minibatch, we'll see whether the degradation in + objective is greater or less than "target-objf-change", and we'll change + D accordingly. We'll use a simple heuristic that D should change proportionally + to the 0.5'th power of the ratio between the "target-objf-change" and the + observed objective function change for this minibatch, but never by more than + a factor of two. Note: the only significance of 0.5 here is that 0.5 <= 1; a + smaller number means slower changes in D, so it should change over about 2 + minibatches to the right number. If this proves unstable, we'll change it. + + Next, it's not absolutely clear how we should set target-objf-change-- the + value which determines how much objective-function degradation we want the + perturbation to produce on average (per sample). To put this in perspective, + for speech tasks with small amounts of data (say, <30 hours) and a couple thousand + classes + we typically see objective values like: training-set -0.6 and valdiation-set -1.1. + These are avearage log-probabilities per frame, of the correct class. + The two numbers are quite different because there is substantial overtraining. Note: for Karel's + nnet1 setup, the difference is typically smaller, more like -0.8 vs. -1.0, as + that setup monitors the validation-set objective and decreases the learning rate + when it starts to degrade. Now, for much larger training sets, we might + see smaller differences in training-set versus validation-set objective function: + for instance: say, -1.40 versus -1.45. (For larger training sets the objectives tend + to be more negative simply because we have more leaves). We measure these values each + iteration: see the files compute_prob_train.*.log and compute_prob_valid.*.log produced + by the example scripts. The reason why I discuss these values + is that if the training-set and validation-set objective functions are very close, then + it means that there is not much overtraining going on and we don't want to apply this + method too strongly; on the other hand, if they are very different, it means we are + overtraining badly and we may want to apply this method more. + + So we plan to set target-objf-change to the following value, at the script level: + + target-objf-change = target-multiplier * (training-objf - validation-objf)) + + (e.g. target-multiplier = 1.0). + Note that if target-objf-change is less than a specified min-target-objf-change + (e.g. 0.1) then we won't apply the perturbed training at all, which will save + time. The method is intended to help generalization, and if we're generalizing + well then we don't need to apply it. + The training and validation objective functions are computed over + different (randomly chosen) sets, each with about 3000 samples, and it can + sometimes happen that the validation objective function can be better than the + training set objective function. Also, the validation set is sampled from a + held-out subset of 300 utterances by default; this is done out of a concern + that the correlations within an utterance can be very high, so if we use the + same utterances for training and validation, then the validation set is not + really held-out. But the smallish number (300) of validation utterances + increases the randomness in the training and validation objectives. +*/ + + + +struct NnetPerturbedTrainerConfig { + int32 minibatch_size; + int32 minibatches_per_phase; + // target_objf_change will be set from the command line to a value >0.0. + BaseFloat target_objf_change; + BaseFloat initial_d; + // tune_d_power is not configurable from the command line. + BaseFloat tune_d_power; + // max_d_factor is not configurable from the command line. + BaseFloat max_d_factor; + + + NnetPerturbedTrainerConfig(): minibatch_size(500), + minibatches_per_phase(50), + target_objf_change(0.1), + initial_d(0.05), + tune_d_power(0.5), + max_d_factor(2.0){ } + + void Register (OptionsItf *po) { + po->Register("minibatch-size", &minibatch_size, + "Number of samples per minibatch of training data."); + po->Register("minibatches-per-phase", &minibatches_per_phase, + "Number of minibatches to wait before printing training-set " + "objective."); + po->Register("target-objf-change", &target_objf_change, "Target objective " + "function change from feature perturbation, used to set " + "feature distance parameter D"); + po->Register("initial-d", &initial_d, "Initial value of parameter D " + "It will ultimately be set according to --target-objf-change"); + } +}; + + +/// Class NnetPerturbedTrainer is as NnetSimpleTrainer but implements feature +/// perturbation; see the comment at the top of this file (\ref +/// train-nnet-perturbed.h) for more details. + +class NnetPerturbedTrainer { + public: + NnetPerturbedTrainer(const NnetPerturbedTrainerConfig &config, + const SpMatrix &within_class_covar, + Nnet *nnet); + + /// TrainOnExample will take the example and add it to a buffer; + /// if we've reached the minibatch size it will do the training. + void TrainOnExample(const NnetExample &value); + + ~NnetPerturbedTrainer(); + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(NnetPerturbedTrainer); + + void TrainOneMinibatch(); + + // This function initializes within_class_covar_ and num_layers_before_input_. + // The input within_class_covar is the within-class covariance on the original + // raw features, computed from LDA stats, but if this neural network has + // a data-preconditioning layer of type FixedAffineComponent then we will + // project the transform with that and treat the output of that transform + // as the input x (this is more efficient). + void InitWithinClassCovar(const SpMatrix &within_class_covar); + + void UpdateD(BaseFloat orig_objf_per_example, + BaseFloat perturbed_objf_per_example); + + // The following function is called by TrainOneMinibatch() when we enter a new + // phase. A phase is just a certain number of epochs, and now matters only + // for diagnostics (originally it meant something more). + void BeginNewPhase(bool first_time); + + // Things we were given in the initializer: + NnetPerturbedTrainerConfig config_; + + Nnet *nnet_; // the nnet we're training. + + // static information: + // num_layers_before_input_ is the number of initial layers before what we + // consider to be the input for this method: normally 2, for the splicing + // layer and the (FixedAffineComponent) data-preconditioning layer. + int32 num_layers_before_input_; + // The within_class_covar_ variable below is the within-class covariance; if + // we have a (FixedAffineComponent) data-preconditioning layer, we'd project + // the within-class-covariance with that and store it as within_class_covar_. + CuMatrix within_class_covar_; + + // State information: + int32 num_phases_; + int32 minibatches_seen_this_phase_; + std::vector buffer_; + + double logprob_this_phase_; // Needed for accumulating train log-prob on each phase. + double logprob_perturbed_this_phase_; // same for perturbed log-prob + double weight_this_phase_; // count corresponding to the above. + + double logprob_total_; + double logprob_perturbed_total_; + double weight_total_; + + BaseFloat D_; // The distance factor D. +}; + + + + +/// This function computes the objective function and either updates the model +/// or adds to parameter gradients. It returns the cross-entropy objective +/// function summed over all samples (normalize this by dividing by +/// TotalNnetTrainingWeight(examples)). It is mostly a wrapper for +/// a class NnetPerturbedUpdater that's defined in train-nnet-perturbed.cc, but we +/// don't want to expose that complexity at this level. +/// All these examples will be treated as one minibatch. +/// +/// D is the distance factor that determines how much to perturb examples; +/// this is optimized in outer-level code (see class NnetPerturbedTrainer). +/// num_layers_before_input determines how many layers to skip before we find +/// the activation that we regard as the input x to the network, for purposes +/// of this method (e.g. we might skip over the splicing layer and a layer +/// that preconditions the input). +/// within_class_covar (actually a symmetric matrix, but represented as CuMatrix), +/// is the within-class covariance of the features, measured at that level, +/// which ultimately will be derived from LDA stats on the data. + +void DoBackpropPerturbed(const Nnet &nnet, + int32 num_layers_before_input, + const CuMatrix &within_class_covar, + BaseFloat D, + const std::vector &examples, + Nnet *nnet_to_update, + double *tot_objf_orig, + double *tot_objf_perturbed); + + + +/// This function is similar to "DoBackpropParallel" as declared in +/// nnet-update-parallel.h, but supports "perturbed" training. It's intended +/// for multi-threaded CPU-based training. The number of threads will be +/// set to g_num_threads. +/// within_class_covar is the within-class covariance after any splicing +/// but before preconditioning, as needed for the LDA computation. +/// All pointer arguments must be non-NULL. +void DoBackpropPerturbedParallel(const NnetPerturbedTrainerConfig &config, + const SpMatrix &within_class_covar, + SequentialNnetExampleReader *example_reader, + double *tot_objf_orig, + double *tot_objf_perturbed, + double *tot_weight, + Nnet *nnet); + + +} // namespace nnet2 +} // namespace kaldi + +#endif diff --git a/src/nnet2/train-nnet.h b/src/nnet2/train-nnet.h index 420d359a8..83fb74e56 100644 --- a/src/nnet2/train-nnet.h +++ b/src/nnet2/train-nnet.h @@ -48,7 +48,7 @@ struct NnetSimpleTrainerConfig { // Class NnetSimpleTrainer doesn't do much apart from batching up the // input into minibatches and giving it to the neural net code // to call Update(), which will typically do stochastic gradient -// descent. It also reports training-set +// descent. It also reports training-set objective-function values. // It takes in the training examples through the call // "TrainOnExample()". class NnetSimpleTrainer { @@ -66,8 +66,9 @@ class NnetSimpleTrainer { void TrainOneMinibatch(); - // The following function is called by TrainOneMinibatch() - // when we enter a new phase. + // The following function is called by TrainOneMinibatch() when we enter a new + // phase. A phase is just a certain number of epochs, and now matters only + // for diagnostics (originally it meant something more). void BeginNewPhase(bool first_time); // Things we were given in the initializer: diff --git a/src/nnet2bin/Makefile b/src/nnet2bin/Makefile index 5f03e5d5d..0e08c1d6c 100644 --- a/src/nnet2bin/Makefile +++ b/src/nnet2bin/Makefile @@ -25,7 +25,8 @@ BINFILES = nnet-randomize-frames nnet-am-info nnet-init \ nnet-train-discriminative-simple nnet-train-discriminative-parallel \ nnet-modify-learning-rates nnet-normalize-stddev nnet-perturb-egs \ nnet-perturb-egs-fmllr nnet-get-weighted-egs nnet-adjust-priors \ - cuda-compiled nnet-replace-last-layers nnet-am-switch-preconditioning + cuda-compiled nnet-replace-last-layers nnet-am-switch-preconditioning \ + nnet-train-simple-perturbed nnet-train-parallel-perturbed OBJFILES = diff --git a/src/nnet2bin/nnet-get-feature-transform.cc b/src/nnet2bin/nnet-get-feature-transform.cc index a91f4a048..b2e3823a3 100644 --- a/src/nnet2bin/nnet-get-feature-transform.cc +++ b/src/nnet2bin/nnet-get-feature-transform.cc @@ -36,12 +36,16 @@ int main(int argc, char *argv[]) { bool binary = true; FeatureTransformEstimateOptions opts; std::string write_cholesky; + std::string write_within_covar; ParseOptions po(usage); - po.Register("binary", &binary, "Write accumulators in binary mode."); + po.Register("binary", &binary, "Write outputs in binary mode."); po.Register("write-cholesky", &write_cholesky, "If supplied, write to this " - "wxfilename the Cholesky factor of the within-class covariance." + "wxfilename the Cholesky factor of the within-class covariance. " "Can be used for perturbing features. E.g. " "--write-cholesky=exp/nnet5/cholesky.tpmat"); + po.Register("write-within-covar", &write_within_covar, "If supplied, write " + "to this wxfilename the within-class covariance (as a symmetric " + "matrix). E.g. --write-within-covar=exp/nnet5/within_covar.mat"); opts.Register(&po); po.Read(argc, argv); @@ -61,10 +65,18 @@ int main(int argc, char *argv[]) { Matrix mat; TpMatrix cholesky; - fte.Estimate(opts, &mat, write_cholesky != "" ? &cholesky : NULL); + fte.Estimate(opts, &mat, + (write_cholesky != "" || write_within_covar != "" ? + &cholesky : NULL)); WriteKaldiObject(mat, projection_wxfilename, binary); - if (write_cholesky != "") + if (write_cholesky != "") { WriteKaldiObject(cholesky, write_cholesky, binary); + } + if (write_within_covar != "") { + SpMatrix within_var(cholesky.NumRows()); + within_var.AddTp2(1.0, cholesky, kNoTrans, 0.0); + WriteKaldiObject(within_var, write_within_covar, binary); + } return 0; } catch(const std::exception &e) { std::cerr << e.what(); diff --git a/src/nnet2bin/nnet-train-parallel-perturbed.cc b/src/nnet2bin/nnet-train-parallel-perturbed.cc new file mode 100644 index 000000000..9dbbbb3a9 --- /dev/null +++ b/src/nnet2bin/nnet-train-parallel-perturbed.cc @@ -0,0 +1,127 @@ +// nnet2bin/nnet-train-parallel-perturbed.cc + +// Copyright 2012-2014 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "nnet2/train-nnet-perturbed.h" +#include "nnet2/am-nnet.h" +#include "thread/kaldi-thread.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet2; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Train the neural network parameters with backprop and stochastic\n" + "gradient descent using minibatches. The training frames and labels\n" + "are read via a pipe from nnet-randomize-frames. This is like nnet-train-parallel,\n" + "using multiple threads in a Hogwild type of update, but also adding\n" + "perturbed training (see src/nnet2/train-nnet-perturbed.h for info)\n" + "\n" + "Usage: nnet-train-parallel-perturbed [options] \n" + "\n" + "e.g.:\n" + "nnet-randomize-frames [args] | nnet-train-parallel-pertured \\\n" + " --within-covar=within.spmat --num-threads=8 --target-objf-change=0.2 1.nnet ark:- 2.nnet\n"; + + bool binary_write = true; + bool zero_stats = true; + int32 srand_seed = 0; + std::string within_covar_rxfilename; + NnetPerturbedTrainerConfig train_config; + + ParseOptions po(usage); + po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("within-covar", &within_covar_rxfilename, + "rxfilename of within-class covariance-matrix, written as " + "SpMatrix. Must be specified."); + po.Register("zero-stats", &zero_stats, "If true, zero stats " + "stored with the neural net (only affects mixing up)."); + po.Register("srand", &srand_seed, + "Seed for random number generator (e.g., for dropout)"); + po.Register("num-threads", &g_num_threads, "Number of training threads to use " + "in the parallel update. [Note: if you use a parallel " + "implementation of BLAS, the actual number of threads may be larger.]"); + train_config.Register(&po); + + po.Read(argc, argv); + srand(srand_seed); + + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + + std::string nnet_rxfilename = po.GetArg(1), + examples_rspecifier = po.GetArg(2), + nnet_wxfilename = po.GetArg(3); + + if (within_covar_rxfilename == "") { + KALDI_ERR << "The option --within-covar is required."; + } + + TransitionModel trans_model; + AmNnet am_nnet; + { + bool binary_read; + Input ki(nnet_rxfilename, &binary_read); + trans_model.Read(ki.Stream(), binary_read); + am_nnet.Read(ki.Stream(), binary_read); + } + + KALDI_ASSERT(train_config.minibatch_size > 0); + + SpMatrix within_covar; + ReadKaldiObject(within_covar_rxfilename, &within_covar); + + if (zero_stats) am_nnet.GetNnet().ZeroStats(); + + SequentialNnetExampleReader example_reader(examples_rspecifier); + + + double tot_objf_orig, tot_objf_perturbed, tot_weight; + // logging info will be printed from within the next call. + DoBackpropPerturbedParallel(train_config, + within_covar, + &example_reader, + &tot_objf_orig, + &tot_objf_perturbed, + &tot_weight, + &(am_nnet.GetNnet())); + { + Output ko(nnet_wxfilename, binary_write); + trans_model.Write(ko.Stream(), binary_write); + am_nnet.Write(ko.Stream(), binary_write); + } + + KALDI_LOG << "Finished training, processed " << tot_weight + << " training examples (weighted). Wrote model to " + << nnet_wxfilename; + return (tot_weight == 0 ? 1 : 0); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + + diff --git a/src/nnet2bin/nnet-train-parallel.cc b/src/nnet2bin/nnet-train-parallel.cc index 9e817c1e7..57cbd0f3c 100644 --- a/src/nnet2bin/nnet-train-parallel.cc +++ b/src/nnet2bin/nnet-train-parallel.cc @@ -41,7 +41,7 @@ int main(int argc, char *argv[]) { "Usage: nnet-train-parallel [options] \n" "\n" "e.g.:\n" - "nnet-randomize-frames [args] | nnet-train-simple 1.nnet ark:- 2.nnet\n"; + "nnet-randomize-frames [args] | nnet-train-parallel --num-threads=8 1.nnet ark:- 2.nnet\n"; bool binary_write = true; bool zero_stats = true; diff --git a/src/nnet2bin/nnet-train-perturbed.cc b/src/nnet2bin/nnet-train-perturbed.cc new file mode 100644 index 000000000..529f62643 --- /dev/null +++ b/src/nnet2bin/nnet-train-perturbed.cc @@ -0,0 +1,137 @@ +// nnet2bin/nnet-train-perturbed.cc + +// Copyright 2012-2014 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "nnet2/nnet-randomize.h" +#include "nnet2/train-nnet-perturbed.h" +#include "nnet2/am-nnet.h" + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet2; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Train the neural network parameters with backprop and stochastic\n" + "gradient descent using minibatches. The training frames and labels\n" + "are read via a pipe from nnet-randomize-frames. This version of the\n" + "training program does not update the learning rate, but uses\n" + "the learning rates stored in the neural nets.\n" + "\n" + "Usage: nnet-train-perturbed [options] \n" + "note: the option --within-covar= is needed\n" + "\n" + "e.g.:\n" + "nnet-randomize-frames [args] | nnet-train-perturbed --within-covar=within.spmat 1.nnet ark:- 2.nnet\n"; + + bool binary_write = true; + bool zero_stats = true; + int32 srand_seed = 0; + std::string use_gpu = "yes"; + std::string within_covar_rxfilename; + NnetPerturbedTrainerConfig train_config; + + ParseOptions po(usage); + po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("within-covar", &within_covar_rxfilename, + "rxfilename of within-class covariance-matrix, written as " + "SpMatrix. Must be specified."); + po.Register("zero-stats", &zero_stats, "If true, zero occupation " + "counts stored with the neural net (only affects mixing up)."); + po.Register("srand", &srand_seed, "Seed for random number generator " + "(relevant if you have layers of type AffineComponentPreconditioned " + "with l2-penalty != 0.0"); + po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA"); + + train_config.Register(&po); + + po.Read(argc, argv); + + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + srand(srand_seed); + +#if HAVE_CUDA==1 + CuDevice::Instantiate().SelectGpuId(use_gpu); +#endif + if (within_covar_rxfilename == "") { + KALDI_ERR << "The option --within-covar is required."; + } + + std::string nnet_rxfilename = po.GetArg(1), + examples_rspecifier = po.GetArg(2), + nnet_wxfilename = po.GetArg(3); + + int64 num_examples = 0; + + { + TransitionModel trans_model; + AmNnet am_nnet; + { + bool binary_read; + Input ki(nnet_rxfilename, &binary_read); + trans_model.Read(ki.Stream(), binary_read); + am_nnet.Read(ki.Stream(), binary_read); + } + + SpMatrix within_covar; + ReadKaldiObject(within_covar_rxfilename, &within_covar); + + if (zero_stats) am_nnet.GetNnet().ZeroStats(); + + { // want to make sure this object deinitializes before + // we write the model, as it does something in the destructor. + NnetPerturbedTrainer trainer(train_config, + within_covar, + &(am_nnet.GetNnet())); + + SequentialNnetExampleReader example_reader(examples_rspecifier); + + for (; !example_reader.Done(); example_reader.Next(), num_examples++) + trainer.TrainOnExample(example_reader.Value()); // It all happens here! + } + + { + Output ko(nnet_wxfilename, binary_write); + trans_model.Write(ko.Stream(), binary_write); + am_nnet.Write(ko.Stream(), binary_write); + } + } +#if HAVE_CUDA==1 + CuDevice::Instantiate().PrintProfile(); +#endif + + KALDI_LOG << "Finished training, processed " << num_examples + << " training examples. Wrote model to " + << nnet_wxfilename; + return (num_examples == 0 ? 1 : 0); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + + diff --git a/src/nnet2bin/nnet-train-simple-perturbed.cc b/src/nnet2bin/nnet-train-simple-perturbed.cc new file mode 100644 index 000000000..819394729 --- /dev/null +++ b/src/nnet2bin/nnet-train-simple-perturbed.cc @@ -0,0 +1,138 @@ +// nnet2bin/nnet-train-simple-perturbed.cc + +// Copyright 2012-2014 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "nnet2/nnet-randomize.h" +#include "nnet2/train-nnet-perturbed.h" +#include "nnet2/am-nnet.h" + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet2; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Train the neural network parameters with backprop and stochastic\n" + "gradient descent using minibatches. The training frames and labels\n" + "are read via a pipe from nnet-randomize-frames. This is as nnet-train-simple\n" + "but implements perturbed training (see src/nnet2/train-nnet-perturbed.h for\n" + "details)\n" + "\n" + "Usage: nnet-train-simple-perturbed [options] \n" + "note: the option --within-covar= is needed\n" + "\n" + "e.g.:\n" + "nnet-randomize-frames [args] | nnet-train-simple-perturbed \\\n" + " --within-covar=within.spmat --target-objf-change=0.2 1.nnet ark:- 2.nnet\n"; + + bool binary_write = true; + bool zero_stats = true; + int32 srand_seed = 0; + std::string use_gpu = "yes"; + std::string within_covar_rxfilename; + NnetPerturbedTrainerConfig train_config; + + ParseOptions po(usage); + po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("within-covar", &within_covar_rxfilename, + "rxfilename of within-class covariance-matrix, written as " + "SpMatrix. Must be specified."); + po.Register("zero-stats", &zero_stats, "If true, zero occupation " + "counts stored with the neural net (only affects mixing up)."); + po.Register("srand", &srand_seed, "Seed for random number generator " + "(relevant if you have layers of type AffineComponentPreconditioned " + "with l2-penalty != 0.0"); + po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA"); + + train_config.Register(&po); + + po.Read(argc, argv); + + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + srand(srand_seed); + +#if HAVE_CUDA==1 + CuDevice::Instantiate().SelectGpuId(use_gpu); +#endif + if (within_covar_rxfilename == "") { + KALDI_ERR << "The option --within-covar is required."; + } + + std::string nnet_rxfilename = po.GetArg(1), + examples_rspecifier = po.GetArg(2), + nnet_wxfilename = po.GetArg(3); + + int64 num_examples = 0; + + { + TransitionModel trans_model; + AmNnet am_nnet; + { + bool binary_read; + Input ki(nnet_rxfilename, &binary_read); + trans_model.Read(ki.Stream(), binary_read); + am_nnet.Read(ki.Stream(), binary_read); + } + + SpMatrix within_covar; + ReadKaldiObject(within_covar_rxfilename, &within_covar); + + if (zero_stats) am_nnet.GetNnet().ZeroStats(); + + { // want to make sure this object deinitializes before + // we write the model, as it does something in the destructor. + NnetPerturbedTrainer trainer(train_config, + within_covar, + &(am_nnet.GetNnet())); + + SequentialNnetExampleReader example_reader(examples_rspecifier); + + for (; !example_reader.Done(); example_reader.Next(), num_examples++) + trainer.TrainOnExample(example_reader.Value()); // It all happens here! + } + + { + Output ko(nnet_wxfilename, binary_write); + trans_model.Write(ko.Stream(), binary_write); + am_nnet.Write(ko.Stream(), binary_write); + } + } +#if HAVE_CUDA==1 + CuDevice::Instantiate().PrintProfile(); +#endif + + KALDI_LOG << "Finished training, processed " << num_examples + << " training examples. Wrote model to " + << nnet_wxfilename; + return (num_examples == 0 ? 1 : 0); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + +