Finishing the scripts for the ASRU papers.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@119 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2011-07-04 22:23:02 +00:00
Родитель b4bd583a07
Коммит a70d3f856b
25 изменённых файлов: 679 добавлений и 582 удалений

Просмотреть файл

@ -59,13 +59,16 @@ exp/decode_sgmma_fmllrbasis_utt/wer:Average WER is 3.191574 (400 / 12533)
# sgmmb is SGMM with speaker vectors.
exp/decode_sgmmb/wer:Average WER is 2.760712 (346 / 12533)
exp/decode_sgmmb_fmllr/wer:Average WER is 2.585175 (324 / 12533)
exp/decode_sgmmb_utt/wer:Average WER is 2.808585 (352 / 12533)
exp/decode_sgmmb/wer:Average WER is 2.760712 (346 / 12533)
# sgmmc is like sgmmb but with gender dependency
exp/decode_sgmmc/wer:Average WER is 2.696880 (338 / 12533)
exp/decode_sgmmc_fmllr/wer:Average WER is 2.457512 (308 / 12533)
# "norm" is normalizing weights per gender..
exp/decode_sgmmc_norm/wer:Average WER is 2.696880 (338 / 12533)
exp/decode_sgmmc_fmllr_norm/wer:Average WER is 2.425596 (304 / 12533)
# sgmmc is like sgmmb but with gender dependency [doesn't help here]
exp/decode_sgmmc/wer:Average WER is 2.776670 (348 / 12533)
exp/decode_sgmmc_fmllr/wer:Average WER is 2.601133 (326 / 12533)

Просмотреть файл

@ -40,7 +40,7 @@ preselectmap=exp/ubmb/preselect.map
mincount=1000 # min occupancy to extimate fMLLR transform
iters=10 # number of iters of fMLLR estimation
if [ ! -f $fmllr_model ]; then
if [ ! -f $fmllr_model -o $model -nt $fmllr_model ]; then
if [ ! -f $model ]; then
echo "Cannot find $model. Maybe training didn't finish?"
exit 1;

Просмотреть файл

@ -0,0 +1,114 @@
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation, Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# SGMM decoding with adaptation.
#
# SGMM decoding; use a different acoustic scale from normal (0.1 vs 0.08333)
# (1) decode with "alignment model"
# (2) get GMM posteriors with "alignment model" and estimate speaker
# vectors with final model
# (3) decode with final model.
# (4) get GMM posteriors from this decoded output and estimate fMLLR transforms
# with this final model
# (5) decode with the final model using both the speaker vectors and fMLLR
if [ -f path.sh ]; then . path.sh; fi
dir=exp/decode_sgmmc_fmllr_norm
tree=exp/sgmmc/tree
occs=exp/sgmmc/final.occs
modelin=exp/sgmmc/final.mdl
alimodelin=exp/sgmmc/final.alimdl
model=exp/sgmmc/final.mdl.norm
alimodel=exp/sgmmc/final.alimdl.norm
fmllr_model=exp/sgmmc/final_fmllr.mdl.norm
graphdir=exp/graph_sgmmc
silphonelist=`cat data/silphones.csl`
preselectmap=exp/ubmb/preselect.map
mincount=1000 # min occupancy to extimate fMLLR transform
iters=10 # number of iters of fMLLR estimation
mkdir -p $dir
sgmm-normalize $modelin ark:$preselectmap $model 2>$dir/normalize.log
sgmm-normalize $alimodelin ark:$preselectmap $alimodel 2>>$dir/normalize.log
sgmm-comp-prexform $model $occs $fmllr_model 2>$dir/prexform.log
scripts/mkgraph.sh $tree $model $graphdir
for test in mar87 oct87 feb89 oct89 feb91 sep92; do
(
feats="ark:add-deltas --print-args=false scp:data/test_${test}.scp ark:- |"
spk2utt_opt="--spk2utt=ark:data/test_${test}.spk2utt"
utt2spk_opt="--utt2spk=ark:data/test_${test}.utt2spk"
scripts/compose_maps.pl data/test_${test}.utt2spk data/spk2gender.map | \
scripts/compose_maps.pl - $preselectmap | \
gzip -c > $dir/preselect_${test}.gz
sgmm-gselect "--preselect=ark:gunzip -c $dir/preselect_${test}.gz|" \
$model "$feats" ark,t:- 2>$dir/gselect.log | \
gzip -c > $dir/${test}_gselect.gz || exit 1;
gselect_opt="--gselect=ark:gunzip -c $dir/${test}_gselect.gz|"
# Use smaller beam for the first pass decoding.
sgmm-decode-faster "$gselect_opt" --beam=15.0 --acoustic-scale=0.1 --word-symbol-table=data/words.txt $alimodel $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.pass1.tra ark,t:$dir/test_${test}.pass1.ali 2> $dir/pass1_${test}.log
# Estimate the speaker vectors
( ali-to-post ark:$dir/test_${test}.pass1.ali ark:- | \
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
sgmm-post-to-gpost "$gselect_opt" $alimodel "$feats" ark,s,cs:- ark:- | \
sgmm-est-spkvecs-gpost $spk2utt_opt $model "$feats" ark,s,cs:- \
ark:$dir/test_${test}.vecs1 ) 2>$dir/vecs1_${test}.log
( ali-to-post ark:$dir/test_${test}.pass1.ali ark:- | \
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
sgmm-est-spkvecs "$gselect_opt" --spk-vecs=ark:$dir/test_${test}.vecs1 $spk2utt_opt \
$model "$feats" ark,s,cs:- ark:$dir/test_${test}.vecs2 ) 2>$dir/vecs2_${test}.log
# Second-pass decoding with the speaker vectors.
sgmm-decode-faster "$gselect_opt" $utt2spk_opt --spk-vecs=ark:$dir/test_${test}.vecs2 --beam=20.0 --acoustic-scale=0.1 --word-symbol-table=data/words.txt $model $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.pass2.tra ark,t:$dir/test_${test}.pass2.ali 2> $dir/pass2_${test}.log
# Estimate the fMLLR transforms.
( ali-to-post ark:$dir/test_${test}.pass2.ali ark:- | \
weight-silence-post 0.01 $silphonelist $model ark:- ark:- | \
sgmm-post-to-gpost --spk-vecs=ark:$dir/test_${test}.vecs2 $utt2spk_opt \
"$gselect_opt" $model "$feats" ark,s,cs:- ark:- | \
sgmm-est-fmllr-gpost --fmllr-iters=$iters --fmllr-min-count=$mincount \
--spk-vecs=ark:$dir/test_${test}.vecs2 "$spk2utt_opt" $fmllr_model \
"$feats" ark,s,cs:- ark:$dir/test_${test}.fmllr ) \
2>$dir/est_fmllr_${test}.log
adapt_feats="ark:add-deltas --print-args=false scp:data/test_${test}.scp ark:- | transform-feats $utt2spk_opt ark:$dir/test_${test}.fmllr ark:- ark:- |"
# Now decode with fMLLR-adapted features. Gaussian selection is also done
# with the adapted features. This causes a small improvement in WER on RM.
sgmm-decode-faster "$gselect_opt" $utt2spk_opt --beam=20.0 --acoustic-scale=0.1 --word-symbol-table=data/words.txt --spk-vecs=ark:$dir/test_${test}.vecs2 $fmllr_model $graphdir/HCLG.fst "$adapt_feats" ark,t:$dir/test_${test}.tra ark,t:$dir/test_${test}.ali 2> $dir/decode_${test}.log
# the ,p option lets it score partial output without dying..
scripts/sym2int.pl --ignore-first-field data/words.txt data_prep/test_${test}_trans.txt | \
compute-wer --mode=present ark:- ark,p:$dir/test_${test}.tra >& $dir/wer_${test}
) &
done
wait
grep WER $dir/wer_* | \
awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", 100.0*n/d, n, d); }' \
> $dir/wer

Просмотреть файл

@ -0,0 +1,85 @@
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# SGMM decoding with adaptation [with gender-dependent UBM].
#
# SGMM decoding; use a different acoustic scale from normal (0.1 vs 0.08333)
# (1) decode with "alignment model"
# (2) get GMM posteriors with "alignment model" and estimate speaker
# vectors with final model
# (3) decode with final model.
if [ -f path.sh ]; then . path.sh; fi
dir=exp/decode_sgmmc_norm
tree=exp/sgmmc/tree
modelin=exp/sgmmc/final.mdl
alimodelin=exp/sgmmc/final.alimdl
model=exp/sgmmc/final.mdl.norm
alimodel=exp/sgmmc/final.alimdl.norm
graphdir=exp/graph_sgmmc
silphonelist=`cat data/silphones.csl`
preselectmap=exp/ubmb/preselect.map
mkdir -p $dir
sgmm-normalize $modelin ark:$preselectmap $model 2>$dir/normalize.log
sgmm-normalize $alimodelin ark:$preselectmap $alimodel 2>>$dir/normalize.log
scripts/mkgraph.sh $tree $model $graphdir
for test in mar87 oct87 feb89 oct89 feb91 sep92; do
(
feats="ark:add-deltas --print-args=false scp:data/test_${test}.scp ark:- |"
spk2utt_opt="--spk2utt=ark:data/test_${test}.spk2utt"
utt2spk_opt="--utt2spk=ark:data/test_${test}.utt2spk"
scripts/compose_maps.pl data/test_${test}.utt2spk data/spk2gender.map | \
scripts/compose_maps.pl - $preselectmap | \
gzip -c > $dir/preselect_${test}.gz
sgmm-gselect "--preselect=ark:gunzip -c $dir/preselect_${test}.gz|" \
$model "$feats" ark,t:- 2>$dir/gselect.log | \
gzip -c > $dir/${test}_gselect.gz || exit 1;
gselect_opt="--gselect=ark:gunzip -c $dir/${test}_gselect.gz|"
# Use smaller beam first time.
sgmm-decode-faster "$gselect_opt" --beam=15.0 --acoustic-scale=0.1 --word-symbol-table=data/words.txt $alimodel $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.pre_tra ark,t:$dir/test_${test}.pre_ali 2> $dir/predecode_${test}.log
( ali-to-post ark:$dir/test_${test}.pre_ali ark:- | \
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
sgmm-post-to-gpost "$gselect_opt" $alimodel "$feats" ark,s,cs:- ark:- | \
sgmm-est-spkvecs-gpost $spk2utt_opt $model "$feats" ark,s,cs:- \
ark:$dir/test_${test}.vecs1 ) 2>$dir/vecs1_${test}.log
( ali-to-post ark:$dir/test_${test}.pre_ali ark:- | \
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
sgmm-est-spkvecs "$gselect_opt" --spk-vecs=ark:$dir/test_${test}.vecs1 $spk2utt_opt \
$model "$feats" ark,s,cs:- ark:$dir/test_${test}.vecs2 ) 2>$dir/vecs2_${test}.log
sgmm-decode-faster "$gselect_opt" $utt2spk_opt --spk-vecs=ark:$dir/test_${test}.vecs2 --beam=20.0 --acoustic-scale=0.1 --word-symbol-table=data/words.txt $model $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.tra ark,t:$dir/test_${test}.ali 2> $dir/decode_${test}.log
# the ,p option lets it score partial output without dying..
scripts/sym2int.pl --ignore-first-field data/words.txt data_prep/test_${test}_trans.txt | \
compute-wer --mode=present ark:- ark,p:$dir/test_${test}.tra >& $dir/wer_${test}
) &
done
wait
grep WER $dir/wer_* | \
awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", 100.0*n/d, n, d); }' \
> $dir/wer

Просмотреть файл

@ -16,8 +16,9 @@
# Train gender-dependent UBM from a trained HMM/GMM system.
# Instead of 400 UBM Gaussians, use 250 UBM Gaussians per gender, for
# a total of 500.
# We're aiming for 500 UBM Gaussians total.
# Because RM is unbalanced (55 female, 109 male), we train 200
# UBM Gaussians for female and 300 for male.
if [ -f path.sh ]; then . path.sh; fi
@ -25,13 +26,19 @@ dir=exp/ubmb
mkdir -p $dir
srcdir=exp/tri1
if [ ! -f $dir/0.m.ubm ]; then
init-ubm --intermediate-numcomps=2000 --ubm-numcomps=250 --verbose=2 \
--fullcov-ubm=true $srcdir/final.mdl $srcdir/final.occs \
$dir/0.m.ubm 2> $dir/cluster.log || exit 1;
fi
rm -f $dir/.error
init-ubm --intermediate-numcomps=2000 --ubm-numcomps=300 --verbose=2 \
--fullcov-ubm=true $srcdir/final.mdl $srcdir/final.occs \
$dir/0.m.ubm 2> $dir/cluster.log || touch $dir/.error &
init-ubm --intermediate-numcomps=2000 --ubm-numcomps=200 --verbose=2 \
--fullcov-ubm=true $srcdir/final.mdl $srcdir/final.occs \
$dir/0.f.ubm 2> $dir/cluster.log || touch $dir/.error &
wait;
[ -f $dir/.error ] && echo "Error clustering UBM Gaussians" && exit 1;
cp $dir/0.m.ubm $dir/0.f.ubm
cp data/train.scp $dir/train.scp
scripts/compose_maps.pl data/train.utt2spk data/spk2gender.map | grep -w m | \

Просмотреть файл

@ -82,10 +82,12 @@ system:
[spk;+fmllr] 8.3 11.3 | [per-speaker adaptation; +fMLLR]
sgmm3b 7.8 10.4 | [ SGMM with speaker vectors, on SI-284]
[utt] 7.8 10.4 | [per-utterance adaptation]
[spk;+fmllr] 7.8 10.2 | [per-speaker adaptation, with fMLLR]
sgmm3c 7.7 9.9 | [ as sgmm3b but gender-dep. UBM]
[utt] 7.7 10.1 | [per-utterance adaptation]
[fmllr] 7.7 9.7 | [per-spk, with fMLLR]
[spk;+fmllr] 7.8 10.0 | [per-speaker adaptation, with fMLLR]
sgmm3c 7.5 9.5 | [ as sgmm3b but gender-dep. UBM]
[+norm] 7.5 9.6 | [normalizing weights per gender]
[utt] 7.7 9.6 | [per-utterance adaptation]
[fmllr] 7.6 9.2 | [per-spk, with fMLLR]
[+norm] 7.5 9.3 | [normalizing weights per gender]
# Raw results:
exp/decode_mono_tgpr_eval92/wer:%WER 31.38 [ 1770 / 5641, 108 ins, 386 del, 1276 sub ]
@ -266,14 +268,27 @@ exp/decode_sgmm2b_tgpr_utt_eval93/wer:%WER 13.72 [ 472 / 3439, 60 ins, 68 del, 3
exp/decode_sgmm2b_fmllr_tgpr_eval92/wer:%WER 9.93 [ 560 / 5641, 130 ins, 42 del, 388 sub ]
exp/decode_sgmm2b_fmllr_tgpr_eval93/wer:%WER 13.49 [ 464 / 3439, 54 ins, 72 del, 338 sub ]
exp/decode_sgmm3b_fmllr_tgpr_eval92/wer:%WER 7.36 [ 415 / 5641, 110 ins, 14 del, 291 sub ]
exp/decode_sgmm3b_fmllr_tgpr_eval93/wer:%WER 9.94 [ 342 / 3439, 56 ins, 49 del, 237 sub ]
exp/decode_sgmm3b_tgpr_eval92/wer:%WER 7.68 [ 433 / 5641, 117 ins, 15 del, 301 sub ]
exp/decode_sgmm3b_tgpr_eval93/wer:%WER 10.32 [ 355 / 3439, 58 ins, 55 del, 242 sub ]
exp/decode_sgmm3b_tgpr_utt_eval92/wer:%WER 7.59 [ 428 / 5641, 111 ins, 17 del, 300 sub ]
exp/decode_sgmm3b_tgpr_utt_eval93/wer:%WER 9.94 [ 342 / 3439, 52 ins, 52 del, 238 sub ]
exp/decode_sgmm3b_fmllr_tgpr_eval92/wer:%WER 7.73 [ 436 / 5641, 118 ins, 15 del, 303 sub ]
exp/decode_sgmm3b_fmllr_tgpr_eval93/wer:%WER 10.00 [ 344 / 3439, 57 ins, 47 del, 240 sub ]
exp/decode_sgmm3b_tgpr_eval92/wer:%WER 7.78 [ 439 / 5641, 118 ins, 15 del, 306 sub ]
exp/decode_sgmm3b_tgpr_eval93/wer:%WER 10.35 [ 356 / 3439, 58 ins, 47 del, 251 sub ]
exp/decode_sgmm3b_tgpr_utt_eval92/wer:%WER 7.80 [ 440 / 5641, 119 ins, 13 del, 308 sub ]
exp/decode_sgmm3b_tgpr_utt_eval93/wer:%WER 10.38 [ 357 / 3439, 55 ins, 50 del, 252 sub ]
exp/decode_sgmm3c_fmllr_tgpr_eval92/wer:%WER 7.55 [ 426 / 5641, 111 ins, 14 del, 301 sub ]
exp/decode_sgmm3c_fmllr_tgpr_eval93/wer:%WER 9.16 [ 315 / 3439, 54 ins, 41 del, 220 sub ]
exp/decode_sgmm3c_fmllr_tgpr_norm_eval92/wer:%WER 7.46 [ 421 / 5641, 111 ins, 13 del, 297 sub ]
exp/decode_sgmm3c_fmllr_tgpr_norm_eval93/wer:%WER 9.25 [ 318 / 3439, 54 ins, 41 del, 223 sub ]
exp/decode_sgmm3c_tgpr_eval92/wer:%WER 7.52 [ 424 / 5641, 113 ins, 13 del, 298 sub ]
exp/decode_sgmm3c_tgpr_eval93/wer:%WER 9.51 [ 327 / 3439, 55 ins, 42 del, 230 sub ]
exp/decode_sgmm3c_tgpr_norm_eval92/wer:%WER 7.48 [ 422 / 5641, 111 ins, 14 del, 297 sub ]
exp/decode_sgmm3c_tgpr_norm_eval93/wer:%WER 9.62 [ 331 / 3439, 55 ins, 43 del, 233 sub ]
exp/decode_sgmm3c_tgpr_utt_eval92/wer:%WER 7.69 [ 434 / 5641, 110 ins, 17 del, 307 sub ]
exp/decode_sgmm3c_tgpr_utt_eval93/wer:%WER 9.62 [ 331 / 3439, 55 ins, 46 del, 230 sub ]
################
# Results below this point may be out of date.
===========
# Some notes on tuning the SGMM systems on half the SI-84 data (sgmm2a and sgmm2b).
# We ended up with 400 UBM components, and acwt 1/11 (unadapted) and 1/12 (adapted..

Просмотреть файл

@ -418,6 +418,8 @@ steps/train_sgmm3c.sh || exit 1;
scripts/decode.sh --per-spk exp/decode_sgmm3c_tgpr_eval${year} exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c.sh data/eval_nov${year}.scp
scripts/decode.sh exp/decode_sgmm3c_tgpr_utt_eval${year} exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c.sh data/eval_nov${year}.scp
scripts/decode.sh --per-spk exp/decode_sgmm3c_fmllr_tgpr_eval${year} exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c_fmllr.sh data/eval_nov${year}.scp
scripts/decode.sh --per-spk exp/decode_sgmm3c_tgpr_norm_eval${year} exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c_norm.sh data/eval_nov${year}.scp
scripts/decode.sh --per-spk exp/decode_sgmm3c_fmllr_tgpr_norm_eval${year} exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c_fmllr_norm.sh data/eval_nov${year}.scp
done
)&

Просмотреть файл

@ -0,0 +1,127 @@
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script does the decoding of a single batch of test data (on one core).
# It requires arguments. It takes the graphdir and decoding directory,
# and the job number which can actually be any string (even ""); it expects
# a file $decode_dir/test${job_number}.scp to exist, and puts its output in
# $decode_dir/${job_number}.tra
if [ $# != 3 ]; then
echo "Usage: scripts/decode_sgmm3c_fmllr.sh <graph> <decode-dir> <job-number>"
exit 1;
fi
. path.sh || exit 1;
acwt=0.08333
prebeam=12.0
beam=13.0
max_active=7000
silphones=`cat data/silphones.csl`
model=exp/sgmm3c/final.mdl.norm
occs=exp/sgmm3c/final.occs
alimodel=exp/sgmm3c/final.alimdl.norm
preselectmap=exp/ubm3b/preselect.map
fmllr_model=exp/sgmm3c/final_fmllr.mdl.norm
graph=$1
dir=$2
job=$3
scp=$dir/$job.scp
feats="ark:add-deltas --print-args=false scp:$scp ark:- |"
if [ ! -f $fmllr_model ]; then
if [ ! -f $model ]; then
echo "Cannot find $model. Maybe training didn't finish?"
exit 1;
fi
sgmm-comp-prexform $model $occs $fmllr_model
fi
filenames="$scp $alimodel $fmllr_model $model $graph data/words.txt"
for file in $filenames; do
if [ ! -f $file ] ; then
echo "No such file $file";
exit 1;
fi
done
if [ -f $dir/$job.spk2utt ]; then
if [ ! -f $dir/$job.utt2spk ]; then
echo "spk2utt but not utt2spk file present!"
exit 1
fi
spk2utt_opt=--spk2utt=ark:$dir/$job.spk2utt
utt2spk_opt=--utt2spk=ark:$dir/$job.utt2spk
fi
cat data/eval*.utt2spk | \
scripts/compose_maps.pl - data/spk2gender.map | \
scripts/compose_maps.pl - $preselectmap | \
scripts/filter_scp.pl $scp - | \
gzip -c > $dir/preselect.$job.gz
echo running on `hostname` > $dir/decode${job}.log
sgmm-gselect "--preselect=ark:gunzip -c $dir/preselect.$job.gz|" \
$model "$feats" ark,t:- 2>$dir/gselect${job}.log | \
gzip -c > $dir/gselect${job}.gz || exit 1;
gselect_opt="--gselect=ark:gunzip -c $dir/gselect${job}.gz|"
sgmm-decode-faster "$gselect_opt" --beam=$prebeam --max-active=$max_active \
--acoustic-scale=$acwt \
--word-symbol-table=data/words.txt $alimodel $graph "$feats" \
ark,t:$dir/$job.pre_tra ark,t:$dir/$job.pre_ali 2>$dir/predecode${job}.log || exit 1;
( ali-to-post ark:$dir/${job}.pre_ali ark:- | \
weight-silence-post 0.01 $silphones $alimodel ark:- ark:- | \
sgmm-post-to-gpost "$gselect_opt" $alimodel "$feats" ark,s,cs:- ark:- | \
sgmm-est-spkvecs-gpost $spk2utt_opt $model "$feats" ark,s,cs:- \
ark:$dir/${job}.vecs1 ) 2>$dir/vecs1${job}.log || exit 1;
( ali-to-post ark:$dir/${job}.pre_ali ark:- | \
weight-silence-post 0.01 $silphones $alimodel ark:- ark:- | \
sgmm-est-spkvecs "$gselect_opt" --spk-vecs=ark,t:$dir/${job}.vecs1 $spk2utt_opt $model \
"$feats" ark,s,cs:- ark:$dir/${job}.vecs2 ) 2>$dir/vecs2.${job}.log || exit 1;
# second pass of decoding: have spk-vecs but not fMLLR
sgmm-decode-faster "$gselect_opt" --beam=$prebeam --max-active=$max_active \
$utt2spk_opt --spk-vecs=ark:$dir/${job}.vecs2 \
--acoustic-scale=$acwt \
--word-symbol-table=data/words.txt $model $graph "$feats" \
ark,t:$dir/$job.pre2_tra ark,t:$dir/$job.pre2_ali 2>$dir/pre2decode${job}.log || exit 1;
# Estimate fMLLR transforms.
( ali-to-post ark:$dir/$job.pre2_ali ark:- | \
weight-silence-post 0.01 $silphones $model ark:- ark:- | \
sgmm-post-to-gpost --spk-vecs=ark:$dir/${job}.vecs2 $utt2spk_opt "$gselect_opt" $model "$feats" ark,s,cs:- ark:- | \
sgmm-est-fmllr-gpost --spk-vecs=ark:$dir/${job}.vecs2 $spk2utt_opt $fmllr_model "$feats" ark,s,cs:- \
ark:$dir/$job.fmllr ) 2>$dir/est_fmllr${job}.log
feats="ark:add-deltas --print-args=false scp:$scp ark:- | transform-feats $utt2spk_opt ark:$dir/$job.fmllr ark:- ark:- |"
sgmm-decode-faster "$gselect_opt" $utt2spk_opt --spk-vecs=ark:$dir/${job}.vecs2 \
--beam=$beam --acoustic-scale=$acwt --word-symbol-table=data/words.txt \
$fmllr_model $graph "$feats" \
ark,t:$dir/${job}.tra ark,t:$dir/${job}.ali 2> $dir/decode${job}.log

Просмотреть файл

@ -0,0 +1,100 @@
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script does the decoding of a single batch of test data (on one core).
# It requires arguments. It takes the graphdir and decoding directory,
# and the job number which can actually be any string (even ""); it expects
# a file $decode_dir/test${job_number}.scp to exist, and puts its output in
# $decode_dir/${job_number}.tra
if [ $# != 3 ]; then
echo "Usage: scripts/decode_sgmm3c.sh <graph> <decode-dir> <job-number>"
exit 1;
fi
. path.sh || exit 1;
acwt=0.08333
prebeam=12.0
beam=13.0
max_active=7000
silphones=`cat data/silphones.csl`
model=exp/sgmm3c/final.mdl.norm
alimodel=exp/sgmm3c/final.alimdl.norm
preselectmap=exp/ubm3b/preselect.map
graph=$1
dir=$2
job=$3
scp=$dir/$job.scp
feats="ark:add-deltas --print-args=false scp:$scp ark:- |"
filenames="$scp $model $graph data/words.txt"
for file in $filenames; do
if [ ! -f $file ] ; then
echo "No such file $file";
exit 1;
fi
done
if [ -f $dir/$job.spk2utt ]; then
if [ ! -f $dir/$job.utt2spk ]; then
echo "spk2utt but not utt2spk file present!"
exit 1
fi
spk2utt_opt=--spk2utt=ark:$dir/$job.spk2utt
utt2spk_opt=--utt2spk=ark:$dir/$job.utt2spk
fi
cat data/eval*.utt2spk | \
scripts/compose_maps.pl - data/spk2gender.map | \
scripts/compose_maps.pl - $preselectmap | \
scripts/filter_scp.pl $scp - | \
gzip -c > $dir/preselect.$job.gz
echo running on `hostname` > $dir/decode${job}.log
sgmm-gselect "--preselect=ark:gunzip -c $dir/preselect.$job.gz|" \
$model "$feats" ark,t:- 2>$dir/gselect${job}.log | \
gzip -c > $dir/gselect${job}.gz || exit 1;
gselect_opt="--gselect=ark:gunzip -c $dir/gselect${job}.gz|"
sgmm-decode-faster "$gselect_opt" --beam=$prebeam --max-active=$max_active \
--acoustic-scale=$acwt \
--word-symbol-table=data/words.txt $alimodel $graph "$feats" \
ark,t:$dir/$job.pre_tra ark,t:$dir/$job.pre_ali 2>$dir/predecode${job}.log || exit 1;
( ali-to-post ark:$dir/${job}.pre_ali ark:- | \
weight-silence-post 0.01 $silphones $alimodel ark:- ark:- | \
sgmm-post-to-gpost "$gselect_opt" $alimodel "$feats" ark,s,cs:- ark:- | \
sgmm-est-spkvecs-gpost $spk2utt_opt $model "$feats" ark,s,cs:- \
ark:$dir/${job}.vecs1 ) 2>$dir/vecs1.${job}.log || exit 1;
( ali-to-post ark:$dir/${job}.pre_ali ark:- | \
weight-silence-post 0.01 $silphones $alimodel ark:- ark:- | \
sgmm-est-spkvecs "$gselect_opt" --spk-vecs=ark,t:$dir/${job}.vecs1 $spk2utt_opt $model \
"$feats" ark,s,cs:- ark:$dir/${job}.vecs2 ) 2>$dir/vecs2.${job}.log || exit 1;
sgmm-decode-faster "$gselect_opt" --beam=$beam --max-active=$max_active \
$utt2spk_opt --spk-vecs=ark:$dir/${job}.vecs2 \
--acoustic-scale=$acwt \
--word-symbol-table=data/words.txt $model $graph "$feats" \
ark,t:$dir/$job.tra ark,t:$dir/$job.ali 2>$dir/decode${job}.log || exit 1;

Просмотреть файл

@ -256,3 +256,7 @@ rm $dir/$x.?.aliacc
( cd $dir; rm final.alimdl 2>/dev/null; ln -s $x.alimdl final.alimdl; )
# Compute normalized models
sgmm-normalize $dir/final.mdl $preselectmap $dir/final.mdl.norm 2>$dir/normalize.log
sgmm-normalize $dir/final.alimdl $preselectmap $dir/final.alimdl.norm 2>>$dir/normalize.log

Просмотреть файл

@ -14,7 +14,6 @@
documentation for acoustic modeling code.
TODO items (Dan):
Remove unused ET stuff.
Remove non-Kaldi code from decoder/
Rename to branches/kaldi-1.0

Просмотреть файл

@ -9,7 +9,7 @@ BINFILES = gmm-init-mono gmm-est gmm-acc-stats-ali gmm-align \
gmm-acc-stats gmm-init-lvtln gmm-est-lvtln-trans gmm-train-lvtln-special \
gmm-acc-mllt gmm-mixup gmm-init-model \
gmm-acc-hlda gmm-est-hlda gmm-transform-means gmm-init-et gmm-est-et \
gmm-et-acc-a gmm-et-est-a gmm-et-acc-b gmm-copy-et gmm-et-est-b gmm-et-get-b \
gmm-et-acc-a gmm-et-est-a gmm-copy-et gmm-et-get-b \
gmm-make-regtree gmm-decode-faster-regtree-fmllr gmm-post-to-gpost \
gmm-est-fmllr-gpost gmm-est-fmllr gmm-est-regtree-fmllr-ali \
gmm-est-regtree-mllr gmm-decode-kaldi gmm-compute-likes \

Просмотреть файл

@ -30,8 +30,8 @@ int main(int argc, char *argv[]) {
const char *usage =
"Accumulate stats for GMM training.\n"
"Usage: gmm-estimate [options] <model-in> <stats-in> <model-out>\n"
"e.g.: gmm-estimate 1.mdl 1.acc 2.mdl\n";
"Usage: gmm-est [options] <model-in> <stats-in> <model-out>\n"
"e.g.: gmm-est 1.mdl 1.acc 2.mdl\n";
bool binary_write = false;
TransitionUpdateConfig tcfg;

Просмотреть файл

@ -1,222 +0,0 @@
// gmmbin/gmm-et-acc-b.cc
// Copyright 2009-2011 Microsoft Corporation
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include <string>
using std::string;
#include <vector>
using std::vector;
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "gmm/am-diag-gmm.h"
#include "hmm/transition-model.h"
#include "transform/exponential-transform.h"
namespace kaldi {
static void ProcessUtterance(const ExponentialTransform &et,
const GauPost &gpost,
const Matrix<BaseFloat> &xform,
const Matrix<BaseFloat> &feats, // un-transformed feats.
const TransitionModel &trans_model,
const AmDiagGmm &am_gmm,
BaseFloat t,
ExponentialTransformAccsB *accs_b) {
// First work out Ds.
int32 dim = et.Dim();
Matrix<BaseFloat> Ds(dim, dim+1);
et.ComputeDs(xform, t, &Ds);
for (size_t i = 0; i < gpost.size(); i++) {
SubVector<BaseFloat> feat(feats, i);
Vector<BaseFloat> t_data(feat); // transformed feature.
ApplyAffineTransform(xform, &t_data);
for (size_t j = 0; j < gpost[i].size(); j++) {
int32 pdf_id = trans_model.TransitionIdToPdf(gpost[i][j].first);
const DiagGmm &gmm = am_gmm.GetPdf(pdf_id);
const Vector<BaseFloat> &posteriors (gpost[i][j].second);
accs_b->AccumulateFromPosteriors(gmm, t_data, posteriors, Ds);
}
}
}
} // end namespace kaldi
int main(int argc, char *argv[]) {
try {
typedef kaldi::int32 int32;
using namespace kaldi;
const char *usage =
"Accumulate statistics for estimating the A matrix of exponential transform, \n"
" per-utterance (default) or per-speaker for \n"
" the supplied set of speakers (spk2utt option).\n"
"Note: the align-model is needed to get GMM posteriors; it's in the unadapted space.\n"
"Usage: gmm-et-acc-b [options] <align-model> <model> <exponential-transform> <feature-rspecifier> "
"<posteriors-rspecifier> <transform-rspecifier> <warp-rspecifier> <accs-filename>\n";
ParseOptions po(usage);
string spk2utt_rspecifier;
bool binary = false;
po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
"utterance-list map");
po.Register("binary", &binary, "Write output in binary mode");
po.Read(argc, argv);
if (po.NumArgs() != 7) {
po.PrintUsage();
exit(1);
}
string model_rxfilename = po.GetArg(1),
et_rxfilename = po.GetArg(2),
feature_rspecifier = po.GetArg(3),
gpost_rspecifier = po.GetArg(4),
transform_rspecifier = po.GetArg(5),
warps_rspecifier = po.GetArg(6),
accs_wxfilename = po.GetArg(7);
RandomAccessGauPostReader gpost_reader(gpost_rspecifier);
RandomAccessBaseFloatMatrixReader transform_reader(transform_rspecifier);
RandomAccessBaseFloatReader warps_reader(warps_rspecifier);
AmDiagGmm am_gmm;
TransitionModel trans_model;
{
bool binary;
Input is(model_rxfilename, &binary);
trans_model.Read(is.Stream(), binary);
am_gmm.Read(is.Stream(), binary);
}
ExponentialTransform et;
{
bool binary;
Input ki(et_rxfilename, &binary);
et.Read(ki.Stream(), binary);
}
int32 dim = et.Dim();
ExponentialTransformAccsB accs_b(dim);
int32 num_done = 0, num_no_gpost = 0, num_other_error = 0;
if (spk2utt_rspecifier != "") { // per-speaker adaptation
SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
string spk = spk2utt_reader.Key();
if (!transform_reader.HasKey(spk)) {
KALDI_WARN << "Could not read transform for speaker " << spk;
num_other_error++;
}
if (!warps_reader.HasKey(spk)) {
KALDI_WARN << "Could not read warp factor for speaker " << spk;
num_other_error++;
continue;
}
const Matrix<BaseFloat> &xform(transform_reader.Value(spk));
BaseFloat t = warps_reader.Value(spk);
const vector<string> &uttlist = spk2utt_reader.Value();
for (vector<string>::const_iterator utt_itr = uttlist.begin(),
itr_end = uttlist.end(); utt_itr != itr_end; ++utt_itr) {
if (!feature_reader.HasKey(*utt_itr)) {
KALDI_WARN << "Did not find features for utterance " << *utt_itr;
continue;
}
if (!gpost_reader.HasKey(*utt_itr)) {
KALDI_WARN << "Did not find gpost for utterance "
<< *utt_itr;
num_no_gpost++;
continue;
}
const Matrix<BaseFloat> &feats = feature_reader.Value(*utt_itr);
const GauPost &gpost = gpost_reader.Value(*utt_itr);
if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
KALDI_WARN << "gpost has wrong size " << gpost.size()
<< " vs. " << feats.NumRows();
num_other_error++;
continue;
}
ProcessUtterance(et, gpost, xform, feats, trans_model,
am_gmm, t, &accs_b);
num_done++;
if (num_done % 50 == 0)
KALDI_VLOG(1) << "Done " << num_done << " utterances.";
} // end looping over all utterances of the current speaker
} // end looping over speakers
} else { // per-utterance adaptation
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
for (; !feature_reader.Done(); feature_reader.Next()) {
string utt = feature_reader.Key();
FmllrDiagGmmAccs accs(dim);
if (!transform_reader.HasKey(utt)) {
KALDI_WARN << "Could not read transform for speaker " << utt;
num_other_error++;
}
if (!warps_reader.HasKey(utt)) {
KALDI_WARN << "Could not read warp factor for speaker " << utt;
num_other_error++;
continue;
}
if (!gpost_reader.HasKey(utt)) {
KALDI_WARN << "Did not find gpost for utterance "
<< utt;
num_no_gpost++;
continue;
}
const Matrix<BaseFloat> &feats = feature_reader.Value();
const GauPost &gpost = gpost_reader.Value(utt);
const Matrix<BaseFloat> &xform(transform_reader.Value(utt));
BaseFloat t = warps_reader.Value(utt);
if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
KALDI_WARN << "gpost has wrong size " << gpost.size()
<< " vs. " << feats.NumRows();
num_other_error++;
continue;
}
ProcessUtterance(et, gpost, xform, feats, trans_model,
am_gmm, t, &accs_b);
num_done++;
if (num_done % 50 == 0)
KALDI_LOG << "Done " << num_done << " utterances";
}
}
KALDI_LOG << "Done " << num_done << " files, " << num_no_gpost
<< " with no gposts, " << num_other_error << " with other errors.";
Output ko(accs_wxfilename, binary);
accs_b.Write(ko.Stream(), binary);
KALDI_LOG << "Written accs.";
return 0;
} catch(const std::exception& e) {
std::cerr << e.what();
return -1;
}
}

Просмотреть файл

@ -1,89 +0,0 @@
// gmmbin/gmm-et-est-b.cc
// Copyright 2009-2011 Microsoft Corporation
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "transform/exponential-transform.h"
int main(int argc, char *argv[])
{
try {
using namespace kaldi;
using kaldi::int32;
const char *usage =
"Update matrix B of exponential transform (uses stats from gmm-et-acc-b)\n"
" [Use matrix-out with gmm-transform-means to transform model means.]\n"
"Usage: gmm-et-est-b [options] <et-in> <et-out> <matrix-out> <b-stats1> <b-stats2> ... \n"
"e.g.: \n"
" gmm-et-est-b 1.et 2.et 1.et_acc_b\n";
bool binary = true;
ParseOptions po(usage);
std::string set_normalize_type = ""; // may be "", "none", "mean", or "mean-and-var";
ExponentialTransformUpdateAOptions update_a_opts;
po.Register("binary", &binary, "Write output in binary mode");
update_a_opts.Register(&po);
po.Read(argc, argv);
if (po.NumArgs() < 4) {
po.PrintUsage();
exit(1);
}
std::string et_rxfilename = po.GetArg(1);
std::string et_wxfilename = po.GetArg(2);
std::string mat_wxfilename = po.GetArg(3);
ExponentialTransform et;
{
bool binary_in;
Input ki(et_rxfilename, &binary_in);
et.Read(ki.Stream(), binary_in);
}
ExponentialTransformAccsB stats;
for (int32 i = 4; i <= po.NumArgs(); i++) {
std::string stats_rxfilename = po.GetArg(i);
bool binary_in;
Input ki(stats_rxfilename, &binary_in);
stats.Read(ki.Stream(), binary_in, true); // true == add
}
int32 dim = et.Dim();
Matrix<BaseFloat> M(dim, dim); // to transform model means.
stats.Update(&et, NULL, NULL, &M);
{
Output ko(et_wxfilename, binary);
et.Write(ko.Stream(), binary);
}
{
Output ko(mat_wxfilename, binary);
M.Write(ko.Stream(), binary);
}
KALDI_LOG << "Written accs and matrix.";
return 0;
} catch(const std::exception& e) {
std::cerr << e.what();
return -1;
}
}

Просмотреть файл

@ -1137,11 +1137,11 @@ void Matrix<Real>::Read(std::istream & is, bool binary, bool add) {
std::string str;
is >> str; // get a token
if (is.fail()) { specific_error << ": Expected \"[\", got EOF"; goto bad; }
if ((str.compare("DM") == 0) || (str.compare("FM") == 0)) { // Back compatibility.
is >> str; // get #rows
is >> str; // get #cols
is >> str; // get "["
}
//if ((str.compare("DM") == 0) || (str.compare("FM") == 0)) { // Back compatibility.
// is >> str; // get #rows
// is >> str; // get #cols
// is >> str; // get "["
//}
if (str == "[]") { Resize(0, 0); return; } // Be tolerant of variants.
else if (str != "[") {
specific_error << ": Expected \"[\", got \"" << str << '"';

Просмотреть файл

@ -789,10 +789,10 @@ void Vector<Real>::Read(std::istream & is, bool binary, bool add) {
} else { // Text mode reading; format is " [ 1.1 2.0 3.4 ]\n"
std::string s;
is >> s;
if ((s.compare("DV") == 0) || (s.compare("FV") == 0)) { // Back compatibility.
is >> s; // get dimension
is >> s; // get "["
}
//if ((s.compare("DV") == 0) || (s.compare("FV") == 0)) { // Back compatibility.
// is >> s; // get dimension
// is >> s; // get "["
//}
if (is.fail()) { specific_error << "EOF while trying to read vector."; goto bad; }
if (s.compare("[]") == 0) { Resize(0); return; } // tolerate this variant.
if (s.compare("[")) { specific_error << "Expected \"[\" but got " << s; goto bad; }

Просмотреть файл

@ -618,6 +618,94 @@ void AmSgmm::ComputeNormalizers() {
KALDI_LOG << "Done computing normalizers";
}
void AmSgmm::ComputeNormalizersNormalized(const std::vector<std::vector<int32> > &normalize_sets) {
{ // Check sets in normalize_sets are disjoint and cover all Gaussians.
std::set<int32> all;
for(int32 i = 0; i < normalize_sets.size(); i++)
for(int32 j = 0; static_cast<size_t>(j) < normalize_sets[i].size(); j++) {
int32 n = normalize_sets[i][j];
KALDI_ASSERT(all.count(n) == 0 && n >= 0 && n < NumGauss());
all.insert(n);
}
KALDI_ASSERT(all.size() == NumGauss());
}
KALDI_LOG << "Computing normalizers [normalized]";
BaseFloat DLog2pi = FeatureDim() * log(2 * M_PI);
Vector<BaseFloat> mu_jmi(FeatureDim());
Vector<BaseFloat> SigmaInv_mu(FeatureDim());
Vector<BaseFloat> log_det_Sigma(NumGauss());
for (int32 i = 0; i < NumGauss(); i++) {
try {
log_det_Sigma(i) = - SigmaInv_[i].LogPosDefDet();
} catch(...) {
KALDI_WARN << "Covariance is not positive definite, setting to unit";
SigmaInv_[i].SetUnit();
log_det_Sigma(i) = 0.0;
}
}
double entropy_count = 0, entropy_sum = 0;
n_.resize(NumStates());
for (int32 j = 0; j < NumStates(); ++j) {
Vector<BaseFloat> log_w_jm(NumGauss());
n_[j].Resize(NumGauss(), NumSubstates(j));
for (int32 m = 0; m < NumSubstates(j); m++) {
BaseFloat logc = log(c_[j](m));
// (in logs): w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7)
log_w_jm.AddMatVec(1.0, w_, kNoTrans, v_[j].Row(m), 0.0);
log_w_jm.Add((-1.0) * log_w_jm.LogSumExp());
for(int32 n = 0; n < normalize_sets.size(); n++) {
const std::vector<int32> &this_set(normalize_sets[n]);
double sum = 0.0;
for(int32 p = 0; p < this_set.size(); p++)
sum += exp(log_w_jm(this_set[p]));
double offset = -log(sum); // add "offset", to normalize weights.
for(int32 p = 0; p < this_set.size(); p++)
log_w_jm(this_set[p]) += offset;
}
for (int32 i = 0; i < NumGauss(); ++i) {
// mu_jmi = M_{i} * v_{jm}
mu_jmi.AddMatVec(1.0, M_[i], kNoTrans, v_[j].Row(m), 0.0);
// mu_{jmi} * \Sigma_{i}^{-1} * mu_{jmi}
SigmaInv_mu.AddSpVec(1.0, SigmaInv_[i], mu_jmi, 0.0);
BaseFloat mu_SigmaInv_mu = VecVec(mu_jmi, SigmaInv_mu);
// Suggestion: Both mu_jmi and SigmaInv_mu could
// have been computed at once for i ,
// if M[i] was concatenated to single matrix over i indeces
// eq.(31)
n_[j](i, m) = logc + log_w_jm(i) - 0.5 * (log_det_Sigma(i) + DLog2pi
+ mu_SigmaInv_mu);
{ // Mainly diagnostic code. Not necessary.
BaseFloat tmp = n_[j](i, m);
if (!KALDI_ISFINITE(tmp)) { // NaN or inf
KALDI_LOG << "Warning: normalizer for j = " << j << ", m = " << m
<< ", i = " << i << " is infinite or NaN " << tmp << "= "
<< (logc) << "+" << (log_w_jm(i)) << "+" << (-0.5 *
log_det_Sigma(i)) << "+" << (-0.5 * DLog2pi)
<< "+" << (mu_SigmaInv_mu) << ", setting to finite.";
n_[j](i, m) = -1.0e+40; // future work(arnab): get rid of magic number
}
}
}
}
}
KALDI_LOG << "Done computing normalizers (normalized over subsets)";
}
void AmSgmm::ComputeFmllrPreXform(const Vector<BaseFloat> &state_occs,
Matrix<BaseFloat> *xform, Matrix<BaseFloat> *inv_xform,
Vector<BaseFloat> *diag_mean_scatter) const {

Просмотреть файл

@ -215,6 +215,12 @@ class AmSgmm {
/// for each Gaussian component and all substates. Eq. (31)
void ComputeNormalizers();
/// Computes the normalizers, while normalizing the weights to one
/// among each of the sets in "normalize_sets": these sets should
/// be disjoint and their union should be all the indices 0 ... I-1.
void ComputeNormalizersNormalized(const std::vector<std::vector<int32> > &normalize_sets);
/// Computes the LDA-like pre-transform and its inverse as well as the
/// eigenvalues of the scatter of the means used in FMLLR estimation.
void ComputeFmllrPreXform(const Vector<BaseFloat> &state_occs,

Просмотреть файл

@ -7,7 +7,7 @@ BINFILES = init-ubm sgmm-align sgmm-align-compiled sgmm-acc-stats-ali sgmm-sum-a
sgmm-est sgmm-decode-faster sgmm-init sgmm-gselect sgmm-acc-stats \
sgmm-est-spkvecs sgmm-post-to-gpost sgmm-acc-stats-gpost sgmm-est-spkvecs-gpost \
sgmm-comp-prexform sgmm-est-fmllr-gpost sgmm-acc-fmllrbasis-ali sgmm-est-fmllrbasis \
sgmm-calc-distances
sgmm-calc-distances sgmm-normalize
OBJFILES =

Просмотреть файл

@ -31,7 +31,7 @@ int main(int argc, char *argv[]) {
typedef kaldi::int32 int32;
const char *usage =
"Estimate SGMM model parameters from accumulated stats.\n"
"Usage: sgmm-estimate [options] <model-in> <stats-in> <model-out>\n";
"Usage: sgmm-est [options] <model-in> <stats-in> <model-out>\n";
bool binary_write = false;
std::string update_flags_str = "vMNwcS";

Просмотреть файл

@ -0,0 +1,83 @@
// sgmmbin/sgmm-normalize.cc
// Copyright 2009-2011 Microsoft Corporation
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "sgmm/am-sgmm.h"
#include "hmm/transition-model.h"
int main(int argc, char *argv[]) {
try {
using namespace kaldi;
typedef kaldi::int32 int32;
const char *usage =
"Renormalize SGMM so that within certain subsets of UBM Gaussians (typically \n"
"corresponding to gender), probabilities sum to one; write it out, including\n"
"normalizers."
"Note: gaussians-rspecifier will normally be \"ark:foo\" where foo looks like\n"
" m 0 1 2 3 4 5\n"
" f 6 7 8 9 10\n"
"Usage: sgmm-normalize [options] <model-in> <gaussians-rspecifier> <model-out>\n";
bool binary_write = false;
ParseOptions po(usage);
po.Register("binary", &binary_write, "Write output in binary mode");
po.Read(argc, argv);
if (po.NumArgs() != 3) {
po.PrintUsage();
exit(1);
}
std::string model_in_filename = po.GetArg(1),
gaussians_rspecifier = po.GetArg(2),
model_out_filename = po.GetArg(3);
AmSgmm am_sgmm;
TransitionModel trans_model;
{
bool binary;
Input is(model_in_filename, &binary);
trans_model.Read(is.Stream(), binary);
am_sgmm.Read(is.Stream(), binary);
}
std::vector<std::vector<int32> > norm_sets;
SequentialInt32VectorReader vec_reader(gaussians_rspecifier);
for (;!vec_reader.Done(); vec_reader.Next())
norm_sets.push_back(vec_reader.Value());
am_sgmm.ComputeNormalizersNormalized(norm_sets);
{
Output os(model_out_filename, binary_write);
trans_model.Write(os.Stream(), binary_write);
am_sgmm.Write(os.Stream(), binary_write, kSgmmWriteAll);
}
KALDI_LOG << "Written model to " << model_out_filename;
return 0;
} catch(const std::exception& e) {
std::cerr << e.what();
return -1;
}
}

Просмотреть файл

@ -18,6 +18,7 @@
#include "util/common-utils.h"
#include "gmm/diag-gmm.h"
#include "transform/exponential-transform.h"
#include "transform/mllt.h"
namespace kaldi {
@ -102,7 +103,7 @@ void UnitTestExponentialTransformUpdate(EtNormalizeType norm_type,
double objf_change_tot = 0.0;
like_tot = 0.0;
ExponentialTransformAccsA accs_a(dim);
ExponentialTransformAccsB accs_b(dim);
MlltAccs accs_b(dim);
for (int32 k = 0; k < nblocks; k++) {
Matrix<BaseFloat> &cur_xform(cur_xforms[k]);
FmllrOptions opts;
@ -146,8 +147,7 @@ void UnitTestExponentialTransformUpdate(EtNormalizeType norm_type,
if (update_b && j%2 == 1)
accs_b.AccumulateFromPosteriors(gmm,
xformed_row,
posteriors,
cur_Ds[k]);
posteriors);
}
}
}
@ -161,16 +161,17 @@ void UnitTestExponentialTransformUpdate(EtNormalizeType norm_type,
}
if (update_b && j%2 == 1) {
BaseFloat count, objf_impr;
Matrix<BaseFloat> M(dim, dim); // to transform GMM means.
accs_b.Update(&et, &objf_impr, &count, &M);
TestIo(accs_b);
Matrix<BaseFloat> C(dim, dim); // to transform GMM means.
C.SetUnit();
accs_b.Update(&C, &objf_impr, &count);
et.ApplyC(C);
TestIo(et);
KALDI_LOG << "Count is " << count << " and objf impr is " << objf_impr << " updating B";
// update the GMM means:
Matrix<BaseFloat> means;
gmm.GetMeans(&means);
Matrix<BaseFloat> new_means(means.NumRows(), means.NumCols());
new_means.AddMatMat(1.0, means, kNoTrans, M, kTrans, 0.0);
new_means.AddMatMat(1.0, means, kNoTrans, C, kTrans, 0.0);
gmm.SetMeans(new_means);
gmm.ComputeGconsts();
}

Просмотреть файл

@ -408,157 +408,6 @@ void ExponentialTransformAccsA::Init(int32 dim) {
Ahat_.Resize(dim, dim+1);
}
void ExponentialTransformAccsB::Init(int32 dim) {
G_.resize(dim);
for (int32 i = 0; i < dim; i++)
G_[i].Resize(dim);
beta_ = 0.0;
}
void
ExponentialTransformAccsB::
AccumulateFromPosteriors(const DiagGmm &gmm,
const VectorBase<BaseFloat> &t_data,
const VectorBase<BaseFloat> &posteriors,
const MatrixBase<BaseFloat> &Ds) {
int32 dim = G_.size();
KALDI_ASSERT(dim == gmm.Dim() &&
dim == t_data.Dim());
KALDI_ASSERT(posteriors.Dim() == gmm.NumGauss());
Vector<BaseFloat> tmp_data(dim);
KALDI_ASSERT(Ds.NumRows() == dim && Ds.NumCols() == dim+1);
KALDI_ASSERT(dim < 2 || (fabs(Ds(1, 0)) < 0.01 && Ds(0, 0) != 0.0)); // quick check it's diagonal.
// Apply the reverse transformation of Ds to the features.
// If d_i is the i'th diagonal element and b_i is the i'th offset element,
// it transforms y_i = x_i d_i + b_i.
// The reverse transformation is: x_i = (y_i - b_i) / d_i
for (int32 i = 0; i < dim; i++)
tmp_data(i) = (t_data(i) - Ds(i, dim)) / Ds(i, i);
// Note on transforming means and variances from the adapted to the
// speaker-independendent space...
// the transformation on means is as above:
// mu_i -> (mu_i - b_i) / d_i
// and the transformation on variances is:
// sigma_i^2 -> sigma_i^2 / d_i^2.
Matrix<BaseFloat> gmm_means;
gmm.GetMeans(&gmm_means);
const Matrix<BaseFloat> &inv_vars = gmm.inv_vars();
double this_beta = 0.0;
Vector<double> offset(dim);
SpMatrix<double> offset2(dim);
for (int32 i = 0; i < gmm.NumGauss(); i++) {
BaseFloat gamma = posteriors(i);
if (gamma < 1.0e-05) continue;
this_beta += gamma;
offset.CopyFromVec(tmp_data);
for (int32 j = 0; j < dim; j++) {
BaseFloat adapted_mean = (gmm_means(i, j) - Ds(j, dim)) / Ds(j, j);
// adapted_mean is viewing Ds as a model-space transform.
offset(j) -= adapted_mean;
}
offset2.SetZero();
offset2.AddVec2(1.0, offset);
for (int32 j = 0; j < dim; j++) {
BaseFloat adapted_inv_var = inv_vars(i, j) * Ds(j, j) * Ds(j, j);
// was: G_[j].AddVec2(gamma * adapted_inv_var, offset);
// This should be more efficient.
G_[j].AddSp(gamma * adapted_inv_var, offset2);
}
}
beta_ += this_beta;
}
void ExponentialTransformAccsB::Update(ExponentialTransform *et,
BaseFloat *objf_impr_out,
BaseFloat *count_out,
MatrixBase<BaseFloat> *Cpart) {
int32 dim = G_.size();
KALDI_ASSERT(beta_ > 2*dim);
KALDI_ASSERT(dim > 0 && et->Dim() == dim);
BaseFloat objf_impr = 0.0;
Matrix<double> transform(dim, dim);
transform.SetUnit();
std::vector<SpMatrix<double> > Ginv(dim);
for (int32 i = 0; i < dim; i++) {
Ginv[i].Resize(dim);
Ginv[i].CopyFromSp(G_[i]);
Ginv[i].Invert();
}
for (int32 iter = 0; iter < 100; iter++) {
for (int32 i = 0; i < dim; i++) { // for each row...
SubVector<double> row(transform, i);
Vector<double> cofactor(dim); // actually cofactor times a constant.
{
Matrix<double> inv(transform);
inv.Invert();
inv.Transpose();
cofactor.CopyFromVec(inv.Row(i));
}
// Auxf is beta * log(cofactor . row) - 0.5 * row^T G_i row
double old_auxf = beta_ * log(std::abs(VecVec(cofactor, row)))
-0.5 * VecSpVec(row, G_[i], row);
// see Gales, "Semi-tied covariance matrices for hidden
// markov models", feb. 1998 (techreport version of IEEE paper),
// eq. 22,
// a_i <== c_i G_i^{-1} \sqrt( beta / c_i^T G_i^{-1} c_i )
// [notation is not exactly as in original].
row.AddSpVec(sqrt(beta_ / VecSpVec(cofactor, Ginv[i], cofactor)),
Ginv[i], cofactor, 0.0);
double new_auxf = beta_ * log(std::abs(VecVec(cofactor, row)))
-0.5 * VecSpVec(row, G_[i], row);
if (new_auxf < old_auxf - 0.0001 * beta_)
KALDI_ERR << "Auxf decreased in MLLR update.";
objf_impr += new_auxf - old_auxf;
}
}
if (et->norm_type_ == kEtNormalizeMeanAndVar)
KALDI_WARN << "Update for B is not guaranteed to improve objective "
"when both mean and variance normalization is being done.";
KALDI_LOG << "Updating matrix B: auxf improvement is "
<< (objf_impr/beta_) << " per frame over " << beta_
<< " frames.\n";
if (objf_impr_out)
*objf_impr_out = objf_impr;
if (count_out)
*count_out = beta_;
KALDI_ASSERT(Cpart != NULL && Cpart->NumRows() == dim
&& Cpart->NumCols() == dim);
Cpart->CopyFromMat(transform); // this is what we would apply to the
// means.
// HERE: actually apply to A and B.
// We will do (with M the transform we just estimated):
// B' <-- M B
// A' <-- M A M^{-1}
// This way we will have exp(tA') B' = (I + t M A M^{-1} + .. ) M B
// = M ( I + t A + .. ) B.
// To do this we need to extend M with one more row 0 0 ... 0 1.
Matrix<BaseFloat> Cfull(dim+1, dim+1);
SubMatrix<BaseFloat> Cfull_part(Cfull, 0, dim, 0, dim);
Cfull_part.CopyFromMat(transform);
Cfull(dim, dim) = 1.0;
Matrix<BaseFloat> Cfull_inv(Cfull);
Cfull_inv.Invert();
Matrix<BaseFloat> tmp(dim+1, dim+1), new_A(dim+1, dim+1), new_B(dim+1, dim+1);
// tmp <- A M^{-1}
tmp.AddMatMat(1.0, et->A_, kNoTrans, Cfull_inv, kNoTrans, 0.0);
// new_A <-- M tmp
new_A.AddMatMat(1.0, Cfull, kNoTrans, tmp, kNoTrans, 0.0);
et->A_.CopyFromMat(new_A);
// new_B <-- M B
new_B.AddMatMat(1.0, Cfull, kNoTrans, et->B_, kNoTrans, 0.0);
et->B_.CopyFromMat(new_B);
}
void ExponentialTransform::Write(std::ostream &os, bool binary) const {
WriteMarker(os, binary, "<ExponentialTransform>");
WriteMarker(os, binary, "<A>");
@ -585,42 +434,6 @@ void ExponentialTransform::Read(std::istream &is, bool binary) {
}
void ExponentialTransformAccsB::Write(std::ostream &os, bool binary) const {
WriteMarker(os, binary, "<ExponentialTransformAccsB>");
WriteMarker(os, binary, "<Beta>");
WriteBasicType(os, binary, beta_);
WriteMarker(os, binary, "<Dim>");
int32 dim = G_.size();
WriteBasicType(os, binary, dim);
WriteMarker(os, binary, "<G>");
for (int32 i = 0; i < dim; i++)
G_[i].Write(os, binary);
WriteMarker(os, binary, "</ExponentialTransformAccsB>");
}
void ExponentialTransformAccsB::Read(std::istream &os, bool binary, bool add) {
if (G_.empty()) add = false; // don't add to nonexistent stats...
ExpectMarker(os, binary, "<ExponentialTransformAccsB>");
ExpectMarker(os, binary, "<Beta>");
double beta;
ReadBasicType(os, binary, &beta);
if (add) beta_ += beta;
else beta_ = beta;
ExpectMarker(os, binary, "<Dim>");
int32 dim;
ReadBasicType(os, binary, &dim);
if (!add) G_.resize(dim);
else {
if (static_cast<size_t>(dim) != G_.size())
KALDI_ERR << "Reading accs for updating B in exponential transform, "
<< "dim mismatch " << dim << " vs. " << G_.size();
}
ExpectMarker(os, binary, "<G>");
for (size_t i = 0; i < G_.size(); i++)
G_[i].Read(os, binary, add);
ExpectMarker(os, binary, "</ExponentialTransformAccsB>");
}
void ExponentialTransformAccsA::Write(std::ostream &os, bool binary) const {
WriteMarker(os, binary, "<ExponentialTransformAccsA>");

Просмотреть файл

@ -119,45 +119,6 @@ class ExponentialTransform {
};
// This is an MLLT type of update.
class ExponentialTransformAccsB {
public:
ExponentialTransformAccsB() { } // typically only used prior to Read().
ExponentialTransformAccsB(int32 dim) { Init(dim); }
void Init(int32 dim);
// AccumulateFromPosteriors is as in the base class, except we
// supply the transform D_s (expected to be a diagonal or mean-only
// transform), which is treated as a model-space transform here.
// Here, "t_data" is the data transformed by the transform W_s.
// Be careful-- this is different from the accumulation for A, in which
// the fMLLR stats are accumulated given the original data.
void AccumulateFromPosteriors(const DiagGmm &gmm,
const VectorBase<BaseFloat> &t_data,
const VectorBase<BaseFloat> &posteriors,
const MatrixBase<BaseFloat> &Ds);
// The Update function does the MLLT update for B. It sets "Cpart"
// (the first d x d block of C) to the transform that we would have
// to apply to the model means.
void Update(ExponentialTransform *et,
BaseFloat *objf_impr,
BaseFloat *count,
MatrixBase<BaseFloat> *Cpart);
void Write(std::ostream &os, bool binary) const;
void Read(std::istream &is, bool binary, bool add = false);
private:
double beta_;
std::vector<SpMatrix<double> > G_;
};
struct ExponentialTransformUpdateAOptions {