зеркало из https://github.com/mozilla/kaldi.git
Finishing the scripts for the ASRU papers.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@119 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
b4bd583a07
Коммит
a70d3f856b
|
@ -59,13 +59,16 @@ exp/decode_sgmma_fmllrbasis_utt/wer:Average WER is 3.191574 (400 / 12533)
|
|||
|
||||
# sgmmb is SGMM with speaker vectors.
|
||||
exp/decode_sgmmb/wer:Average WER is 2.760712 (346 / 12533)
|
||||
exp/decode_sgmmb_fmllr/wer:Average WER is 2.585175 (324 / 12533)
|
||||
exp/decode_sgmmb_utt/wer:Average WER is 2.808585 (352 / 12533)
|
||||
exp/decode_sgmmb/wer:Average WER is 2.760712 (346 / 12533)
|
||||
|
||||
# sgmmc is like sgmmb but with gender dependency
|
||||
exp/decode_sgmmc/wer:Average WER is 2.696880 (338 / 12533)
|
||||
exp/decode_sgmmc_fmllr/wer:Average WER is 2.457512 (308 / 12533)
|
||||
# "norm" is normalizing weights per gender..
|
||||
exp/decode_sgmmc_norm/wer:Average WER is 2.696880 (338 / 12533)
|
||||
exp/decode_sgmmc_fmllr_norm/wer:Average WER is 2.425596 (304 / 12533)
|
||||
|
||||
# sgmmc is like sgmmb but with gender dependency [doesn't help here]
|
||||
exp/decode_sgmmc/wer:Average WER is 2.776670 (348 / 12533)
|
||||
exp/decode_sgmmc_fmllr/wer:Average WER is 2.601133 (326 / 12533)
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -40,7 +40,7 @@ preselectmap=exp/ubmb/preselect.map
|
|||
mincount=1000 # min occupancy to extimate fMLLR transform
|
||||
iters=10 # number of iters of fMLLR estimation
|
||||
|
||||
if [ ! -f $fmllr_model ]; then
|
||||
if [ ! -f $fmllr_model -o $model -nt $fmllr_model ]; then
|
||||
if [ ! -f $model ]; then
|
||||
echo "Cannot find $model. Maybe training didn't finish?"
|
||||
exit 1;
|
||||
|
|
|
@ -0,0 +1,114 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2010-2011 Microsoft Corporation, Arnab Ghoshal
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# SGMM decoding with adaptation.
|
||||
#
|
||||
# SGMM decoding; use a different acoustic scale from normal (0.1 vs 0.08333)
|
||||
# (1) decode with "alignment model"
|
||||
# (2) get GMM posteriors with "alignment model" and estimate speaker
|
||||
# vectors with final model
|
||||
# (3) decode with final model.
|
||||
# (4) get GMM posteriors from this decoded output and estimate fMLLR transforms
|
||||
# with this final model
|
||||
# (5) decode with the final model using both the speaker vectors and fMLLR
|
||||
|
||||
if [ -f path.sh ]; then . path.sh; fi
|
||||
dir=exp/decode_sgmmc_fmllr_norm
|
||||
tree=exp/sgmmc/tree
|
||||
occs=exp/sgmmc/final.occs
|
||||
modelin=exp/sgmmc/final.mdl
|
||||
alimodelin=exp/sgmmc/final.alimdl
|
||||
model=exp/sgmmc/final.mdl.norm
|
||||
alimodel=exp/sgmmc/final.alimdl.norm
|
||||
fmllr_model=exp/sgmmc/final_fmllr.mdl.norm
|
||||
graphdir=exp/graph_sgmmc
|
||||
silphonelist=`cat data/silphones.csl`
|
||||
preselectmap=exp/ubmb/preselect.map
|
||||
|
||||
mincount=1000 # min occupancy to extimate fMLLR transform
|
||||
iters=10 # number of iters of fMLLR estimation
|
||||
|
||||
|
||||
mkdir -p $dir
|
||||
|
||||
sgmm-normalize $modelin ark:$preselectmap $model 2>$dir/normalize.log
|
||||
sgmm-normalize $alimodelin ark:$preselectmap $alimodel 2>>$dir/normalize.log
|
||||
|
||||
sgmm-comp-prexform $model $occs $fmllr_model 2>$dir/prexform.log
|
||||
|
||||
|
||||
scripts/mkgraph.sh $tree $model $graphdir
|
||||
|
||||
for test in mar87 oct87 feb89 oct89 feb91 sep92; do
|
||||
(
|
||||
feats="ark:add-deltas --print-args=false scp:data/test_${test}.scp ark:- |"
|
||||
spk2utt_opt="--spk2utt=ark:data/test_${test}.spk2utt"
|
||||
utt2spk_opt="--utt2spk=ark:data/test_${test}.utt2spk"
|
||||
scripts/compose_maps.pl data/test_${test}.utt2spk data/spk2gender.map | \
|
||||
scripts/compose_maps.pl - $preselectmap | \
|
||||
gzip -c > $dir/preselect_${test}.gz
|
||||
|
||||
sgmm-gselect "--preselect=ark:gunzip -c $dir/preselect_${test}.gz|" \
|
||||
$model "$feats" ark,t:- 2>$dir/gselect.log | \
|
||||
gzip -c > $dir/${test}_gselect.gz || exit 1;
|
||||
gselect_opt="--gselect=ark:gunzip -c $dir/${test}_gselect.gz|"
|
||||
|
||||
# Use smaller beam for the first pass decoding.
|
||||
sgmm-decode-faster "$gselect_opt" --beam=15.0 --acoustic-scale=0.1 --word-symbol-table=data/words.txt $alimodel $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.pass1.tra ark,t:$dir/test_${test}.pass1.ali 2> $dir/pass1_${test}.log
|
||||
|
||||
# Estimate the speaker vectors
|
||||
( ali-to-post ark:$dir/test_${test}.pass1.ali ark:- | \
|
||||
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
|
||||
sgmm-post-to-gpost "$gselect_opt" $alimodel "$feats" ark,s,cs:- ark:- | \
|
||||
sgmm-est-spkvecs-gpost $spk2utt_opt $model "$feats" ark,s,cs:- \
|
||||
ark:$dir/test_${test}.vecs1 ) 2>$dir/vecs1_${test}.log
|
||||
|
||||
( ali-to-post ark:$dir/test_${test}.pass1.ali ark:- | \
|
||||
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
|
||||
sgmm-est-spkvecs "$gselect_opt" --spk-vecs=ark:$dir/test_${test}.vecs1 $spk2utt_opt \
|
||||
$model "$feats" ark,s,cs:- ark:$dir/test_${test}.vecs2 ) 2>$dir/vecs2_${test}.log
|
||||
|
||||
# Second-pass decoding with the speaker vectors.
|
||||
sgmm-decode-faster "$gselect_opt" $utt2spk_opt --spk-vecs=ark:$dir/test_${test}.vecs2 --beam=20.0 --acoustic-scale=0.1 --word-symbol-table=data/words.txt $model $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.pass2.tra ark,t:$dir/test_${test}.pass2.ali 2> $dir/pass2_${test}.log
|
||||
|
||||
# Estimate the fMLLR transforms.
|
||||
( ali-to-post ark:$dir/test_${test}.pass2.ali ark:- | \
|
||||
weight-silence-post 0.01 $silphonelist $model ark:- ark:- | \
|
||||
sgmm-post-to-gpost --spk-vecs=ark:$dir/test_${test}.vecs2 $utt2spk_opt \
|
||||
"$gselect_opt" $model "$feats" ark,s,cs:- ark:- | \
|
||||
sgmm-est-fmllr-gpost --fmllr-iters=$iters --fmllr-min-count=$mincount \
|
||||
--spk-vecs=ark:$dir/test_${test}.vecs2 "$spk2utt_opt" $fmllr_model \
|
||||
"$feats" ark,s,cs:- ark:$dir/test_${test}.fmllr ) \
|
||||
2>$dir/est_fmllr_${test}.log
|
||||
|
||||
adapt_feats="ark:add-deltas --print-args=false scp:data/test_${test}.scp ark:- | transform-feats $utt2spk_opt ark:$dir/test_${test}.fmllr ark:- ark:- |"
|
||||
|
||||
# Now decode with fMLLR-adapted features. Gaussian selection is also done
|
||||
# with the adapted features. This causes a small improvement in WER on RM.
|
||||
sgmm-decode-faster "$gselect_opt" $utt2spk_opt --beam=20.0 --acoustic-scale=0.1 --word-symbol-table=data/words.txt --spk-vecs=ark:$dir/test_${test}.vecs2 $fmllr_model $graphdir/HCLG.fst "$adapt_feats" ark,t:$dir/test_${test}.tra ark,t:$dir/test_${test}.ali 2> $dir/decode_${test}.log
|
||||
|
||||
# the ,p option lets it score partial output without dying..
|
||||
scripts/sym2int.pl --ignore-first-field data/words.txt data_prep/test_${test}_trans.txt | \
|
||||
compute-wer --mode=present ark:- ark,p:$dir/test_${test}.tra >& $dir/wer_${test}
|
||||
) &
|
||||
done
|
||||
|
||||
wait
|
||||
|
||||
grep WER $dir/wer_* | \
|
||||
awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", 100.0*n/d, n, d); }' \
|
||||
> $dir/wer
|
|
@ -0,0 +1,85 @@
|
|||
# Copyright 2010-2011 Microsoft Corporation
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# SGMM decoding with adaptation [with gender-dependent UBM].
|
||||
#
|
||||
# SGMM decoding; use a different acoustic scale from normal (0.1 vs 0.08333)
|
||||
# (1) decode with "alignment model"
|
||||
# (2) get GMM posteriors with "alignment model" and estimate speaker
|
||||
# vectors with final model
|
||||
# (3) decode with final model.
|
||||
|
||||
if [ -f path.sh ]; then . path.sh; fi
|
||||
dir=exp/decode_sgmmc_norm
|
||||
tree=exp/sgmmc/tree
|
||||
modelin=exp/sgmmc/final.mdl
|
||||
alimodelin=exp/sgmmc/final.alimdl
|
||||
model=exp/sgmmc/final.mdl.norm
|
||||
alimodel=exp/sgmmc/final.alimdl.norm
|
||||
|
||||
graphdir=exp/graph_sgmmc
|
||||
silphonelist=`cat data/silphones.csl`
|
||||
preselectmap=exp/ubmb/preselect.map
|
||||
|
||||
mkdir -p $dir
|
||||
|
||||
sgmm-normalize $modelin ark:$preselectmap $model 2>$dir/normalize.log
|
||||
sgmm-normalize $alimodelin ark:$preselectmap $alimodel 2>>$dir/normalize.log
|
||||
|
||||
|
||||
scripts/mkgraph.sh $tree $model $graphdir
|
||||
|
||||
for test in mar87 oct87 feb89 oct89 feb91 sep92; do
|
||||
(
|
||||
feats="ark:add-deltas --print-args=false scp:data/test_${test}.scp ark:- |"
|
||||
spk2utt_opt="--spk2utt=ark:data/test_${test}.spk2utt"
|
||||
utt2spk_opt="--utt2spk=ark:data/test_${test}.utt2spk"
|
||||
scripts/compose_maps.pl data/test_${test}.utt2spk data/spk2gender.map | \
|
||||
scripts/compose_maps.pl - $preselectmap | \
|
||||
gzip -c > $dir/preselect_${test}.gz
|
||||
|
||||
|
||||
sgmm-gselect "--preselect=ark:gunzip -c $dir/preselect_${test}.gz|" \
|
||||
$model "$feats" ark,t:- 2>$dir/gselect.log | \
|
||||
gzip -c > $dir/${test}_gselect.gz || exit 1;
|
||||
gselect_opt="--gselect=ark:gunzip -c $dir/${test}_gselect.gz|"
|
||||
|
||||
# Use smaller beam first time.
|
||||
sgmm-decode-faster "$gselect_opt" --beam=15.0 --acoustic-scale=0.1 --word-symbol-table=data/words.txt $alimodel $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.pre_tra ark,t:$dir/test_${test}.pre_ali 2> $dir/predecode_${test}.log
|
||||
|
||||
( ali-to-post ark:$dir/test_${test}.pre_ali ark:- | \
|
||||
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
|
||||
sgmm-post-to-gpost "$gselect_opt" $alimodel "$feats" ark,s,cs:- ark:- | \
|
||||
sgmm-est-spkvecs-gpost $spk2utt_opt $model "$feats" ark,s,cs:- \
|
||||
ark:$dir/test_${test}.vecs1 ) 2>$dir/vecs1_${test}.log
|
||||
|
||||
( ali-to-post ark:$dir/test_${test}.pre_ali ark:- | \
|
||||
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
|
||||
sgmm-est-spkvecs "$gselect_opt" --spk-vecs=ark:$dir/test_${test}.vecs1 $spk2utt_opt \
|
||||
$model "$feats" ark,s,cs:- ark:$dir/test_${test}.vecs2 ) 2>$dir/vecs2_${test}.log
|
||||
|
||||
sgmm-decode-faster "$gselect_opt" $utt2spk_opt --spk-vecs=ark:$dir/test_${test}.vecs2 --beam=20.0 --acoustic-scale=0.1 --word-symbol-table=data/words.txt $model $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.tra ark,t:$dir/test_${test}.ali 2> $dir/decode_${test}.log
|
||||
|
||||
# the ,p option lets it score partial output without dying..
|
||||
scripts/sym2int.pl --ignore-first-field data/words.txt data_prep/test_${test}_trans.txt | \
|
||||
compute-wer --mode=present ark:- ark,p:$dir/test_${test}.tra >& $dir/wer_${test}
|
||||
) &
|
||||
done
|
||||
|
||||
wait
|
||||
|
||||
grep WER $dir/wer_* | \
|
||||
awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", 100.0*n/d, n, d); }' \
|
||||
> $dir/wer
|
|
@ -16,8 +16,9 @@
|
|||
|
||||
|
||||
# Train gender-dependent UBM from a trained HMM/GMM system.
|
||||
# Instead of 400 UBM Gaussians, use 250 UBM Gaussians per gender, for
|
||||
# a total of 500.
|
||||
# We're aiming for 500 UBM Gaussians total.
|
||||
# Because RM is unbalanced (55 female, 109 male), we train 200
|
||||
# UBM Gaussians for female and 300 for male.
|
||||
|
||||
if [ -f path.sh ]; then . path.sh; fi
|
||||
|
||||
|
@ -25,13 +26,19 @@ dir=exp/ubmb
|
|||
mkdir -p $dir
|
||||
srcdir=exp/tri1
|
||||
|
||||
if [ ! -f $dir/0.m.ubm ]; then
|
||||
init-ubm --intermediate-numcomps=2000 --ubm-numcomps=250 --verbose=2 \
|
||||
--fullcov-ubm=true $srcdir/final.mdl $srcdir/final.occs \
|
||||
$dir/0.m.ubm 2> $dir/cluster.log || exit 1;
|
||||
fi
|
||||
rm -f $dir/.error
|
||||
|
||||
init-ubm --intermediate-numcomps=2000 --ubm-numcomps=300 --verbose=2 \
|
||||
--fullcov-ubm=true $srcdir/final.mdl $srcdir/final.occs \
|
||||
$dir/0.m.ubm 2> $dir/cluster.log || touch $dir/.error &
|
||||
|
||||
init-ubm --intermediate-numcomps=2000 --ubm-numcomps=200 --verbose=2 \
|
||||
--fullcov-ubm=true $srcdir/final.mdl $srcdir/final.occs \
|
||||
$dir/0.f.ubm 2> $dir/cluster.log || touch $dir/.error &
|
||||
|
||||
wait;
|
||||
[ -f $dir/.error ] && echo "Error clustering UBM Gaussians" && exit 1;
|
||||
|
||||
cp $dir/0.m.ubm $dir/0.f.ubm
|
||||
cp data/train.scp $dir/train.scp
|
||||
|
||||
scripts/compose_maps.pl data/train.utt2spk data/spk2gender.map | grep -w m | \
|
||||
|
|
|
@ -82,10 +82,12 @@ system:
|
|||
[spk;+fmllr] 8.3 11.3 | [per-speaker adaptation; +fMLLR]
|
||||
sgmm3b 7.8 10.4 | [ SGMM with speaker vectors, on SI-284]
|
||||
[utt] 7.8 10.4 | [per-utterance adaptation]
|
||||
[spk;+fmllr] 7.8 10.2 | [per-speaker adaptation, with fMLLR]
|
||||
sgmm3c 7.7 9.9 | [ as sgmm3b but gender-dep. UBM]
|
||||
[utt] 7.7 10.1 | [per-utterance adaptation]
|
||||
[fmllr] 7.7 9.7 | [per-spk, with fMLLR]
|
||||
[spk;+fmllr] 7.8 10.0 | [per-speaker adaptation, with fMLLR]
|
||||
sgmm3c 7.5 9.5 | [ as sgmm3b but gender-dep. UBM]
|
||||
[+norm] 7.5 9.6 | [normalizing weights per gender]
|
||||
[utt] 7.7 9.6 | [per-utterance adaptation]
|
||||
[fmllr] 7.6 9.2 | [per-spk, with fMLLR]
|
||||
[+norm] 7.5 9.3 | [normalizing weights per gender]
|
||||
|
||||
# Raw results:
|
||||
exp/decode_mono_tgpr_eval92/wer:%WER 31.38 [ 1770 / 5641, 108 ins, 386 del, 1276 sub ]
|
||||
|
@ -266,14 +268,27 @@ exp/decode_sgmm2b_tgpr_utt_eval93/wer:%WER 13.72 [ 472 / 3439, 60 ins, 68 del, 3
|
|||
exp/decode_sgmm2b_fmllr_tgpr_eval92/wer:%WER 9.93 [ 560 / 5641, 130 ins, 42 del, 388 sub ]
|
||||
exp/decode_sgmm2b_fmllr_tgpr_eval93/wer:%WER 13.49 [ 464 / 3439, 54 ins, 72 del, 338 sub ]
|
||||
|
||||
exp/decode_sgmm3b_fmllr_tgpr_eval92/wer:%WER 7.36 [ 415 / 5641, 110 ins, 14 del, 291 sub ]
|
||||
exp/decode_sgmm3b_fmllr_tgpr_eval93/wer:%WER 9.94 [ 342 / 3439, 56 ins, 49 del, 237 sub ]
|
||||
exp/decode_sgmm3b_tgpr_eval92/wer:%WER 7.68 [ 433 / 5641, 117 ins, 15 del, 301 sub ]
|
||||
exp/decode_sgmm3b_tgpr_eval93/wer:%WER 10.32 [ 355 / 3439, 58 ins, 55 del, 242 sub ]
|
||||
exp/decode_sgmm3b_tgpr_utt_eval92/wer:%WER 7.59 [ 428 / 5641, 111 ins, 17 del, 300 sub ]
|
||||
exp/decode_sgmm3b_tgpr_utt_eval93/wer:%WER 9.94 [ 342 / 3439, 52 ins, 52 del, 238 sub ]
|
||||
exp/decode_sgmm3b_fmllr_tgpr_eval92/wer:%WER 7.73 [ 436 / 5641, 118 ins, 15 del, 303 sub ]
|
||||
exp/decode_sgmm3b_fmllr_tgpr_eval93/wer:%WER 10.00 [ 344 / 3439, 57 ins, 47 del, 240 sub ]
|
||||
exp/decode_sgmm3b_tgpr_eval92/wer:%WER 7.78 [ 439 / 5641, 118 ins, 15 del, 306 sub ]
|
||||
exp/decode_sgmm3b_tgpr_eval93/wer:%WER 10.35 [ 356 / 3439, 58 ins, 47 del, 251 sub ]
|
||||
exp/decode_sgmm3b_tgpr_utt_eval92/wer:%WER 7.80 [ 440 / 5641, 119 ins, 13 del, 308 sub ]
|
||||
exp/decode_sgmm3b_tgpr_utt_eval93/wer:%WER 10.38 [ 357 / 3439, 55 ins, 50 del, 252 sub ]
|
||||
|
||||
exp/decode_sgmm3c_fmllr_tgpr_eval92/wer:%WER 7.55 [ 426 / 5641, 111 ins, 14 del, 301 sub ]
|
||||
exp/decode_sgmm3c_fmllr_tgpr_eval93/wer:%WER 9.16 [ 315 / 3439, 54 ins, 41 del, 220 sub ]
|
||||
exp/decode_sgmm3c_fmllr_tgpr_norm_eval92/wer:%WER 7.46 [ 421 / 5641, 111 ins, 13 del, 297 sub ]
|
||||
exp/decode_sgmm3c_fmllr_tgpr_norm_eval93/wer:%WER 9.25 [ 318 / 3439, 54 ins, 41 del, 223 sub ]
|
||||
exp/decode_sgmm3c_tgpr_eval92/wer:%WER 7.52 [ 424 / 5641, 113 ins, 13 del, 298 sub ]
|
||||
exp/decode_sgmm3c_tgpr_eval93/wer:%WER 9.51 [ 327 / 3439, 55 ins, 42 del, 230 sub ]
|
||||
exp/decode_sgmm3c_tgpr_norm_eval92/wer:%WER 7.48 [ 422 / 5641, 111 ins, 14 del, 297 sub ]
|
||||
exp/decode_sgmm3c_tgpr_norm_eval93/wer:%WER 9.62 [ 331 / 3439, 55 ins, 43 del, 233 sub ]
|
||||
exp/decode_sgmm3c_tgpr_utt_eval92/wer:%WER 7.69 [ 434 / 5641, 110 ins, 17 del, 307 sub ]
|
||||
exp/decode_sgmm3c_tgpr_utt_eval93/wer:%WER 9.62 [ 331 / 3439, 55 ins, 46 del, 230 sub ]
|
||||
|
||||
|
||||
################
|
||||
# Results below this point may be out of date.
|
||||
===========
|
||||
# Some notes on tuning the SGMM systems on half the SI-84 data (sgmm2a and sgmm2b).
|
||||
# We ended up with 400 UBM components, and acwt 1/11 (unadapted) and 1/12 (adapted..
|
||||
|
|
|
@ -418,6 +418,8 @@ steps/train_sgmm3c.sh || exit 1;
|
|||
scripts/decode.sh --per-spk exp/decode_sgmm3c_tgpr_eval${year} exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c.sh data/eval_nov${year}.scp
|
||||
scripts/decode.sh exp/decode_sgmm3c_tgpr_utt_eval${year} exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c.sh data/eval_nov${year}.scp
|
||||
scripts/decode.sh --per-spk exp/decode_sgmm3c_fmllr_tgpr_eval${year} exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c_fmllr.sh data/eval_nov${year}.scp
|
||||
scripts/decode.sh --per-spk exp/decode_sgmm3c_tgpr_norm_eval${year} exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c_norm.sh data/eval_nov${year}.scp
|
||||
scripts/decode.sh --per-spk exp/decode_sgmm3c_fmllr_tgpr_norm_eval${year} exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c_fmllr_norm.sh data/eval_nov${year}.scp
|
||||
done
|
||||
)&
|
||||
|
||||
|
|
|
@ -0,0 +1,127 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2010-2011 Microsoft Corporation
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
# This script does the decoding of a single batch of test data (on one core).
|
||||
# It requires arguments. It takes the graphdir and decoding directory,
|
||||
# and the job number which can actually be any string (even ""); it expects
|
||||
# a file $decode_dir/test${job_number}.scp to exist, and puts its output in
|
||||
# $decode_dir/${job_number}.tra
|
||||
|
||||
|
||||
if [ $# != 3 ]; then
|
||||
echo "Usage: scripts/decode_sgmm3c_fmllr.sh <graph> <decode-dir> <job-number>"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
. path.sh || exit 1;
|
||||
|
||||
acwt=0.08333
|
||||
prebeam=12.0
|
||||
beam=13.0
|
||||
max_active=7000
|
||||
silphones=`cat data/silphones.csl`
|
||||
model=exp/sgmm3c/final.mdl.norm
|
||||
occs=exp/sgmm3c/final.occs
|
||||
alimodel=exp/sgmm3c/final.alimdl.norm
|
||||
preselectmap=exp/ubm3b/preselect.map
|
||||
fmllr_model=exp/sgmm3c/final_fmllr.mdl.norm
|
||||
graph=$1
|
||||
dir=$2
|
||||
job=$3
|
||||
scp=$dir/$job.scp
|
||||
feats="ark:add-deltas --print-args=false scp:$scp ark:- |"
|
||||
|
||||
if [ ! -f $fmllr_model ]; then
|
||||
if [ ! -f $model ]; then
|
||||
echo "Cannot find $model. Maybe training didn't finish?"
|
||||
exit 1;
|
||||
fi
|
||||
sgmm-comp-prexform $model $occs $fmllr_model
|
||||
fi
|
||||
|
||||
|
||||
filenames="$scp $alimodel $fmllr_model $model $graph data/words.txt"
|
||||
for file in $filenames; do
|
||||
if [ ! -f $file ] ; then
|
||||
echo "No such file $file";
|
||||
exit 1;
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -f $dir/$job.spk2utt ]; then
|
||||
if [ ! -f $dir/$job.utt2spk ]; then
|
||||
echo "spk2utt but not utt2spk file present!"
|
||||
exit 1
|
||||
fi
|
||||
spk2utt_opt=--spk2utt=ark:$dir/$job.spk2utt
|
||||
utt2spk_opt=--utt2spk=ark:$dir/$job.utt2spk
|
||||
fi
|
||||
|
||||
cat data/eval*.utt2spk | \
|
||||
scripts/compose_maps.pl - data/spk2gender.map | \
|
||||
scripts/compose_maps.pl - $preselectmap | \
|
||||
scripts/filter_scp.pl $scp - | \
|
||||
gzip -c > $dir/preselect.$job.gz
|
||||
|
||||
echo running on `hostname` > $dir/decode${job}.log
|
||||
|
||||
|
||||
sgmm-gselect "--preselect=ark:gunzip -c $dir/preselect.$job.gz|" \
|
||||
$model "$feats" ark,t:- 2>$dir/gselect${job}.log | \
|
||||
gzip -c > $dir/gselect${job}.gz || exit 1;
|
||||
gselect_opt="--gselect=ark:gunzip -c $dir/gselect${job}.gz|"
|
||||
|
||||
|
||||
sgmm-decode-faster "$gselect_opt" --beam=$prebeam --max-active=$max_active \
|
||||
--acoustic-scale=$acwt \
|
||||
--word-symbol-table=data/words.txt $alimodel $graph "$feats" \
|
||||
ark,t:$dir/$job.pre_tra ark,t:$dir/$job.pre_ali 2>$dir/predecode${job}.log || exit 1;
|
||||
|
||||
( ali-to-post ark:$dir/${job}.pre_ali ark:- | \
|
||||
weight-silence-post 0.01 $silphones $alimodel ark:- ark:- | \
|
||||
sgmm-post-to-gpost "$gselect_opt" $alimodel "$feats" ark,s,cs:- ark:- | \
|
||||
sgmm-est-spkvecs-gpost $spk2utt_opt $model "$feats" ark,s,cs:- \
|
||||
ark:$dir/${job}.vecs1 ) 2>$dir/vecs1${job}.log || exit 1;
|
||||
|
||||
( ali-to-post ark:$dir/${job}.pre_ali ark:- | \
|
||||
weight-silence-post 0.01 $silphones $alimodel ark:- ark:- | \
|
||||
sgmm-est-spkvecs "$gselect_opt" --spk-vecs=ark,t:$dir/${job}.vecs1 $spk2utt_opt $model \
|
||||
"$feats" ark,s,cs:- ark:$dir/${job}.vecs2 ) 2>$dir/vecs2.${job}.log || exit 1;
|
||||
|
||||
# second pass of decoding: have spk-vecs but not fMLLR
|
||||
sgmm-decode-faster "$gselect_opt" --beam=$prebeam --max-active=$max_active \
|
||||
$utt2spk_opt --spk-vecs=ark:$dir/${job}.vecs2 \
|
||||
--acoustic-scale=$acwt \
|
||||
--word-symbol-table=data/words.txt $model $graph "$feats" \
|
||||
ark,t:$dir/$job.pre2_tra ark,t:$dir/$job.pre2_ali 2>$dir/pre2decode${job}.log || exit 1;
|
||||
|
||||
|
||||
# Estimate fMLLR transforms.
|
||||
|
||||
( ali-to-post ark:$dir/$job.pre2_ali ark:- | \
|
||||
weight-silence-post 0.01 $silphones $model ark:- ark:- | \
|
||||
sgmm-post-to-gpost --spk-vecs=ark:$dir/${job}.vecs2 $utt2spk_opt "$gselect_opt" $model "$feats" ark,s,cs:- ark:- | \
|
||||
sgmm-est-fmllr-gpost --spk-vecs=ark:$dir/${job}.vecs2 $spk2utt_opt $fmllr_model "$feats" ark,s,cs:- \
|
||||
ark:$dir/$job.fmllr ) 2>$dir/est_fmllr${job}.log
|
||||
|
||||
feats="ark:add-deltas --print-args=false scp:$scp ark:- | transform-feats $utt2spk_opt ark:$dir/$job.fmllr ark:- ark:- |"
|
||||
|
||||
sgmm-decode-faster "$gselect_opt" $utt2spk_opt --spk-vecs=ark:$dir/${job}.vecs2 \
|
||||
--beam=$beam --acoustic-scale=$acwt --word-symbol-table=data/words.txt \
|
||||
$fmllr_model $graph "$feats" \
|
||||
ark,t:$dir/${job}.tra ark,t:$dir/${job}.ali 2> $dir/decode${job}.log
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2010-2011 Microsoft Corporation
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
# This script does the decoding of a single batch of test data (on one core).
|
||||
# It requires arguments. It takes the graphdir and decoding directory,
|
||||
# and the job number which can actually be any string (even ""); it expects
|
||||
# a file $decode_dir/test${job_number}.scp to exist, and puts its output in
|
||||
# $decode_dir/${job_number}.tra
|
||||
|
||||
|
||||
if [ $# != 3 ]; then
|
||||
echo "Usage: scripts/decode_sgmm3c.sh <graph> <decode-dir> <job-number>"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
. path.sh || exit 1;
|
||||
|
||||
acwt=0.08333
|
||||
prebeam=12.0
|
||||
beam=13.0
|
||||
max_active=7000
|
||||
silphones=`cat data/silphones.csl`
|
||||
model=exp/sgmm3c/final.mdl.norm
|
||||
alimodel=exp/sgmm3c/final.alimdl.norm
|
||||
preselectmap=exp/ubm3b/preselect.map
|
||||
graph=$1
|
||||
dir=$2
|
||||
job=$3
|
||||
scp=$dir/$job.scp
|
||||
feats="ark:add-deltas --print-args=false scp:$scp ark:- |"
|
||||
|
||||
filenames="$scp $model $graph data/words.txt"
|
||||
for file in $filenames; do
|
||||
if [ ! -f $file ] ; then
|
||||
echo "No such file $file";
|
||||
exit 1;
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -f $dir/$job.spk2utt ]; then
|
||||
if [ ! -f $dir/$job.utt2spk ]; then
|
||||
echo "spk2utt but not utt2spk file present!"
|
||||
exit 1
|
||||
fi
|
||||
spk2utt_opt=--spk2utt=ark:$dir/$job.spk2utt
|
||||
utt2spk_opt=--utt2spk=ark:$dir/$job.utt2spk
|
||||
fi
|
||||
|
||||
cat data/eval*.utt2spk | \
|
||||
scripts/compose_maps.pl - data/spk2gender.map | \
|
||||
scripts/compose_maps.pl - $preselectmap | \
|
||||
scripts/filter_scp.pl $scp - | \
|
||||
gzip -c > $dir/preselect.$job.gz
|
||||
|
||||
|
||||
echo running on `hostname` > $dir/decode${job}.log
|
||||
|
||||
sgmm-gselect "--preselect=ark:gunzip -c $dir/preselect.$job.gz|" \
|
||||
$model "$feats" ark,t:- 2>$dir/gselect${job}.log | \
|
||||
gzip -c > $dir/gselect${job}.gz || exit 1;
|
||||
gselect_opt="--gselect=ark:gunzip -c $dir/gselect${job}.gz|"
|
||||
|
||||
|
||||
sgmm-decode-faster "$gselect_opt" --beam=$prebeam --max-active=$max_active \
|
||||
--acoustic-scale=$acwt \
|
||||
--word-symbol-table=data/words.txt $alimodel $graph "$feats" \
|
||||
ark,t:$dir/$job.pre_tra ark,t:$dir/$job.pre_ali 2>$dir/predecode${job}.log || exit 1;
|
||||
|
||||
( ali-to-post ark:$dir/${job}.pre_ali ark:- | \
|
||||
weight-silence-post 0.01 $silphones $alimodel ark:- ark:- | \
|
||||
sgmm-post-to-gpost "$gselect_opt" $alimodel "$feats" ark,s,cs:- ark:- | \
|
||||
sgmm-est-spkvecs-gpost $spk2utt_opt $model "$feats" ark,s,cs:- \
|
||||
ark:$dir/${job}.vecs1 ) 2>$dir/vecs1.${job}.log || exit 1;
|
||||
|
||||
( ali-to-post ark:$dir/${job}.pre_ali ark:- | \
|
||||
weight-silence-post 0.01 $silphones $alimodel ark:- ark:- | \
|
||||
sgmm-est-spkvecs "$gselect_opt" --spk-vecs=ark,t:$dir/${job}.vecs1 $spk2utt_opt $model \
|
||||
"$feats" ark,s,cs:- ark:$dir/${job}.vecs2 ) 2>$dir/vecs2.${job}.log || exit 1;
|
||||
|
||||
sgmm-decode-faster "$gselect_opt" --beam=$beam --max-active=$max_active \
|
||||
$utt2spk_opt --spk-vecs=ark:$dir/${job}.vecs2 \
|
||||
--acoustic-scale=$acwt \
|
||||
--word-symbol-table=data/words.txt $model $graph "$feats" \
|
||||
ark,t:$dir/$job.tra ark,t:$dir/$job.ali 2>$dir/decode${job}.log || exit 1;
|
||||
|
||||
|
|
@ -256,3 +256,7 @@ rm $dir/$x.?.aliacc
|
|||
|
||||
( cd $dir; rm final.alimdl 2>/dev/null; ln -s $x.alimdl final.alimdl; )
|
||||
|
||||
# Compute normalized models
|
||||
sgmm-normalize $dir/final.mdl $preselectmap $dir/final.mdl.norm 2>$dir/normalize.log
|
||||
sgmm-normalize $dir/final.alimdl $preselectmap $dir/final.alimdl.norm 2>>$dir/normalize.log
|
||||
|
||||
|
|
1
src/TODO
1
src/TODO
|
@ -14,7 +14,6 @@
|
|||
documentation for acoustic modeling code.
|
||||
|
||||
TODO items (Dan):
|
||||
Remove unused ET stuff.
|
||||
Remove non-Kaldi code from decoder/
|
||||
Rename to branches/kaldi-1.0
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ BINFILES = gmm-init-mono gmm-est gmm-acc-stats-ali gmm-align \
|
|||
gmm-acc-stats gmm-init-lvtln gmm-est-lvtln-trans gmm-train-lvtln-special \
|
||||
gmm-acc-mllt gmm-mixup gmm-init-model \
|
||||
gmm-acc-hlda gmm-est-hlda gmm-transform-means gmm-init-et gmm-est-et \
|
||||
gmm-et-acc-a gmm-et-est-a gmm-et-acc-b gmm-copy-et gmm-et-est-b gmm-et-get-b \
|
||||
gmm-et-acc-a gmm-et-est-a gmm-copy-et gmm-et-get-b \
|
||||
gmm-make-regtree gmm-decode-faster-regtree-fmllr gmm-post-to-gpost \
|
||||
gmm-est-fmllr-gpost gmm-est-fmllr gmm-est-regtree-fmllr-ali \
|
||||
gmm-est-regtree-mllr gmm-decode-kaldi gmm-compute-likes \
|
||||
|
|
|
@ -30,8 +30,8 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
const char *usage =
|
||||
"Accumulate stats for GMM training.\n"
|
||||
"Usage: gmm-estimate [options] <model-in> <stats-in> <model-out>\n"
|
||||
"e.g.: gmm-estimate 1.mdl 1.acc 2.mdl\n";
|
||||
"Usage: gmm-est [options] <model-in> <stats-in> <model-out>\n"
|
||||
"e.g.: gmm-est 1.mdl 1.acc 2.mdl\n";
|
||||
|
||||
bool binary_write = false;
|
||||
TransitionUpdateConfig tcfg;
|
||||
|
|
|
@ -1,222 +0,0 @@
|
|||
// gmmbin/gmm-et-acc-b.cc
|
||||
|
||||
// Copyright 2009-2011 Microsoft Corporation
|
||||
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string>
|
||||
using std::string;
|
||||
#include <vector>
|
||||
using std::vector;
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "gmm/am-diag-gmm.h"
|
||||
#include "hmm/transition-model.h"
|
||||
#include "transform/exponential-transform.h"
|
||||
|
||||
namespace kaldi {
|
||||
static void ProcessUtterance(const ExponentialTransform &et,
|
||||
const GauPost &gpost,
|
||||
const Matrix<BaseFloat> &xform,
|
||||
const Matrix<BaseFloat> &feats, // un-transformed feats.
|
||||
const TransitionModel &trans_model,
|
||||
const AmDiagGmm &am_gmm,
|
||||
BaseFloat t,
|
||||
ExponentialTransformAccsB *accs_b) {
|
||||
// First work out Ds.
|
||||
int32 dim = et.Dim();
|
||||
Matrix<BaseFloat> Ds(dim, dim+1);
|
||||
|
||||
et.ComputeDs(xform, t, &Ds);
|
||||
|
||||
for (size_t i = 0; i < gpost.size(); i++) {
|
||||
SubVector<BaseFloat> feat(feats, i);
|
||||
Vector<BaseFloat> t_data(feat); // transformed feature.
|
||||
ApplyAffineTransform(xform, &t_data);
|
||||
|
||||
for (size_t j = 0; j < gpost[i].size(); j++) {
|
||||
int32 pdf_id = trans_model.TransitionIdToPdf(gpost[i][j].first);
|
||||
const DiagGmm &gmm = am_gmm.GetPdf(pdf_id);
|
||||
const Vector<BaseFloat> &posteriors (gpost[i][j].second);
|
||||
accs_b->AccumulateFromPosteriors(gmm, t_data, posteriors, Ds);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // end namespace kaldi
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
try {
|
||||
typedef kaldi::int32 int32;
|
||||
using namespace kaldi;
|
||||
const char *usage =
|
||||
"Accumulate statistics for estimating the A matrix of exponential transform, \n"
|
||||
" per-utterance (default) or per-speaker for \n"
|
||||
" the supplied set of speakers (spk2utt option).\n"
|
||||
"Note: the align-model is needed to get GMM posteriors; it's in the unadapted space.\n"
|
||||
"Usage: gmm-et-acc-b [options] <align-model> <model> <exponential-transform> <feature-rspecifier> "
|
||||
"<posteriors-rspecifier> <transform-rspecifier> <warp-rspecifier> <accs-filename>\n";
|
||||
|
||||
ParseOptions po(usage);
|
||||
string spk2utt_rspecifier;
|
||||
bool binary = false;
|
||||
po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
|
||||
"utterance-list map");
|
||||
po.Register("binary", &binary, "Write output in binary mode");
|
||||
|
||||
po.Read(argc, argv);
|
||||
|
||||
if (po.NumArgs() != 7) {
|
||||
po.PrintUsage();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
string model_rxfilename = po.GetArg(1),
|
||||
et_rxfilename = po.GetArg(2),
|
||||
feature_rspecifier = po.GetArg(3),
|
||||
gpost_rspecifier = po.GetArg(4),
|
||||
transform_rspecifier = po.GetArg(5),
|
||||
warps_rspecifier = po.GetArg(6),
|
||||
accs_wxfilename = po.GetArg(7);
|
||||
|
||||
|
||||
RandomAccessGauPostReader gpost_reader(gpost_rspecifier);
|
||||
RandomAccessBaseFloatMatrixReader transform_reader(transform_rspecifier);
|
||||
RandomAccessBaseFloatReader warps_reader(warps_rspecifier);
|
||||
|
||||
AmDiagGmm am_gmm;
|
||||
TransitionModel trans_model;
|
||||
{
|
||||
bool binary;
|
||||
Input is(model_rxfilename, &binary);
|
||||
trans_model.Read(is.Stream(), binary);
|
||||
am_gmm.Read(is.Stream(), binary);
|
||||
}
|
||||
|
||||
ExponentialTransform et;
|
||||
{
|
||||
bool binary;
|
||||
Input ki(et_rxfilename, &binary);
|
||||
et.Read(ki.Stream(), binary);
|
||||
}
|
||||
|
||||
int32 dim = et.Dim();
|
||||
|
||||
ExponentialTransformAccsB accs_b(dim);
|
||||
|
||||
int32 num_done = 0, num_no_gpost = 0, num_other_error = 0;
|
||||
if (spk2utt_rspecifier != "") { // per-speaker adaptation
|
||||
SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
|
||||
RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
|
||||
for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
|
||||
string spk = spk2utt_reader.Key();
|
||||
if (!transform_reader.HasKey(spk)) {
|
||||
KALDI_WARN << "Could not read transform for speaker " << spk;
|
||||
num_other_error++;
|
||||
}
|
||||
if (!warps_reader.HasKey(spk)) {
|
||||
KALDI_WARN << "Could not read warp factor for speaker " << spk;
|
||||
num_other_error++;
|
||||
continue;
|
||||
}
|
||||
const Matrix<BaseFloat> &xform(transform_reader.Value(spk));
|
||||
BaseFloat t = warps_reader.Value(spk);
|
||||
|
||||
const vector<string> &uttlist = spk2utt_reader.Value();
|
||||
for (vector<string>::const_iterator utt_itr = uttlist.begin(),
|
||||
itr_end = uttlist.end(); utt_itr != itr_end; ++utt_itr) {
|
||||
if (!feature_reader.HasKey(*utt_itr)) {
|
||||
KALDI_WARN << "Did not find features for utterance " << *utt_itr;
|
||||
continue;
|
||||
}
|
||||
if (!gpost_reader.HasKey(*utt_itr)) {
|
||||
KALDI_WARN << "Did not find gpost for utterance "
|
||||
<< *utt_itr;
|
||||
num_no_gpost++;
|
||||
continue;
|
||||
}
|
||||
const Matrix<BaseFloat> &feats = feature_reader.Value(*utt_itr);
|
||||
|
||||
const GauPost &gpost = gpost_reader.Value(*utt_itr);
|
||||
|
||||
if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
|
||||
KALDI_WARN << "gpost has wrong size " << gpost.size()
|
||||
<< " vs. " << feats.NumRows();
|
||||
num_other_error++;
|
||||
continue;
|
||||
}
|
||||
|
||||
ProcessUtterance(et, gpost, xform, feats, trans_model,
|
||||
am_gmm, t, &accs_b);
|
||||
num_done++;
|
||||
if (num_done % 50 == 0)
|
||||
KALDI_VLOG(1) << "Done " << num_done << " utterances.";
|
||||
} // end looping over all utterances of the current speaker
|
||||
} // end looping over speakers
|
||||
} else { // per-utterance adaptation
|
||||
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
|
||||
for (; !feature_reader.Done(); feature_reader.Next()) {
|
||||
string utt = feature_reader.Key();
|
||||
FmllrDiagGmmAccs accs(dim);
|
||||
|
||||
if (!transform_reader.HasKey(utt)) {
|
||||
KALDI_WARN << "Could not read transform for speaker " << utt;
|
||||
num_other_error++;
|
||||
}
|
||||
if (!warps_reader.HasKey(utt)) {
|
||||
KALDI_WARN << "Could not read warp factor for speaker " << utt;
|
||||
num_other_error++;
|
||||
continue;
|
||||
}
|
||||
if (!gpost_reader.HasKey(utt)) {
|
||||
KALDI_WARN << "Did not find gpost for utterance "
|
||||
<< utt;
|
||||
num_no_gpost++;
|
||||
continue;
|
||||
}
|
||||
const Matrix<BaseFloat> &feats = feature_reader.Value();
|
||||
const GauPost &gpost = gpost_reader.Value(utt);
|
||||
const Matrix<BaseFloat> &xform(transform_reader.Value(utt));
|
||||
BaseFloat t = warps_reader.Value(utt);
|
||||
|
||||
if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
|
||||
KALDI_WARN << "gpost has wrong size " << gpost.size()
|
||||
<< " vs. " << feats.NumRows();
|
||||
num_other_error++;
|
||||
continue;
|
||||
}
|
||||
|
||||
ProcessUtterance(et, gpost, xform, feats, trans_model,
|
||||
am_gmm, t, &accs_b);
|
||||
num_done++;
|
||||
|
||||
if (num_done % 50 == 0)
|
||||
KALDI_LOG << "Done " << num_done << " utterances";
|
||||
}
|
||||
}
|
||||
|
||||
KALDI_LOG << "Done " << num_done << " files, " << num_no_gpost
|
||||
<< " with no gposts, " << num_other_error << " with other errors.";
|
||||
|
||||
Output ko(accs_wxfilename, binary);
|
||||
accs_b.Write(ko.Stream(), binary);
|
||||
KALDI_LOG << "Written accs.";
|
||||
return 0;
|
||||
} catch(const std::exception& e) {
|
||||
std::cerr << e.what();
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,89 +0,0 @@
|
|||
// gmmbin/gmm-et-est-b.cc
|
||||
|
||||
// Copyright 2009-2011 Microsoft Corporation
|
||||
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "transform/exponential-transform.h"
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
try {
|
||||
using namespace kaldi;
|
||||
using kaldi::int32;
|
||||
|
||||
const char *usage =
|
||||
"Update matrix B of exponential transform (uses stats from gmm-et-acc-b)\n"
|
||||
" [Use matrix-out with gmm-transform-means to transform model means.]\n"
|
||||
"Usage: gmm-et-est-b [options] <et-in> <et-out> <matrix-out> <b-stats1> <b-stats2> ... \n"
|
||||
"e.g.: \n"
|
||||
" gmm-et-est-b 1.et 2.et 1.et_acc_b\n";
|
||||
|
||||
bool binary = true;
|
||||
ParseOptions po(usage);
|
||||
|
||||
std::string set_normalize_type = ""; // may be "", "none", "mean", or "mean-and-var";
|
||||
ExponentialTransformUpdateAOptions update_a_opts;
|
||||
po.Register("binary", &binary, "Write output in binary mode");
|
||||
update_a_opts.Register(&po);
|
||||
|
||||
po.Read(argc, argv);
|
||||
|
||||
if (po.NumArgs() < 4) {
|
||||
po.PrintUsage();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
std::string et_rxfilename = po.GetArg(1);
|
||||
std::string et_wxfilename = po.GetArg(2);
|
||||
std::string mat_wxfilename = po.GetArg(3);
|
||||
|
||||
ExponentialTransform et;
|
||||
{
|
||||
bool binary_in;
|
||||
Input ki(et_rxfilename, &binary_in);
|
||||
et.Read(ki.Stream(), binary_in);
|
||||
}
|
||||
ExponentialTransformAccsB stats;
|
||||
for (int32 i = 4; i <= po.NumArgs(); i++) {
|
||||
std::string stats_rxfilename = po.GetArg(i);
|
||||
bool binary_in;
|
||||
Input ki(stats_rxfilename, &binary_in);
|
||||
stats.Read(ki.Stream(), binary_in, true); // true == add
|
||||
}
|
||||
|
||||
int32 dim = et.Dim();
|
||||
Matrix<BaseFloat> M(dim, dim); // to transform model means.
|
||||
stats.Update(&et, NULL, NULL, &M);
|
||||
|
||||
{
|
||||
Output ko(et_wxfilename, binary);
|
||||
et.Write(ko.Stream(), binary);
|
||||
}
|
||||
{
|
||||
Output ko(mat_wxfilename, binary);
|
||||
M.Write(ko.Stream(), binary);
|
||||
}
|
||||
KALDI_LOG << "Written accs and matrix.";
|
||||
return 0;
|
||||
} catch(const std::exception& e) {
|
||||
std::cerr << e.what();
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
|
@ -1137,11 +1137,11 @@ void Matrix<Real>::Read(std::istream & is, bool binary, bool add) {
|
|||
std::string str;
|
||||
is >> str; // get a token
|
||||
if (is.fail()) { specific_error << ": Expected \"[\", got EOF"; goto bad; }
|
||||
if ((str.compare("DM") == 0) || (str.compare("FM") == 0)) { // Back compatibility.
|
||||
is >> str; // get #rows
|
||||
is >> str; // get #cols
|
||||
is >> str; // get "["
|
||||
}
|
||||
//if ((str.compare("DM") == 0) || (str.compare("FM") == 0)) { // Back compatibility.
|
||||
// is >> str; // get #rows
|
||||
// is >> str; // get #cols
|
||||
// is >> str; // get "["
|
||||
//}
|
||||
if (str == "[]") { Resize(0, 0); return; } // Be tolerant of variants.
|
||||
else if (str != "[") {
|
||||
specific_error << ": Expected \"[\", got \"" << str << '"';
|
||||
|
|
|
@ -789,10 +789,10 @@ void Vector<Real>::Read(std::istream & is, bool binary, bool add) {
|
|||
} else { // Text mode reading; format is " [ 1.1 2.0 3.4 ]\n"
|
||||
std::string s;
|
||||
is >> s;
|
||||
if ((s.compare("DV") == 0) || (s.compare("FV") == 0)) { // Back compatibility.
|
||||
is >> s; // get dimension
|
||||
is >> s; // get "["
|
||||
}
|
||||
//if ((s.compare("DV") == 0) || (s.compare("FV") == 0)) { // Back compatibility.
|
||||
// is >> s; // get dimension
|
||||
// is >> s; // get "["
|
||||
//}
|
||||
if (is.fail()) { specific_error << "EOF while trying to read vector."; goto bad; }
|
||||
if (s.compare("[]") == 0) { Resize(0); return; } // tolerate this variant.
|
||||
if (s.compare("[")) { specific_error << "Expected \"[\" but got " << s; goto bad; }
|
||||
|
|
|
@ -618,6 +618,94 @@ void AmSgmm::ComputeNormalizers() {
|
|||
KALDI_LOG << "Done computing normalizers";
|
||||
}
|
||||
|
||||
|
||||
void AmSgmm::ComputeNormalizersNormalized(const std::vector<std::vector<int32> > &normalize_sets) {
|
||||
{ // Check sets in normalize_sets are disjoint and cover all Gaussians.
|
||||
std::set<int32> all;
|
||||
for(int32 i = 0; i < normalize_sets.size(); i++)
|
||||
for(int32 j = 0; static_cast<size_t>(j) < normalize_sets[i].size(); j++) {
|
||||
int32 n = normalize_sets[i][j];
|
||||
KALDI_ASSERT(all.count(n) == 0 && n >= 0 && n < NumGauss());
|
||||
all.insert(n);
|
||||
}
|
||||
KALDI_ASSERT(all.size() == NumGauss());
|
||||
}
|
||||
|
||||
|
||||
KALDI_LOG << "Computing normalizers [normalized]";
|
||||
BaseFloat DLog2pi = FeatureDim() * log(2 * M_PI);
|
||||
Vector<BaseFloat> mu_jmi(FeatureDim());
|
||||
Vector<BaseFloat> SigmaInv_mu(FeatureDim());
|
||||
Vector<BaseFloat> log_det_Sigma(NumGauss());
|
||||
|
||||
for (int32 i = 0; i < NumGauss(); i++) {
|
||||
try {
|
||||
log_det_Sigma(i) = - SigmaInv_[i].LogPosDefDet();
|
||||
} catch(...) {
|
||||
KALDI_WARN << "Covariance is not positive definite, setting to unit";
|
||||
SigmaInv_[i].SetUnit();
|
||||
log_det_Sigma(i) = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
double entropy_count = 0, entropy_sum = 0;
|
||||
|
||||
n_.resize(NumStates());
|
||||
for (int32 j = 0; j < NumStates(); ++j) {
|
||||
Vector<BaseFloat> log_w_jm(NumGauss());
|
||||
|
||||
n_[j].Resize(NumGauss(), NumSubstates(j));
|
||||
for (int32 m = 0; m < NumSubstates(j); m++) {
|
||||
BaseFloat logc = log(c_[j](m));
|
||||
|
||||
// (in logs): w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7)
|
||||
log_w_jm.AddMatVec(1.0, w_, kNoTrans, v_[j].Row(m), 0.0);
|
||||
log_w_jm.Add((-1.0) * log_w_jm.LogSumExp());
|
||||
|
||||
for(int32 n = 0; n < normalize_sets.size(); n++) {
|
||||
const std::vector<int32> &this_set(normalize_sets[n]);
|
||||
double sum = 0.0;
|
||||
for(int32 p = 0; p < this_set.size(); p++)
|
||||
sum += exp(log_w_jm(this_set[p]));
|
||||
double offset = -log(sum); // add "offset", to normalize weights.
|
||||
for(int32 p = 0; p < this_set.size(); p++)
|
||||
log_w_jm(this_set[p]) += offset;
|
||||
}
|
||||
|
||||
for (int32 i = 0; i < NumGauss(); ++i) {
|
||||
// mu_jmi = M_{i} * v_{jm}
|
||||
mu_jmi.AddMatVec(1.0, M_[i], kNoTrans, v_[j].Row(m), 0.0);
|
||||
|
||||
// mu_{jmi} * \Sigma_{i}^{-1} * mu_{jmi}
|
||||
SigmaInv_mu.AddSpVec(1.0, SigmaInv_[i], mu_jmi, 0.0);
|
||||
BaseFloat mu_SigmaInv_mu = VecVec(mu_jmi, SigmaInv_mu);
|
||||
|
||||
// Suggestion: Both mu_jmi and SigmaInv_mu could
|
||||
// have been computed at once for i ,
|
||||
// if M[i] was concatenated to single matrix over i indeces
|
||||
|
||||
// eq.(31)
|
||||
n_[j](i, m) = logc + log_w_jm(i) - 0.5 * (log_det_Sigma(i) + DLog2pi
|
||||
+ mu_SigmaInv_mu);
|
||||
{ // Mainly diagnostic code. Not necessary.
|
||||
BaseFloat tmp = n_[j](i, m);
|
||||
if (!KALDI_ISFINITE(tmp)) { // NaN or inf
|
||||
KALDI_LOG << "Warning: normalizer for j = " << j << ", m = " << m
|
||||
<< ", i = " << i << " is infinite or NaN " << tmp << "= "
|
||||
<< (logc) << "+" << (log_w_jm(i)) << "+" << (-0.5 *
|
||||
log_det_Sigma(i)) << "+" << (-0.5 * DLog2pi)
|
||||
<< "+" << (mu_SigmaInv_mu) << ", setting to finite.";
|
||||
n_[j](i, m) = -1.0e+40; // future work(arnab): get rid of magic number
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
KALDI_LOG << "Done computing normalizers (normalized over subsets)";
|
||||
}
|
||||
|
||||
|
||||
void AmSgmm::ComputeFmllrPreXform(const Vector<BaseFloat> &state_occs,
|
||||
Matrix<BaseFloat> *xform, Matrix<BaseFloat> *inv_xform,
|
||||
Vector<BaseFloat> *diag_mean_scatter) const {
|
||||
|
|
|
@ -215,6 +215,12 @@ class AmSgmm {
|
|||
/// for each Gaussian component and all substates. Eq. (31)
|
||||
void ComputeNormalizers();
|
||||
|
||||
|
||||
/// Computes the normalizers, while normalizing the weights to one
|
||||
/// among each of the sets in "normalize_sets": these sets should
|
||||
/// be disjoint and their union should be all the indices 0 ... I-1.
|
||||
void ComputeNormalizersNormalized(const std::vector<std::vector<int32> > &normalize_sets);
|
||||
|
||||
/// Computes the LDA-like pre-transform and its inverse as well as the
|
||||
/// eigenvalues of the scatter of the means used in FMLLR estimation.
|
||||
void ComputeFmllrPreXform(const Vector<BaseFloat> &state_occs,
|
||||
|
|
|
@ -7,7 +7,7 @@ BINFILES = init-ubm sgmm-align sgmm-align-compiled sgmm-acc-stats-ali sgmm-sum-a
|
|||
sgmm-est sgmm-decode-faster sgmm-init sgmm-gselect sgmm-acc-stats \
|
||||
sgmm-est-spkvecs sgmm-post-to-gpost sgmm-acc-stats-gpost sgmm-est-spkvecs-gpost \
|
||||
sgmm-comp-prexform sgmm-est-fmllr-gpost sgmm-acc-fmllrbasis-ali sgmm-est-fmllrbasis \
|
||||
sgmm-calc-distances
|
||||
sgmm-calc-distances sgmm-normalize
|
||||
|
||||
|
||||
OBJFILES =
|
||||
|
|
|
@ -31,7 +31,7 @@ int main(int argc, char *argv[]) {
|
|||
typedef kaldi::int32 int32;
|
||||
const char *usage =
|
||||
"Estimate SGMM model parameters from accumulated stats.\n"
|
||||
"Usage: sgmm-estimate [options] <model-in> <stats-in> <model-out>\n";
|
||||
"Usage: sgmm-est [options] <model-in> <stats-in> <model-out>\n";
|
||||
|
||||
bool binary_write = false;
|
||||
std::string update_flags_str = "vMNwcS";
|
||||
|
|
|
@ -0,0 +1,83 @@
|
|||
// sgmmbin/sgmm-normalize.cc
|
||||
|
||||
// Copyright 2009-2011 Microsoft Corporation
|
||||
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "util/common-utils.h"
|
||||
|
||||
#include "sgmm/am-sgmm.h"
|
||||
#include "hmm/transition-model.h"
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
try {
|
||||
using namespace kaldi;
|
||||
typedef kaldi::int32 int32;
|
||||
const char *usage =
|
||||
"Renormalize SGMM so that within certain subsets of UBM Gaussians (typically \n"
|
||||
"corresponding to gender), probabilities sum to one; write it out, including\n"
|
||||
"normalizers."
|
||||
"Note: gaussians-rspecifier will normally be \"ark:foo\" where foo looks like\n"
|
||||
" m 0 1 2 3 4 5\n"
|
||||
" f 6 7 8 9 10\n"
|
||||
"Usage: sgmm-normalize [options] <model-in> <gaussians-rspecifier> <model-out>\n";
|
||||
|
||||
bool binary_write = false;
|
||||
|
||||
ParseOptions po(usage);
|
||||
po.Register("binary", &binary_write, "Write output in binary mode");
|
||||
|
||||
po.Read(argc, argv);
|
||||
if (po.NumArgs() != 3) {
|
||||
po.PrintUsage();
|
||||
exit(1);
|
||||
}
|
||||
std::string model_in_filename = po.GetArg(1),
|
||||
gaussians_rspecifier = po.GetArg(2),
|
||||
model_out_filename = po.GetArg(3);
|
||||
|
||||
AmSgmm am_sgmm;
|
||||
TransitionModel trans_model;
|
||||
{
|
||||
bool binary;
|
||||
Input is(model_in_filename, &binary);
|
||||
trans_model.Read(is.Stream(), binary);
|
||||
am_sgmm.Read(is.Stream(), binary);
|
||||
}
|
||||
|
||||
std::vector<std::vector<int32> > norm_sets;
|
||||
SequentialInt32VectorReader vec_reader(gaussians_rspecifier);
|
||||
for (;!vec_reader.Done(); vec_reader.Next())
|
||||
norm_sets.push_back(vec_reader.Value());
|
||||
|
||||
am_sgmm.ComputeNormalizersNormalized(norm_sets);
|
||||
|
||||
{
|
||||
Output os(model_out_filename, binary_write);
|
||||
trans_model.Write(os.Stream(), binary_write);
|
||||
am_sgmm.Write(os.Stream(), binary_write, kSgmmWriteAll);
|
||||
}
|
||||
|
||||
|
||||
KALDI_LOG << "Written model to " << model_out_filename;
|
||||
return 0;
|
||||
} catch(const std::exception& e) {
|
||||
std::cerr << e.what();
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -18,6 +18,7 @@
|
|||
#include "util/common-utils.h"
|
||||
#include "gmm/diag-gmm.h"
|
||||
#include "transform/exponential-transform.h"
|
||||
#include "transform/mllt.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
@ -102,7 +103,7 @@ void UnitTestExponentialTransformUpdate(EtNormalizeType norm_type,
|
|||
double objf_change_tot = 0.0;
|
||||
like_tot = 0.0;
|
||||
ExponentialTransformAccsA accs_a(dim);
|
||||
ExponentialTransformAccsB accs_b(dim);
|
||||
MlltAccs accs_b(dim);
|
||||
for (int32 k = 0; k < nblocks; k++) {
|
||||
Matrix<BaseFloat> &cur_xform(cur_xforms[k]);
|
||||
FmllrOptions opts;
|
||||
|
@ -146,8 +147,7 @@ void UnitTestExponentialTransformUpdate(EtNormalizeType norm_type,
|
|||
if (update_b && j%2 == 1)
|
||||
accs_b.AccumulateFromPosteriors(gmm,
|
||||
xformed_row,
|
||||
posteriors,
|
||||
cur_Ds[k]);
|
||||
posteriors);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -161,16 +161,17 @@ void UnitTestExponentialTransformUpdate(EtNormalizeType norm_type,
|
|||
}
|
||||
if (update_b && j%2 == 1) {
|
||||
BaseFloat count, objf_impr;
|
||||
Matrix<BaseFloat> M(dim, dim); // to transform GMM means.
|
||||
accs_b.Update(&et, &objf_impr, &count, &M);
|
||||
TestIo(accs_b);
|
||||
Matrix<BaseFloat> C(dim, dim); // to transform GMM means.
|
||||
C.SetUnit();
|
||||
accs_b.Update(&C, &objf_impr, &count);
|
||||
et.ApplyC(C);
|
||||
TestIo(et);
|
||||
KALDI_LOG << "Count is " << count << " and objf impr is " << objf_impr << " updating B";
|
||||
// update the GMM means:
|
||||
Matrix<BaseFloat> means;
|
||||
gmm.GetMeans(&means);
|
||||
Matrix<BaseFloat> new_means(means.NumRows(), means.NumCols());
|
||||
new_means.AddMatMat(1.0, means, kNoTrans, M, kTrans, 0.0);
|
||||
new_means.AddMatMat(1.0, means, kNoTrans, C, kTrans, 0.0);
|
||||
gmm.SetMeans(new_means);
|
||||
gmm.ComputeGconsts();
|
||||
}
|
||||
|
|
|
@ -408,157 +408,6 @@ void ExponentialTransformAccsA::Init(int32 dim) {
|
|||
Ahat_.Resize(dim, dim+1);
|
||||
}
|
||||
|
||||
void ExponentialTransformAccsB::Init(int32 dim) {
|
||||
G_.resize(dim);
|
||||
for (int32 i = 0; i < dim; i++)
|
||||
G_[i].Resize(dim);
|
||||
beta_ = 0.0;
|
||||
}
|
||||
|
||||
void
|
||||
ExponentialTransformAccsB::
|
||||
AccumulateFromPosteriors(const DiagGmm &gmm,
|
||||
const VectorBase<BaseFloat> &t_data,
|
||||
const VectorBase<BaseFloat> &posteriors,
|
||||
const MatrixBase<BaseFloat> &Ds) {
|
||||
int32 dim = G_.size();
|
||||
KALDI_ASSERT(dim == gmm.Dim() &&
|
||||
dim == t_data.Dim());
|
||||
KALDI_ASSERT(posteriors.Dim() == gmm.NumGauss());
|
||||
Vector<BaseFloat> tmp_data(dim);
|
||||
KALDI_ASSERT(Ds.NumRows() == dim && Ds.NumCols() == dim+1);
|
||||
KALDI_ASSERT(dim < 2 || (fabs(Ds(1, 0)) < 0.01 && Ds(0, 0) != 0.0)); // quick check it's diagonal.
|
||||
// Apply the reverse transformation of Ds to the features.
|
||||
// If d_i is the i'th diagonal element and b_i is the i'th offset element,
|
||||
// it transforms y_i = x_i d_i + b_i.
|
||||
// The reverse transformation is: x_i = (y_i - b_i) / d_i
|
||||
for (int32 i = 0; i < dim; i++)
|
||||
tmp_data(i) = (t_data(i) - Ds(i, dim)) / Ds(i, i);
|
||||
|
||||
// Note on transforming means and variances from the adapted to the
|
||||
// speaker-independendent space...
|
||||
// the transformation on means is as above:
|
||||
// mu_i -> (mu_i - b_i) / d_i
|
||||
// and the transformation on variances is:
|
||||
// sigma_i^2 -> sigma_i^2 / d_i^2.
|
||||
|
||||
Matrix<BaseFloat> gmm_means;
|
||||
gmm.GetMeans(&gmm_means);
|
||||
|
||||
const Matrix<BaseFloat> &inv_vars = gmm.inv_vars();
|
||||
|
||||
double this_beta = 0.0;
|
||||
Vector<double> offset(dim);
|
||||
SpMatrix<double> offset2(dim);
|
||||
for (int32 i = 0; i < gmm.NumGauss(); i++) {
|
||||
BaseFloat gamma = posteriors(i);
|
||||
if (gamma < 1.0e-05) continue;
|
||||
this_beta += gamma;
|
||||
offset.CopyFromVec(tmp_data);
|
||||
for (int32 j = 0; j < dim; j++) {
|
||||
BaseFloat adapted_mean = (gmm_means(i, j) - Ds(j, dim)) / Ds(j, j);
|
||||
// adapted_mean is viewing Ds as a model-space transform.
|
||||
offset(j) -= adapted_mean;
|
||||
}
|
||||
offset2.SetZero();
|
||||
offset2.AddVec2(1.0, offset);
|
||||
for (int32 j = 0; j < dim; j++) {
|
||||
BaseFloat adapted_inv_var = inv_vars(i, j) * Ds(j, j) * Ds(j, j);
|
||||
// was: G_[j].AddVec2(gamma * adapted_inv_var, offset);
|
||||
// This should be more efficient.
|
||||
G_[j].AddSp(gamma * adapted_inv_var, offset2);
|
||||
}
|
||||
}
|
||||
beta_ += this_beta;
|
||||
}
|
||||
|
||||
void ExponentialTransformAccsB::Update(ExponentialTransform *et,
|
||||
BaseFloat *objf_impr_out,
|
||||
BaseFloat *count_out,
|
||||
MatrixBase<BaseFloat> *Cpart) {
|
||||
int32 dim = G_.size();
|
||||
KALDI_ASSERT(beta_ > 2*dim);
|
||||
KALDI_ASSERT(dim > 0 && et->Dim() == dim);
|
||||
BaseFloat objf_impr = 0.0;
|
||||
Matrix<double> transform(dim, dim);
|
||||
transform.SetUnit();
|
||||
std::vector<SpMatrix<double> > Ginv(dim);
|
||||
for (int32 i = 0; i < dim; i++) {
|
||||
Ginv[i].Resize(dim);
|
||||
Ginv[i].CopyFromSp(G_[i]);
|
||||
Ginv[i].Invert();
|
||||
}
|
||||
for (int32 iter = 0; iter < 100; iter++) {
|
||||
for (int32 i = 0; i < dim; i++) { // for each row...
|
||||
SubVector<double> row(transform, i);
|
||||
Vector<double> cofactor(dim); // actually cofactor times a constant.
|
||||
{
|
||||
Matrix<double> inv(transform);
|
||||
inv.Invert();
|
||||
inv.Transpose();
|
||||
cofactor.CopyFromVec(inv.Row(i));
|
||||
}
|
||||
// Auxf is beta * log(cofactor . row) - 0.5 * row^T G_i row
|
||||
double old_auxf = beta_ * log(std::abs(VecVec(cofactor, row)))
|
||||
-0.5 * VecSpVec(row, G_[i], row);
|
||||
|
||||
// see Gales, "Semi-tied covariance matrices for hidden
|
||||
// markov models", feb. 1998 (techreport version of IEEE paper),
|
||||
// eq. 22,
|
||||
// a_i <== c_i G_i^{-1} \sqrt( beta / c_i^T G_i^{-1} c_i )
|
||||
// [notation is not exactly as in original].
|
||||
|
||||
row.AddSpVec(sqrt(beta_ / VecSpVec(cofactor, Ginv[i], cofactor)),
|
||||
Ginv[i], cofactor, 0.0);
|
||||
double new_auxf = beta_ * log(std::abs(VecVec(cofactor, row)))
|
||||
-0.5 * VecSpVec(row, G_[i], row);
|
||||
if (new_auxf < old_auxf - 0.0001 * beta_)
|
||||
KALDI_ERR << "Auxf decreased in MLLR update.";
|
||||
objf_impr += new_auxf - old_auxf;
|
||||
}
|
||||
}
|
||||
if (et->norm_type_ == kEtNormalizeMeanAndVar)
|
||||
KALDI_WARN << "Update for B is not guaranteed to improve objective "
|
||||
"when both mean and variance normalization is being done.";
|
||||
|
||||
KALDI_LOG << "Updating matrix B: auxf improvement is "
|
||||
<< (objf_impr/beta_) << " per frame over " << beta_
|
||||
<< " frames.\n";
|
||||
if (objf_impr_out)
|
||||
*objf_impr_out = objf_impr;
|
||||
if (count_out)
|
||||
*count_out = beta_;
|
||||
|
||||
KALDI_ASSERT(Cpart != NULL && Cpart->NumRows() == dim
|
||||
&& Cpart->NumCols() == dim);
|
||||
Cpart->CopyFromMat(transform); // this is what we would apply to the
|
||||
// means.
|
||||
|
||||
// HERE: actually apply to A and B.
|
||||
// We will do (with M the transform we just estimated):
|
||||
// B' <-- M B
|
||||
// A' <-- M A M^{-1}
|
||||
// This way we will have exp(tA') B' = (I + t M A M^{-1} + .. ) M B
|
||||
// = M ( I + t A + .. ) B.
|
||||
// To do this we need to extend M with one more row 0 0 ... 0 1.
|
||||
Matrix<BaseFloat> Cfull(dim+1, dim+1);
|
||||
SubMatrix<BaseFloat> Cfull_part(Cfull, 0, dim, 0, dim);
|
||||
Cfull_part.CopyFromMat(transform);
|
||||
Cfull(dim, dim) = 1.0;
|
||||
Matrix<BaseFloat> Cfull_inv(Cfull);
|
||||
Cfull_inv.Invert();
|
||||
Matrix<BaseFloat> tmp(dim+1, dim+1), new_A(dim+1, dim+1), new_B(dim+1, dim+1);
|
||||
// tmp <- A M^{-1}
|
||||
tmp.AddMatMat(1.0, et->A_, kNoTrans, Cfull_inv, kNoTrans, 0.0);
|
||||
// new_A <-- M tmp
|
||||
new_A.AddMatMat(1.0, Cfull, kNoTrans, tmp, kNoTrans, 0.0);
|
||||
et->A_.CopyFromMat(new_A);
|
||||
// new_B <-- M B
|
||||
new_B.AddMatMat(1.0, Cfull, kNoTrans, et->B_, kNoTrans, 0.0);
|
||||
et->B_.CopyFromMat(new_B);
|
||||
|
||||
}
|
||||
|
||||
void ExponentialTransform::Write(std::ostream &os, bool binary) const {
|
||||
WriteMarker(os, binary, "<ExponentialTransform>");
|
||||
WriteMarker(os, binary, "<A>");
|
||||
|
@ -585,42 +434,6 @@ void ExponentialTransform::Read(std::istream &is, bool binary) {
|
|||
}
|
||||
|
||||
|
||||
void ExponentialTransformAccsB::Write(std::ostream &os, bool binary) const {
|
||||
WriteMarker(os, binary, "<ExponentialTransformAccsB>");
|
||||
WriteMarker(os, binary, "<Beta>");
|
||||
WriteBasicType(os, binary, beta_);
|
||||
WriteMarker(os, binary, "<Dim>");
|
||||
int32 dim = G_.size();
|
||||
WriteBasicType(os, binary, dim);
|
||||
WriteMarker(os, binary, "<G>");
|
||||
for (int32 i = 0; i < dim; i++)
|
||||
G_[i].Write(os, binary);
|
||||
WriteMarker(os, binary, "</ExponentialTransformAccsB>");
|
||||
}
|
||||
|
||||
void ExponentialTransformAccsB::Read(std::istream &os, bool binary, bool add) {
|
||||
if (G_.empty()) add = false; // don't add to nonexistent stats...
|
||||
ExpectMarker(os, binary, "<ExponentialTransformAccsB>");
|
||||
ExpectMarker(os, binary, "<Beta>");
|
||||
double beta;
|
||||
ReadBasicType(os, binary, &beta);
|
||||
if (add) beta_ += beta;
|
||||
else beta_ = beta;
|
||||
ExpectMarker(os, binary, "<Dim>");
|
||||
int32 dim;
|
||||
ReadBasicType(os, binary, &dim);
|
||||
if (!add) G_.resize(dim);
|
||||
else {
|
||||
if (static_cast<size_t>(dim) != G_.size())
|
||||
KALDI_ERR << "Reading accs for updating B in exponential transform, "
|
||||
<< "dim mismatch " << dim << " vs. " << G_.size();
|
||||
}
|
||||
ExpectMarker(os, binary, "<G>");
|
||||
for (size_t i = 0; i < G_.size(); i++)
|
||||
G_[i].Read(os, binary, add);
|
||||
ExpectMarker(os, binary, "</ExponentialTransformAccsB>");
|
||||
}
|
||||
|
||||
|
||||
void ExponentialTransformAccsA::Write(std::ostream &os, bool binary) const {
|
||||
WriteMarker(os, binary, "<ExponentialTransformAccsA>");
|
||||
|
|
|
@ -119,45 +119,6 @@ class ExponentialTransform {
|
|||
|
||||
};
|
||||
|
||||
// This is an MLLT type of update.
|
||||
class ExponentialTransformAccsB {
|
||||
public:
|
||||
ExponentialTransformAccsB() { } // typically only used prior to Read().
|
||||
|
||||
ExponentialTransformAccsB(int32 dim) { Init(dim); }
|
||||
|
||||
void Init(int32 dim);
|
||||
|
||||
// AccumulateFromPosteriors is as in the base class, except we
|
||||
// supply the transform D_s (expected to be a diagonal or mean-only
|
||||
// transform), which is treated as a model-space transform here.
|
||||
// Here, "t_data" is the data transformed by the transform W_s.
|
||||
// Be careful-- this is different from the accumulation for A, in which
|
||||
// the fMLLR stats are accumulated given the original data.
|
||||
void AccumulateFromPosteriors(const DiagGmm &gmm,
|
||||
const VectorBase<BaseFloat> &t_data,
|
||||
const VectorBase<BaseFloat> &posteriors,
|
||||
const MatrixBase<BaseFloat> &Ds);
|
||||
|
||||
|
||||
// The Update function does the MLLT update for B. It sets "Cpart"
|
||||
// (the first d x d block of C) to the transform that we would have
|
||||
// to apply to the model means.
|
||||
void Update(ExponentialTransform *et,
|
||||
BaseFloat *objf_impr,
|
||||
BaseFloat *count,
|
||||
MatrixBase<BaseFloat> *Cpart);
|
||||
|
||||
void Write(std::ostream &os, bool binary) const;
|
||||
|
||||
void Read(std::istream &is, bool binary, bool add = false);
|
||||
|
||||
private:
|
||||
double beta_;
|
||||
std::vector<SpMatrix<double> > G_;
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct ExponentialTransformUpdateAOptions {
|
||||
|
|
Загрузка…
Ссылка в новой задаче