зеркало из https://github.com/mozilla/kaldi.git
Adding MLLR to RM recipe.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@96 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
295f5c39af
Коммит
8049135d15
|
@ -0,0 +1,83 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2010-2011 Microsoft Corporation, Saarland University
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# deocde_tri_regtree_mllr.sh is as ../decode_tri.sh but estimating MLLR in test,
|
||||||
|
# per speaker. There is no SAT. Use a regression-tree with top-level speech/sil
|
||||||
|
# split (no silence weighting).
|
||||||
|
|
||||||
|
if [ -f path.sh ]; then . path.sh; fi
|
||||||
|
srcdir=exp/decode_tri1
|
||||||
|
dir=exp/decode_tri1_fmllr+regtree_mllr
|
||||||
|
mkdir -p $dir
|
||||||
|
model=exp/tri1/final.mdl
|
||||||
|
occs=exp/tri1/final.occs
|
||||||
|
tree=exp/tri1/tree
|
||||||
|
graphdir=exp/graph_tri1
|
||||||
|
silphones=`cat data/silphones.csl`
|
||||||
|
|
||||||
|
regtree=$dir/regtree
|
||||||
|
maxleaves=2 # max # of regression-tree leaves.
|
||||||
|
fmllr_mincount=5000 # mincount before we add new transform.
|
||||||
|
mllr_mincount=1000 # mincount before we add new transform.
|
||||||
|
gmm-make-regtree --sil-phones=$silphones --state-occs=$occs \
|
||||||
|
--max-leaves=$maxleaves $model $regtree 2>$dir/make_regtree.out
|
||||||
|
|
||||||
|
scripts/mkgraph.sh $tree $model $graphdir
|
||||||
|
|
||||||
|
for test in mar87 oct87 feb89 oct89 feb91 sep92; do
|
||||||
|
(
|
||||||
|
# Comment the two lines below to make this per-utterance.
|
||||||
|
spk2utt_opt=--spk2utt=ark:data/test_${test}.spk2utt
|
||||||
|
utt2spk_opt=--utt2spk=ark:data/test_${test}.utt2spk
|
||||||
|
|
||||||
|
feats="ark:add-deltas --print-args=false scp:data/test_${test}.scp ark:- |"
|
||||||
|
|
||||||
|
( ali-to-post ark:$srcdir/test_${test}.ali ark:- | \
|
||||||
|
weight-silence-post 0.01 $silphones $model ark:- ark:- | \
|
||||||
|
gmm-est-fmllr --fmllr-min-count=$fmllr_mincount $spk2utt_opt $model \
|
||||||
|
"$feats" ark,o:- ark:$dir/${test}.fmllr ) 2>$dir/fmllr_${test}.log
|
||||||
|
|
||||||
|
adapt_feats="ark:add-deltas --print-args=false scp:data/test_${test}.scp ark:- | transform-feats $utt2spk_opt ark:$dir/${test}.fmllr ark:- ark:- |"
|
||||||
|
|
||||||
|
gmm-decode-faster --beam=20.0 --acoustic-scale=0.08333 \
|
||||||
|
--word-symbol-table=data/words.txt $model $graphdir/HCLG.fst \
|
||||||
|
"$adapt_feats" ark,t:$dir/${test}_pass2.tra ark,t:$dir/${test}_pass2.ali \
|
||||||
|
2> $dir/pass2_${test}.log
|
||||||
|
|
||||||
|
( ali-to-post ark:$dir/${test}_pass2.ali ark:- | \
|
||||||
|
gmm-est-regtree-mllr --mllr-min-count=$mllr_mincount $spk2utt_opt \
|
||||||
|
$model "$adapt_feats" ark:- $regtree ark:$dir/${test}.mllr ) \
|
||||||
|
2>$dir/mllr_${test}.log
|
||||||
|
|
||||||
|
gmm-decode-faster-regtree-mllr $utt2spk_opt --acoustic-scale=0.08333 \
|
||||||
|
--beam=20.0 --word-symbol-table=data/words.txt $model $graphdir/HCLG.fst \
|
||||||
|
$regtree "$adapt_feats" ark:$dir/${test}.mllr ark,t:$dir/test_${test}.tra \
|
||||||
|
ark,t:$dir/test_${test}.ali 2> $dir/decode_${test}.log
|
||||||
|
|
||||||
|
# the ,p option lets it score partial output without dying..
|
||||||
|
|
||||||
|
scripts/sym2int.pl --ignore-first-field data/words.txt data_prep/test_${test}_trans.txt | \
|
||||||
|
compute-wer --mode=present ark:- ark,p:$dir/test_${test}.tra > $dir/wer_${test}
|
||||||
|
) &
|
||||||
|
done
|
||||||
|
|
||||||
|
wait
|
||||||
|
|
||||||
|
grep WER $dir/wer_* | \
|
||||||
|
awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", 100.0*n/d, n, d); }' \
|
||||||
|
> $dir/wer
|
||||||
|
|
|
@ -0,0 +1,74 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2010-2011 Microsoft Corporation, Saarland University
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# deocde_tri_regtree_mllr.sh is as ../decode_tri.sh but estimating MLLR in test,
|
||||||
|
# per speaker. There is no SAT. Use a regression-tree with top-level speech/sil
|
||||||
|
# split (no silence weighting).
|
||||||
|
|
||||||
|
if [ -f path.sh ]; then . path.sh; fi
|
||||||
|
srcdir=exp/decode_tri1
|
||||||
|
dir=exp/decode_tri1_regtree_mllr
|
||||||
|
mkdir -p $dir
|
||||||
|
model=exp/tri1/final.mdl
|
||||||
|
occs=exp/tri1/final.occs
|
||||||
|
tree=exp/tri1/tree
|
||||||
|
graphdir=exp/graph_tri1
|
||||||
|
silphones=`cat data/silphones.csl`
|
||||||
|
|
||||||
|
regtree=$dir/regtree
|
||||||
|
maxleaves=2 # max # of regression-tree leaves.
|
||||||
|
mincount=5000 # mincount before we add new transform.
|
||||||
|
gmm-make-regtree --sil-phones=$silphones --state-occs=$occs \
|
||||||
|
--max-leaves=$maxleaves $model $regtree 2>$dir/make_regtree.out
|
||||||
|
|
||||||
|
scripts/mkgraph.sh $tree $model $graphdir
|
||||||
|
|
||||||
|
for test in mar87 oct87 feb89 oct89 feb91 sep92; do
|
||||||
|
(
|
||||||
|
# Comment the two lines below to make this per-utterance.
|
||||||
|
spk2utt_opt=--spk2utt=ark:data/test_${test}.spk2utt
|
||||||
|
utt2spk_opt=--utt2spk=ark:data/test_${test}.utt2spk
|
||||||
|
|
||||||
|
# To deweight silence, would add the line
|
||||||
|
# weight-silence-post 0.0 $silphones $model ark:- ark:- | \
|
||||||
|
# after the line with ali-to-post
|
||||||
|
# This is useful if we don't treat silence specially when building regression tree.
|
||||||
|
|
||||||
|
feats="ark:add-deltas --print-args=false scp:data/test_${test}.scp ark:- |"
|
||||||
|
( ali-to-post ark:$srcdir/test_${test}.ali ark:- | \
|
||||||
|
gmm-est-regtree-mllr --mllr-min-count=$mincount $spk2utt_opt \
|
||||||
|
$model "$feats" ark:- $regtree ark:$dir/${test}.mllr ) \
|
||||||
|
2>$dir/mllr_${test}.log
|
||||||
|
|
||||||
|
gmm-decode-faster-regtree-mllr $utt2spk_opt --acoustic-scale=0.08333 \
|
||||||
|
--beam=20.0 --word-symbol-table=data/words.txt $model $graphdir/HCLG.fst \
|
||||||
|
$regtree "$feats" ark:$dir/${test}.mllr ark,t:$dir/test_${test}.tra \
|
||||||
|
ark,t:$dir/test_${test}.ali 2> $dir/decode_${test}.log
|
||||||
|
|
||||||
|
# the ,p option lets it score partial output without dying..
|
||||||
|
|
||||||
|
scripts/sym2int.pl --ignore-first-field data/words.txt data_prep/test_${test}_trans.txt | \
|
||||||
|
compute-wer --mode=present ark:- ark,p:$dir/test_${test}.tra > $dir/wer_${test}
|
||||||
|
) &
|
||||||
|
done
|
||||||
|
|
||||||
|
wait
|
||||||
|
|
||||||
|
grep WER $dir/wer_* | \
|
||||||
|
awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", 100.0*n/d, n, d); }' \
|
||||||
|
> $dir/wer
|
||||||
|
|
|
@ -152,6 +152,7 @@ void DecodableAmDiagGmmRegtreeMllr::InitCache() {
|
||||||
xformed_mean_invvars_.resize(num_pdfs);
|
xformed_mean_invvars_.resize(num_pdfs);
|
||||||
xformed_gconsts_.resize(num_pdfs);
|
xformed_gconsts_.resize(num_pdfs);
|
||||||
is_cached_.resize(num_pdfs, false);
|
is_cached_.resize(num_pdfs, false);
|
||||||
|
ResetLogLikeCache();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -237,7 +238,7 @@ const Vector<BaseFloat>& DecodableAmDiagGmmRegtreeMllr::GetXformedGconsts(
|
||||||
|
|
||||||
BaseFloat DecodableAmDiagGmmRegtreeMllr::LogLikelihoodZeroBased(int32 frame,
|
BaseFloat DecodableAmDiagGmmRegtreeMllr::LogLikelihoodZeroBased(int32 frame,
|
||||||
int32 state) {
|
int32 state) {
|
||||||
KALDI_ERR << "Function not completely implemented yet.";
|
// KALDI_ERR << "Function not completely implemented yet.";
|
||||||
KALDI_ASSERT(frame < NumFrames() && frame >= 0);
|
KALDI_ASSERT(frame < NumFrames() && frame >= 0);
|
||||||
KALDI_ASSERT(state < NumIndices() && state >= 0);
|
KALDI_ASSERT(state < NumIndices() && state >= 0);
|
||||||
|
|
||||||
|
|
|
@ -175,7 +175,8 @@ class DecodableAmDiagGmmRegtreeMllr : public DecodableAmDiagGmmUnmapped {
|
||||||
const RegressionTree ®tree,
|
const RegressionTree ®tree,
|
||||||
BaseFloat scale)
|
BaseFloat scale)
|
||||||
: DecodableAmDiagGmmUnmapped(am, feats), trans_model_(tm), scale_(scale),
|
: DecodableAmDiagGmmUnmapped(am, feats), trans_model_(tm), scale_(scale),
|
||||||
mllr_xform_(mllr_xform), regtree_(regtree) { InitCache(); }
|
mllr_xform_(mllr_xform), regtree_(regtree),
|
||||||
|
data_squared_(feats.NumCols()) { InitCache(); }
|
||||||
~DecodableAmDiagGmmRegtreeMllr();
|
~DecodableAmDiagGmmRegtreeMllr();
|
||||||
|
|
||||||
// Note, frames are numbered from zero but transition-ids (tid) from one.
|
// Note, frames are numbered from zero but transition-ids (tid) from one.
|
||||||
|
|
|
@ -33,7 +33,7 @@ int main(int argc, char *argv[]) {
|
||||||
const char *usage =
|
const char *usage =
|
||||||
"Compute MLLR transforms per-utterance (default) or per-speaker for "
|
"Compute MLLR transforms per-utterance (default) or per-speaker for "
|
||||||
"the supplied set of speakers (spk2utt option). Note: writes RegtreeMllrDiagGmm objects\n"
|
"the supplied set of speakers (spk2utt option). Note: writes RegtreeMllrDiagGmm objects\n"
|
||||||
"Usage: gmm-estimate-regtree-fmllr [options] <model-in> <feature-rspecifier> "
|
"Usage: gmm-estimate-regtree-mllr [options] <model-in> <feature-rspecifier> "
|
||||||
"<posteriors-rspecifier> <regression-tree> <transforms-wspecifier>\n";
|
"<posteriors-rspecifier> <regression-tree> <transforms-wspecifier>\n";
|
||||||
|
|
||||||
ParseOptions po(usage);
|
ParseOptions po(usage);
|
||||||
|
@ -135,7 +135,7 @@ int main(int argc, char *argv[]) {
|
||||||
} // end looping over all utterances of the current speaker
|
} // end looping over all utterances of the current speaker
|
||||||
BaseFloat objf_impr, t;
|
BaseFloat objf_impr, t;
|
||||||
mllr_accs.Update(regtree, opts, &mllr_xforms, &objf_impr, &t);
|
mllr_accs.Update(regtree, opts, &mllr_xforms, &objf_impr, &t);
|
||||||
KALDI_LOG << "fMLLR objf improvement for speaker " << spk << " is "
|
KALDI_LOG << "MLLR objf improvement for speaker " << spk << " is "
|
||||||
<< (objf_impr/(t+1.0e-10)) << " per frame over " << t
|
<< (objf_impr/(t+1.0e-10)) << " per frame over " << t
|
||||||
<< " frames.";
|
<< " frames.";
|
||||||
tot_objf_impr += objf_impr;
|
tot_objf_impr += objf_impr;
|
||||||
|
@ -183,7 +183,7 @@ int main(int argc, char *argv[]) {
|
||||||
<< "Avg like per frame so far is " << (tot_like / tot_t) << '\n';
|
<< "Avg like per frame so far is " << (tot_like / tot_t) << '\n';
|
||||||
BaseFloat objf_impr, t;
|
BaseFloat objf_impr, t;
|
||||||
mllr_accs.Update(regtree, opts, &mllr_xforms, &objf_impr, &t);
|
mllr_accs.Update(regtree, opts, &mllr_xforms, &objf_impr, &t);
|
||||||
KALDI_LOG << "fMLLR objf improvement for utterance " << key << " is "
|
KALDI_LOG << "MLLR objf improvement for utterance " << key << " is "
|
||||||
<< (objf_impr/(t+1.0e-10)) << " per frame over " << t
|
<< (objf_impr/(t+1.0e-10)) << " per frame over " << t
|
||||||
<< " frames.";
|
<< " frames.";
|
||||||
tot_objf_impr += objf_impr;
|
tot_objf_impr += objf_impr;
|
||||||
|
@ -192,7 +192,7 @@ int main(int argc, char *argv[]) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
KALDI_LOG << "Total objf improvement from fMLLR is " << (tot_objf_impr/tot_t_objf)
|
KALDI_LOG << "Total objf improvement from MLLR is " << (tot_objf_impr/tot_t_objf)
|
||||||
<< " per frame over " << tot_t_objf << " frames.";
|
<< " per frame over " << tot_t_objf << " frames.";
|
||||||
KALDI_LOG << "Done " << num_done << " files, " << num_no_posterior
|
KALDI_LOG << "Done " << num_done << " files, " << num_no_posterior
|
||||||
<< " with no posteriors, " << num_other_error << " with other errors.";
|
<< " with no posteriors, " << num_other_error << " with other errors.";
|
||||||
|
|
|
@ -147,7 +147,7 @@ void RegtreeMllrDiagGmm::Read(std::istream &in, bool binary) {
|
||||||
&& xform_itr->NumRows() == dim_);
|
&& xform_itr->NumRows() == dim_);
|
||||||
}
|
}
|
||||||
|
|
||||||
ExpectMarker(in, binary, "<BCLASS2XFORMS");
|
ExpectMarker(in, binary, "<BCLASS2XFORMS>");
|
||||||
ReadIntegerVector(in, binary, &bclass2xforms_);
|
ReadIntegerVector(in, binary, &bclass2xforms_);
|
||||||
ExpectMarker(in, binary, "</MLLRXFORM>");
|
ExpectMarker(in, binary, "</MLLRXFORM>");
|
||||||
}
|
}
|
||||||
|
|
Загрузка…
Ссылка в новой задаче