Adding MLLR to RM recipe.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@96 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2011-06-21 06:53:48 +00:00 · 2011-06-21 06:53:48 +00:00 · 8049135d15
--- a/egs/rm/s1/steps/decode_tri1_fmllr+regtree_mllr.sh
+++ b/egs/rm/s1/steps/decode_tri1_fmllr+regtree_mllr.sh
@ -0,0 +1,83 @@
+#!/bin/bash
+
+# Copyright 2010-2011 	Microsoft Corporation,  Saarland University
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# deocde_tri_regtree_mllr.sh is as ../decode_tri.sh but estimating MLLR in test,
+# per speaker.  There is no SAT.  Use a regression-tree with top-level speech/sil
+# split (no silence weighting).
+
+if [ -f path.sh ]; then . path.sh; fi
+srcdir=exp/decode_tri1
+dir=exp/decode_tri1_fmllr+regtree_mllr
+mkdir -p $dir
+model=exp/tri1/final.mdl
+occs=exp/tri1/final.occs
+tree=exp/tri1/tree
+graphdir=exp/graph_tri1
+silphones=`cat data/silphones.csl`
+
+regtree=$dir/regtree
+maxleaves=2     # max # of regression-tree leaves.
+fmllr_mincount=5000   # mincount before we add new transform.
+mllr_mincount=1000    # mincount before we add new transform.
+gmm-make-regtree --sil-phones=$silphones --state-occs=$occs \
+  --max-leaves=$maxleaves $model $regtree 2>$dir/make_regtree.out
+
+scripts/mkgraph.sh $tree $model $graphdir
+
+for test in mar87 oct87 feb89 oct89 feb91 sep92; do
+ (
+  # Comment the two lines below to make this per-utterance.
+  spk2utt_opt=--spk2utt=ark:data/test_${test}.spk2utt
+  utt2spk_opt=--utt2spk=ark:data/test_${test}.utt2spk
+
+  feats="ark:add-deltas --print-args=false scp:data/test_${test}.scp ark:- |"
+
+  ( ali-to-post ark:$srcdir/test_${test}.ali ark:- | \
+    weight-silence-post 0.01 $silphones $model ark:- ark:- | \
+    gmm-est-fmllr --fmllr-min-count=$fmllr_mincount $spk2utt_opt $model \
+     "$feats" ark,o:- ark:$dir/${test}.fmllr ) 2>$dir/fmllr_${test}.log
+
+  adapt_feats="ark:add-deltas --print-args=false scp:data/test_${test}.scp ark:- | transform-feats $utt2spk_opt ark:$dir/${test}.fmllr ark:- ark:- |"
+
+  gmm-decode-faster --beam=20.0 --acoustic-scale=0.08333 \
+    --word-symbol-table=data/words.txt $model $graphdir/HCLG.fst \
+    "$adapt_feats" ark,t:$dir/${test}_pass2.tra ark,t:$dir/${test}_pass2.ali \
+    2> $dir/pass2_${test}.log
+
+  ( ali-to-post ark:$dir/${test}_pass2.ali ark:- | \
+    gmm-est-regtree-mllr --mllr-min-count=$mllr_mincount $spk2utt_opt \
+      $model "$adapt_feats" ark:- $regtree ark:$dir/${test}.mllr ) \
+      2>$dir/mllr_${test}.log
+
+  gmm-decode-faster-regtree-mllr $utt2spk_opt --acoustic-scale=0.08333 \
+    --beam=20.0 --word-symbol-table=data/words.txt $model $graphdir/HCLG.fst \
+    $regtree "$adapt_feats" ark:$dir/${test}.mllr ark,t:$dir/test_${test}.tra \
+    ark,t:$dir/test_${test}.ali  2> $dir/decode_${test}.log
+
+  # the ,p option lets it score partial output without dying..
+
+  scripts/sym2int.pl --ignore-first-field data/words.txt data_prep/test_${test}_trans.txt | \
+    compute-wer --mode=present ark:-  ark,p:$dir/test_${test}.tra > $dir/wer_${test}
+ ) &
+done
+
+wait
+
+grep WER $dir/wer_* | \
+  awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", 100.0*n/d, n, d); }' \
+   > $dir/wer
+
--- a/egs/rm/s1/steps/decode_tri1_regtree_mllr.sh
+++ b/egs/rm/s1/steps/decode_tri1_regtree_mllr.sh
@ -0,0 +1,74 @@
+#!/bin/bash
+
+# Copyright 2010-2011 	Microsoft Corporation,  Saarland University
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# deocde_tri_regtree_mllr.sh is as ../decode_tri.sh but estimating MLLR in test,
+# per speaker.  There is no SAT.  Use a regression-tree with top-level speech/sil
+# split (no silence weighting).
+
+if [ -f path.sh ]; then . path.sh; fi
+srcdir=exp/decode_tri1
+dir=exp/decode_tri1_regtree_mllr
+mkdir -p $dir
+model=exp/tri1/final.mdl
+occs=exp/tri1/final.occs
+tree=exp/tri1/tree
+graphdir=exp/graph_tri1
+silphones=`cat data/silphones.csl`
+
+regtree=$dir/regtree
+maxleaves=2     # max # of regression-tree leaves.
+mincount=5000   # mincount before we add new transform.
+gmm-make-regtree --sil-phones=$silphones --state-occs=$occs \
+  --max-leaves=$maxleaves $model $regtree 2>$dir/make_regtree.out
+
+scripts/mkgraph.sh $tree $model $graphdir
+
+for test in mar87 oct87 feb89 oct89 feb91 sep92; do
+ (
+  # Comment the two lines below to make this per-utterance.
+  spk2utt_opt=--spk2utt=ark:data/test_${test}.spk2utt
+  utt2spk_opt=--utt2spk=ark:data/test_${test}.utt2spk
+
+  # To deweight silence, would add the line
+  #   weight-silence-post 0.0 $silphones $model ark:- ark:- | \
+  # after the line with ali-to-post
+  # This is useful if we don't treat silence specially when building regression tree.
+
+  feats="ark:add-deltas --print-args=false scp:data/test_${test}.scp ark:- |"
+  ( ali-to-post ark:$srcdir/test_${test}.ali ark:- | \
+    gmm-est-regtree-mllr --mllr-min-count=$mincount $spk2utt_opt \
+      $model "$feats" ark:- $regtree ark:$dir/${test}.mllr ) \
+      2>$dir/mllr_${test}.log
+
+  gmm-decode-faster-regtree-mllr $utt2spk_opt --acoustic-scale=0.08333 \
+    --beam=20.0 --word-symbol-table=data/words.txt $model $graphdir/HCLG.fst \
+    $regtree "$feats" ark:$dir/${test}.mllr ark,t:$dir/test_${test}.tra \
+    ark,t:$dir/test_${test}.ali  2> $dir/decode_${test}.log
+
+  # the ,p option lets it score partial output without dying..
+
+  scripts/sym2int.pl --ignore-first-field data/words.txt data_prep/test_${test}_trans.txt | \
+    compute-wer --mode=present ark:-  ark,p:$dir/test_${test}.tra > $dir/wer_${test}
+ ) &
+done
+
+wait
+
+grep WER $dir/wer_* | \
+  awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", 100.0*n/d, n, d); }' \
+   > $dir/wer
+
--- a/src/decoder/decodable-am-diag-gmm.cc
+++ b/src/decoder/decodable-am-diag-gmm.cc
@ -152,6 +152,7 @@ void DecodableAmDiagGmmRegtreeMllr::InitCache() {
  xformed_mean_invvars_.resize(num_pdfs);
  xformed_gconsts_.resize(num_pdfs);
  is_cached_.resize(num_pdfs, false);
+  ResetLogLikeCache();
 }


@ -237,7 +238,7 @@ const Vector<BaseFloat>& DecodableAmDiagGmmRegtreeMllr::GetXformedGconsts(

 BaseFloat DecodableAmDiagGmmRegtreeMllr::LogLikelihoodZeroBased(int32 frame,
                                                                int32 state) {
-  KALDI_ERR << "Function not completely implemented yet.";
+//  KALDI_ERR << "Function not completely implemented yet.";
  KALDI_ASSERT(frame < NumFrames() && frame >= 0);
  KALDI_ASSERT(state < NumIndices() && state >= 0);

--- a/src/decoder/decodable-am-diag-gmm.h
+++ b/src/decoder/decodable-am-diag-gmm.h
@ -175,7 +175,8 @@ class DecodableAmDiagGmmRegtreeMllr : public DecodableAmDiagGmmUnmapped {
                                const RegressionTree &regtree,
                                BaseFloat scale)
      : DecodableAmDiagGmmUnmapped(am, feats), trans_model_(tm), scale_(scale),
-        mllr_xform_(mllr_xform), regtree_(regtree) { InitCache(); }
+        mllr_xform_(mllr_xform), regtree_(regtree),
+        data_squared_(feats.NumCols()) { InitCache(); }
  ~DecodableAmDiagGmmRegtreeMllr();

  // Note, frames are numbered from zero but transition-ids (tid) from one.
--- a/src/gmmbin/gmm-est-regtree-mllr.cc
+++ b/src/gmmbin/gmm-est-regtree-mllr.cc
@ -33,7 +33,7 @@ int main(int argc, char *argv[]) {
    const char *usage =
        "Compute MLLR transforms per-utterance (default) or per-speaker for "
        "the supplied set of speakers (spk2utt option).  Note: writes RegtreeMllrDiagGmm objects\n"
-        "Usage: gmm-estimate-regtree-fmllr  [options] <model-in> <feature-rspecifier> "
+        "Usage: gmm-estimate-regtree-mllr  [options] <model-in> <feature-rspecifier> "
        "<posteriors-rspecifier> <regression-tree> <transforms-wspecifier>\n";

    ParseOptions po(usage);
@ -135,7 +135,7 @@ int main(int argc, char *argv[]) {
        }  // end looping over all utterances of the current speaker
        BaseFloat objf_impr, t;
        mllr_accs.Update(regtree, opts, &mllr_xforms, &objf_impr, &t);
-        KALDI_LOG << "fMLLR objf improvement for speaker " << spk << " is "
+        KALDI_LOG << "MLLR objf improvement for speaker " << spk << " is "
                  << (objf_impr/(t+1.0e-10)) << " per frame over " << t
                  << " frames.";
        tot_objf_impr += objf_impr;
@ -183,7 +183,7 @@ int main(int argc, char *argv[]) {
            << "Avg like per frame so far is " << (tot_like / tot_t) << '\n';
        BaseFloat objf_impr, t;
        mllr_accs.Update(regtree, opts, &mllr_xforms, &objf_impr, &t);
-        KALDI_LOG << "fMLLR objf improvement for utterance " << key << " is "
+        KALDI_LOG << "MLLR objf improvement for utterance " << key << " is "
                  << (objf_impr/(t+1.0e-10)) << " per frame over " << t
                  << " frames.";
        tot_objf_impr += objf_impr;
@ -192,7 +192,7 @@ int main(int argc, char *argv[]) {
      }
    }

-    KALDI_LOG << "Total objf improvement from fMLLR is " << (tot_objf_impr/tot_t_objf)
+    KALDI_LOG << "Total objf improvement from MLLR is " << (tot_objf_impr/tot_t_objf)
              << " per frame over " << tot_t_objf << " frames.";
    KALDI_LOG << "Done " << num_done << " files, " << num_no_posterior
        << " with no posteriors, " << num_other_error << " with other errors.";
--- a/src/transform/regtree-mllr-diag-gmm.cc
+++ b/src/transform/regtree-mllr-diag-gmm.cc
@ -147,7 +147,7 @@ void RegtreeMllrDiagGmm::Read(std::istream &in, bool binary) {
                 && xform_itr->NumRows() == dim_);
  }

-  ExpectMarker(in, binary, "<BCLASS2XFORMS");
+  ExpectMarker(in, binary, "<BCLASS2XFORMS>");
  ReadIntegerVector(in, binary, &bclass2xforms_);
  ExpectMarker(in, binary, "</MLLRXFORM>");
 }