Fixes to recipes; updating RESULTS files [nearly finished with WSJ one].

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@113 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2011-06-29 21:58:41 +00:00
Родитель 7b0f9e2169
Коммит 75883cff68
30 изменённых файлов: 1459 добавлений и 277 удалений

Просмотреть файл

@ -10,20 +10,68 @@ feb89 oct89 feb91 sep92 avg
2.77 4.02 3.30 6.29 4.10 % from my ICASSP'99 paper on Frame Discrimination (ML baseline)
3.20 4.10 2.86 6.06 4.06 % from decode_tri2c (which is triphone + CMN)
exp/decode_mono/wer:Average WER is 14.234421 (1784 / 12533)
exp/decode_tri1/wer:Average WER is 4.420330 (554 / 12533) # First triphone pass
exp/decode_tri1_fmllr/wer:Average WER is 4.707572 (590 / 12533) # + fMLLR
exp/decode_tri1_regtree_fmllr/wer:Average WER is 4.707572 (590 / 12533) # + regression-tree
exp/decode_tri2a/wer:Average WER is 4.476183 (561 / 12533) # Second triphone pass
exp/decode_tri2a_fmllr/wer:Average WER is 3.718184 (466 / 12533) # + fMLLR
exp/decode_tri2a_fmllr_utt/wer:Average WER is 4.452246 (558 / 12533) # [ fMLLR per utterance ]
exp/decode_tri2b/wer:Average WER is 2.992101 (375 / 12533) # Exponential transform
exp/decode_tri2b_utt/wer:Average WER is 3.247427 (407 / 12533) # [adapt per-utt]
exp/decode_tri2c/wer:Average WER is 3.789994 (475 / 12533) # Cepstral mean subtraction (per-spk)
exp/decode_tri2d/wer:Average WER is 4.188941 (525 / 12533) # MLLT (= global STC)
exp/decode_tri2e/wer:Average WER is 4.923003 (617 / 12533) # splice-9-frames + LDA features
exp/decode_tri2f/wer:Average WER is 3.782015 (474 / 12533) # splice-9-frames + LDA + MLLT
exp/decode_tri2g/wer:Average WER is 3.670310 (460 / 12533) # Linear VTLN (LVTLN); includes mean-only fMLLR
exp/decode_mono/wer:Average WER is 14.234421 (1784 / 12533) # Monophone system, subset
exp/decode_tri1/wer:Average WER is 4.420330 (554 / 12533) # First triphone pass
exp/decode_tri1_fmllr/wer:Average WER is 3.837868 (481 / 12533) # + fMLLR
exp/decode_tri1_regtree_fmllr/wer:Average WER is 3.789994 (475 / 12533) # + regression-tree
exp/decode_tri2a/wer:Average WER is 3.973510 (498 / 12533) # Second triphone pass
exp/decode_tri2a_fmllr/wer:Average WER is 3.590521 (450 / 12533) # + fMLLR
exp/decode_tri2a_fmllr_utt/wer:Average WER is 3.933615 (493 / 12533) # [ fMLLR per utterance ]
exp/decode_tri2b/wer:Average WER is 3.303279 (414 / 12533) # Exponential transform
exp/decode_tri2b_utt/wer:Average WER is 3.335195 (418 / 12533) # [adapt per-utt]
exp/decode_tri2c/wer:Average WER is 3.957552 (496 / 12533) # Cepstral mean subtraction (per-spk)
exp/decode_tri2d/wer:Average WER is 4.316604 (541 / 12533) # MLLT (= global STC)
exp/decode_tri2e/wer:Average WER is 4.659698 (584 / 12533) # splice-9-frames + LDA features
exp/decode_tri2f/wer:Average WER is 3.885742 (487 / 12533) # splice-9-frames + LDA + MLLT
exp/decode_tri2g/wer:Average WER is 3.303279 (414 / 12533) # Linear VTLN
exp/decode_tri2g_diag/wer:Average WER is 3.135722 (393 / 12533) # Linear VTLN; diagonal adapt in test
exp/decode_tri2g_vtln/wer:Average WER is 3.239448 (406 / 12533) # Use warp factors -> feature-level VTLN + offset estimation
exp/decode_tri2g_vtln_diag/wer:Average WER is 3.127743 (392 / 12533) # feature-level VTLN + diag fMLLR
exp/decode_tri2g_vtln_diag_utt/wer:Average WER is 3.407006 (427 / 12533) # as above, per utt.
exp/decode_tri2g_vtln_nofmllr/wer:Average WER is 3.694247 (463 / 12533) # feature-level VTLN but no fMLLR
exp/decode_tri2h/wer:Average WER is 4.252773 (533 / 12533) # Splice-9-frames + HLDA
exp/decode_tri2i/wer:Average WER is 3.981489 (499 / 12533) # Triple-deltas + HLDA
exp/decode_tri2j/wer:Average WER is 3.853826 (483 / 12533) # Triple-deltas + LDA + MLLT
exp/decode_tri2k/wer:Average WER is 2.968164 (372 / 12533) # LDA + exponential transform
exp/decode_tri2k_utt/wer:Average WER is 3.175616 (398 / 12533) # per-utterance adaptation.
exp/decode_tri2k_fmllr/wer:Average WER is 2.505386 (314 / 12533) # +fMLLR (per-spk)
exp/decode_tri2k_regtree_fmllr/wer:Average WER is 2.513365 (315 / 12533) # +regression tree
exp/decode_tri2l/wer:Average WER is 2.704859 (339 / 12533) # Splice-9-frames + LDA + MLLT + SAT (fMLLR in test)
exp/decode_tri2l_utt/wer:Average WER is 4.930982 (618 / 12533) # [ as decode_tri2l but per-utt in test. ]
# sgmma is SGMM without speaker vectors.
exp/decode_sgmma/wer:Average WER is 3.319237 (416 / 12533)
exp/decode_sgmma_fmllr/wer:Average WER is 2.934308 (289 / 9849)
exp/decode_sgmma_fmllr_utt/wer:Average WER is 3.303279 (414 / 12533)
exp/decode_sgmma_fmllrbasis_utt/wer:Average WER is 3.191574 (400 / 12533)
# sgmmb is SGMM with speaker vectors.
exp/decode_sgmmb/wer:Average WER is 2.760712 (346 / 12533)
exp/decode_sgmmb_utt/wer:Average WER is 2.808585 (352 / 12533)
exp/decode_sgmmb_fmllr/wer:Average WER is 2.553259 (320 / 12533)
# sgmmc is like sgmmb but with gender dependency [doesn't help here]
exp/decode_sgmmc/wer:Average WER is 2.776670 (348 / 12533)
exp/decode_sgmmc_fmllr/wer:Average WER is 2.601133 (326 / 12533)
exp/decode_tri2a/wer:Average WER is 4.476183 (561 / 12533)
exp/decode_tri2a_fmllr/wer:Average WER is 3.718184 (466 / 12533)
exp/decode_tri2a_fmllr_utt/wer:Average WER is 4.452246 (558 / 12533)
exp/decode_tri2b/wer:Average WER is 2.992101 (375 / 12533)
exp/decode_tri2b_utt/wer:Average WER is 3.247427 (407 / 12533)
exp/decode_tri2c/wer:Average WER is 3.789994 (475 / 12533)
exp/decode_tri2d/wer:Average WER is 4.188941 (525 / 12533)
exp/decode_tri2e/wer:Average WER is 4.923003 (617 / 12533)
exp/decode_tri2f/wer:Average WER is 3.782015 (474 / 12533)
exp/decode_tri2g/wer:Average WER is 3.670310 (460 / 12533)
exp/decode_tri2g_diag/wer:Average WER is 3.550626 (445 / 12533) # +change mean-only to diagonal fMLLR
exp/decode_tri2g_vtln/wer:Average WER is 3.534668 (443 / 12533) # More conventional VTLN (+mean-only fMLLR)
exp/decode_tri2g_vtln_diag/wer:Average WER is 3.438921 (431 / 12533) #+change mean-only to diagonal fMLLR
@ -39,6 +87,12 @@ exp/decode_tri2k_regtree_fmllr/wer:Average WER is 2.561238 (321 / 12533) # +reg
exp/decode_tri2l/wer:Average WER is 2.688901 (337 / 12533) # Splice-9-frames + LDA + MLLT + SAT (fMLLR in test)
exp/decode_tri2l_utt/wer:Average WER is 5.066624 (635 / 12533) # [ as decode_tri2l but per-utt in test. ]
exp/decode_tri2m/wer:Average WER is 3.223490 (404 / 12533) # Splice + LDA + MLLT + Linear VTLN
exp/decode_tri2m_diag/wer:Average WER is 3.119764 (391 / 12533) # diagonal not offset CMLLR component
exp/decode_tri2m_vtln/wer:Average WER is 4.747467 (595 / 12533) # feature-level VTLN computation
exp/decode_tri2m_vtln_diag/wer:Average WER is 3.087848 (387 / 12533) # diagonal, not offset, adapt
exp/decode_tri2m_vtln_diag_utt/wer:Average WER is 4.340541 (544 / 12533) # per-utterance, diag adapt.
# sgmma is SGMM without speaker vectors.
exp/decode_sgmma/wer:Average WER is 3.151680 (395 / 12533)

Просмотреть файл

@ -1,2 +1,4 @@
root=../../..
export PATH=${root}/src/bin:${root}/tools/openfst/bin:${root}/src/fstbin/:${root}/src/gmmbin/:${root}/src/featbin/:${root}/src/fgmmbin:${root}/src/sgmmbin:$PATH
export LC_ALL=C
export LC_LOCALE_ALL=C

Просмотреть файл

@ -65,7 +65,6 @@ steps/train_tri2a.sh
(steps/decode_tri2a.sh ; steps/decode_tri2a_fmllr.sh; steps/decode_tri2a_fmllr_utt.sh )&
# Then do the same for 2b, 2c, and so on
# 2a = basic triphone (all features double-deltas unless stated).
# 2b = exponential transform
@ -79,8 +78,9 @@ steps/train_tri2a.sh
# 2j = triple-deltas + LDA + MLLT
# 2k = LDA + ET (equiv to LDA+MLLT+ET)
# 2l = splice-9-frames + LDA + MLLT + SAT (i.e. train with CMLLR)
# 2m = splice-9-frames + LDA + MLLT + LVTLN [depends on 2f]
for group in "b c d e" "f g h i" "j k l"; do
for group in "b c d e" "f g h i" "j k l m"; do
for x in $group; do
steps/train_tri2$x.sh &
done
@ -115,7 +115,7 @@ steps/train_ubma.sh
(steps/train_sgmmb.sh; steps/decode_sgmmb.sh; steps/decode_sgmmb_fmllr.sh; steps/decode_sgmmb_utt.sh )&
# + gender dependency.
(steps/train_sgmmc.sh; steps/decode_sgmmc.sh; steps/decode_sgmmc_fmllr.sh )&
(steps/train_ubmb.sh; steps/train_sgmmc.sh; steps/decode_sgmmc.sh; steps/decode_sgmmc_fmllr.sh )&

Просмотреть файл

@ -49,12 +49,12 @@ for test in mar87 oct87 feb89 oct89 feb91 sep92; do
( ali-to-post ark:$dir/test_${test}.pre_ali ark:- | \
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
sgmm-post-to-gpost "$gselect_opt" $alimodel "$feats" ark,s,cs:- ark:- | \
sgmm-est-spkvecs-gpost "$spk2utt_opt" $model "$feats" ark,s,cs:- \
sgmm-est-spkvecs-gpost $spk2utt_opt $model "$feats" ark,s,cs:- \
ark:$dir/test_${test}.vecs1 ) 2>$dir/vecs1_${test}.log
( ali-to-post ark:$dir/test_${test}.pre_ali ark:- | \
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
sgmm-est-spkvecs --spk-vecs=ark:$dir/test_${test}.vecs1 "$spk2utt_opt" \
sgmm-est-spkvecs --spk-vecs=ark:$dir/test_${test}.vecs1 $spk2utt_opt \
$model "$feats" ark,s,cs:- ark:$dir/test_${test}.vecs2 ) 2>$dir/vecs2_${test}.log

Просмотреть файл

@ -55,7 +55,7 @@ for test in mar87 oct87 feb89 oct89 feb91 sep92; do
( ali-to-post ark:$dir/test_${test}.pre_ali ark:- | \
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
sgmm-est-spkvecs --spk-vecs=ark:$dir/test_${test}.vecs1 "$spk2utt_opt" \
sgmm-est-spkvecs --spk-vecs=ark:$dir/test_${test}.vecs1 $spk2utt_opt \
$model "$feats" ark,s,cs:- ark:$dir/test_${test}.vecs2 ) 2>$dir/vecs2_${test}.log
sgmm-decode-faster "$gselect_opt" $utt2spk_opt --spk-vecs=ark:$dir/test_${test}.vecs2 --beam=20.0 --acoustic-scale=0.1 --word-symbol-table=data/words.txt $model $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.tra ark,t:$dir/test_${test}.ali 2> $dir/decode_${test}.log

Просмотреть файл

@ -55,12 +55,12 @@ for test in mar87 oct87 feb89 oct89 feb91 sep92; do
( ali-to-post ark:$dir/test_${test}.pre_ali ark:- | \
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
sgmm-post-to-gpost "$gselect_opt" $alimodel "$feats" ark,s,cs:- ark:- | \
sgmm-est-spkvecs-gpost "$spk2utt_opt" $model "$feats" ark,s,cs:- \
sgmm-est-spkvecs-gpost $spk2utt_opt $model "$feats" ark,s,cs:- \
ark:$dir/test_${test}.vecs1 ) 2>$dir/vecs1_${test}.log
( ali-to-post ark:$dir/test_${test}.pre_ali ark:- | \
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
sgmm-est-spkvecs --spk-vecs=ark:$dir/test_${test}.vecs1 "$spk2utt_opt" \
sgmm-est-spkvecs --spk-vecs=ark:$dir/test_${test}.vecs1 $spk2utt_opt \
$model "$feats" ark,s,cs:- ark:$dir/test_${test}.vecs2 ) 2>$dir/vecs2_${test}.log
sgmm-decode-faster "$gselect_opt" $utt2spk_opt --spk-vecs=ark:$dir/test_${test}.vecs2 --beam=20.0 --acoustic-scale=0.1 --word-symbol-table=data/words.txt $model $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.tra ark,t:$dir/test_${test}.ali 2> $dir/decode_${test}.log

Просмотреть файл

@ -73,12 +73,12 @@ for test in mar87 oct87 feb89 oct89 feb91 sep92; do
( ali-to-post ark:$dir/test_${test}.pass1.ali ark:- | \
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
sgmm-post-to-gpost "$gselect_opt" $alimodel "$feats" ark,s,cs:- ark:- | \
sgmm-est-spkvecs-gpost "$spk2utt_opt" $model "$feats" ark,s,cs:- \
sgmm-est-spkvecs-gpost $spk2utt_opt $model "$feats" ark,s,cs:- \
ark:$dir/test_${test}.vecs1 ) 2>$dir/vecs1_${test}.log
( ali-to-post ark:$dir/test_${test}.pass1.ali ark:- | \
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
sgmm-est-spkvecs --spk-vecs=ark:$dir/test_${test}.vecs1 "$spk2utt_opt" \
sgmm-est-spkvecs --spk-vecs=ark:$dir/test_${test}.vecs1 h$spk2utt_opt \
$model "$feats" ark,s,cs:- ark:$dir/test_${test}.vecs2 ) 2>$dir/vecs2_${test}.log
# Second-pass decoding with the speaker vectors.

66
egs/rm/s1/steps/decode_tri2m.sh Executable file
Просмотреть файл

@ -0,0 +1,66 @@
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# to be run from ..
if [ -f path.sh ]; then . path.sh; fi
dir=exp/decode_tri2m
mkdir -p $dir
model=exp/tri2m/final.mdl
alignmodel=exp/tri2m/final.alimdl
mat=exp/tri2f/final.mat
lvtln=exp/tri2m/final.lvtln
tree=exp/tri2m/tree
graphdir=exp/graph_tri2m
silphones=`cat data/silphones.csl`
# already made the graph.
scripts/mkgraph.sh $tree $model $graphdir
for test in mar87 oct87 feb89 oct89 feb91 sep92; do
(
sifeats="ark:splice-feats scp:data/test_${test}.scp ark:- | transform-feats $mat ark:- ark:- |"
# First do SI decoding with alignment model.
# Use smaller beam for this, as less critical.
gmm-decode-faster --beam=15.0 --acoustic-scale=0.083333 --word-symbol-table=data/words.txt $alignmodel $graphdir/HCLG.fst "$sifeats" ark,t:$dir/test_${test}_pre.tra ark,t:$dir/test_${test}_pre.ali 2> $dir/predecode_${test}.log
# Comment the two lines below to make this per-utterance.
spk2utt_opt=--spk2utt=ark:data/test_${test}.spk2utt
utt2spk_opt=--utt2spk=ark:data/test_${test}.utt2spk
( ali-to-post ark:$dir/test_${test}_pre.ali ark:- | \
weight-silence-post 0.0 $silphones $alignmodel ark:- ark:- | \
gmm-post-to-gpost $alignmodel "$sifeats" ark:- ark:- | \
gmm-est-lvtln-trans --verbose=1 $spk2utt_opt $model $lvtln \
"$sifeats" ark:- ark:$dir/lvtln_${test}.trans ark,t:$dir/lvtln_${test}.warp ) \
2>$dir/lvtln_${test}.log || exit 1;
feats="ark:splice-feats scp:data/test_${test}.scp ark:- | transform-feats $mat ark:- ark:- | transform-feats $utt2spk_opt ark:$dir/lvtln_${test}.trans ark:- ark:- |"
gmm-decode-faster --beam=20.0 --acoustic-scale=0.083333 --word-symbol-table=data/words.txt $model $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.tra ark,t:$dir/test_${test}.ali 2> $dir/decode_${test}.log
# the ,p option lets it score partial output without dying..
scripts/sym2int.pl --ignore-first-field data/words.txt data_prep/test_${test}_trans.txt | \
compute-wer --mode=present ark:- ark,p:$dir/test_${test}.tra >& $dir/wer_${test}
) &
done
wait
grep WER $dir/wer_* | \
awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", 100.0*n/d, n, d); }' \
> $dir/wer

Просмотреть файл

@ -0,0 +1,66 @@
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# to be run from ..
if [ -f path.sh ]; then . path.sh; fi
dir=exp/decode_tri2m_diag
mkdir -p $dir
model=exp/tri2m/final.mdl
alignmodel=exp/tri2m/final.alimdl
mat=exp/tri2f/final.mat
lvtln=exp/tri2m/final.lvtln
tree=exp/tri2m/tree
graphdir=exp/graph_tri2m
silphones=`cat data/silphones.csl`
# already made the graph.
scripts/mkgraph.sh $tree $model $graphdir
for test in mar87 oct87 feb89 oct89 feb91 sep92; do
(
sifeats="ark:splice-feats scp:data/test_${test}.scp ark:- | transform-feats $mat ark:- ark:- |"
# First do SI decoding with alignment model.
# Use smaller beam for this, as less critical.
gmm-decode-faster --beam=15.0 --acoustic-scale=0.083333 --word-symbol-table=data/words.txt $alignmodel $graphdir/HCLG.fst "$sifeats" ark,t:$dir/test_${test}_pre.tra ark,t:$dir/test_${test}_pre.ali 2> $dir/predecode_${test}.log
# Comment the two lines below to make this per-utterance.
spk2utt_opt=--spk2utt=ark:data/test_${test}.spk2utt
utt2spk_opt=--utt2spk=ark:data/test_${test}.utt2spk
( ali-to-post ark:$dir/test_${test}_pre.ali ark:- | \
weight-silence-post 0.0 $silphones $alignmodel ark:- ark:- | \
gmm-post-to-gpost $alignmodel "$sifeats" ark:- ark:- | \
gmm-est-lvtln-trans --norm-type=diag --verbose=1 $spk2utt_opt $model $lvtln \
"$sifeats" ark:- ark:$dir/lvtln_${test}.trans ark,t:$dir/lvtln_${test}.warp ) \
2>$dir/lvtln_${test}.log || exit 1;
feats="ark:splice-feats scp:data/test_${test}.scp ark:- | transform-feats $mat ark:- ark:- | transform-feats $utt2spk_opt ark:$dir/lvtln_${test}.trans ark:- ark:- |"
gmm-decode-faster --beam=20.0 --acoustic-scale=0.083333 --word-symbol-table=data/words.txt $model $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.tra ark,t:$dir/test_${test}.ali 2> $dir/decode_${test}.log
# the ,p option lets it score partial output without dying..
scripts/sym2int.pl --ignore-first-field data/words.txt data_prep/test_${test}_trans.txt | \
compute-wer --mode=present ark:- ark,p:$dir/test_${test}.tra >& $dir/wer_${test}
) &
done
wait
grep WER $dir/wer_* | \
awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", 100.0*n/d, n, d); }' \
> $dir/wer

Просмотреть файл

@ -0,0 +1,80 @@
# as decode_tri2m but using the feature-level VTLN
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# as opposed to the linear VTLN when decoding.
# Also computing a maximum-likelihood mean offset,
# for better comparability with LVTLN.
# to be run from ..
if [ -f path.sh ]; then . path.sh; fi
dir=exp/decode_tri2m_vtln
mkdir -p $dir
vtlnmodel=exp/tri2m/final.vtlnmdl
lvtlnmodel=exp/tri2m/final.mdl
alignmodel=exp/tri2m/final.alimdl
mat=exp/tri2f/final.mat
lvtln=exp/tri2m/final.lvtln
tree=exp/tri2m/tree
graphdir=exp/graph_tri2m
silphones=`cat data/silphones.csl`
# Doesn't matter which model we use when making the graph
# (only the transitions and structure are used).
scripts/mkgraph.sh $tree $vtlnmodel $graphdir
for test in mar87 oct87 feb89 oct89 feb91 sep92; do
(
sifeats="ark:splice-feats scp:data/test_${test}.scp ark:- | transform-feats $mat ark:- ark:- |"
# First do SI decoding with alignment model.
# Use smaller beam for this, as less critical.
gmm-decode-faster --beam=15.0 --acoustic-scale=0.083333 --word-symbol-table=data/words.txt $alignmodel $graphdir/HCLG.fst "$sifeats" ark,t:$dir/test_${test}_pre.tra ark,t:$dir/test_${test}_pre.ali 2> $dir/predecode_${test}.log
# Comment the two lines below to make this per-utterance.
spk2utt_opt=--spk2utt=ark:data/test_${test}.spk2utt
utt2spk_opt=--utt2spk=ark:data/test_${test}.utt2spk
( ali-to-post ark:$dir/test_${test}_pre.ali ark:- | \
weight-silence-post 0.0 $silphones $alignmodel ark:- ark:- | \
gmm-post-to-gpost $alignmodel "$sifeats" ark:- ark:- | \
gmm-est-lvtln-trans --verbose=1 $spk2utt_opt $lvtlnmodel $lvtln \
"$sifeats" ark:- ark:/dev/null ark,t:$dir/lvtln_${test}.warp ) \
2>$dir/lvtln_${test}.log || exit 1;
cat $dir/lvtln_${test}.warp | awk '{print $1, (0.85+0.01*$2);}' > $dir/${test}.factor
feats="ark:compute-mfcc-feats $utt2spk_opt --vtln-low=100 --vtln-high=-600 --vtln-map=ark:$dir/${test}.factor --config=conf/mfcc.conf scp:data_prep/test_${test}_wav.scp ark:- | splice-feats scp:data/test_${test}.scp ark:- | transform-feats $mat ark:- ark:- |"
( ali-to-post ark:$dir/test_${test}_pre.ali ark:- | \
weight-silence-post 0.0 $silphones $alignmodel ark:- ark:- | \
gmm-est-fmllr --fmllr-update-type=offset $spk2utt_opt $vtlnmodel "$feats" ark,o:- ark:$dir/${test}.trans ) 2>$dir/fmllr_${test}.log || exit 1;
feats="ark:compute-mfcc-feats $utt2spk_opt --vtln-low=100 --vtln-high=-600 --vtln-map=ark:$dir/${test}.factor --config=conf/mfcc.conf scp:data_prep/test_${test}_wav.scp ark:- | splice-feats scp:data/test_${test}.scp ark:- | transform-feats $mat ark:- ark:- | transform-feats $utt2spk_opt ark:$dir/${test}.trans ark:- ark:- |"
gmm-decode-faster --beam=20.0 --acoustic-scale=0.083333 --word-symbol-table=data/words.txt $vtlnmodel $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.tra ark,t:$dir/test_${test}.ali 2> $dir/decode_${test}.log
# the ,p option lets it score partial output without dying..
scripts/sym2int.pl --ignore-first-field data/words.txt data_prep/test_${test}_trans.txt | \
compute-wer --mode=present ark:- ark,p:$dir/test_${test}.tra >& $dir/wer_${test}
) &
done
wait
grep WER $dir/wer_* | \
awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", 100.0*n/d, n, d); }' \
> $dir/wer

Просмотреть файл

@ -0,0 +1,80 @@
# as decode_tri2m but using the feature-level VTLN
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# as opposed to the linear VTLN when decoding.
# Also computing a diagonal fMLLR transform for
# comparison with ET.
# to be run from ..
if [ -f path.sh ]; then . path.sh; fi
dir=exp/decode_tri2m_vtln_diag
mkdir -p $dir
vtlnmodel=exp/tri2m/final.vtlnmdl
lvtlnmodel=exp/tri2m/final.mdl
alignmodel=exp/tri2m/final.alimdl
mat=exp/tri2f/final.mat
lvtln=exp/tri2m/final.lvtln
tree=exp/tri2m/tree
graphdir=exp/graph_tri2m
silphones=`cat data/silphones.csl`
# Doesn't matter which model we use when making the graph
# (only the transitions and structure are used).
scripts/mkgraph.sh $tree $vtlnmodel $graphdir
for test in mar87 oct87 feb89 oct89 feb91 sep92; do
(
sifeats="ark:splice-feats scp:data/test_${test}.scp ark:- | transform-feats $mat ark:- ark:- |"
# First do SI decoding with alignment model.
# Use smaller beam for this, as less critical.
gmm-decode-faster --beam=15.0 --acoustic-scale=0.083333 --word-symbol-table=data/words.txt $alignmodel $graphdir/HCLG.fst "$sifeats" ark,t:$dir/test_${test}_pre.tra ark,t:$dir/test_${test}_pre.ali 2> $dir/predecode_${test}.log
# Comment the two lines below to make this per-utterance.
spk2utt_opt=--spk2utt=ark:data/test_${test}.spk2utt
utt2spk_opt=--utt2spk=ark:data/test_${test}.utt2spk
( ali-to-post ark:$dir/test_${test}_pre.ali ark:- | \
weight-silence-post 0.0 $silphones $alignmodel ark:- ark:- | \
gmm-post-to-gpost $alignmodel "$sifeats" ark:- ark:- | \
gmm-est-lvtln-trans --verbose=1 $spk2utt_opt $lvtlnmodel $lvtln \
"$sifeats" ark:- ark:/dev/null ark,t:$dir/lvtln_${test}.warp ) \
2>$dir/lvtln_${test}.log || exit 1;
cat $dir/lvtln_${test}.warp | awk '{print $1, (0.85+0.01*$2);}' > $dir/${test}.factor
feats="ark:compute-mfcc-feats $utt2spk_opt --vtln-low=100 --vtln-high=-600 --vtln-map=ark:$dir/${test}.factor --config=conf/mfcc.conf scp:data_prep/test_${test}_wav.scp ark:- | splice-feats ark:- ark:- | transform-feats $mat ark:- ark:- |"
( ali-to-post ark:$dir/test_${test}_pre.ali ark:- | \
weight-silence-post 0.0 $silphones $alignmodel ark:- ark:- | \
gmm-est-fmllr --fmllr-update-type=diag $spk2utt_opt $vtlnmodel "$feats" ark,o:- ark:$dir/${test}.trans ) 2>$dir/fmllr_${test}.log || exit 1;
feats="ark:compute-mfcc-feats $utt2spk_opt --vtln-low=100 --vtln-high=-600 --vtln-map=ark:$dir/${test}.factor --config=conf/mfcc.conf scp:data_prep/test_${test}_wav.scp ark:- | splice-feats ark:- ark:- | transform-feats $mat ark:- ark:- | transform-feats $utt2spk_opt ark:$dir/${test}.trans ark:- ark:- |"
gmm-decode-faster --beam=20.0 --acoustic-scale=0.083333 --word-symbol-table=data/words.txt $vtlnmodel $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.tra ark,t:$dir/test_${test}.ali 2> $dir/decode_${test}.log
# the ,p option lets it score partial output without dying..
scripts/sym2int.pl --ignore-first-field data/words.txt data_prep/test_${test}_trans.txt | \
compute-wer --mode=present ark:- ark,p:$dir/test_${test}.tra >& $dir/wer_${test}
) &
done
wait
grep WER $dir/wer_* | \
awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", 100.0*n/d, n, d); }' \
> $dir/wer

Просмотреть файл

@ -0,0 +1,80 @@
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# as opposed to the linear VTLN when decoding.
# Also computing a diagonal fMLLR transform for
# comparison with ET.
# to be run from ..
if [ -f path.sh ]; then . path.sh; fi
dir=exp/decode_tri2m_vtln_diag_utt
mkdir -p $dir
vtlnmodel=exp/tri2m/final.vtlnmdl
lvtlnmodel=exp/tri2m/final.mdl
alignmodel=exp/tri2m/final.alimdl
mat=exp/tri2f/final.mat
lvtln=exp/tri2m/final.lvtln
tree=exp/tri2m/tree
graphdir=exp/graph_tri2m
silphones=`cat data/silphones.csl`
mincount=100 # for diagonal fMLLR
# Doesn't matter which model we use when making the graph
# (only the transitions and structure are used).
scripts/mkgraph.sh $tree $vtlnmodel $graphdir
for test in mar87 oct87 feb89 oct89 feb91 sep92; do
(
sifeats="ark:splice-feats scp:data/test_${test}.scp ark:- | transform-feats $mat ark:- ark:- |"
# First do SI decoding with alignment model.
# Use smaller beam for this, as less critical.
gmm-decode-faster --beam=15.0 --acoustic-scale=0.083333 --word-symbol-table=data/words.txt $alignmodel $graphdir/HCLG.fst "$sifeats" ark,t:$dir/test_${test}_pre.tra ark,t:$dir/test_${test}_pre.ali 2> $dir/predecode_${test}.log
## Comment the two lines below to make this per-utterance.
#spk2utt_opt=--spk2utt=ark:data/test_${test}.spk2utt
#utt2spk_opt=--utt2spk=ark:data/test_${test}.utt2spk
( ali-to-post ark:$dir/test_${test}_pre.ali ark:- | \
weight-silence-post 0.0 $silphones $alignmodel ark:- ark:- | \
gmm-post-to-gpost $alignmodel "$sifeats" ark:- ark:- | \
gmm-est-lvtln-trans --verbose=1 $spk2utt_opt $lvtlnmodel $lvtln \
"$sifeats" ark:- ark:/dev/null ark,t:$dir/lvtln_${test}.warp ) \
2>$dir/lvtln_${test}.log || exit 1;
cat $dir/lvtln_${test}.warp | awk '{print $1, (0.85+0.01*$2);}' > $dir/${test}.factor
feats="ark:compute-mfcc-feats $utt2spk_opt --vtln-low=100 --vtln-high=-600 --vtln-map=ark:$dir/${test}.factor --config=conf/mfcc.conf scp:data_prep/test_${test}_wav.scp ark:- | splice-feats scp:data/test_${test}.scp ark:- | transform-feats $mat ark:- ark:- |"
( ali-to-post ark:$dir/test_${test}_pre.ali ark:- | \
weight-silence-post 0.0 $silphones $alignmodel ark:- ark:- | \
gmm-est-fmllr --fmllr-update-type=diag --fmllr-min-count=$mincount $spk2utt_opt $vtlnmodel "$feats" ark,o:- ark:$dir/${test}.trans ) 2>$dir/fmllr_${test}.log || exit 1;
feats="ark:compute-mfcc-feats $utt2spk_opt --vtln-low=100 --vtln-high=-600 --vtln-map=ark:$dir/${test}.factor --config=conf/mfcc.conf scp:data_prep/test_${test}_wav.scp ark:- | splice-feats scp:data/test_${test}.scp ark:- | transform-feats $mat ark:- ark:- | transform-feats $utt2spk_opt ark:$dir/${test}.trans ark:- ark:- |"
gmm-decode-faster --beam=20.0 --acoustic-scale=0.083333 --word-symbol-table=data/words.txt $vtlnmodel $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.tra ark,t:$dir/test_${test}.ali 2> $dir/decode_${test}.log
# the ,p option lets it score partial output without dying..
scripts/sym2int.pl --ignore-first-field data/words.txt data_prep/test_${test}_trans.txt | \
compute-wer --mode=present ark:- ark,p:$dir/test_${test}.tra >& $dir/wer_${test}
) &
done
wait
grep WER $dir/wer_* | \
awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", 100.0*n/d, n, d); }' \
> $dir/wer

Просмотреть файл

@ -0,0 +1,71 @@
# as decode_tri2m but using the feature-level VTLN
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# as opposed to the linear VTLN when decoding.
# to be run from ..
if [ -f path.sh ]; then . path.sh; fi
dir=exp/decode_tri2m_vtln_nofmllr
mkdir -p $dir
vtlnmodel=exp/tri2m/final.vtlnmdl
lvtlnmodel=exp/tri2m/final.mdl
alignmodel=exp/tri2m/final.alimdl
lvtln=exp/tri2m/final.lvtln
tree=exp/tri2m/tree
graphdir=exp/graph_tri2m
silphones=`cat data/silphones.csl`
# Doesn't matter which model we use when making the graph
# (only the transitions and structure are used).
scripts/mkgraph.sh $tree $vtlnmodel $graphdir
for test in mar87 oct87 feb89 oct89 feb91 sep92; do
(
sifeats="ark:splice-feats scp:data/test_${test}.scp ark:- | transform-feats $mat ark:- ark:- |"
# First do SI decoding with alignment model.
# Use smaller beam for this, as less critical.
gmm-decode-faster --beam=15.0 --acoustic-scale=0.083333 --word-symbol-table=data/words.txt $alignmodel $graphdir/HCLG.fst "$sifeats" ark,t:$dir/test_${test}_pre.tra ark,t:$dir/test_${test}_pre.ali 2> $dir/predecode_${test}.log
# Comment the two lines below to make this per-utterance.
spk2utt_opt=--spk2utt=ark:data/test_${test}.spk2utt
utt2spk_opt=--utt2spk=ark:data/test_${test}.utt2spk
( ali-to-post ark:$dir/test_${test}_pre.ali ark:- | \
weight-silence-post 0.0 $silphones $alignmodel ark:- ark:- | \
gmm-post-to-gpost $alignmodel "$sifeats" ark:- ark:- | \
gmm-est-lvtln-trans --verbose=1 $spk2utt_opt $lvtlnmodel $lvtln \
"$sifeats" ark:- ark:/dev/null ark,t:$dir/lvtln_${test}.warp ) \
2>$dir/lvtln_${test}.log || exit 1;
cat $dir/lvtln_${test}.warp | awk '{print $1, (0.85+0.01*$2);}' > $dir/${test}.factor
feats="ark:compute-mfcc-feats $utt2spk_opt --vtln-low=100 --vtln-high=-600 --vtln-map=ark:$dir/${test}.factor --config=conf/mfcc.conf scp:data_prep/test_${test}_wav.scp ark:- | splice-feats scp:data/test_${test}.scp ark:- | transform-feats $mat ark:- ark:- |"
gmm-decode-faster --beam=20.0 --acoustic-scale=0.083333 --word-symbol-table=data/words.txt $vtlnmodel $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.tra ark,t:$dir/test_${test}.ali 2> $dir/decode_${test}.log
# the ,p option lets it score partial output without dying..
scripts/sym2int.pl --ignore-first-field data/words.txt data_prep/test_${test}_trans.txt | \
compute-wer --mode=present ark:- ark,p:$dir/test_${test}.tra >& $dir/wer_${test}
) &
done
wait
grep WER $dir/wer_* | \
awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", 100.0*n/d, n, d); }' \
> $dir/wer

Просмотреть файл

@ -1,87 +0,0 @@
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
if [ -f path.sh ]; then . path.sh; fi
# To be run from ..
dir=exp/sgmm
srcdir=exp/tri1
srcmodel=$srcdir/final.mdl
srcgraphs="ark:gunzip -c $srcdir/graphs.fsts.gz|"
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
numiters=25 # Total number of iterations
realign_iters="5 10 15";
silphonelist=`cat data/silphones.csl`
numsubstates=1500 # Initial #-substates.
totsubstates=5000 # Target #-substates.
maxiterinc=15 # Last iter to increase #substates on.
incsubstates=$[($totsubstates-$numsubstates)/$maxiterinc] # per-iter increment for #substates
gselect_opt="--gselect=ark:gunzip -c $dir/gselect.gz|"
randprune=0.1
mkdir -p $dir
feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
cp $srcdir/tree $dir
echo "aligning all training data"
if [ ! -f $dir/0.ali ]; then
gmm-align-compiled $scale_opts --beam=8 --retry-beam=40 $srcmodel "$srcgraphs" \
"$feats" ark,t:$dir/0.ali 2> $dir/align.0.log || exit 1;
fi
if [ ! -f $dir/0.mdl ]; then
echo "you must run init_sgmm.sh before train_sgmm1.sh"
exit 1
fi
if [ ! -f $dir/gselect.gz ]; then
sgmm-gselect $dir/0.mdl "$feats" ark,t:- 2>$dir/gselect.log | gzip -c > $dir/gselect.gz || exit 1;
fi
cp $dir/0.ali $dir/cur.ali || exit 1;
iter=0
while [ $iter -lt $numiters ]; do
echo "Pass $iter ... "
if echo $realign_iters | grep -w $iter >/dev/null; then
echo "Aligning data"
sgmm-align-compiled $scale_opts "$gselect_opt" --beam=8 --retry-beam=40 $dir/$iter.mdl \
"$srcgraphs" "$feats" \
ark:$dir/cur.ali 2> $dir/align.$iter.log || exit 1;
fi
if [ $iter -gt 0 ]; then
flags=vMwcS
else
flags=vwcS
fi
if [ ! -f $dir/$[$iter+1].mdl ]; then
sgmm-acc-stats-ali --update-flags=$flags "$gselect_opt" --rand-prune=$randprune --binary=false $dir/$iter.mdl "$feats" ark:$dir/cur.ali $dir/$iter.acc 2> $dir/acc.$iter.log || exit 1;
sgmm-est --update-flags=$flags --split-substates=$numsubstates --write-occs=$dir/$[$iter+1].occs $dir/$iter.mdl $dir/$iter.acc $dir/$[$iter+1].mdl 2> $dir/update.$iter.log || exit 1;
fi
# rm $dir/$iter.mdl $dir/$iter.acc
# rm $dir/$iter.occs
if [ $iter -lt $maxiterinc ]; then
numsubstates=$[$numsubstates+$incsubstates]
fi
iter=$[$iter+1];
done
( cd $dir; rm final.mdl final.occs 2>/dev/null; ln -s $iter.mdl final.mdl; ln -s $iter.occs final.occs )

Просмотреть файл

@ -1,103 +0,0 @@
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This is SGMM training with speaker vectors.
if [ -f path.sh ]; then . path.sh; fi
# To be run from ..
dir=exp/sgmm2
srcdir=exp/sgmm
gmmtridir=exp/tri1
trimodel=$gmmtridir/final.mdl
srcgraphs="ark:gunzip -c $gmmtridir/graphs.fsts.gz|"
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
numiters=25 # Total number of iterations
realign_iters="5 10 15";
silphonelist=`cat data/silphones.csl`
numsubstates=1500 # Initial #-substates.
totsubstates=5000 # Target #-substates.
maxiterinc=15 # Last iter to increase #substates on.
incsubstates=$[($totsubstates-$numsubstates)/$maxiterinc] # per-iter increment for #substates
gselect_opt="--gselect=ark:gunzip -c $dir/gselect.gz|"
randprune=0.1
spkdim=39
mkdir -p $dir
feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
cp $gmmtridir/tree $srcdir/{0.ali,0.mdl,gselect.gz} $dir
if [ ! -f $dir/0.ali ]; then
echo "aligning all training data"
gmm-align-compiled $scale_opts --beam=8 --retry-beam=40 $trimodel "$srcgraphs" \
"$feats" ark,t:$dir/0.ali 2> $dir/align.0.log || exit 1;
fi
if [ ! -f $dir/0.mdl ]; then
echo "you must run init_sgmm.sh before train_sgmm2.sh"
exit 1
fi
if [ ! -f $dir/gselect.gz ]; then
sgmm-gselect $dir/0.mdl "$feats" ark,t:- 2>$dir/gselect.log | gzip -c > $dir/gselect.gz || exit 1;
fi
cp $dir/0.ali $dir/cur.ali || exit 1;
iter=0
while [ $iter -lt $numiters ]; do
echo "Pass $iter ... "
if [ $iter -gt 0 ]; then
if [ $iter -le 5 ]; then # only train phonetic subspace
flags=vMwcS
elif [ $(( $iter % 2 )) -eq 1 ]; then # odd iterations
flags=vMwcS
else # even iterations, update N and not M
flags=vwcSN
fi
else
flags=vwcS
fi
if [ ! -f $dir/$[$iter+1].mdl ]; then
if echo $realign_iters | grep -w $iter >/dev/null; then
echo "Aligning data"
sgmm-align-compiled $scale_opts "$gselect_opt" --beam=8 --retry-beam=40 $dir/$iter.mdl \
"$srcgraphs" "$feats" \
ark:$dir/cur.ali 2> $dir/align.$iter.log || exit 1;
fi
sgmm-acc-stats-ali --update-flags=$flags "$gselect_opt" --rand-prune=$randprune --binary=false $dir/$iter.mdl "$feats" ark:$dir/cur.ali $dir/$iter.acc 2> $dir/acc.$iter.log || exit 1;
if [ $iter -eq 5 ]; then # increase spk dimension from 0 to 39
sgmm-estimate --update-flags=$flags --increase-spk-dim=$spkdim --split-substates=$numsubstates --write-occs=$dir/$[$iter+1].occs $dir/$iter.mdl $dir/$iter.acc $dir/$[$iter+1].mdl 2> $dir/update.$iter.log || exit 1;
else
sgmm-estimate --update-flags=$flags --split-substates=$numsubstates --write-occs=$dir/$[$iter+1].occs $dir/$iter.mdl $dir/$iter.acc $dir/$[$iter+1].mdl 2> $dir/update.$iter.log || exit 1;
fi
fi
rm $dir/$iter.acc # $dir/$iter.mdl
# rm $dir/$iter.occs
if [ $iter -lt $maxiterinc ]; then
numsubstates=$[$numsubstates+$incsubstates]
fi
iter=$[$iter+1];
done
( cd $dir; rm final.mdl final.occs 2>/dev/null; ln -s $iter.mdl final.mdl; ln -s $iter.occs final.occs )

Просмотреть файл

@ -14,8 +14,6 @@
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# To be run from ..
if [ -f path.sh ]; then . path.sh; fi

209
egs/rm/s1/steps/train_tri2m.sh Executable file
Просмотреть файл

@ -0,0 +1,209 @@
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# To be run from ..
# This (tri2m) is as tri2g except based on LDA+MLLT
# features from tri2f.
# We also start from tri2f for initial alignments.
if [ -f path.sh ]; then . path.sh; fi
dir=exp/tri2m
srcdir=exp/tri2f
srcmodel=$srcdir/final.mdl
srcgraphs="ark:gunzip -c $srcdir/graphs.fsts.gz|"
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
numiters=30 # Number of iterations of training
maxiterinc=20 # Last iter to increase #Gauss on.
numleaves=1800
numgauss=$numleaves
totgauss=9000 # Target #Gaussians
incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
silphonelist=`cat data/silphones.csl`
realign_iters="10 15 20 25";
lvtln_iters="2 4 6 8 12"; # Recompute LVTLN transforms on these iters.
mat=exp/tri2f/final.mat
if [ ! -f $mat ]; then
echo No input transformation $mat
exit 1
fi
per_spk=true
compute_vtlnmdl=true # If true, at the end compute a model with actual feature-space
# VTLN features. You can decode with this as an alternative to
# final.mdl which takes the LVTLN features.
numfiles=40 # Number of feature files for computing LVTLN transforms.
numclass=31; # Can't really change this without changing the script below
defaultclass=15; # Corresponds to no warping.
# RE "vtln_warp"
if [ $per_spk == "true" ]; then
spk2utt_opt=--spk2utt=ark:data/train.spk2utt
utt2spk_opt=--utt2spk=ark:data/train.utt2spk
else
spk2utt_opt=
utt2spk_opt=
fi
mkdir -p $dir
cp $srcdir/topo $dir
srcfeats="ark:splice-feats --print-args=false scp:data/train.scp ark:- | transform-feats $mat ark:- ark:- |"
# Will create lvtln.trans below...
feats="ark:splice-feats --print-args=false scp:data/train.scp ark:- | transform-feats $mat ark:- ark:- | transform-feats $utt2spk_opt ark:$dir/cur.trans ark:- ark:- |"
gmm-init-lvtln --dim=40 --num-classes=$numclass --default-class=$defaultclass \
$dir/0.lvtln 2>$dir/init_lvtln.log || exit 1
featsub="ark:scripts/subset_scp.pl $numfiles data/train.scp | splice-feats scp:- ark:- | transform-feats $mat ark:- ark:- |"
echo "Initializing lvtln transforms."
c=0
while [ $c -lt $numclass ]; do
warp=`perl -e 'print 0.85 + 0.01*$ARGV[0];' $c`
featsub_warp="ark:scripts/subset_scp.pl $numfiles data_prep/train_wav.scp | compute-mfcc-feats --vtln-low=100 --vtln-high=-600 --vtln-warp=$warp --config=conf/mfcc.conf scp:- ark:- | splice-feats ark:- ark:- | transform-feats $mat ark:- ark:- |"
gmm-train-lvtln-special --normalize-var=true $c $dir/0.lvtln $dir/0.lvtln \
"$featsub" "$featsub_warp" 2> $dir/train_special.$c.log || exit 1;
c=$[$c+1]
done
# just a single element. :-separated integer list of context-independent
scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
# script below tells it not to cluster, but here we avoid accumulating
# CD-stats for silence.
echo "aligning all training data"
gmm-align-compiled $scale_opts --beam=8 --retry-beam=40 $srcmodel \
"$srcgraphs" "$srcfeats" ark,t:$dir/0.ali 2> $dir/align.0.log || exit 1;
echo "Computing LVTLN transforms (iter 0)"
( ali-to-post ark:$dir/0.ali ark:- | \
weight-silence-post 0.0 $silphonelist $srcmodel ark:- ark:- | \
gmm-post-to-gpost $srcmodel "$srcfeats" ark:- ark:- | \
gmm-est-lvtln-trans --verbose=1 $spk2utt_opt $srcmodel $dir/0.lvtln \
"$srcfeats" ark:- ark:$dir/cur.trans ark,t:$dir/0.warp ) 2>$dir/lvtln.0.log || exit 1
acc-tree-stats --ci-phones=$silphonelist $srcmodel "$feats" ark:$dir/0.ali $dir/treeacc 2> $dir/acc.tree.log || exit 1;
cat data/phones.txt | awk '{print $NF}' | grep -v -w 0 > $dir/phones.list
cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/questions.log || exit 1;
scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;
scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
build-tree --verbose=1 --max-leaves=$numleaves \
$dir/treeacc $dir/roots.txt \
$dir/questions.qst $dir/topo $dir/tree 2> $dir/train_tree.log || exit 1;
gmm-init-model --write-occs=$dir/1.occs \
$dir/tree $dir/treeacc $dir/topo $dir/1.mdl 2> $dir/init_model.log || exit 1;
rm $dir/treeacc
# Convert alignments generated from monophone model, to use as initial alignments.
convert-ali $srcmodel $dir/1.mdl $dir/tree ark:$dir/0.ali ark:$dir/cur.ali 2>$dir/convert.log
# Debug step only: convert back and check they're the same.
convert-ali $dir/1.mdl $srcmodel $srcdir/tree ark:$dir/cur.ali ark,t:- \
2>/dev/null | cmp - $dir/0.ali || exit 1;
rm $dir/0.ali
# Make training graphs
echo "Compiling training graphs"
compile-train-graphs $dir/tree $dir/1.mdl data/L.fst ark:data/train.tra \
"ark:|gzip -c > $dir/graphs.fsts.gz" 2>$dir/compile_graphs.log || exit 1
cur_lvtln=$dir/0.lvtln
x=1
while [ $x -lt $numiters ]; do
echo pass $x
if echo $lvtln_iters | grep -w $x >/dev/null; then
( ali-to-post ark:$dir/cur.ali ark:- | \
weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- | \
gmm-post-to-gpost $dir/$x.mdl "$feats" ark:- ark:- | \
gmm-est-lvtln-trans --verbose=1 $spk2utt_opt $dir/$x.mdl $dir/0.lvtln \
"$srcfeats" ark:- ark:$dir/tmp.trans ark,t:$dir/$x.warp ) 2>$dir/lvtln.$x.log || exit 1
cp $dir/$x.warp $dir/cur.warp
mv $dir/tmp.trans $dir/cur.trans
fi
if echo $realign_iters | grep -w $x >/dev/null; then
echo "Aligning data"
gmm-align-compiled $scale_opts --beam=8 --retry-beam=40 $dir/$x.mdl \
"ark:gunzip -c $dir/graphs.fsts.gz|" "$feats" \
ark:$dir/cur.ali 2> $dir/align.$x.log || exit 1;
fi
gmm-acc-stats-ali --binary=false $dir/$x.mdl "$feats" ark:$dir/cur.ali $dir/$x.acc 2> $dir/acc.$x.log || exit 1;
gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl $dir/$x.acc $dir/$[$x+1].mdl 2> $dir/update.$x.log || exit 1;
rm $dir/$x.mdl $dir/$x.acc
if [ $x -le $maxiterinc ]; then
numgauss=$[$numgauss+$incgauss];
fi
x=$[$x+1]
done
# Accumulate stats for "alignment model" which is as the model but with
# the baseline features (shares Gaussian-level alignments).
( ali-to-post ark:$dir/cur.ali ark:- | \
gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$srcfeats" ark:- $dir/$x.acc2 ) 2>$dir/acc_alimdl.log || exit 1;
# Update model.
gmm-est --remove-low-count-gaussians=false $dir/$x.mdl $dir/$x.acc2 $dir/$x.alimdl \
2>$dir/est_alimdl.log || exit 1;
rm $dir/$x.acc2
# The following files contains information that may be useful for display purposes
for n in 0 $lvtln_iters; do
cat $dir/$n.warp | scripts/process_warps.pl data/spk2gender.map > $dir/warps.$n
done
if [ $compute_vtlnmdl == "true" ]; then
cat $dir/cur.warp | awk '{print $1, (0.85+0.01*$2);}' > $dir/cur.factor
compute-mfcc-feats $utt2spk_opt --vtln-low=100 --vtln-high=-600 --vtln-map=ark:$dir/cur.factor --config=conf/mfcc.conf scp:data_prep/train_wav.scp ark:$dir/tmp.ark 2>$dir/mfcc.log
vtlnfeats="ark:splice-feats ark:$dir/tmp.ark ark:- | transform-feats $mat ark:- ark:- |"
# Compute diagonal fMLLR transform to normalize VTLN feats.
( ali-to-post ark:$dir/cur.ali ark:- | \
weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- | \
gmm-est-fmllr --fmllr-update-type=diag $spk2utt_opt $dir/$x.mdl "$vtlnfeats" ark,o:- ark:$dir/vtln.trans ) 2>$dir/vtln_fmllr.log || exit 1;
vtlnfeats="ark:splice-feats ark:$dir/tmp.ark ark:- | transform-feats $mat ark:- ark:- | transform-feats $utt2spk_opt ark:$dir/vtln.trans ark:- ark:- |"
( ali-to-post ark:$dir/cur.ali ark:- | \
gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$vtlnfeats" ark:- $dir/$x.acc3 ) 2>$dir/acc_vtlnmdl.log || exit 1;
# Update model.
gmm-est $dir/$x.mdl $dir/$x.acc3 $dir/$x.vtlnmdl \
2>$dir/est_vtlnmdl.log || exit 1;
rm $dir/$x.acc3
ln -s $x.vtlnmdl $dir/final.vtlnmdl
rm $dir/tmp.ark
fi
( cd $dir; rm final.mdl 2>/dev/null; ln -s $x.mdl final.mdl;
ln -s $x.alimdl final.alimdl;
ln -s 0.lvtln final.lvtln;
ln -s cur.trans final.trans )

Просмотреть файл

@ -28,11 +28,11 @@ The "tri3*" systems are trained on all the SI-284 data.
LM: | Pruned trigram | Bigram |
Test set: | Eval92 | Eval93 | Eval92 | Eval93 |
system:
mono 31.4
tri1 13.3
mono 31.4 37.5
tri1 13.3 18.2
tri2a 12.5 18.3 14.3 21.0 | tri2a is delta+delta-deltas.
+fmllr[spk] 11.4
[utt] 12.5
+fmllr[spk] 11.4 15.5
[utt] 12.5 18.4
tri3a 10.7 13.8 11.9 15.0 | tri3a is as tri2a but all SI-284 data.
+fmllr[spk] 9.5 12.1
[diagonal] 10.5 12.7
@ -42,7 +42,8 @@ system:
+fmllr[spk] 10.5 13.9
[utt] 11.3 15.1 | [estimating ET per utt.]
+fmllr[utt] 11.2 15.3 | [estimating ET and fMLLR per utt]
tri2c 12.7 17.0 | as tri2a plus cepstral mean subtraction.
tri2c 12.7 16.6 | as tri2a plus cepstral mean subtraction.
[utt] 13.0 17.0 | [per utterance CMS in test]
tri2d 13.0 19.4 | as tri2a plus STC/MLLT (worse).
tri2e 14.3 19.1 | as tri2a but splice+LDA features (worse).
tri2f 12.2 17.7 | as tri2e plus STC/MLLT (better than tri2a).
@ -51,15 +52,21 @@ system:
+diag[spk] 10.7 16.5 | + diagonal, not just mean-only fMLLR
+diag[utt] 11.1 16.1 | [all per-utt]
+vtln,diag 10.9 15.9 | actual VTLN, plus diag-FMLLR
[utt] 10.9 16.2 | [all per-utt]
[utt] 10.9 16.1 | [all per-utt]
tri2h 13.4 20.2 | [ splice-9-frames + HLDA... worse than tri2a]
tri2i 12.4 18.4 | [ triple-deltas + HLDA... same as tri2a]
tri2j 12.8 18.3 | [ triple-deltas+LDA+MLLT... slightly worse than tri2a]
tri2k 10.6 15.0 | [ splice-9-frames + LDA + ET ]
[utt] 10.8 15.1 | [adaptation per utterance]
[spk,+fmllr] 9.9 14.2 | [per speaker, plus fMLLR]
tri2k 10.3 15.0 | [ splice-9-frames + LDA + ET ]
[utt] 10.3 15.2 | [adaptation per utterance]
[spk,+fmllr] 9.9 14.4 | [per speaker, plus fMLLR]
tri2l 9.6 13.7 | train with SAT; test with fMLLR
[utt] 12.0 16.8 | [adaptation per utterance]
tri2m 10.8 15.0 | [LDA + MLLT + Linear VTLN]
[utt] 10.6 14.4 | [per-utt, not per-spk]
[diag] 10.7 14.6 | [diagonal, not just offset, CMLLR component]
[diag;utt] 10.8 14.5 | [per-utterance]
[vtln;diag] 10.7 14.9 | [feature-level VTLN; diagonal CMLLR]
[utt] 10.6 14.4 | [per-utterance]
sgmm2a 10.4 16.4 | [sgmm, unadapted, on delta features]
sgmm2b 10.1 14.1 | [sgmm, spk-vector adaptation only]
[utt] 10.2 13.7 | [adapt per utt]
@ -75,6 +82,83 @@ system:
[fmllr] 7.7 9.7 | [per-spk, with fMLLR]
# Raw results:
exp/decode_mono_tgpr_eval92/wer:%WER 31.38 [ 1770 / 5641, 108 ins, 386 del, 1276 sub ]
exp/decode_mono_tgpr_eval93/wer:%WER 37.54 [ 1291 / 3439, 52 ins, 385 del, 854 sub ]
exp/decode_tri1_tgpr_eval92/wer:%WER 13.30 [ 750 / 5641, 133 ins, 74 del, 543 sub ]
exp/decode_tri1_tgpr_eval93/wer:%WER 18.14 [ 624 / 3439, 54 ins, 94 del, 476 sub ]
exp/decode_tri2a_tgpr_eval92/wer:%WER 12.52 [ 706 / 5641, 127 ins, 60 del, 519 sub ]
exp/decode_tri2a_tgpr_eval93/wer:%WER 18.29 [ 629 / 3439, 47 ins, 104 del, 478 sub ]
exp/decode_tri2a_tgpr_fmllr_eval92/wer:%WER 11.42 [ 644 / 5641, 116 ins, 60 del, 468 sub ]
exp/decode_tri2a_tgpr_fmllr_utt_eval92/wer:%WER 12.48 [ 704 / 5641, 128 ins, 56 del, 520 sub ]
exp/decode_tri2c_tgpr_eval92/wer:%WER 12.71 [ 717 / 5641, 137 ins, 72 del, 508 sub ]
exp/decode_tri2c_tgpr_eval93/wer:%WER 16.57 [ 570 / 3439, 62 ins, 87 del, 421 sub ]
exp/decode_tri2c_tgpr_utt_eval92/wer:%WER 12.96 [ 731 / 5641, 148 ins, 67 del, 516 sub ]
exp/decode_tri2c_tgpr_utt_eval93/wer:%WER 17.01 [ 585 / 3439, 61 ins, 85 del, 439 sub ]
exp/decode_tri2d_tgpr_eval92/wer:%WER 13.03 [ 735 / 5641, 138 ins, 74 del, 523 sub ]
exp/decode_tri2d_tgpr_eval93/wer:%WER 19.40 [ 667 / 3439, 48 ins, 130 del, 489 sub ]
exp/decode_tri2e_tgpr_eval92/wer:%WER 14.29 [ 806 / 5641, 155 ins, 79 del, 572 sub ]
exp/decode_tri2e_tgpr_eval93/wer:%WER 19.08 [ 656 / 3439, 71 ins, 120 del, 465 sub ]
exp/decode_tri2f_tgpr_eval92/wer:%WER 12.23 [ 690 / 5641, 138 ins, 57 del, 495 sub ]
exp/decode_tri2f_tgpr_eval93/wer:%WER 17.74 [ 610 / 3439, 68 ins, 85 del, 457 sub ]
exp/decode_tri2g_tgpr_diag_eval92/wer:%WER 10.65 [ 601 / 5641, 111 ins, 55 del, 435 sub ]
exp/decode_tri2g_tgpr_diag_eval93/wer:%WER 16.49 [ 567 / 3439, 77 ins, 72 del, 418 sub ]
exp/decode_tri2g_tgpr_eval92/wer:%WER 11.08 [ 625 / 5641, 119 ins, 57 del, 449 sub ]
exp/decode_tri2g_tgpr_eval93/wer:%WER 16.40 [ 564 / 3439, 72 ins, 68 del, 424 sub ]
exp/decode_tri2g_tgpr_utt_diag_eval92/wer:%WER 11.10 [ 626 / 5641, 119 ins, 60 del, 447 sub ]
exp/decode_tri2g_tgpr_utt_diag_eval93/wer:%WER 16.08 [ 553 / 3439, 75 ins, 68 del, 410 sub ]
exp/decode_tri2g_tgpr_utt_eval92/wer:%WER 11.19 [ 631 / 5641, 117 ins, 59 del, 455 sub ]
exp/decode_tri2g_tgpr_utt_eval93/wer:%WER 16.17 [ 556 / 3439, 76 ins, 67 del, 413 sub ]
exp/decode_tri2g_tgpr_utt_vtln_diag_eval92/wer:%WER 10.87 [ 613 / 5641, 114 ins, 59 del, 440 sub ]
exp/decode_tri2g_tgpr_utt_vtln_diag_eval93/wer:%WER 16.14 [ 555 / 3439, 77 ins, 67 del, 411 sub ]
exp/decode_tri2g_tgpr_vtln_diag_eval92/wer:%WER 10.88 [ 614 / 5641, 117 ins, 59 del, 438 sub ]
exp/decode_tri2g_tgpr_vtln_diag_eval93/wer:%WER 15.91 [ 547 / 3439, 73 ins, 68 del, 406 sub ]
exp/decode_tri2h_tgpr_eval92/wer:%WER 13.40 [ 756 / 5641, 163 ins, 54 del, 539 sub ]
exp/decode_tri2h_tgpr_eval93/wer:%WER 20.24 [ 696 / 3439, 69 ins, 109 del, 518 sub ]
exp/decode_tri2i_tgpr_eval92/wer:%WER 12.39 [ 699 / 5641, 130 ins, 72 del, 497 sub ]
exp/decode_tri2i_tgpr_eval93/wer:%WER 18.35 [ 631 / 3439, 58 ins, 102 del, 471 sub ]
exp/decode_tri2j_tgpr_eval92/wer:%WER 12.82 [ 723 / 5641, 127 ins, 70 del, 526 sub ]
exp/decode_tri2j_tgpr_eval93/wer:%WER 18.26 [ 628 / 3439, 59 ins, 99 del, 470 sub ]
exp/decode_tri2k_tgpr_eval92/wer:%WER 10.26 [ 579 / 5641, 117 ins, 45 del, 417 sub ]
exp/decode_tri2k_tgpr_eval93/wer:%WER 15.03 [ 517 / 3439, 73 ins, 71 del, 373 sub ]
exp/decode_tri2k_tgpr_fmllr_eval92/wer:%WER 9.86 [ 556 / 5641, 119 ins, 49 del, 388 sub ]
exp/decode_tri2k_tgpr_fmllr_eval93/wer:%WER 14.39 [ 495 / 3439, 72 ins, 67 del, 356 sub ]
exp/decode_tri2k_tgpr_utt_eval92/wer:%WER 10.30 [ 581 / 5641, 117 ins, 47 del, 417 sub ]
exp/decode_tri2k_tgpr_utt_eval93/wer:%WER 15.18 [ 522 / 3439, 76 ins, 69 del, 377 sub ]
exp/decode_tri2l_tgpr_eval92/wer:%WER 9.64 [ 544 / 5641, 121 ins, 44 del, 379 sub ]
exp/decode_tri2l_tgpr_eval93/wer:%WER 13.72 [ 472 / 3439, 68 ins, 66 del, 338 sub ]
exp/decode_tri2l_tgpr_utt_eval92/wer:%WER 12.00 [ 677 / 5641, 141 ins, 60 del, 476 sub ]
exp/decode_tri2l_tgpr_utt_eval93/wer:%WER 16.75 [ 576 / 3439, 59 ins, 93 del, 424 sub ]
exp/decode_tri2m_tgpr_diag_eval92/wer:%WER 10.67 [ 602 / 5641, 125 ins, 52 del, 425 sub ]
exp/decode_tri2m_tgpr_diag_eval93/wer:%WER 14.57 [ 501 / 3439, 67 ins, 64 del, 370 sub ]
exp/decode_tri2m_tgpr_eval92/wer:%WER 10.81 [ 610 / 5641, 126 ins, 51 del, 433 sub ]
exp/decode_tri2m_tgpr_eval93/wer:%WER 15.00 [ 516 / 3439, 66 ins, 66 del, 384 sub ]
exp/decode_tri2m_tgpr_utt_diag_eval92/wer:%WER 10.83 [ 611 / 5641, 118 ins, 55 del, 438 sub ]
exp/decode_tri2m_tgpr_utt_diag_eval93/wer:%WER 14.45 [ 497 / 3439, 62 ins, 69 del, 366 sub ]
exp/decode_tri2m_tgpr_utt_eval92/wer:%WER 11.01 [ 621 / 5641, 125 ins, 53 del, 443 sub ]
exp/decode_tri2m_tgpr_utt_eval93/wer:%WER 14.63 [ 503 / 3439, 65 ins, 67 del, 371 sub ]
exp/decode_tri2m_tgpr_utt_vtln_diag_eval92/wer:%WER 10.64 [ 600 / 5641, 123 ins, 51 del, 426 sub ]
exp/decode_tri2m_tgpr_utt_vtln_diag_eval93/wer:%WER 14.39 [ 495 / 3439, 60 ins, 70 del, 365 sub ]
exp/decode_tri2m_tgpr_vtln_diag_eval92/wer:%WER 10.74 [ 606 / 5641, 125 ins, 52 del, 429 sub ]
exp/decode_tri2m_tgpr_vtln_diag_eval93/wer:%WER 14.89 [ 512 / 3439, 68 ins, 67 del, 377 sub ]
exp/decode_tri3a_tgpr_dfmllr_eval92/wer:%WER 10.51 [ 593 / 5641, 111 ins, 51 del, 431 sub ]
exp/decode_tri3a_tgpr_dfmllr_eval93/wer:%WER 12.68 [ 436 / 3439, 52 ins, 52 del, 332 sub ]
exp/decode_tri3a_tgpr_eval92/wer:%WER 10.67 [ 602 / 5641, 131 ins, 43 del, 428 sub ]
exp/decode_tri3a_tgpr_eval93/wer:%WER 13.84 [ 476 / 3439, 55 ins, 68 del, 353 sub ]
exp/decode_tri3a_tgpr_fmllr_eval92/wer:%WER 9.54 [ 538 / 5641, 114 ins, 47 del, 377 sub ]
exp/decode_tri3a_tgpr_fmllr_eval93/wer:%WER 12.13 [ 417 / 3439, 52 ins, 59 del, 306 sub ]
exp/decode_tri3a_tgpr_uttdfmllr_eval92/wer:%WER 10.58 [ 597 / 5641, 118 ins, 49 del, 430 sub ]
exp/decode_tri3a_tgpr_uttdfmllr_eval93/wer:%WER 13.29 [ 457 / 3439, 49 ins, 57 del, 351 sub ]
exp/decode_tri3a_tgpr_uttfmllr_eval92/wer:%WER 10.44 [ 589 / 5641, 122 ins, 47 del, 420 sub ]
exp/decode_tri3a_tgpr_uttfmllr_eval93/wer:%WER 13.93 [ 479 / 3439, 56 ins, 69 del, 354 sub ]
exp/decode_sgmm2a_tgpr_eval92/wer:%WER 10.44 [ 589 / 5641, 129 ins, 38 del, 422 sub ]
exp/decode_sgmm2a_tgpr_eval93/wer:%WER 16.40 [ 564 / 3439, 68 ins, 92 del, 404 sub ]
# [old:]
exp/decode_mono_tgpr_eval92/wer:%WER 31.38 [ 1770 / 5641, 108 ins, 386 del, 1276 sub ]
exp/decode_tri1_tgpr_eval92/wer:%WER 13.30 [ 750 / 5641, 133 ins, 74 del, 543 sub ]
exp/decode_tri2a_bg_eval92/wer:%WER 14.25 [ 804 / 5641, 146 ins, 87 del, 571 sub ]

Просмотреть файл

@ -172,7 +172,6 @@ dir=[some directory to put MFCCs]
steps/make_mfcc_train.sh $dir
steps/make_mfcc_test.sh $dir
# (5) running the training and testing steps..
steps/train_mono.sh || exit 1;
@ -185,12 +184,16 @@ steps/train_mono.sh || exit 1;
# you'd have to modify the script to use that.
(scripts/mkgraph.sh --mono data/G_tg_pruned.fst exp/mono/tree exp/mono/final.mdl exp/graph_mono_tg_pruned || exit 1;
scripts/decode.sh exp/decode_mono_tgpr_eval92 exp/graph_mono_tg_pruned/HCLG.fst steps/decode_mono.sh data/eval_nov92.scp ) &
scripts/decode.sh exp/decode_mono_tgpr_eval92 exp/graph_mono_tg_pruned/HCLG.fst steps/decode_mono.sh data/eval_nov92.scp
scripts/decode.sh exp/decode_mono_tgpr_eval93 exp/graph_mono_tg_pruned/HCLG.fst steps/decode_mono.sh data/eval_nov93.scp
) &
steps/train_tri1.sh || exit 1;
(scripts/mkgraph.sh data/G_tg_pruned.fst exp/tri1/tree exp/tri1/final.mdl exp/graph_tri1_tg_pruned || exit 1;
scripts/decode.sh exp/decode_tri1_tgpr_eval92 exp/graph_tri1_tg_pruned/HCLG.fst steps/decode_tri1.sh data/eval_nov92.scp ) &
scripts/decode.sh exp/decode_tri1_tgpr_eval92 exp/graph_tri1_tg_pruned/HCLG.fst steps/decode_tri1.sh data/eval_nov92.scp
scripts/decode.sh exp/decode_tri1_tgpr_eval93 exp/graph_tri1_tg_pruned/HCLG.fst steps/decode_tri1.sh data/eval_nov93.scp
) &
steps/train_tri2a.sh || exit 1;
@ -201,13 +204,12 @@ steps/train_tri2a.sh || exit 1;
# also doing tri2a with bigram
(
scripts/mkgraph.sh data/G_bg.fst exp/tri2a/tree exp/tri2a/final.mdl exp/graph_tri2a_bg || exit 1;
scripts/decode.sh exp/decode_tri2a_bg_eval92 exp/graph_tri2a_bg/HCLG.fst steps/decode_tri2a.sh data/eval_nov92.scp
scripts/decode.sh exp/decode_tri2a_bg_eval93 exp/graph_tri2a_bg/HCLG.fst steps/decode_tri2a.sh data/eval_nov93.scp
)&
( scripts/decode.sh exp/decode_tri2a_tgpr_fmllr_utt_eval92 exp/graph_tri2a_tg_pruned/HCLG.fst steps/decode_tri2a_fmllr.sh data/eval_nov92.scp
scripts/decode.sh --per-spk exp/decode_tri2a_tgpr_fmllr_eval92 exp/graph_tri2a_tg_pruned/HCLG.fst steps/decode_tri2a_fmllr.sh data/eval_nov92.scp )&
for year in 92 93; do
scripts/decode.sh exp/decode_tri2a_bg_eval${year} exp/graph_tri2a_bg/HCLG.fst steps/decode_tri2a.sh data/eval_nov${year}.scp
scripts/decode.sh exp/decode_tri2a_tgpr_fmllr_utt_eval${year} exp/graph_tri2a_tg_pruned/HCLG.fst steps/decode_tri2a_fmllr.sh data/eval_nov${year}.scp
scripts/decode.sh --per-spk exp/decode_tri2a_tgpr_fmllr_eval${year} exp/graph_tri2a_tg_pruned/HCLG.fst steps/decode_tri2a_fmllr.sh data/eval_nov${year}.scp
done
)&
steps/train_tri3a.sh || exit 1;
@ -233,32 +235,28 @@ done
)&
# will delete:
## scripts/decode_queue_fmllr.sh exp/graph_tri3a_tg_pruned exp/tri3a/final.mdl exp/decode_tri3a_tg_pruned_fmllr &
#### Now alternative experiments... ###
# Exponential Transform (ET)
steps/train_tri2b.sh
(scripts/mkgraph.sh data/G_tg_pruned.fst exp/tri2b/tree exp/tri2b/final.mdl exp/graph_tri2b_tg_pruned || exit 1;
scripts/decode.sh exp/decode_tri2b_tgpr_utt_eval92 exp/graph_tri2b_tg_pruned/HCLG.fst steps/decode_tri2b.sh data/eval_nov92.scp
scripts/decode.sh --per-spk exp/decode_tri2b_tgpr_eval92 exp/graph_tri2b_tg_pruned/HCLG.fst steps/decode_tri2b.sh data/eval_nov92.scp
scripts/decode.sh exp/decode_tri2b_tgpr_utt_fmllr_eval92 exp/graph_tri2b_tg_pruned/HCLG.fst steps/decode_tri2b_fmllr.sh data/eval_nov92.scp
scripts/decode.sh --per-spk exp/decode_tri2b_tgpr_fmllr_eval92 exp/graph_tri2b_tg_pruned/HCLG.fst steps/decode_tri2b_fmllr.sh data/eval_nov92.scp
scripts/decode.sh exp/decode_tri2b_tgpr_utt_eval93 exp/graph_tri2b_tg_pruned/HCLG.fst steps/decode_tri2b.sh data/eval_nov93.scp
scripts/decode.sh --per-spk exp/decode_tri2b_tgpr_eval93 exp/graph_tri2b_tg_pruned/HCLG.fst steps/decode_tri2b.sh data/eval_nov93.scp
scripts/decode.sh exp/decode_tri2b_tgpr_utt_fmllr_eval93 exp/graph_tri2b_tg_pruned/HCLG.fst steps/decode_tri2b_fmllr.sh data/eval_nov93.scp
scripts/decode.sh --per-spk exp/decode_tri2b_tgpr_fmllr_eval93 exp/graph_tri2b_tg_pruned/HCLG.fst steps/decode_tri2b_fmllr.sh data/eval_nov93.scp
for year in 92 93; do
scripts/decode.sh exp/decode_tri2b_tgpr_utt_eval${year} exp/graph_tri2b_tg_pruned/HCLG.fst steps/decode_tri2b.sh data/eval_nov${year}.scp
scripts/decode.sh --per-spk exp/decode_tri2b_tgpr_eval${year} exp/graph_tri2b_tg_pruned/HCLG.fst steps/decode_tri2b.sh data/eval_nov${year}.scp
scripts/decode.sh exp/decode_tri2b_tgpr_utt_fmllr_eval${year} exp/graph_tri2b_tg_pruned/HCLG.fst steps/decode_tri2b_fmllr.sh data/eval_nov${year}.scp
scripts/decode.sh --per-spk exp/decode_tri2b_tgpr_fmllr_eval${year} exp/graph_tri2b_tg_pruned/HCLG.fst steps/decode_tri2b_fmllr.sh data/eval_nov${year}.scp
done
) &
# Cepstral Mean Normalization (CMN)
steps/train_tri2c.sh
(scripts/mkgraph.sh data/G_tg_pruned.fst exp/tri2c/tree exp/tri2c/final.mdl exp/graph_tri2c_tg_pruned || exit 1;
scripts/decode.sh exp/decode_tri2c_tgpr_utt_eval92 exp/graph_tri2c_tg_pruned/HCLG.fst steps/decode_tri2c.sh data/eval_nov92.scp
scripts/decode.sh --per-spk exp/decode_tri2c_tgpr_eval92 exp/graph_tri2c_tg_pruned/HCLG.fst steps/decode_tri2c.sh data/eval_nov92.scp
scripts/decode.sh exp/decode_tri2c_tgpr_eval93 exp/graph_tri2c_tg_pruned/HCLG.fst steps/decode_tri2c.sh data/eval_nov93.scp )&
scripts/decode.sh exp/decode_tri2c_tgpr_utt_eval93 exp/graph_tri2c_tg_pruned/HCLG.fst steps/decode_tri2c.sh data/eval_nov93.scp
scripts/decode.sh --per-spk exp/decode_tri2c_tgpr_eval93 exp/graph_tri2c_tg_pruned/HCLG.fst steps/decode_tri2c.sh data/eval_nov93.scp
)&
# MLLT/STC
@ -351,6 +349,28 @@ steps/train_tri2l.sh
)&
# LDA + MLLT + Linear VTLN (+ regular VTLN)
# Note: this depends on tri2f.
steps/train_tri2m.sh
(
scripts/mkgraph.sh data/G_tg_pruned.fst exp/tri2m/tree exp/tri2m/final.mdl exp/graph_tri2m_tg_pruned || exit 1;
for year in 92 93; do
scripts/decode.sh exp/decode_tri2m_tgpr_utt_eval${year} exp/graph_tri2m_tg_pruned/HCLG.fst steps/decode_tri2m.sh data/eval_nov${year}.scp
scripts/decode.sh exp/decode_tri2m_tgpr_utt_diag_eval${year} exp/graph_tri2m_tg_pruned/HCLG.fst steps/decode_tri2m_diag.sh data/eval_nov${year}.scp
scripts/decode.sh --wav exp/decode_tri2m_tgpr_utt_vtln_diag_eval${year} exp/graph_tri2m_tg_pruned/HCLG.fst steps/decode_tri2m_vtln_diag.sh data/eval_nov${year}.scp
scripts/decode.sh --per-spk exp/decode_tri2m_tgpr_eval${year} exp/graph_tri2m_tg_pruned/HCLG.fst steps/decode_tri2m.sh data/eval_nov${year}.scp
scripts/decode.sh --per-spk exp/decode_tri2m_tgpr_diag_eval${year} exp/graph_tri2m_tg_pruned/HCLG.fst steps/decode_tri2m_diag.sh data/eval_nov${year}.scp
scripts/decode.sh --wav --per-spk exp/decode_tri2m_tgpr_vtln_diag_eval${year} exp/graph_tri2m_tg_pruned/HCLG.fst steps/decode_tri2m_vtln_diag.sh data/eval_nov${year}.scp
done
)&
train_ubm2a.sh || exit 1;
# Deltas + SGMM
steps/train_sgmm2a.sh || exit 1;
@ -362,36 +382,37 @@ steps/train_sgmm2a.sh || exit 1;
steps/train_sgmm2b.sh || exit 1;
(scripts/mkgraph.sh data/G_tg_pruned.fst exp/sgmm2b/tree exp/sgmm2b/final.mdl exp/graph_sgmm2b_tg_pruned || exit 1;
scripts/decode.sh --per-spk exp/decode_sgmm2b_tgpr_eval92 exp/graph_sgmm2b_tg_pruned/HCLG.fst steps/decode_sgmm2b.sh data/eval_nov92.scp
scripts/decode.sh --per-spk exp/decode_sgmm2b_tgpr_eval93 exp/graph_sgmm2b_tg_pruned/HCLG.fst steps/decode_sgmm2b.sh data/eval_nov93.scp
scripts/decode.sh exp/decode_sgmm2b_tgpr_utt_eval92 exp/graph_sgmm2b_tg_pruned/HCLG.fst steps/decode_sgmm2b.sh data/eval_nov92.scp
scripts/decode.sh exp/decode_sgmm2b_tgpr_utt_eval93 exp/graph_sgmm2b_tg_pruned/HCLG.fst steps/decode_sgmm2b.sh data/eval_nov93.scp
scripts/decode.sh --per-spk exp/decode_sgmm2b_fmllr_tgpr_eval92 exp/graph_sgmm2b_tg_pruned/HCLG.fst steps/decode_sgmm2b_fmllr.sh data/eval_nov92.scp
scripts/decode.sh --per-spk exp/decode_sgmm2b_fmllr_tgpr_eval93 exp/graph_sgmm2b_tg_pruned/HCLG.fst steps/decode_sgmm2b_fmllr.sh data/eval_nov93.scp )&
for year in 92 93; do
scripts/decode.sh --per-spk exp/decode_sgmm2b_tgpr_eval${year} exp/graph_sgmm2b_tg_pruned/HCLG.fst steps/decode_sgmm2b.sh data/eval_nov${year}.scp
scripts/decode.sh exp/decode_sgmm2b_tgpr_utt_eval${year} exp/graph_sgmm2b_tg_pruned/HCLG.fst steps/decode_sgmm2b.sh data/eval_nov${year}.scp
scripts/decode.sh --per-spk exp/decode_sgmm2b_fmllr_tgpr_eval${year} exp/graph_sgmm2b_tg_pruned/HCLG.fst steps/decode_sgmm2b_fmllr.sh data/eval_nov${year}.scp
done
)&
# [on all the data]
steps/train_ubm3a.sh || exit 1;
steps/train_sgmm3b.sh || exit 1;
(scripts/mkgraph.sh data/G_tg_pruned.fst exp/sgmm3b/tree exp/sgmm3b/final.mdl exp/graph_sgmm3b_tg_pruned || exit 1;
scripts/decode.sh --per-spk exp/decode_sgmm3b_tgpr_eval92 exp/graph_sgmm3b_tg_pruned/HCLG.fst steps/decode_sgmm3b.sh data/eval_nov92.scp
scripts/decode.sh --per-spk exp/decode_sgmm3b_tgpr_eval93 exp/graph_sgmm3b_tg_pruned/HCLG.fst steps/decode_sgmm3b.sh data/eval_nov93.scp
scripts/decode.sh exp/decode_sgmm3b_tgpr_utt_eval92 exp/graph_sgmm3b_tg_pruned/HCLG.fst steps/decode_sgmm3b.sh data/eval_nov92.scp
scripts/decode.sh exp/decode_sgmm3b_tgpr_utt_eval93 exp/graph_sgmm3b_tg_pruned/HCLG.fst steps/decode_sgmm3b.sh data/eval_nov93.scp
scripts/decode.sh --per-spk exp/decode_sgmm3b_fmllr_tgpr_eval92 exp/graph_sgmm3b_tg_pruned/HCLG.fst steps/decode_sgmm3b_fmllr.sh data/eval_nov92.scp
scripts/decode.sh --per-spk exp/decode_sgmm3b_fmllr_tgpr_eval93 exp/graph_sgmm3b_tg_pruned/HCLG.fst steps/decode_sgmm3b_fmllr.sh data/eval_nov93.scp )&
for year in 92 93; do
scripts/decode.sh --per-spk exp/decode_sgmm3b_tgpr_eval${year} exp/graph_sgmm3b_tg_pruned/HCLG.fst steps/decode_sgmm3b.sh data/eval_nov${year}.scp
scripts/decode.sh exp/decode_sgmm3b_tgpr_utt_eval${year} exp/graph_sgmm3b_tg_pruned/HCLG.fst steps/decode_sgmm3b.sh data/eval_nov${year}.scp
scripts/decode.sh --per-spk exp/decode_sgmm3b_fmllr_tgpr_eval${year} exp/graph_sgmm3b_tg_pruned/HCLG.fst steps/decode_sgmm3b_fmllr.sh data/eval_nov${year}.scp
done
)&
# [ gender dependent ]
steps/train_ubm3b.sh || exit 1;
steps/train_sgmm3c.sh || exit 1;
(scripts/mkgraph.sh data/G_tg_pruned.fst exp/sgmm3c/tree exp/sgmm3c/final.mdl exp/graph_sgmm3c_tg_pruned || exit 1;
scripts/decode.sh --per-spk exp/decode_sgmm3c_tgpr_eval92 exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c.sh data/eval_nov92.scp
scripts/decode.sh --per-spk exp/decode_sgmm3c_tgpr_eval93 exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c.sh data/eval_nov93.scp
scripts/decode.sh exp/decode_sgmm3c_tgpr_utt_eval92 exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c.sh data/eval_nov92.scp
scripts/decode.sh exp/decode_sgmm3c_tgpr_utt_eval93 exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c.sh data/eval_nov93.scp
scripts/decode.sh --per-spk exp/decode_sgmm3c_fmllr_tgpr_eval92 exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c_fmllr.sh data/eval_nov92.scp
scripts/decode.sh --per-spk exp/decode_sgmm3c_fmllr_tgpr_eval93 exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c_fmllr.sh data/eval_nov93.scp )&
for year in 92 93; do
scripts/decode.sh --per-spk exp/decode_sgmm3c_tgpr_eval${year} exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c.sh data/eval_nov${year}.scp
scripts/decode.sh exp/decode_sgmm3c_tgpr_utt_eval${year} exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c.sh data/eval_nov${year}.scp
scripts/decode.sh --per-spk exp/decode_sgmm3c_fmllr_tgpr_eval${year} exp/graph_sgmm3c_tg_pruned/HCLG.fst steps/decode_sgmm3c_fmllr.sh data/eval_nov${year}.scp
done
)&

Просмотреть файл

@ -107,7 +107,7 @@ sgmm-decode-faster "$gselect_opt" --beam=$prebeam --max-active=$max_active \
( ali-to-post ark:$dir/$job.pre2_ali ark:- | \
weight-silence-post 0.01 $silphones $model ark:- ark:- | \
sgmm-post-to-gpost "$gselect_opt" $model "$feats" ark,s,cs:- ark:- | \
sgmm-est-fmllr-gpost --spk-vecs=ark:$dir/${job}.vecs2 "$spk2utt_opt" $fmllr_model "$feats" ark,s,cs:- \
sgmm-est-fmllr-gpost --spk-vecs=ark:$dir/${job}.vecs2 $spk2utt_opt $fmllr_model "$feats" ark,s,cs:- \
ark:$dir/$job.fmllr ) 2>$dir/est_fmllr${job}.log
feats="ark:add-deltas --print-args=false scp:$scp ark:- | transform-feats $utt2spk_opt ark:$dir/$job.fmllr ark:- ark:- |"

Просмотреть файл

@ -108,7 +108,7 @@ sgmm-decode-faster "$gselect_opt" --beam=$prebeam --max-active=$max_active \
( ali-to-post ark:$dir/$job.pre2_ali ark:- | \
weight-silence-post 0.01 $silphones $model ark:- ark:- | \
sgmm-post-to-gpost "$gselect_opt" $model "$feats" ark,s,cs:- ark:- | \
sgmm-est-fmllr-gpost --spk-vecs=ark:$dir/${job}.vecs2 "$spk2utt_opt" $fmllr_model "$feats" ark,s,cs:- \
sgmm-est-fmllr-gpost --spk-vecs=ark:$dir/${job}.vecs2 $spk2utt_opt $fmllr_model "$feats" ark,s,cs:- \
ark:$dir/$job.fmllr ) 2>$dir/est_fmllr${job}.log
feats="ark:add-deltas --print-args=false scp:$scp ark:- | transform-feats $utt2spk_opt ark:$dir/$job.fmllr ark:- ark:- |"

Просмотреть файл

@ -115,7 +115,7 @@ sgmm-decode-faster "$gselect_opt" --beam=$prebeam --max-active=$max_active \
( ali-to-post ark:$dir/$job.pre2_ali ark:- | \
weight-silence-post 0.01 $silphones $model ark:- ark:- | \
sgmm-post-to-gpost "$gselect_opt" $model "$feats" ark,s,cs:- ark:- | \
sgmm-est-fmllr-gpost --spk-vecs=ark:$dir/${job}.vecs2 "$spk2utt_opt" $fmllr_model "$feats" ark,s,cs:- \
sgmm-est-fmllr-gpost --spk-vecs=ark:$dir/${job}.vecs2 $spk2utt_opt $fmllr_model "$feats" ark,s,cs:- \
ark:$dir/$job.fmllr ) 2>$dir/est_fmllr${job}.log
feats="ark:add-deltas --print-args=false scp:$scp ark:- | transform-feats $utt2spk_opt ark:$dir/$job.fmllr ark:- ark:- |"

Просмотреть файл

@ -0,0 +1,84 @@
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script does the decoding of a single batch of test data (on one core).
# It requires arguments. It takes the graphdir and decoding directory, and the
# job number. It expects a file $decode_dir/test${job_number}.scp to exist, and
# puts its output in $decode_dir/${job_number}.tra
#
# If the files
# $decode_dir/${job_number}.utt2spk and $decode_dir/${job_number}.spk2utt exist,
# this script will assume you want to do per-speaker (not per-utterance) adaptation.
if [ $# != 3 ]; then
echo "Usage: scripts/decode_tri2m.sh <graph> <decode-dir> <job-number>"
exit 1;
fi
. path.sh || exit 1;
acwt=0.0625
beam=13.0
prebeam=12.0 # first-pass decoding beam...
max_active=7000
alimodel=exp/tri2m/final.alimdl # first-pass model...
model=exp/tri2m/final.mdl
lvtln=exp/tri2m/0.lvtln
mat=exp/tri2f/final.mat
#####################
silphones=`cat data/silphones.csl`
graph=$1
dir=$2
job=$3
scp=$dir/$job.scp
sifeats="ark:splice-feats --print-args=false scp:$scp ark:- | transform-feats $mat ark:- ark:- |"
if [ -f $dir/$job.spk2utt ]; then
if [ ! -f $dir/$job.utt2spk ]; then
echo "spk2utt but not utt2spk file present!"
exit 1
fi
spk2utt_opt=--spk2utt=ark:$dir/$job.spk2utt
utt2spk_opt=--utt2spk=ark:$dir/$job.utt2spk
fi
filenames="$scp $model $alimodel $et $graph data/words.txt"
for file in $filenames; do
if [ ! -f $file ] ; then
echo "No such file $file";
exit 1;
fi
done
echo running on `hostname` > $dir/predecode${job}.log
# First-pass decoding
gmm-decode-faster --beam=$prebeam --max-active=$max_active --acoustic-scale=$acwt --word-symbol-table=data/words.txt $alimodel $graph "$sifeats" ark,t:$dir/$job.pre_tra ark,t:$dir/$job.pre_ali 2>>$dir/predecode${job}.log
# Estimate transforms
ali-to-post ark:$dir/$job.pre_ali ark:- | \
weight-silence-post 0.0 $silphones $alimodel ark:- ark:- | \
gmm-post-to-gpost $alimodel "$sifeats" ark,o:- ark:- | \
gmm-est-lvtln-trans $spk2utt_opt $model $lvtln "$sifeats" ark,o:- \
ark:$dir/$job.trans ark,t:$dir/$job.warp 2>$dir/lvtln${job}.log
feats="ark:splice-feats --print-args=false scp:$scp ark:- | transform-feats $mat ark:- ark:- | transform-feats $utt2spk_opt ark:$dir/$job.trans ark:- ark:- |"
# Final decoding
echo running on `hostname` > $dir/decode$job.log
gmm-decode-faster --beam=$beam --max-active=$max_active --acoustic-scale=$acwt --word-symbol-table=data/words.txt $model $graph "$feats" ark,t:$dir/$job.tra ark,t:$dir/$job.ali 2>>$dir/decode$job.log

Просмотреть файл

@ -0,0 +1,84 @@
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script does the decoding of a single batch of test data (on one core).
# It requires arguments. It takes the graphdir and decoding directory, and the
# job number. It expects a file $decode_dir/test${job_number}.scp to exist, and
# puts its output in $decode_dir/${job_number}.tra
#
# If the files
# $decode_dir/${job_number}.utt2spk and $decode_dir/${job_number}.spk2utt exist,
# this script will assume you want to do per-speaker (not per-utterance) adaptation.
if [ $# != 3 ]; then
echo "Usage: scripts/decode_tri2m.sh <graph> <decode-dir> <job-number>"
exit 1;
fi
. path.sh || exit 1;
acwt=0.0625
beam=13.0
prebeam=12.0 # first-pass decoding beam...
max_active=7000
alimodel=exp/tri2m/final.alimdl # first-pass model...
model=exp/tri2m/final.mdl
lvtln=exp/tri2m/0.lvtln
mat=exp/tri2f/final.mat
#####################
silphones=`cat data/silphones.csl`
graph=$1
dir=$2
job=$3
scp=$dir/$job.scp
sifeats="ark:splice-feats --print-args=false scp:$scp ark:- | transform-feats $mat ark:- ark:- |"
if [ -f $dir/$job.spk2utt ]; then
if [ ! -f $dir/$job.utt2spk ]; then
echo "spk2utt but not utt2spk file present!"
exit 1
fi
spk2utt_opt=--spk2utt=ark:$dir/$job.spk2utt
utt2spk_opt=--utt2spk=ark:$dir/$job.utt2spk
fi
filenames="$scp $model $alimodel $et $graph data/words.txt"
for file in $filenames; do
if [ ! -f $file ] ; then
echo "No such file $file";
exit 1;
fi
done
echo running on `hostname` > $dir/predecode${job}.log
# First-pass decoding
gmm-decode-faster --beam=$prebeam --max-active=$max_active --acoustic-scale=$acwt --word-symbol-table=data/words.txt $alimodel $graph "$sifeats" ark,t:$dir/$job.pre_tra ark,t:$dir/$job.pre_ali 2>>$dir/predecode${job}.log
# Estimate transforms
ali-to-post ark:$dir/$job.pre_ali ark:- | \
weight-silence-post 0.0 $silphones $alimodel ark:- ark:- | \
gmm-post-to-gpost $alimodel "$sifeats" ark,o:- ark:- | \
gmm-est-lvtln-trans --norm-type=diag $spk2utt_opt $model $lvtln "$sifeats" ark,o:- \
ark:$dir/$job.trans ark,t:$dir/$job.warp 2>$dir/lvtln${job}.log
feats="ark:splice-feats --print-args=false scp:$scp ark:- | transform-feats $mat ark:- ark:- | transform-feats $utt2spk_opt ark:$dir/$job.trans ark:- ark:- |"
# Final decoding
echo running on `hostname` > $dir/decode$job.log
gmm-decode-faster --beam=$beam --max-active=$max_active --acoustic-scale=$acwt --word-symbol-table=data/words.txt $model $graph "$feats" ark,t:$dir/$job.tra ark,t:$dir/$job.ali 2>>$dir/decode$job.log

Просмотреть файл

@ -0,0 +1,100 @@
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# tri2m_vtln is doing normal, feature-level VTLN (with diagonal
# fMLLR).
# This script does the decoding of a single batch of test data (on one core).
# It requires arguments. It takes the graphdir and decoding directory, and the
# job number. It expects a file $decode_dir/test${job_number}.scp to exist, and
# puts its output in $decode_dir/${job_number}.tra
#
# If the files
# $decode_dir/${job_number}.utt2spk and $decode_dir/${job_number}.spk2utt exist,
# this script will assume you want to do per-speaker (not per-utterance) adaptation.
if [ $# != 3 ]; then
echo "Usage: scripts/decode_tri2m.sh <graph> <decode-dir> <job-number>"
exit 1;
fi
. path.sh || exit 1;
acwt=0.0625
beam=13.0
mincount=100 # for fMLLR
prebeam=12.0 # first-pass decoding beam...
max_active=7000
alimodel=exp/tri2m/final.alimdl # first-pass model...
model=exp/tri2m/final.mdl # Model to decide which transform to use.
vtlnmodel=exp/tri2m/final.vtlnmdl
lvtln=exp/tri2m/0.lvtln
mat=exp/tri2f/final.mat
silphones=`cat data/silphones.csl`
graph=$1
dir=$2
job=$3
scp=$dir/$job.scp
sifeats="ark:splice-feats --print-args=false scp:$scp ark:- | transform-feats $mat ark:- ark:- |"
if [ -f $dir/$job.spk2utt ]; then
if [ ! -f $dir/$job.utt2spk ]; then
echo "spk2utt but not utt2spk file present!"
exit 1
fi
spk2utt_opt=--spk2utt=ark:$dir/$job.spk2utt
utt2spk_opt=--utt2spk=ark:$dir/$job.utt2spk
fi
# if we can't find $dir/${job}_wav.scp, then user didn't give --wav option
# to scripts/decode.sh
filenames="$scp $model $alimodel $et $graph data/words.txt $dir/${job}_wav.scp"
for file in $filenames; do
if [ ! -f $file ] ; then
echo "No such file $file";
exit 1;
fi
done
echo running on `hostname` > $dir/predecode${job}.log
# First-pass decoding
gmm-decode-faster --beam=$prebeam --max-active=$max_active --acoustic-scale=$acwt --word-symbol-table=data/words.txt $alimodel $graph "$sifeats" ark,t:$dir/$job.pre_tra ark,t:$dir/$job.pre_ali 2>>$dir/predecode${job}.log
# Estimate transforms
ali-to-post ark:$dir/$job.pre_ali ark:- | \
weight-silence-post 0.0 $silphones $alimodel ark:- ark:- | \
gmm-post-to-gpost $alimodel "$sifeats" ark,o:- ark:- | \
gmm-est-lvtln-trans $spk2utt_opt $model $lvtln "$sifeats" ark,o:- \
ark:$dir/$job.trans ark,t:$dir/$job.warp 2>$dir/lvtln${job}.log
# Compute warping factor
cat $dir/$job.warp | awk '{print $1, (0.85+0.01*$2);}' > $dir/$job.factor
feats="ark:compute-mfcc-feats $utt2spk_opt --vtln-low=100 --vtln-high=-600 --vtln-map=ark:$dir/$job.factor --config=conf/mfcc.conf scp:$dir/${job}_wav.scp ark:- | splice-feats ark:- ark:- | transform-feats $mat ark:- ark:- |"
( ali-to-post ark:$dir/$job.pre_ali ark:- | \
weight-silence-post 0.0 $silphones $alimodel ark:- ark:- | \
gmm-est-fmllr --fmllr-min-count=$mincount --fmllr-update-type=diag $spk2utt_opt $model "$feats" ark,o:- ark:$dir/$job.trans ) 2>$dir/fmllr${job}.log || exit 1;
feats="ark:compute-mfcc-feats $utt2spk_opt --vtln-low=100 --vtln-high=-600 --vtln-map=ark:$dir/$job.factor --config=conf/mfcc.conf scp:$dir/${job}_wav.scp ark:- | splice-feats ark:- ark:- | transform-feats $mat ark:- ark:- | transform-feats $utt2spk_opt ark:$dir/$job.trans ark:- ark:- |"
# Final decoding
echo running on `hostname` > $dir/decode$job.log
gmm-decode-faster --beam=$beam --max-active=$max_active --acoustic-scale=$acwt --word-symbol-table=data/words.txt $model $graph "$feats" ark,t:$dir/$job.tra ark,t:$dir/$job.ali 2>>$dir/decode$job.log

Просмотреть файл

@ -41,3 +41,4 @@ wait;
cat $root_out/train_raw_mfcc{1,2,3,4}.scp > data/train.scp
echo Succeeded "(probably)"

Просмотреть файл

@ -54,8 +54,8 @@ compute-cmvn-stats $spk2utt_opt "$srcfeats" ark:$dir/cmvn.ark 2>$dir/cmvn.log
feats="ark:add-deltas --print-args=false scp:$dir/train.scp ark:- | apply-cmvn $utt2spk_opt ark:$dir/cmvn.ark ark:- ark:- |"
for n in 1 2 3; do
srcfeatspart[$n]="ark:add-deltas --print-args=false scp:$dir/train${n}.scp ark:- |"
featspart[$n]="ark:add-deltas --print-args=false scp:$dir/train${n}.scp ark:- | apply-cmvn $utt2spk_opt ark:$dir/cmvn.ark ark:- ark:- |"
srcfeatspart[$n]="ark,s,cs:add-deltas --print-args=false scp:$dir/train${n}.scp ark:- |"
featspart[$n]="ark,s,cs:add-deltas --print-args=false scp:$dir/train${n}.scp ark:- | apply-cmvn $utt2spk_opt ark:$dir/cmvn.ark ark:- ark:- |"
done
cp $srcdir/topo $dir

Просмотреть файл

@ -283,7 +283,7 @@ fi
# The following files may be be useful for display purposes.
for y in lvtln_iters; do
for y in $lvtln_iters; do
cat $dir/$y.?.warp | scripts/process_warps.pl data/spk2gender.map > $dir/warps.$y
y=$[$y+1]
done

292
egs/wsj/s1/steps/train_tri2m.sh Executable file
Просмотреть файл

@ -0,0 +1,292 @@
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# tri2m is as tri2g ("linear VTLN", and training normal VTLN at the end),
# except basing it on LDA+MLLT features, not deltas.
if [ -f path.sh ]; then . path.sh; fi
dir=exp/tri2m
srcdir=exp/tri2f
srcmodel=$srcdir/final.mdl
mat=$srcdir/final.mat
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
# This block of parameters relates to LVTLN.
compute_vtlnmdl=true # If true, at the end compute a model with actual feature-space
# VTLN features. You can decode with this as an alternative to
# final.mdl which takes the LVTLN features.
dim=40 # the dim of our features.
lvtln_iters="2 4 6 8 12"; # Recompute LVTLN transforms on these iters.
numfiles=40 # Number of feature files for computing LVTLN transforms.
numclass=31; # Can't really change this without changing the script below
defaultclass=15; # Corresponds to no warping.
numiters=35
maxiterinc=20 # By this iter, we have all the Gaussians.
realign_iters="10 20 30";
numleaves=2000
numgauss=2000 # initial num-gauss smallish so that transform-training
# code (when we modify this script) is a bit faster.
totgauss=10000 # Total num-gauss
incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
silphonelist=`cat data/silphones.csl`
mkdir -p $dir
cp $srcdir/train.scp $dir
cp $srcdir/train.tra $dir
scripts/filter_scp.pl $dir/train.scp data/train_wav.scp > $dir/train_wav.scp
scripts/filter_scp.pl $dir/train.scp data/train.utt2spk > $dir/train.utt2spk
scripts/split_scp.pl --utt2spk=$dir/train.utt2spk $dir/train{,1,2,3}.scp
scripts/split_scp.pl --utt2spk=$dir/train.utt2spk $dir/train_wav{,1,2,3}.scp
scripts/split_scp.pl --utt2spk=$dir/train.utt2spk $dir/train{,1,2,3}.tra
scripts/split_scp.pl --utt2spk=$dir/train.utt2spk $dir/train{,1,2,3}.utt2spk
for n in 1 2 3 ""; do # The "" handles the un-split one. Creating spk2utt files..
scripts/utt2spk_to_spk2utt.pl $dir/train$n.utt2spk > $dir/train$n.spk2utt
done
# also see featspart below, used for sub-parts of the features;
# try to keep them in sync.
feats="ark,s,cs:splice-feats --print-args=false scp:$dir/train.scp ark:- | transform-feats $mat ark:- ark:- | transform-feats --utt2spk=ark:$dir/train.utt2spk \"ark:cat $dir/cur?.trans|\" ark:- ark:- |"
srcfeats="ark,s,cs:splice-feats --print-args=false scp:$dir/train.scp ark:- | transform-feats $mat ark:- ark:- |"
for n in 1 2 3; do
featspart[$n]="ark,s,cs:splice-feats --print-args=false scp:$dir/train${n}.scp ark:- | transform-feats $mat ark:- ark:- | transform-feats --utt2spk=ark:$dir/train.utt2spk ark:$dir/cur$n.trans ark:- ark:- |"
srcfeatspart[$n]="ark,s,cs:splice-feats --print-args=false scp:$dir/train${n}.scp ark:- | transform-feats $mat ark:- ark:- |"
done
cp $srcdir/topo $dir
gmm-init-lvtln --dim=$dim --num-classes=$numclass --default-class=$defaultclass \
$dir/0.lvtln 2>$dir/init_lvtln.log || exit 1
# Small subset of features for initializing the LVTLN.
featsub="ark:scripts/subset_scp.pl $numfiles $dir/train.scp | splice-feats scp:- ark:- | transform-feats $mat ark:- ark:- |"
echo "Initializing lvtln transforms."
c=0
while [ $c -lt $numclass ]; do
warp=`perl -e 'print 0.85 + 0.01*$ARGV[0];' $c`
featsub_warp="ark:scripts/subset_scp.pl $numfiles $dir/train_wav.scp | compute-mfcc-feats --vtln-low=100 --vtln-high=-600 --vtln-warp=$warp --config=conf/mfcc.conf scp:- ark:- | splice-feats ark:- ark:- | transform-feats $mat ark:- ark:- |"
gmm-train-lvtln-special --normalize-var=true $c $dir/0.lvtln $dir/0.lvtln \
"$featsub" "$featsub_warp" 2> $dir/train_special.$c.log || exit 1;
c=$[$c+1]
done
# Align all training data using old model (and old graphs, since we
# use the same data-subset as last time).
# Note: a few fail to get aligned here due to the difference between
# per-speaker and per-utterance splitting, but this doesn't really matter.
echo "Aligning all training data"
rm -f $dir/.error
for n in 1 2 3; do
gmm-align-compiled $scale_opts --beam=8 --retry-beam=40 $srcmodel \
"ark:gunzip -c $srcdir/graphs${n}.fsts.gz|" "${srcfeatspart[$n]}" \
"ark:|gzip -c >$dir/0.${n}.ali.gz" \
2> $dir/align.0.${n}.log || touch $dir/.error &
done
wait;
[ -f $dir/.error ] && echo alignment error RE old system && exit 1
echo "Computing LVTLN transforms (iter 0)"
rm -f $dir/.error
for n in 1 2 3; do
( ali-to-post "ark:gunzip -c $dir/0.$n.ali.gz|" ark:- | \
weight-silence-post 0.0 $silphonelist $srcmodel ark:- ark:- | \
gmm-post-to-gpost $srcmodel "${srcfeatspart[$n]}" ark:- ark:- | \
gmm-est-lvtln-trans --verbose=1 --spk2utt=ark:$dir/train$n.spk2utt $srcmodel $dir/0.lvtln \
"${srcfeatspart[$n]}" ark:- ark:$dir/cur$n.trans ark,t:$dir/0.$n.warp ) \
2>$dir/lvtln.0.$n.log || touch $dir/.error &
done
wait;
[ -f $dir/.error ] && echo error computing LVTLN transforms on iter 0 && exit 1
acc-tree-stats --ci-phones=$silphonelist $srcmodel "$feats" "ark:gunzip -c $dir/0.?.ali.gz|" $dir/treeacc 2> $dir/acc.tree.log || exit 1;
# The next few commands are involved with making the questions
# for tree clustering. The extra complexity vs. the RM recipe has
# to do with the desire to ask questions about the "real" phones
# ignoring things like stress and position-in-word, and ask questions
# separately about stress and position-in-word.
# Don't include silences as things to be clustered -> --nosil option.
scripts/make_shared_phones.sh --nosil | scripts/sym2int.pl data/phones.txt > $dir/phone_sets.list
cluster-phones $dir/treeacc $dir/phone_sets.list $dir/questions.txt 2> $dir/cluster_phones.log || exit 1;
scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
scripts/make_extra_questions.sh | cat $dir/questions_syms.txt - > $dir/questions_syms_all.txt
scripts/sym2int.pl data/phones.txt < $dir/questions_syms_all.txt > $dir/questions_all.txt
compile-questions $dir/topo $dir/questions_all.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;
scripts/make_roots.sh > $dir/roots_syms.txt
scripts/sym2int.pl --ignore-oov data/phones.txt < $dir/roots_syms.txt > $dir/roots.txt
build-tree --verbose=1 --max-leaves=$numleaves \
$dir/treeacc $dir/roots.txt \
$dir/questions.qst $dir/topo $dir/tree 2> $dir/train_tree.log || exit 1;
gmm-init-model --write-occs=$dir/1.occs \
$dir/tree $dir/treeacc $dir/topo $dir/1.mdl 2> $dir/init_model.log || exit 1;
gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl \
2>$dir/mixup.log || exit 1;
rm $dir/treeacc $dir/1.occs
# Convert alignments generated from previous model, to use as initial alignments.
for n in 1 2 3; do
convert-ali $srcmodel $dir/1.mdl $dir/tree \
"ark:gunzip -c $dir/0.$n.ali.gz|" \
"ark:|gzip -c > $dir/cur$n.ali.gz" 2>$dir/convert.$n.log || exit 1;
done
rm $dir/0.?.ali.gz
# Make training graphs
echo "Compiling training graphs"
rm -f $dir/.error
for n in 1 2 3; do
compile-train-graphs $dir/tree $dir/1.mdl data/L.fst ark:$dir/train${n}.tra \
"ark:|gzip -c > $dir/graphs${n}.fsts.gz" \
2>$dir/compile_graphs.${n}.log || touch $dir/.error &
done
wait
[ -f $dir/.error ] && echo compile-graphs error && exit 1
x=1
while [ $x -lt $numiters ]; do
echo "Pass $x"
if echo $realign_iters | grep -w $x >/dev/null; then
echo "Aligning data"
rm -f $dir/.error
for n in 1 2 3; do
gmm-align-compiled $scale_opts --beam=8 --retry-beam=40 $dir/$x.mdl \
"ark:gunzip -c $dir/graphs${n}.fsts.gz|" "${featspart[$n]}" \
"ark:|gzip -c >$dir/cur${n}.ali.gz" 2> $dir/align.$x.$n.log \
|| touch $dir/.error &
done
wait
[ -f $dir/.error ] && echo error aligning data && exit 1
fi
if echo $lvtln_iters | grep -w $x >/dev/null; then
# Work out current transforms (in parallel).
echo "Computing LVTLN transforms"
rm -f $dir/.error
for n in 1 2 3; do
( ali-to-post "ark:gunzip -c $dir/cur${n}.ali.gz|" ark:- | \
weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- | \
gmm-post-to-gpost $dir/$x.mdl "${featspart[$n]}" ark,o:- ark:- | \
gmm-est-lvtln-trans --spk2utt=ark:$dir/train$n.spk2utt --verbose=1 $dir/$x.mdl $dir/0.lvtln \
"${srcfeatspart[$n]}" ark,s,cs:- ark:$dir/tmp$n.trans ark,t:$dir/$x.$n.warp ) \
2> $dir/trans.$x.$n.log && mv $dir/tmp$n.trans $dir/cur$n.trans \
|| touch $dir/.error &
done
wait
[ -f $dir/.error ] && echo error aligning data && exit 1
fi
gmm-acc-stats-ali --binary=false $dir/$x.mdl "$feats" \
"ark,s,cs:gunzip -c $dir/cur?.ali.gz|" $dir/$x.acc 2> $dir/acc.$x.log || exit 1;
gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl $dir/$x.acc $dir/$[$x+1].mdl 2> $dir/update.$x.log || exit 1;
rm $dir/$x.mdl $dir/$x.acc $dir/$x.occs 2>/dev/null
if [ $x -le $maxiterinc ]; then
numgauss=$[$numgauss+$incgauss];
fi
x=$[$x+1];
done
# Accumulate stats for "alignment model" which is as the model but with
# the baseline features (shares Gaussian-level alignments).
( ali-to-post "ark:gunzip -c $dir/cur?.ali.gz|" ark:- | \
gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$srcfeats" ark:- $dir/$x.acc2 ) 2>$dir/acc_alimdl.log || exit 1;
# Update model.
gmm-est --remove-low-count-gaussians=false $dir/$x.mdl $dir/$x.acc2 $dir/$x.alimdl \
2>$dir/est_alimdl.log || exit 1;
rm $dir/$x.acc2
if [ $compute_vtlnmdl == "true" ]; then
iter=`echo 0 $lvtln_iters | awk '{print $NF}'` # last iter we re-estimated LVTLN
rm -f $dir/.error
for n in 1 2 3; do
cat $dir/$iter.$n.warp | awk '{print $1, (0.85+0.01*$2);}' > $dir/cur$n.factor
compute-mfcc-feats --utt2spk=ark:$dir/train$n.utt2spk --vtln-low=100 --vtln-high=-600 \
--vtln-map=ark:$dir/cur$n.factor --config=conf/mfcc.conf \
scp:$dir/train_wav$n.scp ark:$dir/tmp$n.ark 2>$dir/mfcc.$n.log \
|| touch $dir/.error &
done
wait
[ -f $dir/.error ] && echo error computing VTLN-warped MFCC features && exit 1
# Compute diagonal fMLLR transform to normalize VTLN feats.
# (note, this is a bit stronger than the mean-only transform we used for the LVTLN stuff,
# LVTLN also globally normalized the variance of each warp factor, so this seems
# appropriate).
for n in 1 2 3; do
vtlnfeats="ark:splice-feats ark:$dir/tmp$n.ark ark:- | transform-feats $mat ark:- ark:- |"
( ali-to-post "ark:gunzip -c $dir/cur$n.ali.gz|" ark:- | \
weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- | \
gmm-est-fmllr --fmllr-update-type=diag --spk2utt=ark:$dir/train$n.spk2utt \
$dir/$x.mdl "$vtlnfeats" ark,o:- ark:$dir/vtln$n.trans ) \
2>$dir/vtln_fmllr.$n.log || touch $dir/.error &
done
wait
[ -f $dir/.error ] && echo error computing fMLLR transforms after VTLN && exit 1
# all the features, with diagonal fMLLR
vtlnfeats="ark:cat $dir/tmp?.ark | splice-feats ark:- ark:- | transform-feats $mat ark:- ark:- | transform-feats --utt2spk=ark:$dir/train.utt2spk \"ark:cat $dir/vtln?.trans|\" ark:- ark:- |"
( ali-to-post "ark:gunzip -c $dir/cur?.ali.gz|" ark:- | \
gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$vtlnfeats" ark,s,cs:- $dir/$x.acc3 ) 2>$dir/acc_vtlnmdl.log || exit 1;
# Update model.
gmm-est $dir/$x.mdl $dir/$x.acc3 $dir/$x.vtlnmdl \
2>$dir/est_vtlnmdl.log || exit 1;
rm $dir/$x.acc3
rm $dir/final.alimdl 2>/dev/null
ln -s $x.vtlnmdl $dir/final.vtlnmdl
rm $dir/tmp?.ark
fi
# The following files may be be useful for display purposes.
for y in $lvtln_iters; do
cat $dir/$y.?.warp | scripts/process_warps.pl data/spk2gender.map > $dir/warps.$y
y=$[$y+1]
done
( cd $dir; rm final.{mdl,alimdl,et} 2>/dev/null;
ln -s $x.mdl final.mdl; ln -s $x.alimdl final.alimdl;
ln -s $numiters_et.et final.et )

Просмотреть файл

@ -43,7 +43,7 @@ BaseFloat DecodableAmDiagGmmUnmapped::LogLikelihoodZeroBased(
// check if everything is in order
if (pdf.Dim() != data.Dim()) {
KALDI_ERR << "Dim mismatch: data dim = " << data.Dim()
<< "vs. model dim = " << pdf.Dim();
<< " vs. model dim = " << pdf.Dim();
}
if (!pdf.valid_gconsts()) {
KALDI_ERR << "State " << (state) << ": Must call ComputeGconsts() "
@ -96,7 +96,7 @@ BaseFloat DecodableAmDiagGmmRegtreeFmllr::LogLikelihoodZeroBased(int32 frame,
// check if everything is in order
if (pdf.Dim() != data.Dim()) {
KALDI_ERR << "Dim mismatch: data dim = " << data.Dim()
<< "vs. model dim = " << pdf.Dim();
<< " vs. model dim = " << pdf.Dim();
}
if (!pdf.valid_gconsts()) {
KALDI_ERR << "State " << (state) << ": Must call ComputeGconsts() "
@ -252,7 +252,7 @@ BaseFloat DecodableAmDiagGmmRegtreeMllr::LogLikelihoodZeroBased(int32 frame,
// check if everything is in order
if (pdf.Dim() != data.Dim()) {
KALDI_ERR << "Dim mismatch: data dim = " << data.Dim()
<< "vs. model dim = " << pdf.Dim();
<< " vs. model dim = " << pdf.Dim();
}
if (frame != previous_frame_) { // cache the squared stats.