Committing some scripts.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@103 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2011-06-26 05:04:49 +00:00 · 2011-06-26 05:04:49 +00:00 · d7a6d499aa
--- a/egs/wsj/s1/steps/train_sgmm3b.sh
+++ b/egs/wsj/s1/steps/train_sgmm3b.sh
@ -77,6 +77,7 @@ cp $srcdir/topo $dir
 # Note: a small number of utterances don't have graphs at this stage because of differences
 # in how the data splitting is done when we switch to using speaker information.
 echo "Aligning all training data"
 rm -f $dir/.error
@ -89,7 +90,6 @@ done
 wait;
 [ -f $dir/.error ] &&  echo align error RE old system && exit 1
 acc-tree-stats  --ci-phones=$silphonelist $srcmodel "$feats" \
  "ark:gunzip -c $dir/0.?.ali.gz|" $dir/treeacc 2> $dir/acc.tree.log  || exit 1;
@ -166,7 +166,7 @@ while [ $x -lt $numiters ]; do
     rm -f $dir/.error
     for n in 1 2 3; do
       sgmm-align-compiled ${spkvecs_opt[$n]} --utt2spk=ark:$dir/train$n.utt2spk \
-            "--gselect=ark:gunzip -c $dir/gselect$n.gz|" \
+            "--gselect=ark,s,cs:gunzip -c $dir/gselect$n.gz|" \
           $scale_opts --beam=8 --retry-beam=40 $dir/$x.mdl \
           "ark:gunzip -c $dir/graphs${n}.fsts.gz|" "${featspart[$n]}" \
           "ark:|gzip -c >$dir/cur${n}.ali.gz" 2> $dir/align.$x.$n.log \
@ -181,9 +181,9 @@ while [ $x -lt $numiters ]; do
      ( ali-to-post "ark:gunzip -c $dir/cur${n}.ali.gz|" ark:- | \
        weight-silence-post 0.01 $silphonelist $dir/$x.mdl ark:- ark:- | \
        sgmm-est-spkvecs --spk2utt=ark:$dir/train$n.spk2utt ${spkvecs_opt[$n]} \
-         "--gselect=ark:gunzip -c $dir/gselect$n.gz|" \
+         "--gselect=ark,s,cs:gunzip -c $dir/gselect$n.gz|" \
          --rand-prune=$randprune $dir/$x.mdl \
-         "${featspart[$n]}" ark:- ark:$dir/tmp$n.vecs && mv $dir/tmp$n.vecs $cur/cur$n.vecs ) \
+         "${featspart[$n]}" ark,s,cs:- ark:$dir/tmp$n.vecs && mv $dir/tmp$n.vecs $dir/cur$n.vecs ) \
                   2>$dir/spkvecs.$x.$n.log \
           || touch $dir/.error &
        spkvecs_opt[$n]="--spk-vecs=ark:$dir/cur$n.vecs"
@ -202,7 +202,7 @@ while [ $x -lt $numiters ]; do
   for n in 1 2 3; do
     sgmm-acc-stats-ali ${spkvecs_opt[$n]} --utt2spk=ark:$dir/train$n.utt2spk \
-       --update-flags=$flags "--gselect=ark:gunzip -c $dir/gselect$n.gz|" \
+       --update-flags=$flags "--gselect=ark,s,cs:gunzip -c $dir/gselect$n.gz|" \
       --rand-prune=$randprune --binary=true $dir/$x.mdl "${featspart[$n]}" \
      "ark:gunzip -c $dir/cur$n.ali.gz|" $dir/$x.$n.acc 2> $dir/acc.$x.$n.log \
        || touch $dir/.error &
@ -225,7 +225,7 @@ flags=MwcS
 for n in 1 2 3; do
 ( ali-to-post "ark:gunzip -c $dir/cur$n.ali.gz|" ark:- | \
   sgmm-post-to-gpost ${spkvecs_opt[$n]} --utt2spk=ark:$dir/train$n.utt2spk \
-                "--gselect=ark:gunzip -c $dir/gselect$n.gz|" \
+                "--gselect=ark,s,cs:gunzip -c $dir/gselect$n.gz|" \
                 $dir/$x.mdl "${featspart[$n]}" ark,s,cs:- ark:- | \
  sgmm-acc-stats-gpost --update-flags=$flags  $dir/$x.mdl "${featspart[$n]}" \
            ark,s,cs:- $dir/$x.$n.aliacc ) 2> $dir/acc_ali.$x.$n.log || touch $dir/.error &
--- a/egs/wsj/s1/steps/train_sgmm3b2.sh
+++ b/egs/wsj/s1/steps/train_sgmm3b2.sh
@ -0,0 +1,248 @@
 #!/bin/bash
 # Copyright 2010-2011 Microsoft Corporation  Arnab Ghoshal
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #  http://www.apache.org/licenses/LICENSE-2.0
 #
 # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 # MERCHANTABLITY OR NON-INFRINGEMENT.
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 # 3b2 is with dim=50
 # sgmm3b is as sgmm2b (SGMM with speaker vectors), but using all
 # the training data. 
 # Instead of starting from sgmm2b we start from tri3a.  This means we can
 # essentially reuse the train_sgmm2b.sh script, and don't have to do
 # alignment of the model with speaker vectors (which requires multiple
 # passes to do properly and is a bit of a hassle).
 if [ -f path.sh ]; then . path.sh; fi
 dir=exp/sgmm3b2
 srcdir=exp/tri3a # more convenient as has graphs and alignments for this data already.
 ubm=exp/ubm3a/final.ubm # 600 UBM comps
 srcmodel=$srcdir/final.mdl
 scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
 numiters=35 # Total number of iterations.
 realign_iters="5 15 25"; # realign a bit earlier than we did in tri2a, 
    # since SGMM system quite different
    # from normal triphone system.
 spkvec_iters="5 8 12 17 22 32"
 maxiterinc=20 # By this iter, we have all the substates.
 numleaves=6000 # was 4.2k for GMM system: incresaing it for SGMM system.
 numsubstates=6000 # initial #-substates
 totsubstates=35000 # a little less than #Gauss for baseline GMM system (40k)
 incsubstates=$[($totsubstates-$numsubstates)/$maxiterinc] # per-iter increment for #substates
 phn_dim=50
 phn_dim_iter=3 # iter to increase phn dim.
 silphonelist=`cat data/silphones.csl`
 randprune=0.1
 mkdir -p $dir
 cp $srcdir/train.scp $dir
 cp $srcdir/train.tra $dir
 scripts/filter_scp.pl $dir/train.scp data/train.utt2spk > $dir/train.utt2spk
 scripts/utt2spk_to_spk2utt.pl $dir/train.utt2spk > $dir/train.spk2utt
 scripts/split_scp.pl --utt2spk=$dir/train.utt2spk $dir/train{,1,2,3}.scp
 scripts/split_scp.pl --utt2spk=$dir/train.utt2spk $dir/train{,1,2,3}.tra
 scripts/split_scp.pl --utt2spk=$dir/train.utt2spk $dir/train{,1,2,3}.utt2spk
 for n in 1 2 3 ""; do # The "" handles the un-split one.  Creating spk2utt files..
  scripts/utt2spk_to_spk2utt.pl $dir/train$n.utt2spk > $dir/train$n.spk2utt
 done
 # also see featspart below, used for sub-parts of the features;
 # try to keep them in sync.
 feats="ark:add-deltas --print-args=false scp:$dir/train.scp ark:- |"
 for n in 1 2 3; do
   featspart[$n]="ark:add-deltas --print-args=false scp:$dir/train${n}.scp ark:- |"
 done
 if [ ! -f $ubm ]; then
  echo "No UBM in $ubm";
  exit 1
 fi
 cp $srcdir/topo $dir
 # Align all training data using old model (and old graphs, since we
 # use the same data-subset as last time). 
 # Note: a small number of utterances don't have graphs at this stage because of differences
 # in how the data splitting is done when we switch to using speaker information.
 echo "Aligning all training data"
 rm -f $dir/.error
 for n in 1 2 3; do
   gmm-align-compiled $scale_opts --beam=8 --retry-beam=40 $srcmodel \
       "ark:gunzip -c $srcdir/graphs${n}.fsts.gz|" "${featspart[$n]}" \
       "ark:|gzip -c >$dir/0.${n}.ali.gz" \
           2> $dir/align.0.${n}.log || touch $dir/.error &
 done
 wait;
 [ -f $dir/.error ] &&  echo align error RE old system && exit 1
 acc-tree-stats  --ci-phones=$silphonelist $srcmodel "$feats" \
  "ark:gunzip -c $dir/0.?.ali.gz|" $dir/treeacc 2> $dir/acc.tree.log  || exit 1;
 # The next few commands are involved with making the questions
 # for tree clustering.  The extra complexity vs. the RM recipe has
 # to do with the desire to ask questions about the "real" phones
 # ignoring things like stress and position-in-word, and ask questions
 # separately about stress and position-in-word.
 # Don't include silences as things to be clustered -> --nosil option.
 scripts/make_shared_phones.sh --nosil | scripts/sym2int.pl data/phones.txt > $dir/phone_sets.list
 cluster-phones $dir/treeacc $dir/phone_sets.list $dir/questions.txt 2> $dir/cluster_phones.log || exit 1;
 scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
 scripts/make_extra_questions.sh | cat $dir/questions_syms.txt - > $dir/questions_syms_all.txt
 scripts/sym2int.pl data/phones.txt < $dir/questions_syms_all.txt > $dir/questions_all.txt
 compile-questions $dir/topo $dir/questions_all.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;
 scripts/make_roots.sh > $dir/roots_syms.txt
 scripts/sym2int.pl --ignore-oov data/phones.txt  < $dir/roots_syms.txt > $dir/roots.txt
 build-tree --verbose=1 --max-leaves=$numleaves \
  $dir/treeacc $dir/roots.txt \
  $dir/questions.qst $dir/topo $dir/tree  2> $dir/train_tree.log || exit 1;
 # the sgmm-init program accepts a GMM, so we just create a temporary GMM "0.gmm"
 gmm-init-model  --write-occs=$dir/0.occs  \
    $dir/tree $dir/treeacc $dir/topo $dir/0.gmm 2> $dir/init_gmm.log || exit 1;
 sgmm-init --spk-space-dim=39 $dir/0.gmm $ubm $dir/0.mdl 2> $dir/init_sgmm.log || exit 1;
 rm $dir/0.gmm
 rm $dir/treeacc
 for n in 1 2 3; do
  sgmm-gselect $dir/0.mdl "${featspart[$n]}" ark,t:- 2>$dir/gselect$n.log | \
   gzip -c > $dir/gselect${n}.gz || touch $dir/.error &
 done
 wait
 [ -f $dir/.error ] && echo "Error in gselect phase" && exit 1;
 # Convert alignments generated from previous model, to use as 
 # initial alignments.
 for n in 1 2 3; do
  convert-ali $srcmodel $dir/0.mdl $dir/tree \
      "ark:gunzip -c $dir/0.$n.ali.gz|" \
      "ark:|gzip -c > $dir/cur$n.ali.gz" \
     2>$dir/convert.$n.log || exit 1; # don't parallelize: mostly I/O.
 done
 rm $dir/0.?.ali.gz
 # Make training graphs
 echo "Compiling training graphs"
 rm -f $dir/.error
 for n in 1 2 3; do
  compile-train-graphs $dir/tree $dir/0.mdl  data/L.fst ark:$dir/train${n}.tra \
     "ark:|gzip -c > $dir/graphs${n}.fsts.gz" \
     2>$dir/compile_graphs.${n}.log || touch $dir/.error &
 done
 wait
 [ -f $dir/.error ] &&  echo compile-graphs error && exit 1
 x=0
 while [ $x -lt $numiters ]; do
   echo "Pass $x"
   if echo $realign_iters | grep -w $x >/dev/null; then
     echo "Aligning data"
     rm -f $dir/.error
     for n in 1 2 3; do
       sgmm-align-compiled ${spkvecs_opt[$n]} --utt2spk=ark:$dir/train$n.utt2spk \
            "--gselect=ark,s,cs:gunzip -c $dir/gselect$n.gz|" \
           $scale_opts --beam=8 --retry-beam=40 $dir/$x.mdl \
           "ark:gunzip -c $dir/graphs${n}.fsts.gz|" "${featspart[$n]}" \
           "ark:|gzip -c >$dir/cur${n}.ali.gz" 2> $dir/align.$x.$n.log \
             || touch $dir/.error &
     done
     wait 
     [ -f $dir/.error ] && echo error aligning data && exit 1
   fi
   if echo $spkvec_iters | grep -w $x >/dev/null; then
     echo "Computing speaker vectors"
     for n in 1 2 3; do
      ( ali-to-post "ark:gunzip -c $dir/cur${n}.ali.gz|" ark:- | \
        weight-silence-post 0.01 $silphonelist $dir/$x.mdl ark:- ark:- | \
        sgmm-est-spkvecs --spk2utt=ark:$dir/train$n.spk2utt ${spkvecs_opt[$n]} \
         "--gselect=ark,s,cs:gunzip -c $dir/gselect$n.gz|" \
          --rand-prune=$randprune $dir/$x.mdl \
         "${featspart[$n]}" ark:- ark:$dir/tmp$n.vecs && mv $dir/tmp$n.vecs $dir/cur$n.vecs ) \
                   2>$dir/spkvecs.$x.$n.log \
           || touch $dir/.error &
        spkvecs_opt[$n]="--spk-vecs=ark:$dir/cur$n.vecs"
     done
     wait;
     [ -f $dir/.error ] && echo error computing speaker vectors && exit 1    
   fi  
   if [ $x -eq 0 ]; then
     flags=vwcS
   elif [ $[$x%2] -eq 1 -a $x -gt 4 ]; then # even iters after 4 (i.e. starting from 6)...
     flags=vNwcS
   else
     flags=vMwcS
   fi
   for n in 1 2 3; do
     sgmm-acc-stats-ali ${spkvecs_opt[$n]} --utt2spk=ark:$dir/train$n.utt2spk \
       --update-flags=$flags "--gselect=ark,s,cs:gunzip -c $dir/gselect$n.gz|" \
       --rand-prune=$randprune --binary=true $dir/$x.mdl "${featspart[$n]}" \
      "ark:gunzip -c $dir/cur$n.ali.gz|" $dir/$x.$n.acc 2> $dir/acc.$x.$n.log \
        || touch $dir/.error &
   done
   wait;
   [ -f $dir/.error ] && echo error accumulating stats on iter $x && exit 1  
   if [ $x == $phn_dim_iter ]; then 
     phn_dim_opt=--increase-phn-dim=$phn_dim
   else
     phn_dim_opt=
   fi
   sgmm-est $phn_dim_opt --update-flags=$flags --split-substates=$numsubstates \
      --write-occs=$dir/$[$x+1].occs $dir/$x.mdl "sgmm-sum-accs - $dir/$x.?.acc|" \
      $dir/$[$x+1].mdl  2> $dir/update.$x.log || exit 1;
   rm $dir/$x.mdl $dir/$x.?.acc $dir/$x.occs 2>/dev/null
   if [ $x -lt $maxiterinc ]; then 
     numsubstates=$[$numsubstates+$incsubstates]
   fi
   x=$[$x+1];
 done
 ( cd $dir; rm final.mdl final.occs 2>/dev/null; ln -s $x.mdl final.mdl; ln -s $x.occs final.occs )
 # Create "alignment model"
 flags=MwcS
 for n in 1 2 3; do
 ( ali-to-post "ark:gunzip -c $dir/cur$n.ali.gz|" ark:- | \
   sgmm-post-to-gpost ${spkvecs_opt[$n]} --utt2spk=ark:$dir/train$n.utt2spk \
                "--gselect=ark,s,cs:gunzip -c $dir/gselect$n.gz|" \
                 $dir/$x.mdl "${featspart[$n]}" ark,s,cs:- ark:- | \
  sgmm-acc-stats-gpost --update-flags=$flags  $dir/$x.mdl "${featspart[$n]}" \
            ark,s,cs:- $dir/$x.$n.aliacc ) 2> $dir/acc_ali.$x.$n.log || touch $dir/.error &
 done
 wait;
 [ -f $dir/.error ] && echo error accumulating stats for alignment model && exit 1  
 sgmm-est --update-flags=$flags --remove-speaker-space=true $dir/$x.mdl \
    "sgmm-sum-accs - $dir/$x.?.aliacc|" $dir/$x.alimdl 2>$dir/update_ali.$x.log || exit 1;
 rm $dir/$x.?.aliacc
 ( cd $dir; rm final.alimdl 2>/dev/null; ln -s $x.alimdl final.alimdl; )