trunk: copying from sandbox/oplatek2, the vystadial_en and vystadial_cz recipes

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@3809 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2014-03-28 00:59:43 +00:00 · 2014-03-28 00:59:43 +00:00 · 528345f30c
--- a/egs/vystadial_cz/README.txt
+++ b/egs/vystadial_cz/README.txt
@ -0,0 +1,168 @@
+Summary
+-------
+The data comprise over 15 hours of speech in Czech.
+
+The Czech recordings were collected in three ways:
+
+1. using a free Call Friend phone service
+2. using the Repeat After Me speech data collecting process
+3. from telephone interactions with the PublicTransportInfo Spoken Dialog System (SDS)
+   Alex: http://ufal.ms.mff.cuni.cz/alex-dialogue-systems-framework/.
+
+The data collection process is described in detail
+in article "Free English and Czech telephone speech corpus shared under the CC-BY-SA 3.0 license"
+published for LREC 2014 (To Appear).
+
+WE USE COMMON KALDI DECODERS IN THE SCRIPTS (gmm-latgen-faster through steps/decode.sh)
+However, the main purpose of providing the data and scripts
+is training acoustic models for real-time speech recognition unit
+for dialog system ALEX, which uses modified real-time Kaldi OnlineLatgenRecogniser.
+The modified Kaldi decoders are NOT required for running the scripts!
+
+The modified OnlineLatgenRecogniser is actively developed at 
+https://github.com/UFAL-DSG/pykaldi/tree/master/src/onl-rec
+and has Python wrapper:
+https://github.com/UFAL-DSG/pykaldi/tree/master/src/pykaldi
+Note that I am currently moving the online recogniser to:
+http://sourceforge.net/p/kaldi/code/HEAD/tree/sandbox/oplatek2/
+
+Credits and license
+------------------------
+The scripts are partially based on Voxforge KALDI recipe.
+The original scripts as well as theses scripts are licensed under APACHE 2.0 license.
+The data are distributed under Attribution-{ShareAlike} 3.0 Unported ({CC} {BY}-{SA} 3.0) license.
+Czech data: https://lindat.mff.cuni.cz/repository/xmlui/handle/11858/00-097C-0000-0023-4670-6
+English data: https://lindat.mff.cuni.cz/repository/xmlui/handle/11858/00-097C-0000-0023-4671-4
+
+The data collecting process and development of these training scripts 
+was partly funded by the Ministry of Education, Youth and Sports 
+of the Czech Republic under the grant agreement LK11221 
+and core research funding of Charles University in Prague.
+For citing, please use following BibTex citation:
+
+@inproceedings{korvas_2014,
+  title={{Free English and Czech telephone speech corpus shared under the CC-BY-SA 3.0 license}},
+  author={Korvas, Mat\v{e}j and Pl\'{a}tek, Ond\v{r}ej and Du\v{s}ek, Ond\v{r}ej and \v{Z}ilka, Luk\'{a}\v{s} and Jur\v{c}\'{i}\v{c}ek, Filip},
+  booktitle={Proceedings of the Eigth International Conference on Language Resources and Evaluation (LREC 2014)},
+  pages={To Appear},
+  year={2014},
+}
+
+
+Expected results
+----------------
+The expected results were obtained simply by running
+bash train_voip_cs.sh OR bash train_voip_en.sh.
+Note that you need SRILM installed in path or at kaldi/tools/ directory!
+
+    build2 - bigram LM from train data, estimated by the scripts using SRILM
+    build0 - zerogram LM from test data, estimated by scripts using Python code
+    LMW - Language model weight, we picked the best from (min_lmw, max_lmw) based on decoding results on DEV set
+
+    Full Czech data: 
+    exp             set     LM      LMW     WER     SER  
+    mono            test    build0  6       86.1    89.66
+    tri1            test    build0  8       70.84   82.9 
+    tri2a           test    build0  8       70.86   83.01
+    tri2b           test    build0  9       68.13   80.89
+    tri2b_mmi       test    build0  9       67.61   79.53
+    tri2b_mmi_b0.05 test    build0  8       66.18   78.72
+    tri2b_mpe       test    build0  9       64.93   77.66
+    mono            test    build2  8       72.3    79.02
+    tri1            test    build2  11      55.57   72.11
+    tri2a           test    build2  11      55.12   70.9 
+    tri2b           test    build2  12      52.95   70.7 
+    tri2b_mmi       test    build2  10      50.42   68.38
+    tri2b_mmi_b0.05 test    build2  10      49.96   68.58
+    tri2b_mpe       test    build2  12      49.87   66.97
+
+    Note that the zero-gram LMs for discriminative training
+    give significant advantage, because they are estimated on test set!
+
+
+Details
+-------
+* Requires Kaldi installation and Linux environment. (Tested on Ubuntu 10.04, 12.04 and 12.10.)
+* The config files s5/env_voip_cs.sh sets the data directory,
+  mfcc directory and experiments directory.
+* Our scripts prepare the data to the expected format in s5/data.
+* Experiment files are stored to $exp directory e.g. s5/exp.
+* The local directory contains scripts for data preparation to prepare 
+  lang directory.
+* path.sh, cmd.sh and  common/* contain configurations for the 
+  recipe.
+* Language model (LM) is either built from the training data using 
+  [SRILM](http://www.speech.sri.com/projects/srilm/)  or we supply one in 
+  the ARPA format.
+
+
+Running experiments
+-------------------
+Before running the experiments, check that:
+
+* you have the Kaldi toolkit compiled: 
+  http://sourceforge.net/projects/kaldi/.
+* you have SRILM compiled. (This is needed for building a language model 
+  unless you supply your own LM in the ARPA format.) 
+  See http://www.speech.sri.com/projects/srilm/.
+* The number of jobs njobs are set correctly in path.sh.
+* In cmd.sh, you switched to run the training on a SGE[*] grid if 
+  required (disabled by default).
+
+Start the recipe from the s5 directory by running 
+bash run.sh.
+It will create s5/mfcc, s5/data and s5/exp directories.
+If any of them exists, it will ask you if you want them to be overwritten.
+
+.. [*] Sun Grid Engine
+
+Extracting the results and trained models
+-----------------------------------------
+The main scripts, s5/run.sh, 
+perform not only training of the acoustic models, but also decoding.
+The acoustic models are evaluated after running the training and  
+reports are printed to the standard output.
+
+The s5/local/results.py exp command extracts the results from the $exp directory.
+and stores the results to exp/results.log.
+
+If you want to use the trained acoustic model with your language model
+outside the prepared script, you need to build the HCLG decoding graph yourself.  
+See http://kaldi.sourceforge.net/graph.html for general introduction to the FST 
+framework in Kaldi.
+
+The simplest way to start decoding is to use the same LM which
+was used by the s5/run.sh script.
+Let's say you want to decode with 
+the acoustic model stored in exp/tri2b_bmmi,
+then you need files listed below:
+
+================================= =====================================================================================
+mfcc.conf                          Speech parametrisation (MFCC) settings. Training and decoding setup must match.
+exp/tri2b_bmmi/graph/HCLG.fst      Decoding Graph. Graph part of AM plus lexicon, phone->3phone & LM representation.
+exp/tri2b_bmmi/graph/words.txt     Word symbol table, a mapping between words and integers which are decoded.
+exp/tri2b_bmmi/graph/silence.csl   List of phone integer ids, which represent silent phones. 
+exp/tri2b_bmmi/final.mdl           Trained acoustic model (AM).
+exp/tri2b_bmmi/final.mat           Trained matrix of feature/space transformations (E.g. LDA and bMMI).
+================================= =====================================================================================
+
+
+We recommend to study steps/decode.sh Kaldi standard script
+for standalone decoding with gmm-latgen-faster Kaldi decoder.
+
+In order to build your own decoding graph HCLG 
+you need LM in ARPA format and files in table below. 
+
+* Note 1: Building HCLG decoding graph is out of scope this README.
+* Note 2: Each acoustic model needs corresponding HCLG graph.
+* Note 3: The phonetic dictionary applied on the vocabulary 
+  should always generate only a subset of phones seen in training data!
+
+===============================  =========================================================================
+LM.arpa                           Language model in ARPA format [You should supply it]
+vocabulary.txt                    List of words you want to decode [You should supply it]
+OOV_SYMBOL                        String representing out of vocabulary word. [You should supply it]
+dictionary.txt                    Phonetic dictionary. [You should supply it]
+exp/tri2b_bmmi/final.mdl          Trained acoustic model (AM).
+exp/tri2b_bmmi/final.tree         Phonetic decision tree.
+===============================  =========================================================================
--- a/egs/vystadial_cz/online_demo/.gdbinit_faster
+++ b/egs/vystadial_cz/online_demo/.gdbinit_faster
@ -0,0 +1,37 @@
+###############################
+#  Examples commands for gdb  #
+###############################
+# layout src  # Open TUI mode in viewing src code
+# file ./online-python-gmm-decode-faster-test # we want to debug this program
+# directory ../feat # add directory to search path for source codes
+# b online-python-gmm-decode-faster-test.cc:80 # set breakpoint
+# set args --rt-min=0.5 --rt-max=0.7   # pass arguments to program
+# shell ls  # execute shell command ls
+
+#############################
+#  EXAMPLES how to run gdb  #
+#############################
+# gdb -q -iex "set auto-load safe-path ." .gdbinit
+# gdb -q -x .gdbinit
+
+#######################
+#  USEFULL shortcuts  #
+#######################
+# Ctrl+x Ctrl+a ... switches of and of tui from gdb prompt
+# In TUI mode: Ctrl+n  resp. Ctrl+p ... next resp. previous line in history
+
+###########
+#  LINKS  #
+###########
+# Martin Jiricka slides in czech:
+# http://www.ms.mff.cuni.cz/~jirim7am/data/gdb/gdb.pdf (or in my Calibre library)
+
+directory ../../../src/onl-rec
+directory ../../../src/decoder
+directory ../../../src/feat
+
+# extractor->Compute
+b onl-rec-audio-source.cc:49
+b onl-rec-audio-source.cc:31
+
+run
--- a/egs/vystadial_cz/online_demo/Makefile
+++ b/egs/vystadial_cz/online_demo/Makefile
@ -0,0 +1,64 @@
+BEST_LINE=18
+MODEL_PREFIX_URL=http://vystadial.ms.mff.cuni.cz/download/kaldi/src/pykaldi/pykaldi/binutils/
+DATA_PREFIX_URL=http://vystadial.ms.mff.cuni.cz/download/kaldi/src/pykaldi/pykaldi/binutils/
+
+# Czech language models 
+LANG=cs
+HCLG=models/HCLG_tri2b_bmmi.fst
+AM=models/tri2b_bmmi.mdl
+MAT=models/tri2b_bmmi.mat  # matrix trained in tri2b models 
+WST=models/words.txt
+MFCC=models/mfcc.conf
+SILENCE=models/silence.csl
+
+$(MFCC):
+	wget $(MODEL_PREFIX_URL)/$@ -O $@
+
+$(SILENCE):
+	wget $(MODEL_PREFIX_URL)/$@ -O $@
+
+$(HCLG):
+	wget $(MODEL_PREFIX_URL)/$@ -O $@
+
+$(AM):
+	wget $(MODEL_PREFIX_URL)/$@ -O $@
+
+$(MAT):
+	wget $(MODEL_PREFIX_URL)/$@ -O $@
+
+$(WST):
+	wget $(MODEL_PREFIX_URL)/$@ -O $@
+
+data/vystadial-sample-$(LANG).tar.gz:
+	wget $(DATA_PREFIX_URL)/$@ -O $@
+
+data/vystadial-sample-$(LANG)/README.rst: data/vystadial-sample-$(LANG).tar.gz
+	tar xf  $< -C data
+	touch $@
+
+data/vystadial-sample-$(LANG)/test/input.scp: data/vystadial-sample-$(LANG)/README.rst
+	for f in `dirname $@`/*.wav ; do echo "`basename $$f`" "`pwd`/$$f" ; done > $@
+		
+data/vystadial-sample-$(LANG)/test/input_best.scp: data/vystadial-sample-$(LANG)/test/input.scp
+	sed -n '$(BEST_LINE)p' < $< > $@
+
+build_scp: data/vystadial-sample-$(LANG)/test/input_best.scp data/vystadial-sample-$(LANG)/test/input.scp
+
+download_models: $(SILENCE) $(WST) $(HCLG) $(AM) $(MAT) $(MFCC)
+
+pyonline-recogniser: build_scp download_models
+	bash run_pyonline-latgen-recogniser.sh
+
+online-recogniser: build_scp download_models
+	bash run_online-latgen-recogniser.sh
+
+gmm-latgen-faster: build_scp download_models
+	bash run_gmm-latgen-faster.sh
+
+live: download_models
+	bash run_live-demo.sh
+	
+clean:
+	rm -rf *.pyc data/* models/* decode/*
+
+.PHONY: live pyonline-recogniser online-recogniser gmm-latgen-faster clean
--- a/egs/vystadial_cz/online_demo/README.txt
+++ b/egs/vystadial_cz/online_demo/README.txt
@ -0,0 +1,80 @@
+Running the example Pykaldi scripts
+===================================
+
+Summary
+-------
+The demo presents three new Kaldi features on pretrained Czech AMs:
+* Online Lattice Recogniser. The best results were obtained using MFCC, LDA+MLLT and bMMI.
+* Python wrapper which interfaces the OnlineLatticeRecogniser to Python.
+* Training scripts which can be used with standard Kaldi tools or with the new OnlineLatticeRecogniser.
+
+The pykaldi-latgen-faster-decoder.py
+demonstrates how to use the class PyOnlineLatgenRecogniser,
+which takes audio on the input and outputs the decoded lattice.
+There are also the OnlineLatgenRecogniser C++ and Kaldi standard gmm-latgen-faster demos.
+All three demos produce the same results.
+
+TODO: Publish English AM and add English demo
+
+In March 2014, the PyOnlineLatticeRecogniser recogniser was evaluated on domain of SDS Alex. 
+See graphs evaluating OnlineLatticeRecogniser performance at 
+http://nbviewer.ipython.org/github/oplatek/pykaldi-eval/blob/master/Pykaldi-evaluation.ipynb.
+
+An example posterior word lattice output for one Czech utterance can be seen at 
+http://oplatek.blogspot.it/2014/02/ipython-demo-pykaldi-decoders-on-short.html
+
+Dependencies
+------------
+* Build (make) and test (make test) the code under  kaldi/src, kaldi/src/pykaldi and kaldi/src/onl-rec
+* For inspecting the saved lattices you need dot binary 
+  from Graphviz <http://www.graphviz.org/Download..php library.
+* For running the live demo you need pyaudio package.
+
+Running the example scripts
+---------------------------
+
+
+    make online-latgen-recogniser
+
+* Run the test src/onl-rec/onl-rec-latgen-recogniser-test for OnlineLatgenRecogniser
+  which shows C++ example of how to use the recogniser.
+  The same data, AM a LM are used as for make pyonline-latgen-recogniser.
+  The pretrained Language (LM) and Acoustic (AM) models are used.
+  The data as well as the models are downloaded from our server.
+
+
+    make pyonline-latgen-recogniser
+
+* Run the decoding with PyOnlineFasterRecogniser. 
+  Example Python script pykaldi-online-latgen-recogniser.py shows 
+  PyOnlineFasterRecogniser decoding  on small test set.
+  The same pretrained Language (LM) and Acoustic (AM) models.
+
+
+    make gmm-latgen-faster
+
+* Run the decoding with Kaldi gmm-latgen-faster executable wrapped in `<run_gmm-latgen-faster.sh>`_.
+  This is the reference executable for 
+  The same data, AM a LM are used as for make pyonline-latgen-recogniser.
+  We use this script as reference.
+
+
+    make live
+
+* The simple live demo should decode speech from your microphone.
+  It uses the pretrained AM and LM and wraps `<live-demo.py>`_. 
+  The pyaudio package is used for capturing the sound from your microphone.
+  We were able to use it under `Ubuntu 12.10` and Python 2.7, but we guarantee nothing on your system.
+
+
+Notes
+-----
+ The scripts for Czech and English support acoustic models obtained using MFCC, LDA+MLLT/delta+delta-delta feature transformations and acoustic models trained generatively or by MPE or bMMI training.
+
+The new functionality is separated to different directories:
+ * kaldi/src/onl-rec stores C++ code for OnlineLatticeRecogniser.
+ * kaldi/scr/pykaldi stores Python wrapper PyOnlineLatticeRecogniser.
+ * kaldi/egs/vystadial/s5 stores training scripts.
+ * kaldi/egs/vystadial/online_demo shows Kaldi standard decoder, OnlineLatticeRecogniser and PyOnlineLatticeRecogniser, which produce the exact same lattices using the same setup.
+
+The OnlineLatticeRecogniser is used in Alex dialogue system (https://github.com/UFAL-DSG/alex).
--- a/egs/vystadial_cz/online_demo/init.py
+++ b/egs/vystadial_cz/online_demo/init.py
@ -0,0 +1 @@
+from __future__ import unicode_literals
--- a/egs/vystadial_cz/online_demo/build_reference.py
+++ b/egs/vystadial_cz/online_demo/build_reference.py
@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import glob
+import sys
+import os
+import codecs
+
+def build_reference(wav_scp, ref_path):
+    print wav_scp, ref_path
+    with codecs.open(ref_path, 'w', 'utf-8') as w:
+        with codecs.open(wav_scp, 'r', 'utf-8') as scp:
+            for line in scp:
+                name, wavpath = line.strip().split(' ', 1)
+                with codecs.open(wavpath + '.trn', 'r', 'utf-8') as trn:
+                    trans = trn.read().strip()
+                    w.write(u'%s %s\n' % (name, trans))
+
+
+if __name__ == '__main__':
+    usage = '''
+    Usage: python %(exec)s (audio_directory|in.scp) decode_directory
+
+    Where directory contains files "*.scp" and
+    audio files "*.wav" and their transcriptions "*.wav.trn".
+    The "*.scp" files contains of list wav names and their path.
+
+    The %(exec)s looks for "*.scp" files builds a reference from "*.wav.trn"
+    '''
+    usage_args = {'exec': sys.argv[0]}
+
+    if len(sys.argv) != 3:
+        print >> sys.stderr, "Wrong number of arguments"
+        print >> sys.stderr, usage % {'exec': sys.argv[0]}
+        sys.exit(1)
+
+    if sys.argv[1].endswith('scp'):
+        scps = [sys.argv[1]]
+    else:
+        scps = glob.glob(os.path.join(sys.argv[1], '*.scp'))
+    target_dir = sys.argv[2]
+    if not len(scps):
+        print >> sys.stderr, "No '*.scp' files found"
+        print >> sys.stderr, usage % {'exec': sys.argv[0]}
+        sys.exit(1)
+    if not os.path.isdir(target_dir):
+        print >> sys.stderr, "No '*.scp' files found"
+        print >> sys.stderr, usage % {'exec': sys.argv[0]}
+        sys.exit(1)
+
+    refers = [os.path.join(target_dir, os.path.basename(scp) + '.tra') for scp in scps]
+    for scp, refer in zip(scps, refers):
+        build_reference(scp, refer)
--- a/egs/vystadial_cz/online_demo/data/.gitignore
+++ b/egs/vystadial_cz/online_demo/data/.gitignore
@ -0,0 +1 @@
+*
--- a/egs/vystadial_cz/online_demo/decode/.gitignore
+++ b/egs/vystadial_cz/online_demo/decode/.gitignore
@ -0,0 +1 @@
+*
--- a/egs/vystadial_cz/online_demo/display_gmm_latgen.sh
+++ b/egs/vystadial_cz/online_demo/display_gmm_latgen.sh
@ -0,0 +1,8 @@
+#!/bin/bash
+
+# source the settings
+. path.sh
+
+for n in `cut -d' ' -f1 $wav_scp` ; do
+    utils/show_lattice.sh --mode save --format svg $n $lattice $wst
+done
--- a/egs/vystadial_cz/online_demo/live-demo.py
+++ b/egs/vystadial_cz/online_demo/live-demo.py
@ -0,0 +1,135 @@
+#!/usr/bin/env python
+# encoding: utf-8
+# Copyright (c) 2013, Ondrej Platek, Ufal MFF UK <oplatek@ufal.mff.cuni.cz>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License. #
+from __future__ import unicode_literals
+
+import pyaudio
+from kaldi.decoders import PyOnlineLatgenRecogniser
+from kaldi.utils import wst2dict, lattice_to_nbest
+import sys
+import time
+import select
+import tty
+import termios
+import wave
+
+CHANNELS, RATE, FORMAT = 1, 16000, pyaudio.paInt16
+
+
+class LiveDemo:
+
+    def __init__(self, audio_batch_size, wst, dec_args):
+        self.batch_size = audio_batch_size
+        self.wst = wst
+        self.args = dec_args
+        self.d = PyOnlineLatgenRecogniser()
+        self.pin, self.stream = None, None
+        self.frames = []
+        self.utt_frames, self.new_frames = 0, 0
+        self.utt_end, self.dialog_end = False, False
+
+    def setup(self):
+        self.d.reset()
+        self.d.setup(argv)
+        self.pin = pyaudio.PyAudio()
+        self.stream = self.pin.open(format=FORMAT, channels=CHANNELS,
+                                    rate=RATE, input=True, frames_per_buffer=self.batch_size,
+                                    stream_callback=self.get_audio_callback())
+        self.utt_frames, self.new_frames = 0, 0
+        self.utt_end, self.dialog_end = False, False
+        self.frames = []
+
+    def tear_down(self):
+        if self.stream is not None:
+            self.stream.stop_stream()
+            self.stream.close()
+        if self.pin is not None:
+            self.pin.terminate()
+        p, stream = None, None
+        self.frames = []
+
+    def get_audio_callback(self):
+        def frame_in(in_data, frame_count, time_info, status):
+            self.d.frame_in(in_data)
+            self.frames.append(in_data)
+            return in_data, pyaudio.paContinue
+        return frame_in
+
+    def _user_control(self):
+        '''Simply stupid sollution how to control state of recogniser.'''
+
+        self.utt_end, self.dialog_end = False, False
+        old_settings = termios.tcgetattr(sys.stdin)
+        try:
+            tty.setcbreak(sys.stdin.fileno())
+            # if is data on input
+            while (select.select([sys.stdin], [], [], 1) == ([sys.stdin], [], [])):
+                c = sys.stdin.read(1)
+                if c == 'u':
+                    print('\nMarked end of utterance\n')
+                    self.utt_end = True
+                elif c == 'c':
+                    self.dialog_end = True
+                    print('\nMarked end of dialogue\n')
+        finally:
+            termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
+        print("""Chunks: %d ; Utterance %d ; end %d : press 'u'\nFor terminating press 'c'\n\n""" % (len(self.frames), self.utt_frames, self.utt_end))
+
+    def run(self):
+        while True:
+            time.sleep(0.1)
+            self._user_control()
+            new_frames = self.d.decode(max_frames=10)
+            while new_frames > 0:
+                self.utt_frames += new_frames
+                new_frames = self.d.decode(max_frames=10)
+            if self.utt_end or self.dialog_end:
+                start = time.time()
+                self.d.prune_final()
+                prob, lat = self.d.get_lattice()
+                # lat.write('live-demo-recorded.fst')
+                nbest = lattice_to_nbest(lat, n=10)
+                if nbest:
+                    best_prob, best_path = nbest[0]
+                    decoded = ' '.join([wst[w] for w in best_path])
+                else:
+                    decoded = 'Empty hypothesis'
+                print("%s secs, frames: %d, prob: %f, %s " % (
+                    str(time.time() - start), self.utt_frames, prob, decoded))
+                self.utt_frames = 0
+                self.d.reset(keep_buffer_data=False)
+            if self.dialog_end:
+                self.save_wav()
+                break
+
+    def save_wav(self):
+        wf = wave.open('live-demo-record.wav', 'wb')
+        wf.setnchannels(CHANNELS)
+        wf.setframerate(RATE)
+        wf.setsampwidth(self.pin.get_sample_size(FORMAT))
+        wf.writeframes(b''.join(self.frames))
+        wf.close()
+
+
+if __name__ == '__main__':
+    audio_batch_size, wst_path = int(sys.argv[1]), sys.argv[2]
+    argv = sys.argv[3:]
+    print >> sys.stderr, 'Python args: %s' % str(sys.argv)
+
+    wst = wst2dict(wst_path)
+    demo = LiveDemo(audio_batch_size, wst, argv)
+    demo.setup()
+    demo.run()
--- a/egs/vystadial_cz/online_demo/models/.gitignore
+++ b/egs/vystadial_cz/online_demo/models/.gitignore
@ -0,0 +1 @@
+*
--- a/egs/vystadial_cz/online_demo/path.sh
+++ b/egs/vystadial_cz/online_demo/path.sh
@ -0,0 +1,41 @@
+#!/bin/bash
+
+# data location
+PWD=`pwd`
+exp_dir=$PWD
+data_dir=$PWD/data/vystadial-sample-cs/test
+decode_dir=$exp_dir/decode
+
+# IO parameters
+wav_scp=$data_dir/input_best.scp
+# wav_scp=$data_dir/input.scp
+
+gmm_latgen_faster_tra=$decode_dir/gmm-latgen-faster.tra
+gmm_latgen_faster_tra_txt=${gmm_latgen_faster_tra}.txt
+
+pykaldi_latgen_tra=$decode_dir/pykaldi-latgen.tra
+pykaldi_latgen_tra_txt=${pykaldi_faster_tra}.txt
+lattice=$decode_dir/lat.gz
+
+# Czech language models 
+LANG=cs
+HCLG=models/HCLG_tri2b_bmmi.fst
+AM=models/tri2b_bmmi.mdl
+MAT=models/tri2b_bmmi.mat  # matrix trained in tri2b models 
+WST=models/words.txt
+MFCC=models/mfcc.conf
+SILENCE=models/silence.csl
+
+kaldisrc=`pwd`/../../../src
+openfst=`pwd`/../../../tools/openfst/
+
+export PATH=$kaldisrc/bin:$kaldisrc/fgmmbin:$kaldisrc/gmmbin:$kaldisrc/nnetbin:$kaldisrc/sgmm2bin:$kaldisrc/tiedbin:$kaldisrc/featbin:$kaldisrc/fstbin:$kaldisrc/latbin:$kaldisrc/onlinebin:$kaldisrc/sgmmbin:$kaldisrc/onl-rec:$openfst/bin:"$PATH"
+export LD_LIBRARY_PATH=$kaldisrc/onl-rec:$kaldisrc/pykaldi/kaldi:$openfst/lib:$openfst/lib/fst:$LD_LIBRARY_PATH
+export PYTHONPATH=$kaldisrc/pykaldi:$kaldisrc/pykaldi/pyfst:$PYTHONPATH
+
+beam=16.0
+latbeam=10.0
+max_active=14000
+
+# Size of chunks are queued in "online" interface
+batch_size=4560
--- a/egs/vystadial_cz/online_demo/pykaldi-online-latgen-recogniser.py
+++ b/egs/vystadial_cz/online_demo/pykaldi-online-latgen-recogniser.py
@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# Copyright (c) 2013, Ondrej Platek, Ufal MFF UK <oplatek@ufal.mff.cuni.cz>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License. #
+from __future__ import unicode_literals
+
+from kaldi.utils import load_wav, wst2dict, lattice_to_nbest
+from kaldi.decoders import PyOnlineLatgenRecogniser
+import sys
+import fst
+import time
+
+# DEBUG = True
+DEBUG = False
+
+
+def write_decoded(f, wav_name, word_ids, wst):
+    assert(len(word_ids) > 0)
+    best_weight, best_path = word_ids[0]
+    if wst is not None:
+        decoded = [wst[w] for w in best_path]
+    else:
+        decoded = [unicode(w) for w in best_path]
+    line = u' '.join([wav_name] + decoded + ['\n'])
+    if DEBUG:
+        print '%s best path %s' % (wav_name, decoded.encode('UTF-8'))
+        for i, s in enumerate(word_ids):
+            if i > 0:
+                break
+            print 'best path %d: %s' % (i, str(s))
+    f.write(line.encode('UTF-8'))
+
+
+# @profile
+def decode(d, pcm):
+    frame_len = (2 * audio_batch_size)  # 16-bit audio so 1 sample = 2 chars
+    i, decoded_frames, max_end = 0, 0, len(pcm)
+    start = time.time()
+    while i * frame_len < len(pcm):
+        i, begin, end = i + 1, i * frame_len, min(max_end, (i + 1) * frame_len)
+        audio_chunk = pcm[begin:end]
+        d.frame_in(audio_chunk)
+        dec_t = d.decode(max_frames=10)
+        while dec_t > 0:
+            decoded_frames += dec_t
+            dec_t = d.decode(max_frames=10)
+    print "forward decode: %s secs" % str(time.time() - start)
+    start = time.time()
+    d.prune_final()
+    lik, lat = d.get_lattice()
+    print "backward decode: %s secs" % str(time.time() - start)
+    d.reset(keep_buffer_data=False)
+    return (lat, lik, decoded_frames)
+
+
+def decode_wrap(argv, audio_batch_size, wav_paths,
+        file_output, wst_path=None):
+    wst = wst2dict(wst_path)
+    d = PyOnlineLatgenRecogniser()
+    d.setup(argv)
+    for wav_name, wav_path in wav_paths:
+        sw, sr = 2, 16000  # 16-bit audio so 1 sample_width = 2 chars
+        pcm = load_wav(wav_path, def_sample_width=sw, def_sample_rate=sr)
+        print '%s has %f sec' % (wav_name, (float(len(pcm)) / sw) / sr)
+        lat, lik, decoded_frames = decode(d, pcm)
+        lat.isyms = lat.osyms = fst.read_symbols_text(wst_path)
+        if DEBUG:
+            with open('pykaldi_%s.svg' % wav_name, 'w') as f:
+                f.write(lat._repr_svg_())
+            lat.write('%s_pykaldi.fst' % wav_name)
+
+        print "Log-likelihood per frame for utterance %s is %f over %d frames" % (
+            wav_name, (lik / decoded_frames), decoded_frames)
+        word_ids = lattice_to_nbest(lat, n=10)
+        write_decoded(file_output, wav_name, word_ids, wst)
+
+
+if __name__ == '__main__':
+    audio_scp, audio_batch_size = sys.argv[1], int(sys.argv[2])
+    dec_hypo, wst_path = sys.argv[3], sys.argv[4]
+    argv = sys.argv[5:]
+    print >> sys.stderr, 'Python args: %s' % str(sys.argv)
+
+    # open audio_scp, decode and write to dec_hypo file
+    with open(audio_scp, 'rb') as r:
+        with open(dec_hypo, 'wb') as w:
+            lines = r.readlines()
+            scp = [tuple(line.strip().split(' ', 1)) for line in lines]
+            decode_wrap(argv, audio_batch_size, scp, w, wst_path)
--- a/egs/vystadial_cz/online_demo/run_gmm-latgen-faster.sh
+++ b/egs/vystadial_cz/online_demo/run_gmm-latgen-faster.sh
@ -0,0 +1,55 @@
+# $!/bin/bash
+
+# source the settings
+. path.sh
+
+. utils/parse_options.sh || exit 1
+
+# temporary files
+mfccdir=$decode_dir/mfcc
+feat_scp=$mfccdir/raw_mfcc.scp
+
+mkdir -p $mfccdir
+
+compute-mfcc-feats  --verbose=0 --config=$MFCC scp:$wav_scp \
+  ark,scp:$mfccdir/raw_mfcc.ark,$feat_scp || exit 1;
+
+# # For debugging
+# # cgdb -q -x .gdbinit --args \
+# compute-mfcc-feats  --verbose=0 --config=$MFCC scp:$wav_scp \
+#   ark,t,scp:$mfccdir/raw_mfcc.ark.txt,${feat_scp}.txt || exit 1;
+# # For debugging
+# add-deltas "scp,s,cs:$feat_scp" "ark,t:$mfccdir/dd_mfcc.ark.txt"
+
+# check the splice-feats use the default left=right=4 context
+if [ -z $MAT ] ; then
+  # no LDA matrix -> use delta-delta
+  feats="ark,s,cs:copy-feats scp:$feat_scp ark:- | add-deltas ark:- ark:- |"
+else
+  # LDA matrix specified -> using it
+  feats="ark,s,cs:copy-feats scp:$feat_scp ark:- | splice-feats ark:- ark:- | transform-feats $MAT ark:- ark:- |"
+fi
+
+gmm-latgen-faster --verbose=0 --max-mem=500000000 \
+    --beam=$beam --lattice-beam=$latbeam --max-active=$max_active \
+    --allow-partial=true --word-symbol-table=$WST \
+    $AM $HCLG "$feats" \
+    "ark:|gzip -c > $lattice"
+
+lattice-best-path --verbose=0 --lm-scale=15 --word-symbol-table=$WST \
+    "ark:gunzip -c $lattice|" ark,t:$gmm_latgen_faster_tra || exit 1;
+
+cat $gmm_latgen_faster_tra | utils/int2sym.pl -f 2- $WST \
+    > $gmm_latgen_faster_tra_txt || exit 1
+
+# reference is named based on wav_scp
+./build_reference.py $wav_scp $decode_dir
+reference=$decode_dir/`basename $wav_scp`.tra
+
+echo; echo "Reference"; echo
+cat $reference
+echo; echo "Decoded"; echo
+cat $gmm_latgen_faster_tra_txt
+
+compute-wer --text --mode=present ark:$reference ark,p:$gmm_latgen_faster_tra_txt
+
--- a/egs/vystadial_cz/online_demo/run_live-demo.sh
+++ b/egs/vystadial_cz/online_demo/run_live-demo.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+
+# source the settings
+. path.sh
+
+batch_size=4560
+beam=12.0
+latbeam=6.0
+max_active=2000
+
+# cgdb -q -x .gdbinit_faster --args python \
+python \
+live-demo.py $batch_size $WST \
+    --verbose=0 --lat-lm-scale=15 --config=$MFCC \
+    --beam=$beam --lattice-beam=$latbeam --max-active=$max_active \
+    $AM $HCLG `cat $SILENCE` $MAT
--- a/egs/vystadial_cz/online_demo/run_online-latgen-recogniser.sh
+++ b/egs/vystadial_cz/online_demo/run_online-latgen-recogniser.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+
+# source the settings
+. path.sh
+
+. utils/parse_options.sh || exit 1
+
+
+# cgdb -q -x .gdbinit_latgen --args \
+
+wav_name=./data/vystadial-sample-cs/test/vad-2013-06-08-22-50-01.897179.wav
+onl-rec-latgen-recogniser-test $wav_name \
+    --verbose=0  --max-mem=500000000 --lat-lm-scale=15 --config=$MFCC \
+    --beam=$beam --lattice-beam=$latbeam --max-active=$max_active \
+    $AM $HCLG `cat $SILENCE` $MAT
+
+echo; echo "Converting the lattice to svg picture ${wav_name}.svg" ; echo
+fstdraw --portrait=true --osymbols=$WST ${wav_name}.fst | dot -Tsvg  > ${wav_name}.svg
--- a/egs/vystadial_cz/online_demo/run_pyonline-latgen-recogniser.sh
+++ b/egs/vystadial_cz/online_demo/run_pyonline-latgen-recogniser.sh
@ -0,0 +1,37 @@
+#!/bin/bash
+
+# source the settings
+. path.sh
+
+. utils/parse_options.sh || exit 1
+
+logname=b${beam}_lb${latbeam}_ma${max_active}_bs${batch_size}
+
+# Below, there are various commands for debugging, profiling.
+# Uncomment convenient prefix for you and put it just before the arguments.
+#
+# cgdb -q -x .gdbinit_latgen --args python \
+# valgrind --tool=callgrind -v --dump-instr=yes --trace-jump=yes --callgrind-out-file=callgrind_${logname}.log python \
+# kernprof.py -o kernprof_${logname}.log -l -v \
+python \
+  pykaldi-online-latgen-recogniser.py $wav_scp $batch_size $pykaldi_latgen_tra $WST \
+    --verbose=0  --max-mem=500000000 --lat-lm-scale=15 --config=$MFCC \
+    --beam=$beam --lattice-beam=$latbeam --max-active=$max_active \
+    $AM $HCLG `cat $SILENCE` $MAT
+
+# If using callgrind display the results by running kcachegrind
+# kcachegrind callgrind_${logname}.log
+# If using kernprof.py @profile decorators 
+# to functions which should be profiled.
+
+# reference is named based on wav_scp
+./build_reference.py $wav_scp $decode_dir
+reference=$decode_dir/`basename $wav_scp`.tra
+
+echo; echo "Reference"; echo
+cat $reference
+echo; echo "Decoded"; echo
+cat $pykaldi_latgen_tra
+echo
+
+compute-wer --text --mode=present ark:$reference ark,p:$pykaldi_latgen_tra
--- a/egs/vystadial_cz/online_demo/utils
+++ b/egs/vystadial_cz/online_demo/utils
@ -0,0 +1 @@
+../../wsj/s5/utils
--- a/egs/vystadial_cz/s5/cmd.sh
+++ b/egs/vystadial_cz/s5/cmd.sh
@ -0,0 +1,22 @@
+# "queue.pl" uses qsub.  The options to it are
+# options to qsub.  If you have GridEngine installed,
+# change this to a queue you have access to.
+# Otherwise, use "run.pl", which will run jobs locally
+# (make sure your --num-jobs options are no more than
+# the number of cpus on your machine.
+
+#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
+#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
+# export train_cmd="queue.pl -l mf=5g"
+# export decode_cmd="queue.pl -l mf=5g"
+export train_cmd="queue.pl -l arch=*64*"
+export decode_cmd="queue.pl -l arch=*64*"
+
+# The number of parallel jobs to be started for some parts of the recipe
+# Make sure you have enough resources(CPUs and RAM) to accomodate this number of jobs
+njobs=20
+
+# If you have no GridEngine you can do:
+#export train_cmd=run.pl
+#export decode_cmd=run.pl
+#njobs=2
--- a/egs/vystadial_cz/s5/common/decode.conf
+++ b/egs/vystadial_cz/s5/common/decode.conf
@ -0,0 +1,3 @@
+beam=12.0
+latbeam=6.0
+max_active=14000
--- a/egs/vystadial_cz/s5/common/mfcc.conf
+++ b/egs/vystadial_cz/s5/common/mfcc.conf
@ -0,0 +1,7 @@
+# --use-energy=false  # non default option. False -> Use C0 instead energy
+# NUMCEPS in HTK is without C0: there 12 here 13 - default
+--low-freq=125
+--high-freq=3800
+# --htk-compat
+# --remove-dc-offset  # equivalent ZMEANSOURCE in HTK
+# --subtract-mean  # not recommended to do it this way
--- a/egs/vystadial_cz/s5/env_voip_cs.sh
+++ b/egs/vystadial_cz/s5/env_voip_cs.sh
@ -0,0 +1,42 @@
+#!/bin/bash
+
+# EVERY_N utterance is used for training
+# EVERY_N=3    ->   we use one third of training data
+export EVERY_N=1
+export TEST_SETS="dev test"
+
+# Directories set up
+export DATA_ROOT=`pwd`/data  # expects subdirectories train + $TEST_SETS
+export WORK=`pwd`/lang_prep
+export EXP=`pwd`/exp
+export TGT_MODELS=trained_models
+
+# Specify paths to arpa models. Paths may not contain spaces.
+# Specify build0 or build1 or build2, .. for building (zero|uni|bi)-gram LM.
+# Note: The LM file name should not contain underscore "_"! 
+# Otherwise the results will be reported without the LM with underscore."
+export LMs="build0 build2"
+
+# Use path to prebuilt dictionary or 'build' command in order to build dictionary
+# export DICTIONARY="../../resources/lm/caminfo/dict"
+export DICTIONARY="build"
+
+
+# Borders for estimating LM model weight.
+# LMW is tuned on development set and applied on test set.
+export min_lmw=4
+export max_lmw=15
+
+# Number of states for phonem training
+export pdf=1200
+
+# Maximum number of Gaussians used for training
+export gauss=19200
+
+export train_mmi_boost=0.05
+
+export mmi_beam=16.0
+export mmi_lat_beam=10.0
+
+# --fake -> NO CMVN; empty -> CMVN (pykaldi decoders can not handle CMVN -> fake)
+export fake="--fake"
--- a/egs/vystadial_cz/s5/local/backup.sh
+++ b/egs/vystadial_cz/s5/local/backup.sh
@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2013, Ondrej Platek, Ufal MFF UK <oplatek@ufal.mff.cuni.cz>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License. #
+
+expdir=$1; shift
+tgtdir=$2; shift
+
+tgtdir="$tgtdir/$name"
+date="`date +%F_%T.%N`"
+
+if [[ -d $tgtdir || -f $tgtdir ]] ; then
+    tgtdir="$tgtdir/backup_$date"
+fi
+
+
+# This is EXAMPLE SCRIPT you are ENCOURAGED TO CHANGE IT!
+
+mkdir -p "$tgtdir"
+cp -rf $expdir "$tgtdir"
+
+# Collect the results
+
+local/results.py $EXP > "$tgtdir"/results.log
+echo "Date: $date" >> "$tgtdir"/results.log
+size=`du -hs "$tgtdir"`
+echo "Size of backup: $size" >> "$tgtdir"/results.log
+
+echo; echo "DATA successfully copied to $tgtdir"; echo
--- a/egs/vystadial_cz/s5/local/create_G.sh
+++ b/egs/vystadial_cz/s5/local/create_G.sh
@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Copyright 2012 Vassil Panayotov
+#           2013 Ondrej Platek
+# Apache 2.0
+
+echo "===test_sets Formating data ..."
+langdir=$1; shift
+LMs=$1; shift
+lmdir=$1; shift
+lexicon=$1; shift
+
+# Next, for each type of language model, create the corresponding FST
+# and the corresponding lang_test_* directory.
+for lm in $LMs ; do
+    tgt=${langdir}_`basename "$lm"`
+    lmp=$lmdir/`basename $lm`
+
+    tmpdir=$tgt/tmp
+    mkdir -p $tgt 
+    mkdir -p $tmpdir
+
+    echo "--- Preparing the grammar transducer (G.fst) from $lmp in $tgt ..."
+
+    for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones ; do
+        ln -s $langdir/$f $tgt/$f 2> /dev/null
+    done
+
+    cat $lmp | utils/find_arpa_oovs.pl $tgt/words.txt > $tmpdir/oovs.txt
+
+     # grep -v '<s> <s>' because the LM seems to have some strange and useless
+     # stuff in it with multiple <s>'s in the history.  Encountered some other similar
+     # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
+     # which are supposed to occur only at being/end of utt.  These can cause 
+     # determinization failures of CLG [ends up being epsilon cycles].
+
+    cat $lmp | \
+      grep -v '<s> <s>\|</s> <s>\|</s> </s>' | \
+      arpa2fst - | fstprint | \
+      utils/remove_oovs.pl $tmpdir/oovs.txt | \
+      utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$tgt/words.txt \
+        --osymbols=$tgt/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+      fstrmepsilon > $tgt/G.fst
+    fstisstochastic $tgt/G.fst
+    # The output is like:
+    # 9.14233e-05 -0.259833
+    # we do expect the first of these 2 numbers to be close to zero (the second is
+    # nonzero because the backoff weights make the states sum to >1).
+    # Because of the <s> fiasco for these particular LMs, the first number is not
+    # as close to zero as it could be.
+    
+    # Everything below is only for diagnostic.
+    # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
+    # this might cause determinization failure of CLG.
+    # #0 is treated as an empty word.
+    mkdir -p $tmpdir/g
+    awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
+      < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
+    fstcompile --isymbols=$tgt/words.txt --osymbols=$tgt/words.txt \
+      $tmpdir/g/select_empty.fst.txt | \
+    fstarcsort --sort_type=olabel | fstcompose - $tgt/G.fst > $tmpdir/g/empty_words.fst
+    fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+      echo "Language model has cycles with empty words" && exit 1
+
+    # rm -rf $tmpdir  # TODO debugging
+    echo "*** Succeeded in creating G.fst for $tgt"
+
+done  # for lm in $LMs ; do
--- a/egs/vystadial_cz/s5/local/create_LMs.sh
+++ b/egs/vystadial_cz/s5/local/create_LMs.sh
@ -0,0 +1,89 @@
+#!/bin/bash
+# Copyright (c) 2013, Ondrej Platek, Ufal MFF UK <oplatek@ufal.mff.cuni.cz>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License. #
+locdata=$1; shift
+train_text=$1; shift
+test_text=$1; shift
+local_lm=$1; shift
+lms=$1; shift
+
+
+mkdir -p $local_lm
+
+echo "=== Preparing the LM ..."
+
+function build_0gram {
+    echo "=== Building zerogram $lm from ${transcr}. ..."
+    transcr=$1; lm=$2
+    cut -d' ' -f2- $transcr | tr ' ' '\n' | sort -u > $lm
+    echo "<s>" >> $lm
+    echo "</s>" >> $lm
+    python -c """
+import math
+with open('$lm', 'r+') as f:
+    lines = f.readlines()
+    p = math.log10(1/float(len(lines)));
+    lines = ['%f\\t%s'%(p,l) for l in lines]
+    f.seek(0); f.write('\\n\\\\data\\\\\\nngram  1=       %d\\n\\n\\\\1-grams:\\n' % len(lines))
+    f.write(''.join(lines) + '\\\\end\\\\')
+"""
+}
+
+for lm in $lms ; do
+    lm_base=`basename $lm`
+    if [ ${lm_base%[0-6]} !=  'build' ] ; then
+        cp $lm $local_lm
+    else
+        # We will build the LM 'build[0-9].arpa
+        lm_order=${lm_base#build}
+
+        echo "=== Building LM of order ${lm_order}..."
+        if [ $lm_order -eq 0 ] ; then
+            echo "Zerogram $lm_base LM is build from text: $test_text"
+            cut -d' ' -f2- $test_text | sed -e 's:^:<s> :' -e 's:$: </s>:' | \
+                sort -u > $locdata/lm_test.txt
+            build_0gram  $locdata/lm_test.txt $local_lm/${lm_base}
+        else
+            echo "LM $lm_base is build from text: $train_text"
+            cut -d' ' -f2- $train_text | sed -e 's:^:<s> :' -e 's:$: </s>:' | \
+                sort -u > $locdata/lm_train.txt
+            ngram-count -text $locdata/lm_train.txt -order ${lm_order} \
+                -wbdiscount -interpolate -lm $local_lm/${lm_base}
+        fi
+    fi
+done
+echo "*** LMs preparation finished!"
+
+echo "=== Preparing the vocabulary ..."
+
+if [ "$DICTIONARY" == "build" ]; then
+  echo; echo "Building dictionary from train data"; echo
+  cut -d' ' -f2- $train_text | tr ' ' '\n' > $locdata/vocab-full-raw.txt
+else
+  echo; echo "Using predefined dictionary: ${DICTIONARY}"
+  echo "Throwing away first 2 rows."; echo
+  tail -n +3 $DICTIONARY | cut -f 1 > $locdata/vocab-full-raw.txt
+fi
+
+echo '</s>' >> $locdata/vocab-full-raw.txt
+echo "Removing from vocabulary _NOISE_, and  all '_' words from vocab-full.txt"
+cat $locdata/vocab-full-raw.txt | grep -v '_' | \
+  sort -u > $locdata/vocab-full.txt
+echo "*** Vocabulary preparation finished!"
+
+
+echo "Removing from vocabulary _NOISE_, and  all '_' words from vocab-test.txt"
+cut -d' ' -f2 $test_text | tr ' ' '\n' | grep -v '_' | sort -u > $locdata/vocab-test.txt
+
--- a/egs/vystadial_cz/s5/local/create_phone_lists.sh
+++ b/egs/vystadial_cz/s5/local/create_phone_lists.sh
@ -0,0 +1,54 @@
+#!/bin/bash
+# Copyright (c) 2013, Ondrej Platek, Ufal MFF UK <oplatek@ufal.mff.cuni.cz>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License. #
+
+# The vystadial data are specific by having following marks in transcriptions
+# _INHALE_
+# _LAUGH_
+# _EHM_HMM_
+# _NOISE_
+# _EHM_HMM_
+# _SIL_
+
+locdict=$1; shift
+
+echo "--- Prepare nonsilence phone lists ..."
+# We suppose only nonsilence_phones in lexicon right now
+awk '{for(n=2;n<=NF;n++) { p[$n]=1; }} END{for(x in p) {print x}}' \
+    $locdict/lexicon.txt | sort > $locdict/nonsilence_phones.txt
+
+echo "--- Adding silence phones to lexicon ..."
+echo "_SIL_ SIL" >> $locdict/lexicon.txt
+echo "_EHM_HMM_ EHM" >> $locdict/lexicon.txt
+echo "_INHALE_ INH" >> $locdict/lexicon.txt
+echo "_LAUGH_ LAU" >> $locdict/lexicon.txt
+echo "_NOISE_ NOI" >> $locdict/lexicon.txt
+
+echo "--- Sorting lexicon in place..."
+sort $locdict/lexicon.txt -o $locdict/lexicon.txt
+
+echo "--- Prepare silence phone lists ..."
+echo SIL > $locdict/silence_phones.txt
+echo EHM >> $locdict/silence_phones.txt
+echo INH >> $locdict/silence_phones.txt
+echo LAU >> $locdict/silence_phones.txt
+echo NOI >> $locdict/silence_phones.txt
+
+echo SIL > $locdict/optional_silence.txt
+
+# Some downstream scripts expect this file exists, even if empty
+touch $locdict/extra_questions.txt
+
+echo "*** Creating phone lists finished!"
--- a/egs/vystadial_cz/s5/local/create_sample.sh
+++ b/egs/vystadial_cz/s5/local/create_sample.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+# example usage:
+# ./local/create_sample.sh /ha/projects/vystadial/data/asr/en/voip/ Results/vystadial-sample/ test 100
+# note that it suppose there are only *.wav and *.wav.trn and the 
+# the n is the numbero of files in the directory
+
+src=$1
+tgt=$2
+typ=$3   # dev/test/train
+n=$4
+
+src_dir=$src/$typ
+tgt_dir=$tgt/$typ
+mkdir -p $tgt_dir
+ls $src_dir | head -n $n \
+| while read f ; do
+    cp $src_dir/$f $tgt_dir
+done
--- a/egs/vystadial_cz/s5/local/data_split.sh
+++ b/egs/vystadial_cz/s5/local/data_split.sh
@ -0,0 +1,108 @@
+#!/bin/bash
+# Copyright (c) 2013, Ondrej Platek, Ufal MFF UK <oplatek@ufal.mff.cuni.cz>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License. #
+#
+# Makes train/test splits
+# local/voxforge_data_prep.sh --nspk_test ${nspk_test} ${SELECTED} || exit 1
+# create files: (TYPE=train|test)
+#   a) ${TYPE}_trans.txt: ID transcription capitalized! No interputction
+#   b) ${TYPE}_wav.scp: ID path2ID.wav 
+#   c) $TYPE.utt2spk: ID-recording ID-speaker
+#   s) $TYPE.spk2utt
+#   e) $TYPE.spk2gender  all speakers are male
+# we have ID-recording = ID-speaker
+
+# The vystadial data are specific by having following marks in transcriptions
+# _INHALE_
+# _LAUGH_ 
+# _EHM_HMM_ 
+# _NOISE_
+# _EHM_HMM_
+# _SIL_
+
+# renice 20 $$
+
+every_n=1
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. utils/parse_options.sh || exit 1;
+
+
+if [ $# -ne 4 ] ; then
+    echo "Usage: local/data_split.sh [--every-n 30] <data-directory>  <local-directory> <LMs> <Test-Sets> <tgt-dir>";
+    exit 1;
+fi
+
+DATA=$1; shift
+locdata=$1; shift
+LMs=$1; shift
+test_sets=$1; shift
+tgt_dir=$1; shift
+
+echo "LMs $LMs  test_sets $test_sets"
+
+
+echo "=== Starting initial Vystadial data preparation ..."
+echo "--- Making test/train data split from $DATA taking every $every_n recording..."
+
+mkdir -p $locdata
+
+i=0
+for s in $test_sets train ; do
+    mkdir -p $locdata/$s
+    ls $DATA/$s/ | sed -n /.*wav$/p |\
+    while read wav ; do
+        ((i++)) # bash specific
+        if [[ $i -ge $every_n ]] ; then
+            i=0
+            pwav=$DATA/$s/$wav
+            trn=`cat $DATA/$s/$wav.trn`
+            echo "$wav $pwav" >> $locdata/$s/wav.scp
+            echo "$wav $wav" >> $locdata/$s/utt2spk
+            echo "$wav $wav" >> $locdata/$s/spk2utt
+            echo "$wav $trn" >> $locdata/$s/trans.txt
+            # Ignoring gender -> label all recordings as male
+            echo "$wav M" >> $locdata/spk2gender
+        fi
+    done # while read wav 
+
+    for f in wav.scp utt2spk spk2utt trans.txt ; do
+       sort "$locdata/$s/$f" -k1 -u -o "$locdata/$s/$f"  # sort in place
+    done # for f
+
+done # for in $test_sets train
+
+echo "Set 1:1 relation for spk2utt: spk in $test_sets AND train, sort in place"
+sort "$locdata/spk2gender" -k1 -o "$locdata/spk2gender" 
+
+echo "--- Distributing the file lists to train and ($test_sets x $LMs) directories ..."
+mkdir -p $WORK/train
+cp $locdata/train/wav.scp $WORK/train/wav.scp || exit 1;
+cp $locdata/train/trans.txt $WORK/train/text || exit 1;
+cp $locdata/train/spk2utt $WORK/train/spk2utt || exit 1;
+cp $locdata/train/utt2spk $WORK/train/utt2spk || exit 1;
+utils/filter_scp.pl $WORK/train/spk2utt $locdata/spk2gender > $WORK/train/spk2gender || exit 1;
+
+for s in $test_sets ; do 
+    for lm in $LMs; do
+        tgt_dir=$WORK/${s}_`basename ${lm}`
+        mkdir -p $tgt_dir
+        cp $locdata/${s}/wav.scp $tgt_dir/wav.scp || exit 1;
+        cp $locdata/${s}/trans.txt $tgt_dir/text || exit 1;
+        cp $locdata/${s}/spk2utt $tgt_dir/spk2utt || exit 1;
+        cp $locdata/${s}/utt2spk $tgt_dir/utt2spk || exit 1;
+        utils/filter_scp.pl $tgt_dir/spk2utt $locdata/spk2gender > $tgt_dir/spk2gender || exit 1;
+    done
+done
--- a/egs/vystadial_cz/s5/local/download_cs_data.sh
+++ b/egs/vystadial_cz/s5/local/download_cs_data.sh
@ -0,0 +1,46 @@
+#!/bin/bash
+# Copyright Ondrej Platek Apache 2.0
+
+DATA_ROOT=$1
+
+url=https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11858/00-097C-0000-0023-4670-6/data_voip_cs.tgz
+name=data_voip_cs
+extract_file=106277
+
+mkdir -p $DATA_ROOT
+
+if [ ! -f $DATA_ROOT/${name}.tgz ] ; then
+    wget $url -O $DATA_ROOT/${name}.tgz || exit 1
+    echo "Data successfully downloaded"
+fi
+
+if [[ ! -d $DATA_ROOT/$name && -e $DATA_ROOT/$name ]] ; then
+    echo "The $DATA_ROOT/$name is not a directory and we cannot extract the data!"
+    exit 1;
+fi
+
+if [ ! -d $DATA_ROOT/$name ] ; then
+    mkdir $DATA_ROOT/$name
+    tar xfv $DATA_ROOT/${name}.tgz -C $DATA_ROOT | \
+    while read line; do
+        x=$((x+1))
+        echo -en "$x extracted from $extract_file files.\r"
+    done
+fi
+
+if [ -d $DATA_ROOT/$name ] ; then
+    echo "Checking if data extracted correctly"
+    num_files=`find $DATA_ROOT/$name -name '*' | wc -l`
+    if [ ! $num_files -eq $extract_file ] ; then
+        echo "Data extraction failed! Extracted $num_files instead of $extract_file"
+        exit 1;
+    fi
+    echo "It seams that data are extracted correctly"
+fi
+
+pushd $DATA_ROOT
+    for t in test train dev ; do
+        ln -s $name/$t
+    done
+    ln -s $name/arpa_bigram arpa-bigram
+popd
--- a/egs/vystadial_cz/s5/local/export_models.sh
+++ b/egs/vystadial_cz/s5/local/export_models.sh
@ -0,0 +1,50 @@
+#!/bin/bash
+
+# Copyright (c) 2013, Ondrej Platek, Ufal MFF UK <oplatek@ufal.mff.cuni.cz>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License. #
+
+tgt=$1; shift
+exp=$1; shift
+lang=$1; shift
+
+mkdir -p $tgt
+
+echo "--- Exporting models to $tgt ..."
+
+# See local/save_check.sh  which saves the settings at the beginning for details
+# cp -f $exp/alex_gitlog.log $exp/alex_gitdiff.log $exp/experiment_bash_vars.log $tgt
+cp -f $exp/experiment_bash_vars.log $tgt
+
+# Store also the results
+cp -f $exp/results.log $tgt/results.log
+
+
+cp -f common/mfcc.conf $tgt 
+
+cp -f $exp/tri2a/final.mdl $tgt/tri2a.mdl
+cp -f $exp/tri2a/tree $tgt/tri2a.tree
+
+cp -f $exp/tri2b/final.mdl $tgt/tri2b.mdl
+cp -f $exp/tri2b/tree $tgt/tri2b.tree
+cp -f $exp/tri2b/final.mat $tgt/tri2b.mat
+
+cp -f $exp/tri2b_mmi_b*/final.mdl $tgt/tri2b_bmmi.mdl
+cp -f $exp/tri2b/tree $tgt/tri2b_bmmi.tree
+cp -f $exp/tri2b_mmi_b*/final.mat $tgt/tri2b_bmmi.mat
+
+cp -f $lang/phones.txt $lang/phones/silence.csl $tgt
+
+
+# FIXME do I need splice_opts for something?
--- a/egs/vystadial_cz/s5/local/make_baseform.pl
+++ b/egs/vystadial_cz/s5/local/make_baseform.pl
@ -0,0 +1,172 @@
+#!perl -w
+
+#
+# ====================================================================
+# Copyright (C) 1999-2008 Carnegie Mellon University and Alexander
+# Rudnicky. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in
+#    the documentation and/or other materials provided with the
+#    distribution.
+#
+# This work was supported in part by funding from the Defense Advanced
+# Research Projects Agency, the Office of Naval Research and the National
+# Science Foundation of the United States of America, and by member
+# companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
+# the contributions of many volunteers to the expansion and improvement of
+# this dictionary.
+#
+# THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
+# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
+# NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# ====================================================================
+#
+
+# [20050309] (air) Created.
+# strip out stress marks from a cmudict, producing a "SphinxPhones_40" dictionary
+# [20080420] (air) Changed to pass comments.
+#                  Fixed output collation sequence; DOS eol's
+# [20090309] (air) fixed duplicate pron and collation bugs
+# [20090331] (air) restored standard collation order (since other stuff deppends on it)
+# [20090629] (air) do not put comments into SPHINX_40 version; not all software deals
+# [20100118] (air) added $VERBOSE; this should really be a cmdline flag...
+#
+
+
+$VERBOSE = 0;
+
+my $basecount = 0;
+my $dupl = 0;
+my $base = 0;
+my $varia = 0;
+
+if ( scalar @ARGV ne 2 ) { die "usage: make_baseform <input> <output>\n"; }
+
+open(IN, $ARGV[0]) || die "can't open $ARGV[0] for reading!\n";
+open(OUT,">$ARGV[1]") || die "can't open $ARGV[1] for writing!\n";
+
+@header = ();  # header comment lines (passed through)
+%dict = ();    # words end up in here
+%histo = ();   # some statistics on variants
+
+get_dict(\%dict,\@header,IN);  # process the entries
+
+# what have we got?
+print STDERR "$basecount forms processed\n";
+print STDERR "$base baseforms, $varia variants and $dupl duplicates found.\n";
+print STDERR "variant distribution:\n";
+foreach $var ( sort keys %histo ) {
+    print STDERR "$var\t$histo{$var}\n";
+}
+
+# print special comments (copyright, etc.)
+# removed since it messes some things up...
+# foreach $h (@header) { print OUT "$h\n"; }
+
+# print out each entry
+%dict_out = ();
+foreach $w (sort keys %dict) {
+  $var=1;  # variants will number starting with 2
+  foreach $p ( @{$dict{$w}} ) {
+    if ($var eq 1) {
+	$dict_out{$w} = $p;
+      $var++;
+    }  else {
+      $dict_out{"$w($var)"} = $p;
+      $var++;
+    }
+  }
+}
+
+foreach $entry ( sort keys %dict_out ) {
+    print OUT "$entry\t$dict_out{$entry}\n";
+}
+
+close(IN);
+close(OUT);
+
+#
+#
+# read in a dictionary
+sub get_dict {
+  my $dict = shift;  # data structure with dictionary entries
+  my $header = shift;
+  my $target = shift;  # input file handle
+
+  while (<$target>) {
+    s/[\r\n]+$//g;  # DOS-robust chomp;
+
+    # process comments; blank lines ignored
+    # presume that ";;; #" will be collected and emitted at the top
+    if ($_ =~ /^;;; \#/) { push @$header, $_; next; }  # save header info
+    elsif ( $_ =~ /^;;;/ ) { next; }  # ignore plain comments
+    elsif ( $_ =~ /^\s*$/ ) { next; }  # ignore blank lines
+
+    # extract the word,pron pair and prepare for processing
+    ($word,$pron) = /(.+?)\s+(.+?)$/;
+    if (! defined $word) { print STDERR "bad entry (no head word): $_\n"; next; }
+
+    $basecount++;
+
+    if ($word =~ /\)$/) { # variant
+      ($root,$variant) = ($word =~ m/(.+?)\((.+?)\)/);
+    } else {
+      $root = $word;
+      $variant = 0;
+    }
+    $pron = &strip_stress($pron);
+
+    # found a new baseform; set it up
+    if ( ! defined $dict->{$root} ) {
+	$dict->{$root}[0] = $pron;
+	$base++;
+	next;
+    }
+
+    # old baseform; see if, after removed stress, pron is a duplicate
+    foreach $var ( @{$dict->{$root}} ) {
+	if ( $var eq $pron ) {
+	    if ($VERBOSE) {print STDERR "duplicate entry: $root ($variant) $pron\n";}
+	    $dupl++;
+	    $pron = "";
+	    last;
+	}
+    }
+
+    # it's a new variant on an existing baseform, keep it
+    if ( $pron ne "" ) { 
+	push @{$dict->{$root}}, $pron;
+	$varia++;
+	$histo{scalar @{$dict->{$root}}}++;  # track variant stats
+	if ( scalar @{$dict->{$root}} > 4 ) { print STDERR "$root -- ",scalar @{$dict->{$root}},"\n"; }
+    }
+  }
+}
+
+
+# strip stress marks from phonetic symbols
+sub strip_stress {
+  @pron = split " ", $_[0];
+  my $p;
+  foreach $p (@pron) { if ( $p =~ /\d$/) { $p =~ s/(\d+)$//; } }
+  return ( join(" ",@pron));
+}
+
+#
--- a/egs/vystadial_cz/s5/local/phonetic_transcription_cs.pl
+++ b/egs/vystadial_cz/s5/local/phonetic_transcription_cs.pl
@ -0,0 +1,374 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+use utf8;
+use Encode;
+
+# $ PhoneticTranscriptionCS.pl [inputFile inputFile2 ...] outputFile
+#
+# Converts Czech text in CAPITALS in utf8 to Czech phonetic alphabet in
+# utf8. All input files will be concatenated into the output file. If no
+# input files are specified, reads from STDIN.
+#
+# If you want the script to operate in another encoding, set the EV_encoding
+# environment variable to the desired encoding.
+#
+# This is a rewrite of "vyslov" shell-script by Nino Peterek and Jan Oldrich Kruza, which was using tools
+# written by Pavel Ircing. These are copy-pasted including comments into this
+# script.
+
+my $enc = 'utf8';
+
+my $out_fn = pop @ARGV;
+if ($out_fn) {
+    close STDOUT;
+    open STDOUT, '>', $out_fn or die "Couldn't open '$out_fn': $!";
+}
+
+my %seen = ();
+while (<>) {
+    for (decode($enc, $_)) {
+#        if (/[^\w\s]/) {
+#            chomp;
+#            print encode($enc, $_), (' ' x 7), "sp\n";
+#            next
+#        }
+        chomp;
+        $_ = uc($_);
+
+        print encode($enc, $_);
+        print(' ' x 7);
+        exceptions();
+        transcription();
+        tr/[A-Z]/[a-z]/;
+        prague2pilsen();
+        infreq();
+
+#        while ($_ =~ /(.)/g) {
+#            $seen{$1}++;
+#        }
+
+        print encode($enc, $_);
+        print "\n";
+    }
+}
+
+#print "unique chars are: ";
+#foreach (sort(keys %seen)) {
+#    print encode($enc, $_);
+#}
+
+sub exceptions {
+    s/AA/A/g;
+
+    s/AKTI/AKTY/g;
+    s/ANTI/ANTY/g;
+    s/ARKTI/ARKTY/g;
+    s/ATIK/ATYK/g;
+    s/ATRAKTI/ATRAKTY/g;
+    s/AUDI/AUDY/g;
+    s/AUTOMATI/AUTOMATY/g;
+    s/^BARRANDOV/BARANDOV/g;
+    s/CAUSA/KAUZA/g;
+    s/CELSIA/CELZIA/g;
+    s/^CHAPLIN/ČAPLIN/g;
+    s/CHIL/ČIL/g;
+    s/DANIH/DANYH/g;
+    s/DEALER/D ii LER/g;
+    s/DIAG/DYAG/g;
+    s/DIET/DYET/g;
+    s/DIF/DYF/g;
+    s/DIG/DYG/g;
+    s/DIKT/DYKT/g;
+    s/DILET/DYLET/g;
+    s/DIPL/DYPL/g;
+    s/DIRIG/DYRYG/g;
+    s/DISK/DYSK/g;
+    s/DISP/DYSP/g;
+    s/DISPLAY/DYSPLEJ/g;
+    s/DIST/DYST/g;
+    s/DIVIDE/DYVIDE/g;
+    s/DUKTI/DUKTY/g;
+    s/EDIC/EDYC/g;
+    s/EFEKTIV/EFEKTYV/g;
+    s/ELEKTRONI/ELEKTRONY/g;
+    s/ENERGETIK/ENERGETYK/g;
+    s/ERROR/EROR/g;
+    s/ETIK/ETYK/g;
+    s/^EX([AEIOUÁÉÍÓÚŮ])/EGZ$1/g;
+    s/FEMINI/FEMINY/g;
+    s/FINIŠ/FINYŠ/g;
+    s/FINITI/FINYTY/g;
+    s/GATIV/GATYV/g;
+    s/GENETI/GENETY/g;
+    s/GIENI/GIENY/g;
+    s/GITI/GITY/g;
+    s/^GOETH/GÉT/g;
+    s/IMUNI/IMUNY/g;
+    s/INDIV/INDYV/g;
+    s/ING/YNG/g;
+    s/INICI/INYCI/g;
+    s/INVESTI/INVESTY/g;
+    s/KANDI/KANDY/g;
+    s/KARATI/KARATY/g;
+    s/KARDI/KARDY/g;
+    s/KLAUS/KLAUZ/g;
+    s/KOMODIT/KOMODYT/g;
+    s/KOMUNI/KOMUNY/g;
+    s/KONDI/KONDY/g;
+    s/KONSOR/KONZOR/g;
+    s/KREDIT/KREDYT/g;
+    s/KRITI/KRITY/g;
+    s/LEASING/L ii z ING/g;
+    s/MANAG/MENEDŽ/g;
+    s/MANIP/MANYP/g;
+    s/MATI/MATY/g;
+    s/MEDI/MEDY/g;
+    s/MINI/MINY/g;
+    s/MINUS/MÝNUS/g;
+    s/MODERNI/MODERNY/g;
+    s/MONIE/MONYE/g;
+    s/MOTIV/MOTYV/g;
+    s/^MOZART/MÓCART/g;
+    s/^NE/NE!/g;
+    s/^NEWTON/ŇŮTN/g;
+    s/NIE/NYE/g;
+    s/NII/NYY/g;
+    s/NJ/Ň/g;
+    s/NSTI/NSTY/g;
+    s/^ODD/OD!D/g;
+    s/^ODI(?=[^V])/ODY/g;
+    s/^ODT/OT!T/g;
+    s/OPTIM/OPTYM/g;
+    s/ORGANI/ORGANY/g;
+    s/^PANASONIC/PANASONYK/g;
+    s/PANICK/PANYCK/g;
+    s/^Patton/PETN/g;
+    s/PEDIATR/PEDYATR/g;
+    s/PERVITI/PERVITY/g;
+    s/^PODD/POD!D/g;
+    s/^PODT/POT!T/g;
+    s/POLITI/POLITY/g;
+    s/^POULI/PO!ULI/g;
+    s/POZIT/POZYT/g;
+    s/^PŘED(?=[^Ě])/PŘED!/g;
+    s/PRIVATI/PRIVATY/g;
+    s/PROSTITU/PROSTYTU/g;
+    s/RADIK/RADYK/g;
+    s/^RADIO/RADYO/g;
+    s/^RÁDI(.)/RÁDY$1/g;
+    s/RELATIV/RELATYV/g;
+    s/RESTITU/RESTYTU/g;
+    s/^ROCK/ROK/g;
+    s/^ROZ/ROZ!/g;
+    s/RUTIN/RUTYN/g;
+    s/^SCHENGEN/ŠENGEN/g;
+    s/^SEBE/SEBE!/g;
+    s/SHOP/sz O P/g;
+    s/^SHO/SCHO/g;
+    s/SOFTWAR/SOFTVER/g;
+    s/SORTIM/SORTYM/g;
+    s/SPEKTIV/SPEKTYV/g;
+    s/STATISTI/STATYSTY/g;
+    s/STIK/STYK/g;
+    s/STIMUL/STYMUL/g;
+    s/^STROSSMAYER/ŠTROSMAJER/g;
+    s/STUDI/STUDY/g;
+    s/SUPERLATIV/SUPERLATYV/g;
+    s/TECHNI/TECHNY/g;
+    s/TELECOM/TELEKOM/g;
+    s/TELEFONI/TELEFONY/g;
+    s/TEMATI/TEMATY/g;
+    s/^TESCO/TESKO/g;
+    s/TETIK/TETYK/g;
+    s/TEXTIL/TEXTYL/g;
+    s/TIBET/TYBET/g;
+    s/TIBOR/TYBOR/g;
+    s/TICK/TYCK/g;
+    s/TIRANY/TYRANY/g;
+    s/TITUL/TYTUL/g;
+    s/TRADI/TRADY/g;
+    s/UNIVER/UNYVER/g;
+    s/VENTI/VENTY/g;
+    s/VERTIK/VERTYK/g;
+    s/^WAGNER/WÁGNER/g;
+    s/^WATT/VAT/g;
+    s/^WEBBER/VEBER/g;
+    s/^WEBER/VEBER/g;
+    s/^WILSON/VILSON/g;
+
+}
+
+sub transcription {
+    # namapování nechtěných znaků na model ticha
+    s/^.*[0-9].*$/sil/g;
+
+    # náhrada víceznakových fonémů speciálním znakem, případně rozepsání znaku na více fonémů
+    s/CH/#/g;
+    s/W/V/g;
+    s/Q/KV/g;
+    s/DŽ/&/g;  # v původním vyslov nefungovalo
+    s/DZ/@/g;
+    s/X/KS/g;
+
+    # ošetření Ě
+    s/([BPFV])Ě/$1JE/g;
+    s/DĚ/ĎE/g;
+    s/TĚ/ŤE/g;
+    s/NĚ/ŇE/g;
+    s/MĚ/MŇE/g;
+    s/Ě/E/g;
+
+    # změkčující i
+    s/DI/ĎI/g;
+    s/TI/ŤI/g;
+    s/NI/ŇI/g;
+    s/DÍ/ĎÍ/g;
+    s/TÍ/ŤÍ/g;
+    s/NÍ/ŇÍ/g;
+
+    # asimilace znělosti
+    s/B$/P/g;
+    s/B([PTŤKSŠCČ#F])/P$1/g;
+    s/B([BDĎGZŽ@&H])$/P$1/g;
+    s/P([BDĎGZŽ@&H])/B$1/g;
+    s/D$/T/g;
+    s/D([PTŤKSŠCČ#F])/T$1/g;
+    s/D([BDĎGZŽ@&H])$/T$1/g;
+    s/T([BDĎGZŽ@&H])/D$1/g;
+    s/Ď$/Ť/g;
+    s/Ď([PTŤKSŠCČ#F])/Ť$1/g;
+    s/Ď([BDĎGZŽ@&H])$/Ť$1/g;
+    s/Ť([BDĎGZŽ@&H])/Ď$1/g;
+    s/V$/F/g;
+    s/V([PTŤKSŠCČ#F])/F$1/g;
+    s/V([BDĎGZŽ@&H])$/F$1/g;
+    s/F([BDĎGZŽ@&H])/V$1/g;
+    s/G$/K/g;
+    s/G([PTŤKSŠCČ#F])/K$1/g;
+    s/G([BDĎGZŽ@&H])$/K$1/g;
+    s/K([BDĎGZŽ@&H])/G$1/g;
+    s/Z$/S/g;
+    s/Z([PTŤKSŠCČ#F])/S$1/g;
+    s/Z([BDĎGZŽ@&H])$/S$1/g;
+    s/S([BDĎGZŽ@&H])/Z$1/g;
+    s/Ž$/Š/g;
+    s/Ž([PTŤKSŠCČ#F])/Š$1/g;
+    s/Ž([BDĎGZŽ@&H])$/Š$1/g;
+    s/Š([BDĎGZŽ@&H])/Ž$1/g;
+    s/H$/#/g;
+    s/H([PTŤKSŠCČ#F])/#$1/g;
+    s/H([BDĎGZŽ@&H])$/#$1/g;
+    s/#([BDĎGZŽ@&H])/H$1/g;
+    s/\@$/C/g;
+    s/\@([PTŤKSŠCČ#F])/C$1/g;
+    s/\@([BDĎGZŽ@&H])$/C$1/g;
+    s/C([BDĎGZŽ@&H])/\@$1/g;
+    s/&$/Č/g;
+    s/&([PTŤKSŠCČ#F])/Č$1/g;
+    s/&([BDĎGZŽ@&H])$/Č$1/g;
+    s/Č([BDĎGZŽ@&H])/&$1/g;
+    s/Ř$/>/g;
+    s/Ř([PTŤKSŠCČ#F])/>$1/g;
+    s/Ř([BDĎGZŽ@&H])$/>$1/g;
+    s/([PTŤKSŠCČ#F])Ř/$1>/g;
+
+
+    #zbytek
+    s/NK/ng K/g;
+    s/NG/ng G/g;
+    s/MV/mg V/g;
+    s/MF/mg F/g;
+    s/NŤ/ŇŤ/g;
+    s/NĎ/ŇĎ/g;
+    s/NŇ/Ň/g;
+    s/CC/C/g;
+    s/DD/D/g;
+    s/JJ/J/g;
+    s/KK/K/g;
+    s/LL/L/g;
+    s/NN/N/g;
+    s/MM/M/g;
+    s/SS/S/g;
+    s/TT/T/g;
+    s/ZZ/Z/g;
+    s/ČČ/Č/g;
+    s/ŠŠ/Š/g;
+    s/-//g;
+
+    # závěrečný přepis na HTK abecedu
+    s/>/rsz /g;
+    s/EU/eu /g;
+    s/AU/au /g;
+    s/OU/ou /g;
+    s/Á/aa /g;
+    s/Č/cz /g;
+    s/Ď/dj /g;
+    s/É/ee /g;
+    s/Í/ii /g;
+    s/Ň/nj /g;
+    s/Ó/oo /g;
+    s/Ř/rzs /g;
+    s/Š/sz /g;
+    s/Ť/tj /g;
+    s/Ú/uu /g;
+    s/Ů/uu /g;
+    s/Ý/ii /g;
+    s/Ž/zs /g;
+    s/Y/i /g;
+    s/&/dzs /g;
+    s/\@/ts /g;
+    s/#/ch /g;
+    s/!//g;
+    s/([A-Z])/$1 /g;
+
+    # crazy characters mapped to closest phones
+    s/Ü/uu /g;
+    s/Ö/o /g;
+    s/Ć/ch /g;
+    s/Ľ/l /g;
+    s/Ś/sz /g;
+    s/Ű/uu /g;
+    s/Ź/zs /g;
+    s/Ń/nj /g;
+    s/Ę/e /g;
+    s/Ě/e /g;
+    s/Ĺ/l /g;
+    s/Ľ/l /g;
+    s/Ł/l /g;
+    s/Â/a /g;
+    s/Ä/a /g;
+    s/Ç/c /g;
+    s/Ë/e /g;
+    s/Î/i /g;
+    s/Ô/o /g;
+    s/Ő/o /g;
+
+#    s/$/ sp/g;
+}
+
+sub prague2pilsen {
+    s/au/aw/g;
+    s/ch/x/g;
+    s/cz/ch/g;
+    s/dzs/dzh/g;
+    s/es/e s/g;
+    s/eu/ew/g;
+    s/ou/ow/g;
+    s/rsz/rsh/g;
+    s/rzs/rzh/g;
+    s/sz/sh/g;
+    s/ts/dz/g;
+    s/zs/zh/g;
+}
+
+sub infreq {
+    s/dz/c/g;
+    s/dzh/ch/g;
+    s/ew/e u/g;
+    s/mg/m/g;
+    s/oo/o/g;
+}
+
+
--- a/egs/vystadial_cz/s5/local/prepare_cs_transcription.sh
+++ b/egs/vystadial_cz/s5/local/prepare_cs_transcription.sh
@ -0,0 +1,21 @@
+#!/bin/bash
+
+locdata=$1; shift
+locdict=$1; shift
+
+
+mkdir -p $locdict 
+
+perl local/phonetic_transcription_cs.pl $locdata/vocab-full.txt $locdict/cs_transcription.txt
+
+echo "--- Searching for OOV words ..."
+gawk 'NR==FNR{words[$1]; next;} !($1 in words)' \
+  $locdict/cs_transcription.txt $locdata/vocab-full.txt |\
+  egrep -v '<.?s>' > $locdict/vocab-oov.txt
+
+gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \
+  $locdata/vocab-full.txt $locdict/cs_transcription.txt |\
+  egrep -v '<.?s>' > $locdict/lexicon.txt
+
+wc -l $locdict/vocab-oov.txt
+wc -l $locdict/lexicon.txt
--- a/egs/vystadial_cz/s5/local/results.py
+++ b/egs/vystadial_cz/s5/local/results.py
@ -0,0 +1,187 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2013, Ondrej Platek, Ufal MFF UK <oplatek@ufal.mff.cuni.cz>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License. #
+import argparse
+from numpy import mean
+import glob
+import sys
+import sqlite3
+
+
+def extract_stat(wer_file):
+    wer, ser = None, None
+    try:
+        with open(wer_file, 'r') as f:
+            s = f.readlines()
+            wer = float(s[1].split()[1])
+            ser = float(s[2].split()[1])
+
+    except Exception as e:
+        print sys.stderr, 'Error parsing file %s' % wer_file
+        print sys.stderr, str(e)
+    return wer, ser
+
+
+def extractResults(path):
+    wer_files = glob.glob('%s/*/decode_*/*wer_*' % path)
+    table = []
+    for wf in wer_files:
+        try:
+            exp, decode_dir, wer_f = wf.split('/')[-3:]
+            # last split: decode_it3_dev_build0  -> (dev, build0)
+            lm = decode_dir.split('_')[-1]
+            dataset = decode_dir.split('_')[-2]
+            lm_w = int(wer_f[4:])  # strip wer_ from wer_19
+            wer, ser = extract_stat(wf)
+            table.append((exp, dataset, lm,  lm_w, wer, ser))
+        except Exception as e:
+            print >> sys.stderr, 'failed to parse %s' % wf
+            print >> sys.stderr, str(e)
+    return table
+
+
+class Table(object):
+
+    def __init__(self, data=[], colnames=[]):
+        self.data = data
+        self.colnames = colnames
+        self.colSep = '\t'
+        self.lineSep = '\n'
+
+    def data2str(self):
+        strdata = []
+        for r in self.data:
+            strdata.append([str(c) for c in r])
+        return strdata
+
+    def __str__(self):
+        sd = self.data2str()
+        colwidth = [len(c) for c in self.colnames]
+        for j in range(len(colwidth)):
+            for r in sd:
+                colwidth[j] = max(colwidth[j], len(r[j]))
+
+        gaps = [m - len(c) for (m, c) in zip(colwidth, self.colnames)]
+        rows = [self.colSep.join(
+            [c + ' ' * gap for c, gap in zip(self.colnames, gaps)])]
+        for r in sd:
+            gaps = [m - len(c) for (m, c) in zip(colwidth, r)]
+            rows.append(
+                self.colSep.join([c + ' ' * d for c, d in zip(r, gaps)]))
+        return self.lineSep.join(rows)
+
+
+class LatexTable(Table):
+
+    def __init__(self, data=[], colnames=[]):
+        Table.__init__(self, data, colnames)
+        nc = len(colnames)
+        self.header = '\\begin{tabular}{%s}' % ('c' * nc)
+        self.tail = '\\end{tabular}'
+        self.colSep = ' & '
+        self.lineSep = '\\\\ \n'
+
+    def __str__(self):
+        table_s = super(LatexTable, self).__str__()
+        table_s = table_s.replace('_', '\_')
+        return '%s\n%s\n%s\n' % (self.header, table_s, self.tail)
+
+
+def Table2LatexTable(table):
+    return LatexTable(table.data, table.colnames)
+
+
+def createSmallTable(r):
+    d = []
+    for k, v in r.iteritems():
+        w, s, r = v
+        if w == []:
+            minw = None
+        else:
+            minw = min(w)  # returns tuple if s is list of tuples
+        if s == []:
+            mins = None
+        else:
+            mins = min(s)  # returns tuple if s is list of tuples
+        d.append([k, mean(r), minw, mins])
+    t = Table(d, ['exp', 'RT coef', 'WER', 'SER'])
+    return t
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Parse experiment directory generated by kaldi vystadial recipe and print statistics')
+
+    parser.add_argument('expath', type=str, action='store',
+                        help='Path to experiment directory')
+    parser.add_argument('-l', '--latex', default=False, action='store_true',
+                        help='Generate also latex format table')
+    args = parser.parse_args()
+
+    raw_d = extractResults(args.expath)
+
+    conn = sqlite3.connect(':memory:')
+    c = conn.cursor()
+    c.execute(
+        '''CREATE TABLE results (exp text, dataset text, lm text, lm_w int, wer float, ser float)''')
+    c.executemany('INSERT INTO results VALUES (?, ?, ?, ?, ?, ?)', raw_d)
+
+    # get all results sorted
+    # c.execute("SELECT * FROM results ORDER BY exp, dataset, lm, lm_w")
+    # d = c.fetchall()
+    # t = Table(data=d, colnames=['exp', 'set', 'lm', 'LMW', 'WER', 'SER'])
+    # print '%s\n==================' % str(t)
+
+    # best experiment
+    # c.execute("SELECT exp, dataset, lm_w,  MIN(wer), ser FROM results ORDER BY exp, lm_w, dataset")
+    # d = c.fetchall()
+    # compare dev and test set by picking up the best experiment
+    # c.execute(("SELECT exp, dataset, lm_w,  MIN(wer), ser FROM results "
+    #            "GROUP BY exp, lm, dataset ORDER BY exp, lm, dataset"))
+    # d = c.fetchall()
+    # t = Table(data=d, colnames=['exp', 'set', 'lm', 'LMW', 'WER', 'SER'])
+    # print '%s\n==================' % str(t)
+
+    # traditional usage of devset
+    dev_set_query = ("SELECT r.exp, r.lm, r.lm_w FROM results AS r "
+                     "INNER JOIN ( SELECT dataset, exp, lm, MIN(wer) as min_wer "
+                     "           FROM results WHERE dataset=? GROUP BY exp, lm) i "
+                     "ON r.exp=i.exp AND r.lm=i.lm AND r.dataset=i.dataset AND r.wer <= i.min_wer "
+                     )
+    c.execute(dev_set_query, ('dev',))
+
+    min_dev = c.fetchall()
+
+    # remove duplicates: duplicates if equal mimimum wer in dev set
+    min_dev_un = [(e, lm, lmw) for ((e, lm), lmw) in
+                  dict([((e, lm), lmw) for e, lm, lmw in min_dev]).items()]
+    # sort according LM -> sort results according experiment & LMs
+    min_dev_un.sort(key=lambda x: (x[1], x[0]))
+
+    # extract corresponding test results to dev set
+    d = []
+    for exp, lm, lm_w in min_dev_un:
+        c.execute(("SELECT * FROM results WHERE "
+                   "dataset='test' AND exp=? AND lm=? AND lm_w=?"),
+                  (exp, lm, lm_w))
+        x = c.fetchall()
+        assert (len(x) == 1), "One row should be extracted."
+        d.append(x[0])
+
+    t = Table(data=d, colnames=['exp', 'set', 'LM', 'LMW', 'WER', 'SER'])
+    print str(t)
+    if args.latex:
+        print Table2LatexTable(t)
--- a/egs/vystadial_cz/s5/local/save_check.sh
+++ b/egs/vystadial_cz/s5/local/save_check.sh
@ -0,0 +1,49 @@
+#!/bin/bash
+# Copyright (c) 2013, Ondrej Platek, Ufal MFF UK <oplatek@ufal.mff.cuni.cz>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License. #
+
+EXP=$1
+
+# make sure that the directories exists
+conflict=""
+for d in $@ ; do
+    if [ -d $d ] || [ -f $d ] ; then
+        conflict="$conflict $d"
+    fi
+done
+
+if [[ ! -z "$conflict" ]] ; then
+    echo "Running new experiment will create following directories."
+    echo "Some of them already exists!"
+    echo ""
+    echo "Existing directories:"
+    for d in $conflict ; do 
+        echo "   $d"
+    done
+    read -p "Should I delete the conflicting directories NOW y/n?"
+    case $REPLY in
+        [Yy]* ) echo "Deleting $conflict directories"; rm -rf $conflict;;
+        * ) echo 'Keeping conflicting directories and exiting ...'; exit 1;;
+    esac
+fi
+
+for d in $@ ; do
+    mkdir -p $d
+done
+
+# Save the variables set up 
+(set -o posix ; set ) > $EXP/experiment_bash_vars.log
+# git log -1 > $EXP/alex_gitlog.log
+# git diff > $EXP/alex_gitdiff.log
--- a/egs/vystadial_cz/s5/local/score.sh
+++ b/egs/vystadial_cz/s5/local/score.sh
@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+#           2014  Mff UK, UFAL (modification: Ondrej Platek)
+# Apache 2.0
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+min_lmw=9
+max_lmw=20
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --min_lmw <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmw <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/scoring/log
+
+cp $data/text $dir/scoring/test.txt
+
+$cmd LMW=$min_lmw:$max_lmw $dir/scoring/log/best_path.LMW.log \
+  lattice-best-path --lm-scale=LMW --word-symbol-table=$symtab \
+    "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMW.tra || exit 1;
+
+$cmd LMW=$min_lmw:$max_lmw $dir/scoring/log/score.LMW.log \
+   cat $dir/scoring/LMW.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| \
+    compute-wer --text --mode=present \
+     ark:$dir/scoring/test.txt  ark,p:- ">&" $dir/wer_LMW || exit 1;
+
+# Show results
+for f in $dir/wer_*; do echo $f; egrep  '(WER)|(SER)' < $f; done
+
+exit 0;
--- a/egs/vystadial_cz/s5/path.sh
+++ b/egs/vystadial_cz/s5/path.sh
@ -0,0 +1,19 @@
+# Needed for "correct" sorting
+export LC_ALL=C
+export KALDI_ROOT=../../..
+
+# adding Kaldi binaries to path
+export PATH=$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PWD/utils:$PWD/steps:$PATH
+
+
+
+srilm_bin=$KALDI_ROOT/tools/srilm/bin/
+if [ ! -e "$srilm_bin" ] ; then
+    echo "SRILM is not installed in $KALDI_ROOT/tools."
+    echo "May not be able to create LMs!"
+    echo "Please go to $KALDI_ROOT/tools and run ./install_srilm.sh"
+fi
+srilm_sub_bin=`find "$srilm_bin" -type d`
+for d in $srilm_sub_bin ; do
+    export PATH=$d:$PATH
+done
--- a/egs/vystadial_cz/s5/run.sh
+++ b/egs/vystadial_cz/s5/run.sh
@ -0,0 +1,165 @@
+#!/bin/bash
+# Copyright Ondrej Platek Apache 2.0
+renice 20 $$
+
+# Load training parameters
+. ./env_voip_cs.sh
+# Source optional config if exists
+[ -f env_voip_cs_CUSTOM.sh ] && . ./env_voip_cs_CUSTOM.sh
+
+. ./path.sh
+
+# If you have cluster of machines running GridEngine you may want to
+# change the train and decode commands in the file below
+. ./cmd.sh
+
+#######################################################################
+#       Preparing acoustic features, LMs and helper files             #
+#######################################################################
+
+echo " Copy the configuration files to $EXP directory."
+local/save_check.sh $EXP $WORK/*  || exit 1;
+
+local/download_cs_data.sh $DATA_ROOT || exit 1;
+
+local/data_split.sh --every_n $EVERY_N $DATA_ROOT $WORK/local "$LMs" "$TEST_SETS" || exit 1
+
+local/create_LMs.sh $WORK/local $WORK/local/train/trans.txt \
+    $WORK/local/test/trans.txt  $WORK/local/lm "$LMs" || exit 1
+
+local/prepare_cs_transcription.sh $WORK/local $WORK/local/dict || exit 1
+
+local/create_phone_lists.sh $WORK/local/dict || exit 1
+
+utils/prepare_lang.sh $WORK/local/dict '_SIL_' $WORK/local/lang $WORK/lang || exit 1
+
+local/create_G.sh $WORK/lang "$LMs" $WORK/local/lm $WORK/local/dict/lexicon.txt || exit 1
+
+echo "Create MFCC features and storing them (Could be large)."
+for s in train $TEST_SETS ; do
+    steps/make_mfcc.sh --mfcc-config common/mfcc.conf --cmd \
+      "$train_cmd" --nj $njobs $WORK/local/$s $EXP/make_mfcc/$s $WORK/mfcc || exit 1;
+    # Note --fake -> NO CMVN
+    steps/compute_cmvn_stats.sh $fake $WORK/local/$s \
+      $EXP/make_mfcc/$s $WORK/mfcc || exit 1;
+done
+
+echo "Decoding is done for each pair (TEST_SET x LMs)"
+echo "Distribute the links to MFCC feats to all LM variations."
+cp $WORK/local/train/feats.scp $WORK/train/feats.scp
+cp $WORK/local/train/cmvn.scp $WORK/train/cmvn.scp
+for s in $TEST_SETS; do
+  for lm in $LMs; do
+    tgt_dir=${s}_`basename "$lm"`
+    echo "cp $WORK/local/$s/feats.scp $WORK/$tgt_dir/feats.scp"
+    cp $WORK/local/$s/feats.scp $WORK/$tgt_dir/feats.scp
+    echo "cp $WORK/local/$s/cmvn.scp $WORK/$tgt_dir/cmvn.scp"
+    cp $WORK/local/$s/cmvn.scp $WORK/$tgt_dir/cmvn.scp
+  done
+done
+
+#######################################################################
+#                      Training Acoustic Models                       #
+#######################################################################
+
+echo "Train monophone models on full data -> may be wastefull (can be done on subset)"
+steps/train_mono.sh  --nj $njobs --cmd "$train_cmd" $WORK/train $WORK/lang $EXP/mono || exit 1;
+
+echo "Get alignments from monophone system."
+steps/align_si.sh  --nj $njobs --cmd "$train_cmd" \
+  $WORK/train $WORK/lang $EXP/mono $EXP/mono_ali || exit 1;
+
+echo "Train tri1 [first triphone pass]"
+steps/train_deltas.sh  --cmd "$train_cmd" \
+  $pdf $gauss $WORK/train $WORK/lang $EXP/mono_ali $EXP/tri1 || exit 1;
+
+# draw-tree $WORK/lang/phones.txt $EXP/tri1/tree | dot -Tsvg -Gsize=8,10.5  > graph.svg
+
+echo "Align tri1"
+steps/align_si.sh  --nj $njobs --cmd "$train_cmd" \
+  --use-graphs true $WORK/train $WORK/lang $EXP/tri1 $EXP/tri1_ali || exit 1;
+
+echo "Train tri2a [delta+delta-deltas]"
+steps/train_deltas.sh  --cmd "$train_cmd" $pdf $gauss \
+  $WORK/train $WORK/lang $EXP/tri1_ali $EXP/tri2a || exit 1;
+
+echo "Train tri2b [LDA+MLLT]"
+steps/train_lda_mllt.sh  --cmd "$train_cmd" $pdf $gauss \
+  $WORK/train $WORK/lang $EXP/tri1_ali $EXP/tri2b || exit 1;
+
+echo "Align all data with LDA+MLLT system (tri2b)"
+steps/align_si.sh  --nj $njobs --cmd "$train_cmd" \
+    --use-graphs true $WORK/train $WORK/lang $EXP/tri2b $EXP/tri2b_ali || exit 1;
+
+echo "Train MMI on top of LDA+MLLT."
+steps/make_denlats.sh  --nj $njobs --cmd "$train_cmd" \
+   --beam $mmi_beam --lattice-beam $mmi_lat_beam \
+   $WORK/train $WORK/lang $EXP/tri2b $EXP/tri2b_denlats || exit 1;
+steps/train_mmi.sh  $WORK/train $WORK/lang $EXP/tri2b_ali $EXP/tri2b_denlats $EXP/tri2b_mmi || exit 1;
+
+echo "Train MMI on top of LDA+MLLT with boosting. train_mmi_boost is a e.g. 0.05"
+steps/train_mmi.sh  --boost ${train_mmi_boost} $WORK/train $WORK/lang \
+   $EXP/tri2b_ali $EXP/tri2b_denlats $EXP/tri2b_mmi_b${train_mmi_boost} || exit 1;
+
+echo "Train MPE."
+steps/train_mpe.sh $WORK/train $WORK/lang $EXP/tri2b_ali $EXP/tri2b_denlats $EXP/tri2b_mpe || exit 1;
+
+#######################################################################
+#                       Building decoding graph                       #
+#######################################################################
+for lm in $LMs ; do
+  lm=`basename "$lm"`
+  utils/mkgraph.sh --mono $WORK/lang_${lm} $EXP/mono $EXP/mono/graph_${lm} || exit 1
+  utils/mkgraph.sh $WORK/lang_${lm} $EXP/tri1 $EXP/tri1/graph_${lm} || exit 1
+  utils/mkgraph.sh $WORK/lang_${lm} $EXP/tri2a $EXP/tri2a/graph_${lm} || exit 1
+  utils/mkgraph.sh $WORK/lang_${lm} $EXP/tri2b $EXP/tri2b/graph_${lm} || exit 1
+done
+
+
+#######################################################################
+#                              Decoding                               #
+#######################################################################
+for s in $TEST_SETS ; do
+  for lm in $LMs ; do
+    lm=`basename "$lm"`
+    tgt_dir=${s}_`basename "$lm"`
+    echo "Monophone decoding"
+    # Note: steps/decode.sh --scoring-opts "--min-lmw $min_lmw --max-lmw $max_lmw" \
+    # calls the command line once for each test,
+    # and afterwards averages the WERs into (in this case $EXP/mono/decode/)
+    steps/decode.sh --scoring-opts "--min-lmw $min_lmw --max-lmw $max_lmw" \
+       --config common/decode.conf --nj $njobs --cmd "$decode_cmd" \
+      $EXP/mono/graph_${lm} $WORK/${tgt_dir} $EXP/mono/decode_${tgt_dir}
+    echo "Decode tri1"
+    steps/decode.sh --scoring-opts "--min-lmw $min_lmw --max-lmw $max_lmw" \
+       --config common/decode.conf --nj $njobs --cmd "$decode_cmd" \
+      $EXP/tri1/graph_${lm} $WORK/$tgt_dir $EXP/tri1/decode_${tgt_dir}
+    echo "Decode tri2a"
+    steps/decode.sh --scoring-opts "--min-lmw $min_lmw --max-lmw $max_lmw" \
+       --config common/decode.conf --nj $njobs --cmd "$decode_cmd" \
+      $EXP/tri2a/graph_${lm} $WORK/$tgt_dir $EXP/tri2a/decode_${tgt_dir}
+    echo "Decode tri2b [LDA+MLLT]"
+    steps/decode.sh --scoring-opts "--min-lmw $min_lmw --max-lmw $max_lmw" \
+       --config common/decode.conf --nj $njobs --cmd "$decode_cmd" \
+      $EXP/tri2b/graph_${lm} $WORK/$tgt_dir $EXP/tri2b/decode_${tgt_dir}
+    # Note: change --iter option to select the best model. 4.mdl == final.mdl
+    echo "Decode MMI on top of LDA+MLLT."
+    steps/decode.sh --scoring-opts "--min-lmw $min_lmw --max-lmw $max_lmw" \
+       --config common/decode.conf --iter 4 --nj $njobs --cmd "$decode_cmd" \
+      $EXP/tri2b/graph_${lm} $WORK/$tgt_dir $EXP/tri2b_mmi/decode_it4_${tgt_dir}
+    echo "Decode MMI on top of LDA+MLLT with boosting. train_mmi_boost is a number e.g. 0.05"
+    steps/decode.sh --scoring-opts "--min-lmw $min_lmw --max-lmw $max_lmw" \
+       --config common/decode.conf --iter 4 --nj $njobs --cmd "$decode_cmd" \
+      $EXP/tri2b/graph_${lm} $WORK/$tgt_dir $EXP/tri2b_mmi_b${train_mmi_boost}/decode_it4_${tgt_dir};
+    echo "Decode MPE."
+    steps/decode.sh --scoring-opts "--min-lmw $min_lmw --max-lmw $max_lmw" \
+       --config common/decode.conf --iter 4 --nj $njobs --cmd "$decode_cmd" \
+      $EXP/tri2b/graph_${lm} $WORK/$tgt_dir $EXP/tri2b_mpe/decode_it4_${tgt_dir} || exit 1;
+  done
+done
+
+
+echo "Successfully trained and evaluated all the experiments"
+local/results.py $EXP | tee $EXP/results.log
+
+local/export_models.sh $TGT_MODELS $EXP $WORK/lang
--- a/egs/vystadial_cz/s5/steps
+++ b/egs/vystadial_cz/s5/steps
@ -0,0 +1 @@
+../../wsj/s5/steps
--- a/egs/vystadial_cz/s5/utils
+++ b/egs/vystadial_cz/s5/utils
@ -0,0 +1 @@
+../../wsj/s5/utils
--- a/egs/vystadial_en/README.txt
+++ b/egs/vystadial_en/README.txt
@ -0,0 +1,165 @@
+Summary
+-------
+The data comprise over 41 hours of speech in English.
+
+The English recordings were collected from humans interacting via telephone 
+calls with statistical dialogue systems, designed to provide the user 
+with information on a suitable dining venue in the town.
+
+The data collection process is described in detail
+in article "Free English and Czech telephone speech corpus shared under the CC-BY-SA 3.0 license"
+published for LREC 2014 (To Appear).
+
+WE USE COMMON KALDI DECODERS IN THE SCRIPTS (gmm-latgen-faster through steps/decode.sh)
+However, the main purpose of providing the data and scripts
+is training acoustic models for real-time speech recognition unit
+for dialog system ALEX, which uses modified real-time Kaldi OnlineLatgenRecogniser.
+The modified Kaldi decoders are NOT required for running the scripts!
+
+The modified OnlineLatgenRecogniser is actively developed at 
+https://github.com/UFAL-DSG/pykaldi/tree/master/src/onl-rec
+and has Python wrapper:
+https://github.com/UFAL-DSG/pykaldi/tree/master/src/pykaldi
+Note that I am currently moving the online recogniser to:
+http://sourceforge.net/p/kaldi/code/HEAD/tree/sandbox/oplatek2/
+
+Credits and license
+------------------------
+The scripts are partially based on Voxforge KALDI recipe.
+The original scripts as well as theses scripts are licensed under APACHE 2.0 license.
+The data are distributed under Attribution-{ShareAlike} 3.0 Unported ({CC} {BY}-{SA} 3.0) license.
+Czech data: https://lindat.mff.cuni.cz/repository/xmlui/handle/11858/00-097C-0000-0023-4670-6
+English data: https://lindat.mff.cuni.cz/repository/xmlui/handle/11858/00-097C-0000-0023-4671-4
+
+The data collecting process and development of these training scripts 
+was partly funded by the Ministry of Education, Youth and Sports 
+of the Czech Republic under the grant agreement LK11221 
+and core research funding of Charles University in Prague.
+For citing, please use following BibTex citation:
+
+@inproceedings{korvas_2014,
+  title={{Free English and Czech telephone speech corpus shared under the CC-BY-SA 3.0 license}},
+  author={Korvas, Mat\v{e}j and Pl\'{a}tek, Ond\v{r}ej and Du\v{s}ek, Ond\v{r}ej and \v{Z}ilka, Luk\'{a}\v{s} and Jur\v{c}\'{i}\v{c}ek, Filip},
+  booktitle={Proceedings of the Eigth International Conference on Language Resources and Evaluation (LREC 2014)},
+  pages={To Appear},
+  year={2014},
+}
+
+
+Expected results
+----------------
+The expected results were obtained simply by running
+bash train_voip_cs.sh OR bash train_voip_en.sh.
+Note that you need SRILM installed in path or at kaldi/tools/ directory!
+
+    build2 - bigram LM from train data, estimated by the scripts using SRILM
+    build0 - zerogram LM from test data, estimated by scripts using Python code
+    LMW - Language model weight, we picked the best from (min_lmw, max_lmw) based on decoding results on DEV set
+
+    Full English data:
+    exp             set     LM      LMW     WER     SER
+    mono            test    build0  9       67.52   91.6
+    tri1            test    build0  10      36.75   74.7
+    tri2a           test    build0  10      35.8    71.65
+    tri2b           test    build0  10      32.24   68.35
+    tri2b_mmi       test    build0  9       24.36   54.5
+    tri2b_mmi_b0.05 test    build0  9       23.72   53.1
+    tri2b_mpe       test    build0  10      25.81   59.45
+    mono            test    build2  14      31.51   64.2
+    tri1            test    build2  20      15.2    43.55
+    tri2a           test    build2  16      15.61   43.4
+    tri2b           test    build2  19      15.27   42.8
+    tri2b_mmi       test    build2  14      10.2    30.45
+    tri2b_mmi_b0.05 test    build2  17      10.09   29.85
+    tri2b_mpe       test    build2  20      15.54   42.2
+
+    Note that the zero-gram LMs for discriminative training
+    give significant advantage, because they are estimated on test set!
+
+
+Details
+-------
+* Requires Kaldi installation and Linux environment. (Tested on Ubuntu 10.04, 12.04 and 12.10.)
+* The config file s5/env_voip_en.sh sets the data directory,
+  mfcc directory and experiments directory.
+* Our scripts prepare the data to the expected format in s5/data.
+* Experiment files are stored to $exp directory e.g. s5/exp.
+* The local directory contains scripts for data preparation to prepare 
+  lang directory.
+* path.sh, cmd.sh and  common/* contain configurations for the 
+  recipe.
+* Language model (LM) is either built from the training data using 
+  [SRILM](http://www.speech.sri.com/projects/srilm/)  or we supply one in 
+  the ARPA format.
+
+
+Running experiments
+-------------------
+Before running the experiments, check that:
+
+* you have the Kaldi toolkit compiled: 
+  http://sourceforge.net/projects/kaldi/.
+* you have SRILM compiled. (This is needed for building a language model 
+  unless you supply your own LM in the ARPA format.) 
+  See http://www.speech.sri.com/projects/srilm/.
+* The number of jobs njobs are set correctly in path.sh.
+* In cmd.sh, you switched to run the training on a SGE[*] grid if 
+  required (disabled by default).
+
+Start the recipe from the s5 directory by running 
+bash run.sh.
+It will create s5/mfcc, s5/data and s5/exp directories.
+If any of them exists, it will ask you if you want them to be overwritten.
+
+.. [*] Sun Grid Engine
+
+Extracting the results and trained models
+-----------------------------------------
+The main scripts, s5/run.sh, 
+perform not only training of the acoustic models, but also decoding.
+The acoustic models are evaluated after running the training and  
+reports are printed to the standard output.
+
+The s5/local/results.py exp command extracts the results from the $exp directory.
+and stores the results to exp/results.log.
+
+If you want to use the trained acoustic model with your language model
+outside the prepared script, you need to build the HCLG decoding graph yourself.  
+See http://kaldi.sourceforge.net/graph.html for general introduction to the FST 
+framework in Kaldi.
+
+The simplest way to start decoding is to use the same LM which
+was used by the s5/run.sh script.
+Let's say you want to decode with 
+the acoustic model stored in exp/tri2b_bmmi,
+then you need files listed below:
+
+================================= =====================================================================================
+mfcc.conf                          Speech parametrisation (MFCC) settings. Training and decoding setup must match.
+exp/tri2b_bmmi/graph/HCLG.fst      Decoding Graph. Graph part of AM plus lexicon, phone->3phone & LM representation.
+exp/tri2b_bmmi/graph/words.txt     Word symbol table, a mapping between words and integers which are decoded.
+exp/tri2b_bmmi/graph/silence.csl   List of phone integer ids, which represent silent phones. 
+exp/tri2b_bmmi/final.mdl           Trained acoustic model (AM).
+exp/tri2b_bmmi/final.mat           Trained matrix of feature/space transformations (E.g. LDA and bMMI).
+================================= =====================================================================================
+
+
+We recommend to study steps/decode.sh Kaldi standard script
+for standalone decoding with gmm-latgen-faster Kaldi decoder.
+
+In order to build your own decoding graph HCLG 
+you need LM in ARPA format and files in table below. 
+
+* Note 1: Building HCLG decoding graph is out of scope this README.
+* Note 2: Each acoustic model needs corresponding HCLG graph.
+* Note 3: The phonetic dictionary applied on the vocabulary 
+  should always generate only a subset of phones seen in training data!
+
+===============================  =========================================================================
+LM.arpa                           Language model in ARPA format [You should supply it]
+vocabulary.txt                    List of words you want to decode [You should supply it]
+OOV_SYMBOL                        String representing out of vocabulary word. [You should supply it]
+dictionary.txt                    Phonetic dictionary. [You should supply it]
+exp/tri2b_bmmi/final.mdl          Trained acoustic model (AM).
+exp/tri2b_bmmi/final.tree         Phonetic decision tree.
+===============================  =========================================================================
--- a/egs/vystadial_en/online_demo
+++ b/egs/vystadial_en/online_demo
@ -0,0 +1 @@
+../vystadial_cz/online_demo/
--- a/egs/vystadial_en/s5/cmd.sh
+++ b/egs/vystadial_en/s5/cmd.sh
@ -0,0 +1,22 @@
+# "queue.pl" uses qsub.  The options to it are
+# options to qsub.  If you have GridEngine installed,
+# change this to a queue you have access to.
+# Otherwise, use "run.pl", which will run jobs locally
+# (make sure your --num-jobs options are no more than
+# the number of cpus on your machine.
+
+#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
+#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
+# export train_cmd="queue.pl -l mf=5g"
+# export decode_cmd="queue.pl -l mf=5g"
+export train_cmd="queue.pl -l arch=*64*"
+export decode_cmd="queue.pl -l arch=*64*"
+
+# The number of parallel jobs to be started for some parts of the recipe
+# Make sure you have enough resources(CPUs and RAM) to accomodate this number of jobs
+njobs=20
+
+# If you have no GridEngine you can do:
+#export train_cmd=run.pl
+#export decode_cmd=run.pl
+#njobs=2
--- a/egs/vystadial_en/s5/common/cmudict.ext
+++ b/egs/vystadial_en/s5/common/cmudict.ext
@ -0,0 +1,254 @@
+A.M.  EY EH M
+AQUIRE  AE K W AY R
+ABOUTS  AH B AW T S
+ADC  AE D K
+ADDEN  AE D EH N 
+ADDENBROOKE  AE D EH N B R AH K AH
+ADDENBROOKE'S  AE D EH N B R AH K AH Z
+AHAR  AE AH R
+ALIMENTUM  AE L AH M EH N T AH M
+ALLENBELL  AO L AH N B EH L
+ALPHA-MILTON  AE L F AH M IH L T AH N
+APOLOGISE  AH P AA L AH JH AY Z
+ARBURY  AA R B Y AH R IY
+ASWELL  AH S W EH L
+AUNTIE'S  AE N T IY Z
+AYLESBRAY  EY L EH Z R EY IY
+BACKSTREET  B AE K S T R IY T
+BADMINTONCOURT  B AE D M IH N T AH N K OW R T
+BEACONSFIELD  B IY K AH N Z F IY L D
+BENE'T  B AH N EH T
+BENET  B EH N AH T
+BENET(2)  B AH N EY
+BENETS  B IH N EH T S
+BENNY'S  B EH N IY Z
+BINH  B IH N
+BLD'S  B IY EH L D IY S
+BLU  B L UW
+BOCHKA  B AA CH AH
+BOCHKAS  B AA CH AH Z
+BOLCHOI  B OW L CH OY
+BOSPHORUS  B AA S AH R AH S
+BROOKFIELDS  B R UH K F IY L D Z
+BUDGENS  B AH JH AH N Z
+BURRELL'S  B ER EH L Z
+BURWASH  B AH R W AH SH
+CAFE-BAR  K AH F EY B AA R
+CAIPIRINHA  K EY P AH R IH N HH AH
+CAIPIRINHAS  K EY P AH R IH N HH AH Z
+CAIUS  K EY AH S
+CAMBOURNE  K AE M B OW R N
+CARLUCCIO'S  K AA R L AH K S IY AH Z
+CARNABY  K AA R N AE B IY
+CARRINGTONS  K AA R IH NG T AH N Z
+CASTLEHILL  K AE S AH L HH IH L
+CATHARINE'S  K AE TH R IH N Z
+CAZIMIR  K AE Z AH M AH R
+CERTIFICATED  S ER T IH F IH K AH T IH D
+CHEAPISH  CH IY P IH SH
+CHERSTON  CH EH R S T AH N
+CHESTERON  CH EH S T ER R AH N
+CHERRYHINTON  CH EH R IY HH IH N T AH N
+CHERRYWOOD  CH EH R IY W UH D
+CHIQUITO  CH IH K W AH T OW
+CHIQUITAS  CH IH K W AH T AH Z
+CINEWORLD  S IH N Y UW AH R L D
+CITYROOMZ  S IH T AH R UW M Z
+COBBLES  K AA B AH L Z
+COLADA  K OW L AE D AH
+COLDHAM  K OW L D HH AE M
+COLDHAM'S  K OW L D HH AE M Z
+COLDHAMS  K OW L D HH AE M Z
+COLOURS  K AH L ER Z
+COMPETITVE  K AA M P EH T AH T V
+COMPETITVE  K AA M P EH T AH T V
+CRICKETERS  K R IH K AH T ER Z
+CROSSWAYS  K R AO S W EY Z
+CUCINA  K Y UW S AH N AH
+CURY  K Y UH R IY
+D'ARRY'S  D AE R AH Z
+DOJO  D OW JH OW
+EFES  IY F S
+EMBARASSING  IH M B AA R AE S IH NG
+ENSUITE  EH N S W IY T
+ENSUITES  EH N S W IY T S
+ERAINA  AH R EY N AH
+ERM  AH R M
+EUROS  Y UW R OW Z
+FAMISHED  F AE M IH SH T
+FEN  F EH N
+FENDITTON  F EH N D IH T AH N
+FENNER'S  F EH N ER Z
+FITZBILLIES  F IH T S B IH L AY Z
+FIZZY  F IH Z IY
+FOODWELL  F UW D W EH L
+FUSIONS F Y UW ZH AH N Z
+FREUNDE  F R UW N D
+FYNE  F AY N
+GARONDE  G AE R AH N D
+GELDART  JH EH L D AA R T
+GIRTONS  G ER1 T AH0 N Z
+GOG  G AA G
+GONVILLE  G AA N V AH L
+GPS  JH IY P IY S
+GRANTA  G R AE N T AH
+GUINNESSES  G UW N EH S AH Z
+GWYDIR  G W IH D AH R
+GWYDIR  G W IH D AH R
+HAHA  HH AE HH AH
+HAKKA  HH AE K AH
+HALLO  HH AE L OW
+HAUXTON  HH AO K S T AH N
+HEDGE'S  HH EH JH IH Z
+HENRYS  HH EH N R IY Z
+HILARYS  HH AY L ER IY Z
+HINCHINBROOK  HH IH NG K IH N B R AH K
+HIPPY  HH IH P IY
+HISTON  HH IH S T AH N
+HIYA  HH AY AY AH
+HOBSONS  HH AA B S AH N Z
+HOMERTON  HH OW M ER T AH N
+HOPBINE  HH AA P B AY N
+HOTPOT  HH AA T P AA T
+HOUNSLOW  HH AW N S L OW
+HSBC  EY CH EH S B IY K
+HUMBERSTONE  HH AH M B IY ER S T OW N
+ITEMISED  AY T AH M IY Z D
+JASONS  JH AE S AH N Z
+JASONVILLE  JH AH S AA N V AH L
+JEDBURGH  JH EH D AH R
+JINLING  JH AY N L AH NG
+KETTLE'S  K EH T AH L Z
+KILOMETRE  K IH L AO M AH T ER
+KILOMETRE(2)  K IH L AH M IY T ER
+KILOMETRE  K IH L AO M AH T ER
+KILOMETRE(2)  K IH L AH M IY T ER
+KILOMETRE  K IH L AO M AH T ER
+KILOMETRE(2)  K IH L AH M IY T ER
+KILOMETRES  K IH L AO M AH T ER Z
+KILOMETRES(2)  K IH L AH M IY T ER Z
+KINGSHEDGES  K IH NG Z HH EH JH IH Z
+KINK  K IH NG K
+KOHINOOR  K AH HH IH N OW AH R
+KYMMOY  K IH M OW IY
+LABELLING  L EY B AH L IH NG
+LABELLING  L EY B AH L IH NG
+LAGERS  L AA G ER Z
+LENSFIELD  L EH N Z F IY L D
+LIVINGSTONES  L IH V IH NG S T OW N Z
+MAGOG  M AE G AH G
+MAHARAJAH  M AH HH AE R AH JH AH
+MAINCOURSE  M EY N K OW R S
+MARGHERITA  M AH R EH R AH T AH
+MASSARO'S  M AH S AA R OW Z
+MASSIMO'S  M AE S IH M OW Z
+MATHS  M AE TH S
+MAYPOLE  M EY P OW L
+MEGHNA  M EH N AH
+MERCERS  M ER S ER Z
+METRES  M IY T ER Z
+METROPOLE  M EH T R OW P OW L
+METROPOLE  M EH T R OW P OW L
+MEZE  M IY Z
+MICHAELHOUSE  M IH CH IY L HH AW Z
+MID-PRICED  M IH D P R AY S T
+MIDDLEEASTERN  M IH D AH L IY S T ER N
+MITCHUM'S  M IH CH AH M Z
+MULTISTOREY  M AH L T AY S T OW R IY
+NANDO'S  N AE N D OW Z
+NANDOS  N AE N D AH Z
+NAZ  N AE Z
+NEATH  N IY TH
+NEWCHESTERTON  N UW CH EH S T ER T AH N
+NEWNHAM  N Y UW N HH AH M
+NHS  EH N EY CH S
+NU-MEGA  N UW M EH G AH
+NUSHA  N AH SH AH
+P.M.  P IY EH M
+PANAHAR  P AE N AH AH R
+PASTY  P EY S T IY
+PATISSERIE  P EY T IY Z ER IY
+PATISSIER  P EY T IY Z IY ER
+PEKINGS  P IY1 K IH1 NG Z
+PERNE  P AH R N IY
+PICTUREHOUSE  P IH K CH ER HH AW S
+PIPASHA  P IH P AH SH AH
+PLAINING  P L EY N IH NG
+PLC  P IY EH L K
+POLONIA  P AH L OW N Y AH
+PREZZO  P R IY Z OW
+PRICERANGE  P R AY S R EY N JH
+PUNTING  P AH N T IH NG
+QUAYSIDE  K W EY S AY D
+QUEENS'  K W IY N Z
+RADEGUND  R AE D AH G AH N D
+ROMWELL R AA M W EH L
+ROSCO R AO S K OW
+REALISED  R IY L IY Z D
+RECOGNISE  R IH K AH N AY Z
+RECOGNISED  R AH K AA G N AH S T
+RECOGNISING  R IH K AH N AY Z AH NG
+ROMSEY  R AA M Z IY
+ROSEFORD  R OW Z F OW R D
+RUGBYFIELD  R AH G B AY F IY L D
+SAINSBURY'S  S EY N S B EH R IY Z
+SAVINO'S  S AA V IY N OW Z
+SAVOURY  S AH V AW R IY
+SECS  S EH K S
+SERGU  S AH R G Y UW
+SHADDAI  SH AE D EY
+SHELFORD  SH EH L F AH R D
+SHIRAZ  SH IH R AH Z
+SIDGWICK  S IH JH W AH K
+SLEEPERZ  S L IY P AH R Z
+SNAKATAK  S N AE K AH T AH K
+SORRENTO  S AO R EH N T OW
+SPELT  S P EH L T
+SQUASHCOURT  S K W AA SH K OW R T
+ST.  S EY N T
+STANSTED  S T AE N S T AH D
+STAZIONE  S T EY Z IY OW N
+STOREY'S  S T AO R IY Z
+STOUTS  S T AW T S
+STOW-CUM-QUY  S T OW K AH M K W IY
+STURTON  S T AH R T AH N
+SWIMMINGPOOL  S W IH M IH NG P UW L
+TAKEAWAY  T EY K AH W EY
+TANDOORI  T AE N D AH R IY
+TARIF  T AE R AH F
+TATTIES  T AE T IY Z
+TENISON  T EH N IY Z AO N
+TEXMEX  T EH K S M EH K S
+TEQUILAS  T EH K W AH L AH Z
+TERI-AKI  T EH R IY AA K IY
+THANKYOU  TH AE NG K Y UW
+THATS  DH AE T S
+THODAY  TH AA D EY IY
+THOROUGHBREADS  TH ER OW B R EH D Z
+TIMINGS  T AY M IH NG Z
+TOURISTY  T UH R IH S T IY
+TRAVELLER'S  T R AE V AH L ER Z
+TROCKEL  T R AA K AH L
+TRUMPINGTON  T R AH M P IH NG T AH N
+ULMANN  AH L M AH N
+UPTO  AH P T OW
+VENUE'S V EH N Y UW Z
+WAGAMAMA  W AH G AE M AH M AH
+WARKWORTH  W AO R K W AH R TH
+WATERBEACH  W AO T ER B IY CH
+WI-FI  W IY F AY
+WIFI  W IY F AY
+WHEATSHEAF  W IY T SH IY F
+WHEREABOUT  W IY R AH B AW T
+WOLLASTON  W AA L AE S T AH N
+ZIZZI  Z IH Z IY
+BARNOBUS B AA R N AH B AH S
+MARTIN'S M AA R T AH N Z
+ROUTE R UW T
+ROUTE(2) R AW T
+ROUTES R UW T S
+ROUTES(2) R AW T S
+ROUTES(3) R UH T S
+TERRANIAN T EH R EY N IY AH N
+YIPEE Y IY P IY
+UK  Y K EY
--- a/egs/vystadial_en/s5/common/decode.conf
+++ b/egs/vystadial_en/s5/common/decode.conf
@ -0,0 +1,3 @@
+beam=12.0
+latbeam=6.0
+max_active=14000
--- a/egs/vystadial_en/s5/common/mfcc.conf
+++ b/egs/vystadial_en/s5/common/mfcc.conf
@ -0,0 +1,7 @@
+# --use-energy=false  # non default option. False -> Use C0 instead energy
+# NUMCEPS in HTK is without C0: there 12 here 13 - default
+--low-freq=125
+--high-freq=3800
+# --htk-compat
+# --remove-dc-offset  # equivalent ZMEANSOURCE in HTK
+# --subtract-mean  # not recommended to do it this way
--- a/egs/vystadial_en/s5/env_voip_en.sh
+++ b/egs/vystadial_en/s5/env_voip_en.sh
@ -0,0 +1,42 @@
+#!/bin/bash
+
+# EVERY_N utterance is used for training
+# EVERY_N=3    ->   we use one third of training data
+export EVERY_N=1
+export TEST_SETS="dev test"
+
+# Directories set up
+export DATA_ROOT=`pwd`/data  # expects subdirectories train + $TEST_SETS
+export WORK=`pwd`/lang_prep
+export EXP=`pwd`/exp
+export TGT_MODELS=trained_models
+
+# Specify paths to arpa models. Paths may not contain spaces.
+# Specify build0 or build1 or build2, .. for building (zero|uni|bi)-gram LM.
+# Note: The LM file name should not contain underscore "_"! 
+# Otherwise the results will be reported without the LM with underscore."
+export LMs="build0 build2"
+
+# Use path to prebuilt dictionary or 'build' command in order to build dictionary
+# export DICTIONARY="../../resources/lm/caminfo/dict"
+export DICTIONARY="build"
+
+
+# Borders for estimating LM model weight.
+# LMW is tuned on development set and applied on test set.
+export min_lmw=9
+export max_lmw=20
+
+# Number of states for phonem training
+export pdf=1200
+
+# Maximum number of Gaussians used for training
+export gauss=19200
+
+export train_mmi_boost=0.05
+
+export mmi_beam=16.0
+export mmi_lat_beam=10.0
+
+# --fake -> NO CMVN; empty -> CMVN (pykaldi decoders can not handle CMVN -> fake)
+export fake="--fake"
--- a/egs/vystadial_en/s5/local/backup.sh
+++ b/egs/vystadial_en/s5/local/backup.sh
@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2013, Ondrej Platek, Ufal MFF UK <oplatek@ufal.mff.cuni.cz>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License. #
+
+expdir=$1; shift
+tgtdir=$2; shift
+
+tgtdir="$tgtdir/$name"
+date="`date +%F_%T.%N`"
+
+if [[ -d $tgtdir || -f $tgtdir ]] ; then
+    tgtdir="$tgtdir/backup_$date"
+fi
+
+
+# This is EXAMPLE SCRIPT you are ENCOURAGED TO CHANGE IT!
+
+mkdir -p "$tgtdir"
+cp -rf $expdir "$tgtdir"
+
+# Collect the results
+
+local/results.py $EXP > "$tgtdir"/results.log
+echo "Date: $date" >> "$tgtdir"/results.log
+size=`du -hs "$tgtdir"`
+echo "Size of backup: $size" >> "$tgtdir"/results.log
+
+echo; echo "DATA successfully copied to $tgtdir"; echo
--- a/egs/vystadial_en/s5/local/create_G.sh
+++ b/egs/vystadial_en/s5/local/create_G.sh
@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Copyright 2012 Vassil Panayotov
+#           2013 Ondrej Platek
+# Apache 2.0
+
+echo "===test_sets Formating data ..."
+langdir=$1; shift
+LMs=$1; shift
+lmdir=$1; shift
+lexicon=$1; shift
+
+# Next, for each type of language model, create the corresponding FST
+# and the corresponding lang_test_* directory.
+for lm in $LMs ; do
+    tgt=${langdir}_`basename "$lm"`
+    lmp=$lmdir/`basename $lm`
+
+    tmpdir=$tgt/tmp
+    mkdir -p $tgt 
+    mkdir -p $tmpdir
+
+    echo "--- Preparing the grammar transducer (G.fst) from $lmp in $tgt ..."
+
+    for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones ; do
+        ln -s $langdir/$f $tgt/$f 2> /dev/null
+    done
+
+    cat $lmp | utils/find_arpa_oovs.pl $tgt/words.txt > $tmpdir/oovs.txt
+
+     # grep -v '<s> <s>' because the LM seems to have some strange and useless
+     # stuff in it with multiple <s>'s in the history.  Encountered some other similar
+     # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
+     # which are supposed to occur only at being/end of utt.  These can cause 
+     # determinization failures of CLG [ends up being epsilon cycles].
+
+    cat $lmp | \
+      grep -v '<s> <s>\|</s> <s>\|</s> </s>' | \
+      arpa2fst - | fstprint | \
+      utils/remove_oovs.pl $tmpdir/oovs.txt | \
+      utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$tgt/words.txt \
+        --osymbols=$tgt/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+      fstrmepsilon > $tgt/G.fst
+    fstisstochastic $tgt/G.fst
+    # The output is like:
+    # 9.14233e-05 -0.259833
+    # we do expect the first of these 2 numbers to be close to zero (the second is
+    # nonzero because the backoff weights make the states sum to >1).
+    # Because of the <s> fiasco for these particular LMs, the first number is not
+    # as close to zero as it could be.
+    
+    # Everything below is only for diagnostic.
+    # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
+    # this might cause determinization failure of CLG.
+    # #0 is treated as an empty word.
+    mkdir -p $tmpdir/g
+    awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
+      < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
+    fstcompile --isymbols=$tgt/words.txt --osymbols=$tgt/words.txt \
+      $tmpdir/g/select_empty.fst.txt | \
+    fstarcsort --sort_type=olabel | fstcompose - $tgt/G.fst > $tmpdir/g/empty_words.fst
+    fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+      echo "Language model has cycles with empty words" && exit 1
+
+    # rm -rf $tmpdir  # TODO debugging
+    echo "*** Succeeded in creating G.fst for $tgt"
+
+done  # for lm in $LMs ; do
--- a/egs/vystadial_en/s5/local/create_LMs.sh
+++ b/egs/vystadial_en/s5/local/create_LMs.sh
@ -0,0 +1,89 @@
+#!/bin/bash
+# Copyright (c) 2013, Ondrej Platek, Ufal MFF UK <oplatek@ufal.mff.cuni.cz>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License. #
+locdata=$1; shift
+train_text=$1; shift
+test_text=$1; shift
+local_lm=$1; shift
+lms=$1; shift
+
+
+mkdir -p $local_lm
+
+echo "=== Preparing the LM ..."
+
+function build_0gram {
+    echo "=== Building zerogram $lm from ${transcr}. ..."
+    transcr=$1; lm=$2
+    cut -d' ' -f2- $transcr | tr ' ' '\n' | sort -u > $lm
+    echo "<s>" >> $lm
+    echo "</s>" >> $lm
+    python -c """
+import math
+with open('$lm', 'r+') as f:
+    lines = f.readlines()
+    p = math.log10(1/float(len(lines)));
+    lines = ['%f\\t%s'%(p,l) for l in lines]
+    f.seek(0); f.write('\\n\\\\data\\\\\\nngram  1=       %d\\n\\n\\\\1-grams:\\n' % len(lines))
+    f.write(''.join(lines) + '\\\\end\\\\')
+"""
+}
+
+for lm in $lms ; do
+    lm_base=`basename $lm`
+    if [ ${lm_base%[0-6]} !=  'build' ] ; then
+        cp $lm $local_lm
+    else
+        # We will build the LM 'build[0-9].arpa
+        lm_order=${lm_base#build}
+
+        echo "=== Building LM of order ${lm_order}..."
+        if [ $lm_order -eq 0 ] ; then
+            echo "Zerogram $lm_base LM is build from text: $test_text"
+            cut -d' ' -f2- $test_text | sed -e 's:^:<s> :' -e 's:$: </s>:' | \
+                sort -u > $locdata/lm_test.txt
+            build_0gram  $locdata/lm_test.txt $local_lm/${lm_base}
+        else
+            echo "LM $lm_base is build from text: $train_text"
+            cut -d' ' -f2- $train_text | sed -e 's:^:<s> :' -e 's:$: </s>:' | \
+                sort -u > $locdata/lm_train.txt
+            ngram-count -text $locdata/lm_train.txt -order ${lm_order} \
+                -wbdiscount -interpolate -lm $local_lm/${lm_base}
+        fi
+    fi
+done
+echo "*** LMs preparation finished!"
+
+echo "=== Preparing the vocabulary ..."
+
+if [ "$DICTIONARY" == "build" ]; then
+  echo; echo "Building dictionary from train data"; echo
+  cut -d' ' -f2- $train_text | tr ' ' '\n' > $locdata/vocab-full-raw.txt
+else
+  echo; echo "Using predefined dictionary: ${DICTIONARY}"
+  echo "Throwing away first 2 rows."; echo
+  tail -n +3 $DICTIONARY | cut -f 1 > $locdata/vocab-full-raw.txt
+fi
+
+echo '</s>' >> $locdata/vocab-full-raw.txt
+echo "Removing from vocabulary _NOISE_, and  all '_' words from vocab-full.txt"
+cat $locdata/vocab-full-raw.txt | grep -v '_' | \
+  sort -u > $locdata/vocab-full.txt
+echo "*** Vocabulary preparation finished!"
+
+
+echo "Removing from vocabulary _NOISE_, and  all '_' words from vocab-test.txt"
+cut -d' ' -f2 $test_text | tr ' ' '\n' | grep -v '_' | sort -u > $locdata/vocab-test.txt
+
--- a/egs/vystadial_en/s5/local/create_phone_lists.sh
+++ b/egs/vystadial_en/s5/local/create_phone_lists.sh
@ -0,0 +1,54 @@
+#!/bin/bash
+# Copyright (c) 2013, Ondrej Platek, Ufal MFF UK <oplatek@ufal.mff.cuni.cz>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License. #
+
+# The vystadial data are specific by having following marks in transcriptions
+# _INHALE_
+# _LAUGH_
+# _EHM_HMM_
+# _NOISE_
+# _EHM_HMM_
+# _SIL_
+
+locdict=$1; shift
+
+echo "--- Prepare nonsilence phone lists ..."
+# We suppose only nonsilence_phones in lexicon right now
+awk '{for(n=2;n<=NF;n++) { p[$n]=1; }} END{for(x in p) {print x}}' \
+    $locdict/lexicon.txt | sort > $locdict/nonsilence_phones.txt
+
+echo "--- Adding silence phones to lexicon ..."
+echo "_SIL_ SIL" >> $locdict/lexicon.txt
+echo "_EHM_HMM_ EHM" >> $locdict/lexicon.txt
+echo "_INHALE_ INH" >> $locdict/lexicon.txt
+echo "_LAUGH_ LAU" >> $locdict/lexicon.txt
+echo "_NOISE_ NOI" >> $locdict/lexicon.txt
+
+echo "--- Sorting lexicon in place..."
+sort $locdict/lexicon.txt -o $locdict/lexicon.txt
+
+echo "--- Prepare silence phone lists ..."
+echo SIL > $locdict/silence_phones.txt
+echo EHM >> $locdict/silence_phones.txt
+echo INH >> $locdict/silence_phones.txt
+echo LAU >> $locdict/silence_phones.txt
+echo NOI >> $locdict/silence_phones.txt
+
+echo SIL > $locdict/optional_silence.txt
+
+# Some downstream scripts expect this file exists, even if empty
+touch $locdict/extra_questions.txt
+
+echo "*** Creating phone lists finished!"
--- a/egs/vystadial_en/s5/local/create_sample.sh
+++ b/egs/vystadial_en/s5/local/create_sample.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+# example usage:
+# ./local/create_sample.sh /ha/projects/vystadial/data/asr/en/voip/ Results/vystadial-sample/ test 100
+# note that it suppose there are only *.wav and *.wav.trn and the 
+# the n is the numbero of files in the directory
+
+src=$1
+tgt=$2
+typ=$3   # dev/test/train
+n=$4
+
+src_dir=$src/$typ
+tgt_dir=$tgt/$typ
+mkdir -p $tgt_dir
+ls $src_dir | head -n $n \
+| while read f ; do
+    cp $src_dir/$f $tgt_dir
+done
--- a/egs/vystadial_en/s5/local/data_split.sh
+++ b/egs/vystadial_en/s5/local/data_split.sh
@ -0,0 +1,108 @@
+#!/bin/bash
+# Copyright (c) 2013, Ondrej Platek, Ufal MFF UK <oplatek@ufal.mff.cuni.cz>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License. #
+#
+# Makes train/test splits
+# local/voxforge_data_prep.sh --nspk_test ${nspk_test} ${SELECTED} || exit 1
+# create files: (TYPE=train|test)
+#   a) ${TYPE}_trans.txt: ID transcription capitalized! No interputction
+#   b) ${TYPE}_wav.scp: ID path2ID.wav 
+#   c) $TYPE.utt2spk: ID-recording ID-speaker
+#   s) $TYPE.spk2utt
+#   e) $TYPE.spk2gender  all speakers are male
+# we have ID-recording = ID-speaker
+
+# The vystadial data are specific by having following marks in transcriptions
+# _INHALE_
+# _LAUGH_ 
+# _EHM_HMM_ 
+# _NOISE_
+# _EHM_HMM_
+# _SIL_
+
+# renice 20 $$
+
+every_n=1
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. utils/parse_options.sh || exit 1;
+
+
+if [ $# -ne 4 ] ; then
+    echo "Usage: local/data_split.sh [--every-n 30] <data-directory>  <local-directory> <LMs> <Test-Sets> <tgt-dir>";
+    exit 1;
+fi
+
+DATA=$1; shift
+locdata=$1; shift
+LMs=$1; shift
+test_sets=$1; shift
+tgt_dir=$1; shift
+
+echo "LMs $LMs  test_sets $test_sets"
+
+
+echo "=== Starting initial Vystadial data preparation ..."
+echo "--- Making test/train data split from $DATA taking every $every_n recording..."
+
+mkdir -p $locdata
+
+i=0
+for s in $test_sets train ; do
+    mkdir -p $locdata/$s
+    ls $DATA/$s/ | sed -n /.*wav$/p |\
+    while read wav ; do
+        ((i++)) # bash specific
+        if [[ $i -ge $every_n ]] ; then
+            i=0
+            pwav=$DATA/$s/$wav
+            trn=`cat $DATA/$s/$wav.trn`
+            echo "$wav $pwav" >> $locdata/$s/wav.scp
+            echo "$wav $wav" >> $locdata/$s/utt2spk
+            echo "$wav $wav" >> $locdata/$s/spk2utt
+            echo "$wav $trn" >> $locdata/$s/trans.txt
+            # Ignoring gender -> label all recordings as male
+            echo "$wav M" >> $locdata/spk2gender
+        fi
+    done # while read wav 
+
+    for f in wav.scp utt2spk spk2utt trans.txt ; do
+       sort "$locdata/$s/$f" -k1 -u -o "$locdata/$s/$f"  # sort in place
+    done # for f
+
+done # for in $test_sets train
+
+echo "Set 1:1 relation for spk2utt: spk in $test_sets AND train, sort in place"
+sort "$locdata/spk2gender" -k1 -o "$locdata/spk2gender" 
+
+echo "--- Distributing the file lists to train and ($test_sets x $LMs) directories ..."
+mkdir -p $WORK/train
+cp $locdata/train/wav.scp $WORK/train/wav.scp || exit 1;
+cp $locdata/train/trans.txt $WORK/train/text || exit 1;
+cp $locdata/train/spk2utt $WORK/train/spk2utt || exit 1;
+cp $locdata/train/utt2spk $WORK/train/utt2spk || exit 1;
+utils/filter_scp.pl $WORK/train/spk2utt $locdata/spk2gender > $WORK/train/spk2gender || exit 1;
+
+for s in $test_sets ; do 
+    for lm in $LMs; do
+        tgt_dir=$WORK/${s}_`basename ${lm}`
+        mkdir -p $tgt_dir
+        cp $locdata/${s}/wav.scp $tgt_dir/wav.scp || exit 1;
+        cp $locdata/${s}/trans.txt $tgt_dir/text || exit 1;
+        cp $locdata/${s}/spk2utt $tgt_dir/spk2utt || exit 1;
+        cp $locdata/${s}/utt2spk $tgt_dir/utt2spk || exit 1;
+        utils/filter_scp.pl $tgt_dir/spk2utt $locdata/spk2gender > $tgt_dir/spk2gender || exit 1;
+    done
+done
--- a/egs/vystadial_en/s5/local/download_en_data.sh
+++ b/egs/vystadial_en/s5/local/download_en_data.sh
@ -0,0 +1,46 @@
+#!/bin/bash
+# Copyright Ondrej Platek Apache 2.0
+
+DATA_ROOT=$1
+
+url=https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11858/00-097C-0000-0023-4671-4/data_voip_en.tgz
+name=data_voip_en
+extract_file=205859
+
+mkdir -p $DATA_ROOT
+
+if [ ! -f $DATA_ROOT/${name}.tgz ] ; then
+    wget $url -O $DATA_ROOT/${name}.tgz || exit 1
+    echo "Data successfully downloaded"
+fi
+
+if [[ ! -d $DATA_ROOT/$name && -e $DATA_ROOT/$name ]] ; then
+    echo "The $DATA_ROOT/$name is not a directory and we cannot extract the data!"
+    exit 1;
+fi
+
+if [ ! -d $DATA_ROOT/$name ] ; then
+    mkdir $DATA_ROOT/$name
+    tar xfv $DATA_ROOT/${name}.tgz -C $DATA_ROOT | \
+    while read line; do
+        x=$((x+1))
+        echo -en "$x extracted from $extract_file files.\r"
+    done
+fi
+
+if [ -d $DATA_ROOT/$name ] ; then
+    echo "Checking if data extracted correctly"
+    num_files=`find $DATA_ROOT/$name -name '*' | wc -l`
+    if [ ! $num_files -eq $extract_file ] ; then
+        echo "Data extraction failed! Extracted $num_files instead of $extract_file"
+        exit 1;
+    fi
+    echo "It seams that data are extracted correctly"
+fi
+
+pushd $DATA_ROOT
+    for t in test train dev ; do
+        ln -s $name/$t
+    done
+    ln -s $name/arpa_bigram arpa-bigram
+popd
--- a/egs/vystadial_en/s5/local/export_models.sh
+++ b/egs/vystadial_en/s5/local/export_models.sh
@ -0,0 +1,50 @@
+#!/bin/bash
+
+# Copyright (c) 2013, Ondrej Platek, Ufal MFF UK <oplatek@ufal.mff.cuni.cz>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License. #
+
+tgt=$1; shift
+exp=$1; shift
+lang=$1; shift
+
+mkdir -p $tgt
+
+echo "--- Exporting models to $tgt ..."
+
+# See local/save_check.sh  which saves the settings at the beginning for details
+# cp -f $exp/alex_gitlog.log $exp/alex_gitdiff.log $exp/experiment_bash_vars.log $tgt
+cp -f $exp/experiment_bash_vars.log $tgt
+
+# Store also the results
+cp -f $exp/results.log $tgt/results.log
+
+
+cp -f common/mfcc.conf $tgt 
+
+cp -f $exp/tri2a/final.mdl $tgt/tri2a.mdl
+cp -f $exp/tri2a/tree $tgt/tri2a.tree
+
+cp -f $exp/tri2b/final.mdl $tgt/tri2b.mdl
+cp -f $exp/tri2b/tree $tgt/tri2b.tree
+cp -f $exp/tri2b/final.mat $tgt/tri2b.mat
+
+cp -f $exp/tri2b_mmi_b*/final.mdl $tgt/tri2b_bmmi.mdl
+cp -f $exp/tri2b/tree $tgt/tri2b_bmmi.tree
+cp -f $exp/tri2b_mmi_b*/final.mat $tgt/tri2b_bmmi.mat
+
+cp -f $lang/phones.txt $lang/phones/silence.csl $tgt
+
+
+# FIXME do I need splice_opts for something?
--- a/egs/vystadial_en/s5/local/make_baseform.pl
+++ b/egs/vystadial_en/s5/local/make_baseform.pl
@ -0,0 +1,172 @@
+#!perl -w
+
+#
+# ====================================================================
+# Copyright (C) 1999-2008 Carnegie Mellon University and Alexander
+# Rudnicky. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in
+#    the documentation and/or other materials provided with the
+#    distribution.
+#
+# This work was supported in part by funding from the Defense Advanced
+# Research Projects Agency, the Office of Naval Research and the National
+# Science Foundation of the United States of America, and by member
+# companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
+# the contributions of many volunteers to the expansion and improvement of
+# this dictionary.
+#
+# THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
+# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
+# NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# ====================================================================
+#
+
+# [20050309] (air) Created.
+# strip out stress marks from a cmudict, producing a "SphinxPhones_40" dictionary
+# [20080420] (air) Changed to pass comments.
+#                  Fixed output collation sequence; DOS eol's
+# [20090309] (air) fixed duplicate pron and collation bugs
+# [20090331] (air) restored standard collation order (since other stuff deppends on it)
+# [20090629] (air) do not put comments into SPHINX_40 version; not all software deals
+# [20100118] (air) added $VERBOSE; this should really be a cmdline flag...
+#
+
+
+$VERBOSE = 0;
+
+my $basecount = 0;
+my $dupl = 0;
+my $base = 0;
+my $varia = 0;
+
+if ( scalar @ARGV ne 2 ) { die "usage: make_baseform <input> <output>\n"; }
+
+open(IN, $ARGV[0]) || die "can't open $ARGV[0] for reading!\n";
+open(OUT,">$ARGV[1]") || die "can't open $ARGV[1] for writing!\n";
+
+@header = ();  # header comment lines (passed through)
+%dict = ();    # words end up in here
+%histo = ();   # some statistics on variants
+
+get_dict(\%dict,\@header,IN);  # process the entries
+
+# what have we got?
+print STDERR "$basecount forms processed\n";
+print STDERR "$base baseforms, $varia variants and $dupl duplicates found.\n";
+print STDERR "variant distribution:\n";
+foreach $var ( sort keys %histo ) {
+    print STDERR "$var\t$histo{$var}\n";
+}
+
+# print special comments (copyright, etc.)
+# removed since it messes some things up...
+# foreach $h (@header) { print OUT "$h\n"; }
+
+# print out each entry
+%dict_out = ();
+foreach $w (sort keys %dict) {
+  $var=1;  # variants will number starting with 2
+  foreach $p ( @{$dict{$w}} ) {
+    if ($var eq 1) {
+	$dict_out{$w} = $p;
+      $var++;
+    }  else {
+      $dict_out{"$w($var)"} = $p;
+      $var++;
+    }
+  }
+}
+
+foreach $entry ( sort keys %dict_out ) {
+    print OUT "$entry\t$dict_out{$entry}\n";
+}
+
+close(IN);
+close(OUT);
+
+#
+#
+# read in a dictionary
+sub get_dict {
+  my $dict = shift;  # data structure with dictionary entries
+  my $header = shift;
+  my $target = shift;  # input file handle
+
+  while (<$target>) {
+    s/[\r\n]+$//g;  # DOS-robust chomp;
+
+    # process comments; blank lines ignored
+    # presume that ";;; #" will be collected and emitted at the top
+    if ($_ =~ /^;;; \#/) { push @$header, $_; next; }  # save header info
+    elsif ( $_ =~ /^;;;/ ) { next; }  # ignore plain comments
+    elsif ( $_ =~ /^\s*$/ ) { next; }  # ignore blank lines
+
+    # extract the word,pron pair and prepare for processing
+    ($word,$pron) = /(.+?)\s+(.+?)$/;
+    if (! defined $word) { print STDERR "bad entry (no head word): $_\n"; next; }
+
+    $basecount++;
+
+    if ($word =~ /\)$/) { # variant
+      ($root,$variant) = ($word =~ m/(.+?)\((.+?)\)/);
+    } else {
+      $root = $word;
+      $variant = 0;
+    }
+    $pron = &strip_stress($pron);
+
+    # found a new baseform; set it up
+    if ( ! defined $dict->{$root} ) {
+	$dict->{$root}[0] = $pron;
+	$base++;
+	next;
+    }
+
+    # old baseform; see if, after removed stress, pron is a duplicate
+    foreach $var ( @{$dict->{$root}} ) {
+	if ( $var eq $pron ) {
+	    if ($VERBOSE) {print STDERR "duplicate entry: $root ($variant) $pron\n";}
+	    $dupl++;
+	    $pron = "";
+	    last;
+	}
+    }
+
+    # it's a new variant on an existing baseform, keep it
+    if ( $pron ne "" ) { 
+	push @{$dict->{$root}}, $pron;
+	$varia++;
+	$histo{scalar @{$dict->{$root}}}++;  # track variant stats
+	if ( scalar @{$dict->{$root}} > 4 ) { print STDERR "$root -- ",scalar @{$dict->{$root}},"\n"; }
+    }
+  }
+}
+
+
+# strip stress marks from phonetic symbols
+sub strip_stress {
+  @pron = split " ", $_[0];
+  my $p;
+  foreach $p (@pron) { if ( $p =~ /\d$/) { $p =~ s/(\d+)$//; } }
+  return ( join(" ",@pron));
+}
+
+#
--- a/egs/vystadial_en/s5/local/prepare_en_transcription.sh
+++ b/egs/vystadial_en/s5/local/prepare_en_transcription.sh
@ -0,0 +1,35 @@
+#!/bin/bash
+
+locdata=$1
+locdict=$2
+
+cmu_dict=common/cmudict.0.7a
+cmu_ext=common/cmudict.ext
+
+mkdir -p $locdict
+
+if [ ! -f $cmu_dict ] ; then
+  echo "--- Downloading CMU dictionary ..."
+  svn export http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict.0.7a \
+     $cmu_dict || exit 1;
+fi
+
+echo; echo "If common/cmudict.ext exists, add extra pronunciation to dictionary" ; echo
+cat $cmu_dict  $cmu_ext > $locdict/cmudict_ext.txt 2> /dev/null  # ignoring if no extension
+
+echo "--- Striping stress and pronunciation variant markers from cmudict ..."
+perl local/make_baseform.pl \
+  $locdict/cmudict_ext.txt /dev/stdout |\
+  sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' > $locdict/cmudict-plain.txt
+
+echo "--- Searching for OOV words ..."
+gawk 'NR==FNR{words[$1]; next;} !($1 in words)' \
+  $locdict/cmudict-plain.txt $locdata/vocab-full.txt |\
+  egrep -v '<.?s>' > $locdict/vocab-oov.txt
+
+gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \
+  $locdata/vocab-full.txt $locdict/cmudict-plain.txt |\
+  egrep -v '<.?s>' > $locdict/lexicon.txt
+
+wc -l $locdict/vocab-oov.txt
+wc -l $locdict/lexicon.txt
--- a/egs/vystadial_en/s5/local/results.py
+++ b/egs/vystadial_en/s5/local/results.py
@ -0,0 +1,187 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2013, Ondrej Platek, Ufal MFF UK <oplatek@ufal.mff.cuni.cz>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License. #
+import argparse
+from numpy import mean
+import glob
+import sys
+import sqlite3
+
+
+def extract_stat(wer_file):
+    wer, ser = None, None
+    try:
+        with open(wer_file, 'r') as f:
+            s = f.readlines()
+            wer = float(s[1].split()[1])
+            ser = float(s[2].split()[1])
+
+    except Exception as e:
+        print sys.stderr, 'Error parsing file %s' % wer_file
+        print sys.stderr, str(e)
+    return wer, ser
+
+
+def extractResults(path):
+    wer_files = glob.glob('%s/*/decode_*/*wer_*' % path)
+    table = []
+    for wf in wer_files:
+        try:
+            exp, decode_dir, wer_f = wf.split('/')[-3:]
+            # last split: decode_it3_dev_build0  -> (dev, build0)
+            lm = decode_dir.split('_')[-1]
+            dataset = decode_dir.split('_')[-2]
+            lm_w = int(wer_f[4:])  # strip wer_ from wer_19
+            wer, ser = extract_stat(wf)
+            table.append((exp, dataset, lm,  lm_w, wer, ser))
+        except Exception as e:
+            print >> sys.stderr, 'failed to parse %s' % wf
+            print >> sys.stderr, str(e)
+    return table
+
+
+class Table(object):
+
+    def __init__(self, data=[], colnames=[]):
+        self.data = data
+        self.colnames = colnames
+        self.colSep = '\t'
+        self.lineSep = '\n'
+
+    def data2str(self):
+        strdata = []
+        for r in self.data:
+            strdata.append([str(c) for c in r])
+        return strdata
+
+    def __str__(self):
+        sd = self.data2str()
+        colwidth = [len(c) for c in self.colnames]
+        for j in range(len(colwidth)):
+            for r in sd:
+                colwidth[j] = max(colwidth[j], len(r[j]))
+
+        gaps = [m - len(c) for (m, c) in zip(colwidth, self.colnames)]
+        rows = [self.colSep.join(
+            [c + ' ' * gap for c, gap in zip(self.colnames, gaps)])]
+        for r in sd:
+            gaps = [m - len(c) for (m, c) in zip(colwidth, r)]
+            rows.append(
+                self.colSep.join([c + ' ' * d for c, d in zip(r, gaps)]))
+        return self.lineSep.join(rows)
+
+
+class LatexTable(Table):
+
+    def __init__(self, data=[], colnames=[]):
+        Table.__init__(self, data, colnames)
+        nc = len(colnames)
+        self.header = '\\begin{tabular}{%s}' % ('c' * nc)
+        self.tail = '\\end{tabular}'
+        self.colSep = ' & '
+        self.lineSep = '\\\\ \n'
+
+    def __str__(self):
+        table_s = super(LatexTable, self).__str__()
+        table_s = table_s.replace('_', '\_')
+        return '%s\n%s\n%s\n' % (self.header, table_s, self.tail)
+
+
+def Table2LatexTable(table):
+    return LatexTable(table.data, table.colnames)
+
+
+def createSmallTable(r):
+    d = []
+    for k, v in r.iteritems():
+        w, s, r = v
+        if w == []:
+            minw = None
+        else:
+            minw = min(w)  # returns tuple if s is list of tuples
+        if s == []:
+            mins = None
+        else:
+            mins = min(s)  # returns tuple if s is list of tuples
+        d.append([k, mean(r), minw, mins])
+    t = Table(d, ['exp', 'RT coef', 'WER', 'SER'])
+    return t
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Parse experiment directory generated by kaldi vystadial recipe and print statistics')
+
+    parser.add_argument('expath', type=str, action='store',
+                        help='Path to experiment directory')
+    parser.add_argument('-l', '--latex', default=False, action='store_true',
+                        help='Generate also latex format table')
+    args = parser.parse_args()
+
+    raw_d = extractResults(args.expath)
+
+    conn = sqlite3.connect(':memory:')
+    c = conn.cursor()
+    c.execute(
+        '''CREATE TABLE results (exp text, dataset text, lm text, lm_w int, wer float, ser float)''')
+    c.executemany('INSERT INTO results VALUES (?, ?, ?, ?, ?, ?)', raw_d)
+
+    # get all results sorted
+    # c.execute("SELECT * FROM results ORDER BY exp, dataset, lm, lm_w")
+    # d = c.fetchall()
+    # t = Table(data=d, colnames=['exp', 'set', 'lm', 'LMW', 'WER', 'SER'])
+    # print '%s\n==================' % str(t)
+
+    # best experiment
+    # c.execute("SELECT exp, dataset, lm_w,  MIN(wer), ser FROM results ORDER BY exp, lm_w, dataset")
+    # d = c.fetchall()
+    # compare dev and test set by picking up the best experiment
+    # c.execute(("SELECT exp, dataset, lm_w,  MIN(wer), ser FROM results "
+    #            "GROUP BY exp, lm, dataset ORDER BY exp, lm, dataset"))
+    # d = c.fetchall()
+    # t = Table(data=d, colnames=['exp', 'set', 'lm', 'LMW', 'WER', 'SER'])
+    # print '%s\n==================' % str(t)
+
+    # traditional usage of devset
+    dev_set_query = ("SELECT r.exp, r.lm, r.lm_w FROM results AS r "
+                     "INNER JOIN ( SELECT dataset, exp, lm, MIN(wer) as min_wer "
+                     "           FROM results WHERE dataset=? GROUP BY exp, lm) i "
+                     "ON r.exp=i.exp AND r.lm=i.lm AND r.dataset=i.dataset AND r.wer <= i.min_wer "
+                     )
+    c.execute(dev_set_query, ('dev',))
+
+    min_dev = c.fetchall()
+
+    # remove duplicates: duplicates if equal mimimum wer in dev set
+    min_dev_un = [(e, lm, lmw) for ((e, lm), lmw) in
+                  dict([((e, lm), lmw) for e, lm, lmw in min_dev]).items()]
+    # sort according LM -> sort results according experiment & LMs
+    min_dev_un.sort(key=lambda x: (x[1], x[0]))
+
+    # extract corresponding test results to dev set
+    d = []
+    for exp, lm, lm_w in min_dev_un:
+        c.execute(("SELECT * FROM results WHERE "
+                   "dataset='test' AND exp=? AND lm=? AND lm_w=?"),
+                  (exp, lm, lm_w))
+        x = c.fetchall()
+        assert (len(x) == 1), "One row should be extracted."
+        d.append(x[0])
+
+    t = Table(data=d, colnames=['exp', 'set', 'LM', 'LMW', 'WER', 'SER'])
+    print str(t)
+    if args.latex:
+        print Table2LatexTable(t)
--- a/egs/vystadial_en/s5/local/save_check.sh
+++ b/egs/vystadial_en/s5/local/save_check.sh
@ -0,0 +1,49 @@
+#!/bin/bash
+# Copyright (c) 2013, Ondrej Platek, Ufal MFF UK <oplatek@ufal.mff.cuni.cz>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License. #
+
+EXP=$1
+
+# make sure that the directories exists
+conflict=""
+for d in $@ ; do
+    if [ -d $d ] || [ -f $d ] ; then
+        conflict="$conflict $d"
+    fi
+done
+
+if [[ ! -z "$conflict" ]] ; then
+    echo "Running new experiment will create following directories."
+    echo "Some of them already exists!"
+    echo ""
+    echo "Existing directories:"
+    for d in $conflict ; do 
+        echo "   $d"
+    done
+    read -p "Should I delete the conflicting directories NOW y/n?"
+    case $REPLY in
+        [Yy]* ) echo "Deleting $conflict directories"; rm -rf $conflict;;
+        * ) echo 'Keeping conflicting directories and exiting ...'; exit 1;;
+    esac
+fi
+
+for d in $@ ; do
+    mkdir -p $d
+done
+
+# Save the variables set up 
+(set -o posix ; set ) > $EXP/experiment_bash_vars.log
+# git log -1 > $EXP/alex_gitlog.log
+# git diff > $EXP/alex_gitdiff.log
--- a/egs/vystadial_en/s5/local/score.sh
+++ b/egs/vystadial_en/s5/local/score.sh
@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+#           2014  Mff UK, UFAL (modification: Ondrej Platek)
+# Apache 2.0
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+min_lmw=9
+max_lmw=20
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --min_lmw <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmw <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/scoring/log
+
+cp $data/text $dir/scoring/test.txt
+
+$cmd LMW=$min_lmw:$max_lmw $dir/scoring/log/best_path.LMW.log \
+  lattice-best-path --lm-scale=LMW --word-symbol-table=$symtab \
+    "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMW.tra || exit 1;
+
+$cmd LMW=$min_lmw:$max_lmw $dir/scoring/log/score.LMW.log \
+   cat $dir/scoring/LMW.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| \
+    compute-wer --text --mode=present \
+     ark:$dir/scoring/test.txt  ark,p:- ">&" $dir/wer_LMW || exit 1;
+
+# Show results
+for f in $dir/wer_*; do echo $f; egrep  '(WER)|(SER)' < $f; done
+
+exit 0;
--- a/egs/vystadial_en/s5/path.sh
+++ b/egs/vystadial_en/s5/path.sh
@ -0,0 +1,18 @@
+# Needed for "correct" sorting
+export LC_ALL=C
+export KALDI_ROOT=../../..
+
+# adding Kaldi binaries to path
+export PATH=$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PWD/utils:$PWD/steps:$PATH
+
+
+srilm_bin=$KALDI_ROOT/tools/srilm/bin/
+if [ ! -e "$srilm_bin" ] ; then
+    echo "SRILM is not installed in $KALDI_ROOT/tools."
+    echo "May not be able to create LMs!"
+    echo "Please go to $KALDI_ROOT/tools and run ./install_srilm.sh"
+fi
+srilm_sub_bin=`find "$srilm_bin" -type d`
+for d in $srilm_sub_bin ; do
+    export PATH=$d:$PATH
+done
--- a/egs/vystadial_en/s5/run.sh
+++ b/egs/vystadial_en/s5/run.sh
@ -0,0 +1,165 @@
+#!/bin/bash
+# Copyright Ondrej Platek Apache 2.0
+renice 20 $$
+
+# Load training parameters
+. ./env_voip_en.sh
+# Source optional config if exists
+[ -f env_voip_en_CUSTOM.sh ] && . ./env_voip_en_CUSTOM.sh
+
+. ./path.sh
+
+# If you have cluster of machines running GridEngine you may want to
+# change the train and decode commands in the file below
+. ./cmd.sh
+
+#######################################################################
+#       Preparing acoustic features, LMs and helper files             #
+#######################################################################
+
+echo " Copy the configuration files to $EXP directory."
+local/save_check.sh $EXP $WORK/*  || exit 1;
+
+local/download_en_data.sh $DATA_ROOT || exit 1;
+
+local/data_split.sh --every_n $EVERY_N $DATA_ROOT $WORK/local "$LMs" "$TEST_SETS" || exit 1
+
+local/create_LMs.sh $WORK/local $WORK/local/train/trans.txt \
+    $WORK/local/test/trans.txt  $WORK/local/lm "$LMs" || exit 1
+
+local/prepare_en_transcription.sh $WORK/local $WORK/local/dict || exit 1
+
+local/create_phone_lists.sh $WORK/local/dict || exit 1
+
+utils/prepare_lang.sh $WORK/local/dict '_SIL_' $WORK/local/lang $WORK/lang || exit 1
+
+local/create_G.sh $WORK/lang "$LMs" $WORK/local/lm $WORK/local/dict/lexicon.txt || exit 1
+
+echo "Create MFCC features and storing them (Could be large)."
+for s in train $TEST_SETS ; do
+    steps/make_mfcc.sh --mfcc-config common/mfcc.conf --cmd \
+      "$train_cmd" --nj $njobs $WORK/local/$s $EXP/make_mfcc/$s $WORK/mfcc || exit 1;
+    # Note --fake -> NO CMVN
+    steps/compute_cmvn_stats.sh $fake $WORK/local/$s \
+      $EXP/make_mfcc/$s $WORK/mfcc || exit 1;
+done
+
+echo "Decoding is done for each pair (TEST_SET x LMs)"
+echo "Distribute the links to MFCC feats to all LM variations."
+cp $WORK/local/train/feats.scp $WORK/train/feats.scp
+cp $WORK/local/train/cmvn.scp $WORK/train/cmvn.scp
+for s in $TEST_SETS; do
+  for lm in $LMs; do
+    tgt_dir=${s}_`basename "$lm"`
+    echo "cp $WORK/local/$s/feats.scp $WORK/$tgt_dir/feats.scp"
+    cp $WORK/local/$s/feats.scp $WORK/$tgt_dir/feats.scp
+    echo "cp $WORK/local/$s/cmvn.scp $WORK/$tgt_dir/cmvn.scp"
+    cp $WORK/local/$s/cmvn.scp $WORK/$tgt_dir/cmvn.scp
+  done
+done
+
+#######################################################################
+#                      Training Acoustic Models                       #
+#######################################################################
+
+echo "Train monophone models on full data -> may be wastefull (can be done on subset)"
+steps/train_mono.sh  --nj $njobs --cmd "$train_cmd" $WORK/train $WORK/lang $EXP/mono || exit 1;
+
+echo "Get alignments from monophone system."
+steps/align_si.sh  --nj $njobs --cmd "$train_cmd" \
+  $WORK/train $WORK/lang $EXP/mono $EXP/mono_ali || exit 1;
+
+echo "Train tri1 [first triphone pass]"
+steps/train_deltas.sh  --cmd "$train_cmd" \
+  $pdf $gauss $WORK/train $WORK/lang $EXP/mono_ali $EXP/tri1 || exit 1;
+
+# draw-tree $WORK/lang/phones.txt $EXP/tri1/tree | dot -Tsvg -Gsize=8,10.5  > graph.svg
+
+echo "Align tri1"
+steps/align_si.sh  --nj $njobs --cmd "$train_cmd" \
+  --use-graphs true $WORK/train $WORK/lang $EXP/tri1 $EXP/tri1_ali || exit 1;
+
+echo "Train tri2a [delta+delta-deltas]"
+steps/train_deltas.sh  --cmd "$train_cmd" $pdf $gauss \
+  $WORK/train $WORK/lang $EXP/tri1_ali $EXP/tri2a || exit 1;
+
+echo "Train tri2b [LDA+MLLT]"
+steps/train_lda_mllt.sh  --cmd "$train_cmd" $pdf $gauss \
+  $WORK/train $WORK/lang $EXP/tri1_ali $EXP/tri2b || exit 1;
+
+echo "Align all data with LDA+MLLT system (tri2b)"
+steps/align_si.sh  --nj $njobs --cmd "$train_cmd" \
+    --use-graphs true $WORK/train $WORK/lang $EXP/tri2b $EXP/tri2b_ali || exit 1;
+
+echo "Train MMI on top of LDA+MLLT."
+steps/make_denlats.sh  --nj $njobs --cmd "$train_cmd" \
+   --beam $mmi_beam --lattice-beam $mmi_lat_beam \
+   $WORK/train $WORK/lang $EXP/tri2b $EXP/tri2b_denlats || exit 1;
+steps/train_mmi.sh  $WORK/train $WORK/lang $EXP/tri2b_ali $EXP/tri2b_denlats $EXP/tri2b_mmi || exit 1;
+
+echo "Train MMI on top of LDA+MLLT with boosting. train_mmi_boost is a e.g. 0.05"
+steps/train_mmi.sh  --boost ${train_mmi_boost} $WORK/train $WORK/lang \
+   $EXP/tri2b_ali $EXP/tri2b_denlats $EXP/tri2b_mmi_b${train_mmi_boost} || exit 1;
+
+echo "Train MPE."
+steps/train_mpe.sh $WORK/train $WORK/lang $EXP/tri2b_ali $EXP/tri2b_denlats $EXP/tri2b_mpe || exit 1;
+
+#######################################################################
+#                       Building decoding graph                       #
+#######################################################################
+for lm in $LMs ; do
+  lm=`basename "$lm"`
+  utils/mkgraph.sh --mono $WORK/lang_${lm} $EXP/mono $EXP/mono/graph_${lm} || exit 1
+  utils/mkgraph.sh $WORK/lang_${lm} $EXP/tri1 $EXP/tri1/graph_${lm} || exit 1
+  utils/mkgraph.sh $WORK/lang_${lm} $EXP/tri2a $EXP/tri2a/graph_${lm} || exit 1
+  utils/mkgraph.sh $WORK/lang_${lm} $EXP/tri2b $EXP/tri2b/graph_${lm} || exit 1
+done
+
+
+#######################################################################
+#                              Decoding                               #
+#######################################################################
+for s in $TEST_SETS ; do
+  for lm in $LMs ; do
+    lm=`basename "$lm"`
+    tgt_dir=${s}_`basename "$lm"`
+    echo "Monophone decoding"
+    # Note: steps/decode.sh --scoring-opts "--min-lmw $min_lmw --max-lmw $max_lmw" \
+    # calls the command line once for each test,
+    # and afterwards averages the WERs into (in this case $EXP/mono/decode/)
+    steps/decode.sh --scoring-opts "--min-lmw $min_lmw --max-lmw $max_lmw" \
+       --config common/decode.conf --nj $njobs --cmd "$decode_cmd" \
+      $EXP/mono/graph_${lm} $WORK/${tgt_dir} $EXP/mono/decode_${tgt_dir}
+    echo "Decode tri1"
+    steps/decode.sh --scoring-opts "--min-lmw $min_lmw --max-lmw $max_lmw" \
+       --config common/decode.conf --nj $njobs --cmd "$decode_cmd" \
+      $EXP/tri1/graph_${lm} $WORK/$tgt_dir $EXP/tri1/decode_${tgt_dir}
+    echo "Decode tri2a"
+    steps/decode.sh --scoring-opts "--min-lmw $min_lmw --max-lmw $max_lmw" \
+       --config common/decode.conf --nj $njobs --cmd "$decode_cmd" \
+      $EXP/tri2a/graph_${lm} $WORK/$tgt_dir $EXP/tri2a/decode_${tgt_dir}
+    echo "Decode tri2b [LDA+MLLT]"
+    steps/decode.sh --scoring-opts "--min-lmw $min_lmw --max-lmw $max_lmw" \
+       --config common/decode.conf --nj $njobs --cmd "$decode_cmd" \
+      $EXP/tri2b/graph_${lm} $WORK/$tgt_dir $EXP/tri2b/decode_${tgt_dir}
+    # Note: change --iter option to select the best model. 4.mdl == final.mdl
+    echo "Decode MMI on top of LDA+MLLT."
+    steps/decode.sh --scoring-opts "--min-lmw $min_lmw --max-lmw $max_lmw" \
+       --config common/decode.conf --iter 4 --nj $njobs --cmd "$decode_cmd" \
+      $EXP/tri2b/graph_${lm} $WORK/$tgt_dir $EXP/tri2b_mmi/decode_it4_${tgt_dir}
+    echo "Decode MMI on top of LDA+MLLT with boosting. train_mmi_boost is a number e.g. 0.05"
+    steps/decode.sh --scoring-opts "--min-lmw $min_lmw --max-lmw $max_lmw" \
+       --config common/decode.conf --iter 4 --nj $njobs --cmd "$decode_cmd" \
+      $EXP/tri2b/graph_${lm} $WORK/$tgt_dir $EXP/tri2b_mmi_b${train_mmi_boost}/decode_it4_${tgt_dir};
+    echo "Decode MPE."
+    steps/decode.sh --scoring-opts "--min-lmw $min_lmw --max-lmw $max_lmw" \
+       --config common/decode.conf --iter 4 --nj $njobs --cmd "$decode_cmd" \
+      $EXP/tri2b/graph_${lm} $WORK/$tgt_dir $EXP/tri2b_mpe/decode_it4_${tgt_dir} || exit 1;
+  done
+done
+
+
+echo "Successfully trained and evaluated all the experiments"
+local/results.py $EXP | tee $EXP/results.log
+
+local/export_models.sh $TGT_MODELS $EXP $WORK/lang
--- a/egs/vystadial_en/s5/steps
+++ b/egs/vystadial_en/s5/steps
@ -0,0 +1 @@
+../../wsj/s5/steps
--- a/egs/vystadial_en/s5/utils
+++ b/egs/vystadial_en/s5/utils
@ -0,0 +1 @@
+../../wsj/s5/utils