зеркало из https://github.com/mozilla/kaldi.git
trunk: Added FFV installation script (from Bagher) and modified sacc one; improvement to sgmm2 efficiency when spk-vec dim is large; additional documentation on I/O; minor fixes.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@2736 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
ff9cb38538
Коммит
8b1bfb1d8d
|
@ -52,7 +52,17 @@ scp=$data/wav.scp
|
|||
|
||||
[ ! -s $KALDI_ROOT ] && KALDI_ROOT=../../..
|
||||
|
||||
sacc_dir=$KALDI_ROOT/tools/sacc/SAcC_GLNXA64/
|
||||
( # this is for back compatiblity:
|
||||
cd $KALDI_ROOT/tools
|
||||
if [ -d sacc ] && [ ! -d pitch_trackers/sacc ]; then
|
||||
echo "Linking sacc directory to new location."
|
||||
mkdir -p pitch_trackers
|
||||
cd pitch_trackers
|
||||
ln -s ../sacc ..
|
||||
fi
|
||||
)
|
||||
|
||||
sacc_dir=$KALDI_ROOT/tools/pitch_trackers/sacc/SAcC_GLNXA64/
|
||||
# make $sacc_dir an absolute pathname.
|
||||
sacc_dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $sacc_dir ${PWD}`
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/perl
|
||||
# Copyright 2010-2012 Microsoft Corporation
|
||||
# Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
|
||||
// Copyright 2009-2011 Microsoft Corporation
|
||||
// 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -20,7 +21,9 @@
|
|||
namespace kaldi {
|
||||
/** \page io Kaldi I/O mechanisms
|
||||
|
||||
This page gives an overview of input-output mechanisms in Kaldi.
|
||||
This page gives an overview of input-output mechanisms in Kaldi.
|
||||
This section of the documentation is oriented towards the code-level mechanisms
|
||||
for I/O; for documentation more oriented towards the command-line, see \ref io_tut.
|
||||
|
||||
\section io_sec_style The input/output style of Kaldi classes
|
||||
|
||||
|
@ -234,8 +237,9 @@ namespace kaldi {
|
|||
For a list of classes and types that relate to Tables, see \ref table_group.
|
||||
|
||||
A Table can be accessed in three ways: using a TableWriter, a
|
||||
SequentialTableReader, and a RandomAccessTableReader. The names are
|
||||
self-explanatory. These are all templates; they are templated not on the
|
||||
SequentialTableReader, and a RandomAccessTableReader (there is also
|
||||
RandomAccessTableReaderMapped, which is a special case we will introduce later).
|
||||
These are all templates; they are templated not on the
|
||||
object in the table, but on a Holder type (see below, \ref io_sec_holders) that
|
||||
tells the Table code how to read and write that type of object. To open
|
||||
a Table type, you must provide a string called a wspecifier or rspecifier (see below, \ref
|
||||
|
@ -564,6 +568,23 @@ namespace kaldi {
|
|||
If there is a gap in the second archive, it can use the fact that the second archive is sorted
|
||||
to avoid searching till the end of the file (this is the job of the "s" option).
|
||||
|
||||
\subsection io_sec_mapped
|
||||
|
||||
In order to condense a particular code pattern that was recurring in many programs, we have introduced the template type
|
||||
RandomAccessTableReaderMapped. Unlike RandomAccessTableReader, this takes two initializer arguments, for instance:
|
||||
\verbatim
|
||||
std::string rspecifier, utt2spk_map_rspecifier; // get these from somewhere.
|
||||
RandomAccessTableReaderMapped<BaseFloatMatrixHolder> transform_reader(rspecifier,
|
||||
utt2spk_map_rspecifier);
|
||||
\endverbatim
|
||||
If utt2spk_map_rspecifier is the empty string, this will behave just like a
|
||||
regular RandomAccessTableReader. If it is nonempty, e.g. ark:data/train/utt2spk,
|
||||
it will read an utterance-to-speaker map from that location and whenever a particular
|
||||
string e.g. utt1 is queried, it will use that map to convert the utterance-id
|
||||
to a speaker-id (e.g. spk1) and use that as the key to query the table being
|
||||
read from rspecifier. The utterance-to-speaker map is also an archive
|
||||
because it happens that the Table code is the easiest way to read in such maps.
|
||||
|
||||
|
||||
*/
|
||||
|
||||
|
|
|
@ -0,0 +1,192 @@
|
|||
// doc/io_tut.dox
|
||||
|
||||
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
namespace kaldi {
|
||||
/** \page io_tut Kaldi I/O from a command-line perspective.
|
||||
|
||||
This page describes the I/O mechanisms in Kaldi from the perspective of
|
||||
a user of the command line tools. See \ref io for a more code-level overview.
|
||||
|
||||
\section Overview
|
||||
|
||||
\subsection io_tut_nontable Non-table I/O
|
||||
|
||||
We first describe "non-table" I/O. This refers to files or streams containing just
|
||||
one or two objects (e.g. acoustic model files; transformation matrices), rather than a
|
||||
collection of objects indexed by strings.
|
||||
|
||||
- Kaldi file formats are binary by default but programs will output non-binary
|
||||
if you supply the flag --binary=false.
|
||||
- Many objects have corresponding "copy" programs, e.g. copy-matrix or gmm-copy,
|
||||
which can be used with the --binary=false flag to convert to text form, e.g.
|
||||
``copy-matrix --binary=false foo.mat -''.
|
||||
- There is typically a one-to-one correspondence between an file on disk and a C++ object
|
||||
in memory, e.g. a matrix of floats, although some files contain more than one object
|
||||
(Case in point: for acoustic model files, typically a TransitionModel object and then
|
||||
an acoustic model).
|
||||
- Kaldi programs typically know which type of object they are expecting to read, rather
|
||||
than working it out from the stream.
|
||||
- Similarly to perl, a filename can be replaced with - (for standard input/output) or
|
||||
a string such as ``|gzip -c >foo.gz'' or ``gunzip -c foo.gz|''
|
||||
- For reading files, we also support things like foo:1045, meaning character-offset
|
||||
1045 within file foo.
|
||||
- In order to refer to the above concept, we generally use the special terms 'rxfilename' for
|
||||
a string describing a stream to be read (i.e. a file, stream or the standard input),
|
||||
and 'wxfilename' for an output stream. See \ref io_sec_xfilename.
|
||||
|
||||
To illustrate the concepts above, make sure $KALDI_ROOT/src/bin is on your path,
|
||||
where $KALDI_ROOT is the top of the repository, and type the following:
|
||||
\verbatim
|
||||
echo '[ 0 1 ]' | copy-matrix - -
|
||||
\endverbatim
|
||||
It will print out a log message and some binary data corresponding to that matrix.
|
||||
\verbatim
|
||||
echo '[ 0 1 ]' | copy-matrix --binary=false - -
|
||||
\endverbatim
|
||||
The output will look like this:
|
||||
\verbatim
|
||||
# copy-matrix --binary=false - -
|
||||
copy-matrix --binary=false - -
|
||||
[
|
||||
0 1 ]
|
||||
LOG (copy-matrix:main():copy-matrix.cc:68) Copied matrix to -
|
||||
\endverbatim
|
||||
Although it looks like the matrix and log messages are mixed up, the log messages
|
||||
are on the standard error and would not be passed into a pipe; to avoid seeing
|
||||
the log messages you could redirect stderr to /dev/null by adding 2>/dev/null to the
|
||||
command line.
|
||||
|
||||
Kaldi programs may be connected using pipes or by using the
|
||||
stream-as-a-file mechanism of Kaldi I/O. Here is a pipe example:
|
||||
\verbatim
|
||||
echo '[ 0 1 ]' | copy-matrix - - | copy-matrix --binary=false - -
|
||||
\endverbatim
|
||||
This outputs the matrix in text form (the first copy-matrix command converts
|
||||
to binary form and the second to text form, which is of course pointless).
|
||||
You could accomplish the same thing in a more convoluted way by doing this:
|
||||
\verbatim
|
||||
copy-matrix 'echo [ 0 1 ]|' '|copy-matrix --binary=false - -'
|
||||
\endverbatim
|
||||
There is no reas to do this here, but it can sometimes be useful when
|
||||
programs have multiple inputs or outputs so the stdin or stdout is
|
||||
already being used. It is particularly useful with tables (see next section).
|
||||
|
||||
|
||||
\subsection io_tut_table Table I/O
|
||||
|
||||
Kaldi has special I/O mechanisms for dealing with collections of objects
|
||||
indexed by strings. Examples of this are feature matrices indexed by
|
||||
utterance-ids, or speaker-adaptation transformation matrices indexed
|
||||
by speaker-ids. The strings that index the collection must be nonempty
|
||||
and whitespace free. See \ref io_sec_tables for a more in-depth
|
||||
discussion.
|
||||
|
||||
A Table may exist in two forms: an "archive" or a "script file". The
|
||||
difference is that the archive actually contains the data, while
|
||||
a script file points to the location of the data.
|
||||
|
||||
Programs that read from Tables expect a string we call an "rspecifier" that
|
||||
says how to read the indexed data, and programs that write to Tables expect
|
||||
a string we call a "wspecifier" to write it. These are strings that specify
|
||||
whether to expect script file or an archive, and the file location, along
|
||||
with various options. Common types of
|
||||
rspecifiers include ``ark:-'', meaning read the data as an archive
|
||||
from the standard input, or ``scp:foo.scp'', meaning the script file
|
||||
"scp" says where to read the data from. Points to bear in
|
||||
mind are:
|
||||
|
||||
- The part after the colon is interpreted as a wxfilename or rxfilename (as
|
||||
in \ref io_tut_nontable), meaning that things like pipes and standard
|
||||
input/output are supported.
|
||||
- A Table always contains just one type of object (e.g., a matrix of floats).
|
||||
- You may see options on rspecifiers and wspecifiers, principally:
|
||||
- In rspecifiers, ark,s,cs:- means that when we read (from the standard input in this case)
|
||||
we expect the keys to be in sorted order (,s) and we assert that they will be accessed
|
||||
in sorted order (,cs) meaning that we know the program will
|
||||
access them in sorted order (the program will crash if these conditions do not hold).
|
||||
This allows Kaldi to emulate random access without using up a lot of memory.
|
||||
- For data that isn't too large and for which it's inconvenient to ensure sorted order
|
||||
(e.g. transforms for speaker adaptation), there is little harm in omitting the ,s,cs.
|
||||
- Typically programs that take multiple rspecifiers will iterate over the objects in the
|
||||
first one (sequential access) and do random access on the later ones, so ,s,cs is
|
||||
generally not needed for the first rspecifier.
|
||||
- In scp,p:foo.scp, the ,p means we should not crash if some of the
|
||||
referenced files do not exist (for archives, ,p| will prevent a crash if
|
||||
the archive is corrupted or truncated.)
|
||||
- For writing, the option ,t| means text mode, e.g. in ark,t:-|.
|
||||
The --binary| command-line option has no effect for archives.
|
||||
- The script-file format is, on each line, ``<key> <rspecifier|wspecifier>'', e.g.
|
||||
utt1 /foo/bar/utt1.mat|. It is OK for the rspecifier or wspecifier to contain
|
||||
spaces, e.g.: ``utt1 gunzip -c /foo/bar/utt1.mat.gz''.
|
||||
- The archive format is: <key1> <object1> <newline> <key2> <object2> <newline> ...
|
||||
- Archives may be concatenated and they will still be valid archives, but be careful about
|
||||
the order of concatenation, e.g. avoid ``cat a/b/*.ark''
|
||||
if you need the sorted order.
|
||||
- Although not often used, script files may be used for output, e.g. if we write to
|
||||
the wspecifier scp:foo.scp, and the program tries to write to key utt1,
|
||||
it looks for a line like utt1 some_file.mat in foo.scp. It will crash
|
||||
if there is no such line.
|
||||
- It is possible to write to both an archive and script at the same time,
|
||||
e.g. ark,scp:foo.ark,foo.scp. The script file will be written with offsets
|
||||
like utt1 foo.ark:1016. This is useful when data is to be accessed in random order
|
||||
or in parts, but you don't want to produce lots of small files.
|
||||
- It is possible to trick the archive mechanism into operating on single files. For instance,
|
||||
\verbatim
|
||||
echo '[ 0 1 ]' | copy-matrix 'scp:echo foo -|' 'scp,t:echo foo -|'
|
||||
\endverbatim
|
||||
This deserves a little explanation. Firstly, the rspecifier ``scp:echo foo -|'' is equivalent
|
||||
to scp:bar.scp if the file bar.scp contained just the line ``foo -''. This
|
||||
tells it to read the object indexed by "foo" from the standard input. Similarly, for
|
||||
the wspecifier ``scp,t:echo foo -|'', it writes the data for ``foo'' to the standard
|
||||
output. This trick should not be overused. In this particular case, it is unnecessary
|
||||
because we have made the copy-matrix program support regular files (rxfilenames|wxfilenames),
|
||||
as well as tables so you could have written just ``copy-matrix - -''. If you have to use
|
||||
this trick too much, it's better to modify the program concerned.
|
||||
- In certain cases the archive-reading code allows for limited type conversion, e.g.
|
||||
between float and double for matrices, or Lattice and CompactLattice for lattices.
|
||||
|
||||
\subsection io_tut_maps Utterance-to-speaker and speaker-to-utterance maps.
|
||||
|
||||
Many Kaldi programs take utterance-to-speaker and speaker-to-utterances maps-- files
|
||||
called ``utt2spk'' or ``spk2utt''. These are generally specified by command-line options
|
||||
--utt2spk and --spk2utt. The utt2spk map has the format
|
||||
\verbatim
|
||||
utt1 spk_of_utt1
|
||||
utt2 spk_of_utt2
|
||||
...
|
||||
\endverbatim
|
||||
and the spk2utt map has the format
|
||||
\verbatim
|
||||
spk1 utt1_of_spk1 utt2_of_spk1 utt3_of_spk1
|
||||
spk2 utt1_of_spk2 utt2_of_spk2
|
||||
...
|
||||
\endverbatim
|
||||
These files are used for speaker adaptation, e.g. for finding which speaker corresponds
|
||||
to an utterances, or to iterate over speakers.
|
||||
For reasons that relate mostly to the way the Kaldi example scripts are set up
|
||||
and the way we split data up into multiple pieces, it's important to ensure
|
||||
that the speakers in the utterance-to-speaker map are in sorted order (see \ref data_prep).
|
||||
Anyway, these files are actually treated as archives, and for this reason
|
||||
you will see command-line options like --utt2spk=ark:data/train/utt2spk.
|
||||
At the code level, the utt2spk file is treated as a table containing a string, and the spk2utt
|
||||
file is treated as a table containing a list of strings.
|
||||
|
||||
|
||||
*/
|
||||
|
||||
}
|
|
@ -50,6 +50,7 @@
|
|||
- \ref matrix
|
||||
- \ref matrixwrap
|
||||
- \ref io
|
||||
- \ref io_tut
|
||||
- \ref error
|
||||
- \ref parse_options
|
||||
- \ref util
|
||||
|
|
|
@ -383,7 +383,7 @@ TE -1 3 ( CE 12 CE 13 CE 14 )
|
|||
|
||||
\section tree_ilabel The ilabel_info object
|
||||
|
||||
The final graph (HCLG in the standard notation, see \ref graph) has symbols
|
||||
The CLG graph (see \ref graph) has symbols
|
||||
on its input side that represent context-dependent phones (as well as
|
||||
disambiguation symbols and possibly epsilon symbols). In the graph, as always,
|
||||
these are represented by integer labels. We use an object that, in code
|
||||
|
@ -402,7 +402,7 @@ input label the corresponding phonetic context window (see above,
|
|||
have
|
||||
\code
|
||||
// not valid C++
|
||||
ilabel_info[1500] == { 4, 1500, 12 };
|
||||
ilabel_info[1500] == { 4, 30, 12 };
|
||||
\endcode
|
||||
In the monophone case, we would have things like:
|
||||
\code
|
||||
|
|
|
@ -385,17 +385,12 @@ void AmSgmm2::InitializeFromFullGmm(const FullGmm &full_gmm,
|
|||
full_ubm_.CopyFromFullGmm(full_gmm);
|
||||
diag_ubm_.CopyFromFullGmm(full_gmm);
|
||||
if (phn_subspace_dim < 1 || phn_subspace_dim > full_gmm.Dim() + 1) {
|
||||
KALDI_WARN << "Initial phone-subspace dimension must be in [1, "
|
||||
<< full_gmm.Dim() + 1 << "]. Changing from " << phn_subspace_dim
|
||||
<< " to " << full_gmm.Dim() + 1;
|
||||
KALDI_WARN << "Initial phone-subspace dimension must be >= 1, value is "
|
||||
<< phn_subspace_dim << "; setting to " << full_gmm.Dim() + 1;
|
||||
phn_subspace_dim = full_gmm.Dim() + 1;
|
||||
}
|
||||
if (spk_subspace_dim < 0 || spk_subspace_dim > full_gmm.Dim()) {
|
||||
KALDI_WARN << "Initial spk-subspace dimension must be in [1, "
|
||||
<< full_gmm.Dim() << "]. Changing from " << spk_subspace_dim
|
||||
<< " to " << full_gmm.Dim();
|
||||
spk_subspace_dim = full_gmm.Dim();
|
||||
}
|
||||
KALDI_ASSERT(spk_subspace_dim >= 0);
|
||||
|
||||
w_.Resize(0, 0);
|
||||
N_.clear();
|
||||
c_.clear();
|
||||
|
@ -1118,7 +1113,7 @@ void AmSgmm2::ComputeH(std::vector< SpMatrix<double> > *H_i) const;
|
|||
|
||||
// Initializes the matrices M_{i} and w_i
|
||||
void AmSgmm2::InitializeMw(int32 phn_subspace_dim,
|
||||
const Matrix<BaseFloat> &norm_xform) {
|
||||
const Matrix<BaseFloat> &norm_xform) {
|
||||
int32 ddim = full_ubm_.Dim();
|
||||
KALDI_ASSERT(phn_subspace_dim <= ddim + 1);
|
||||
KALDI_ASSERT(phn_subspace_dim <= norm_xform.NumCols() + 1);
|
||||
|
@ -1134,8 +1129,14 @@ void AmSgmm2::InitializeMw(int32 phn_subspace_dim,
|
|||
thisM.Resize(ddim, phn_subspace_dim);
|
||||
// Eq. (27): M_{i} = [ \bar{\mu}_{i} (J)_{1:D, 1:(S-1)}]
|
||||
thisM.CopyColFromVec(mean, 0);
|
||||
thisM.Range(0, ddim, 1, phn_subspace_dim-1).CopyFromMat(
|
||||
norm_xform.Range(0, ddim, 0, phn_subspace_dim-1), kNoTrans);
|
||||
int32 nonrandom_dim = std::min(phn_subspace_dim - 1, ddim),
|
||||
random_dim = phn_subspace_dim - 1 - nonrandom_dim;
|
||||
thisM.Range(0, ddim, 1, nonrandom_dim).CopyFromMat(
|
||||
norm_xform.Range(0, ddim, 0, nonrandom_dim), kNoTrans);
|
||||
// The following extension to the original paper allows us to
|
||||
// initialize the model with a larger dimension of phone-subspace vector.
|
||||
if (random_dim > 0)
|
||||
thisM.Range(0, ddim, nonrandom_dim + 1, random_dim).SetRandn();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1144,16 +1145,22 @@ void AmSgmm2::InitializeNu(int32 spk_subspace_dim,
|
|||
const Matrix<BaseFloat> &norm_xform,
|
||||
bool speaker_dependent_weights) {
|
||||
int32 ddim = full_ubm_.Dim();
|
||||
KALDI_ASSERT(spk_subspace_dim <= ddim);
|
||||
KALDI_ASSERT(spk_subspace_dim <= norm_xform.NumCols());
|
||||
KALDI_ASSERT(ddim <= norm_xform.NumRows());
|
||||
|
||||
|
||||
int32 num_gauss = full_ubm_.NumGauss();
|
||||
N_.resize(num_gauss);
|
||||
for (int32 i = 0; i < num_gauss; i++) {
|
||||
N_[i].Resize(ddim, spk_subspace_dim);
|
||||
// Eq. (28): N_{i} = [ (J)_{1:D, 1:T)}]
|
||||
N_[i].CopyFromMat(norm_xform.Range(0, ddim, 0, spk_subspace_dim), kNoTrans);
|
||||
|
||||
int32 nonrandom_dim = std::min(spk_subspace_dim, ddim),
|
||||
random_dim = spk_subspace_dim - nonrandom_dim;
|
||||
|
||||
N_[i].Range(0, ddim, 0, nonrandom_dim).
|
||||
CopyFromMat(norm_xform.Range(0, ddim, 0, nonrandom_dim), kNoTrans);
|
||||
// The following extension to the original paper allows us to
|
||||
// initialize the model with a larger dimension of speaker-subspace vector.
|
||||
if (random_dim > 0)
|
||||
N_[i].Range(0, ddim, nonrandom_dim, random_dim).SetRandn();
|
||||
}
|
||||
if (speaker_dependent_weights) {
|
||||
u_.Resize(num_gauss, spk_subspace_dim); // will set to zero.
|
||||
|
|
|
@ -169,6 +169,7 @@ class Sgmm2PerSpkDerivedVars {
|
|||
log_b_is.Resize(0);
|
||||
log_d_jms.resize(0);
|
||||
}
|
||||
bool Empty() { return v_s.Dim() == 0; }
|
||||
// caution: after SetSpeakerVector you typically want to
|
||||
// use the function AmSgmm::ComputePerSpkDerivedVars
|
||||
const Vector<BaseFloat> &GetSpeakerVector() { return v_s; }
|
||||
|
|
|
@ -584,7 +584,8 @@ void MleAmSgmm2Accs::CommitStatsForSpk(const AmSgmm2 &model,
|
|||
for (int32 i = 0; i < num_gaussians_; i++)
|
||||
// Accumulate Statistics R_{ki}
|
||||
if (gamma_s_(i) != 0.0)
|
||||
R_[i].AddVec2(static_cast<BaseFloat>(gamma_s_(i)), v_s);
|
||||
R_[i].AddVec2(gamma_s_(i),
|
||||
Vector<double>(v_s));
|
||||
}
|
||||
if (a_s_.Dim() != 0) {
|
||||
Vector<BaseFloat> tmp(gamma_s_);
|
||||
|
@ -592,7 +593,8 @@ void MleAmSgmm2Accs::CommitStatsForSpk(const AmSgmm2 &model,
|
|||
tmp.AddVecVec(-1.0, Vector<BaseFloat>(a_s_), spk_vars.b_is, 1.0);
|
||||
t_.AddVecVec(1.0, tmp, v_s); // eq. 53 of techreport.
|
||||
for (int32 i = 0; i < num_gaussians_; i++) {
|
||||
U_[i].AddVec2(a_s_(i) * spk_vars.b_is(i), v_s); // eq. 54 of techreport.
|
||||
U_[i].AddVec2(a_s_(i) * spk_vars.b_is(i),
|
||||
Vector<double>(v_s)); // eq. 54 of techreport.
|
||||
}
|
||||
}
|
||||
gamma_s_.SetZero();
|
||||
|
|
|
@ -68,7 +68,8 @@ int main(int argc, char *argv[]) {
|
|||
RandomAccessSgmm2GauPostReader gpost_reader(gpost_rspecifier);
|
||||
RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
|
||||
utt2spk_rspecifier);
|
||||
|
||||
RandomAccessTokenReader utt2spk_map(utt2spk_rspecifier);
|
||||
|
||||
AmSgmm2 am_sgmm;
|
||||
TransitionModel trans_model;
|
||||
{
|
||||
|
@ -87,8 +88,37 @@ int main(int argc, char *argv[]) {
|
|||
kaldi::Sgmm2PerFrameDerivedVars per_frame_vars;
|
||||
|
||||
int32 num_done = 0, num_err = 0;
|
||||
std::string cur_spk;
|
||||
Sgmm2PerSpkDerivedVars spk_vars;
|
||||
|
||||
for (; !feature_reader.Done(); feature_reader.Next()) {
|
||||
std::string utt = feature_reader.Key();
|
||||
std::string spk = utt;
|
||||
|
||||
if (!utt2spk_rspecifier.empty()) {
|
||||
if (!utt2spk_map.HasKey(utt)) {
|
||||
KALDI_WARN << "utt2spk map does not have value for " << utt
|
||||
<< ", ignoring this utterance.";
|
||||
continue;
|
||||
} else { spk = utt2spk_map.Value(utt); }
|
||||
}
|
||||
if (spk != cur_spk || spk_vars.Empty()) {
|
||||
spk_vars.Clear();
|
||||
if (spkvecs_reader.IsOpen()) {
|
||||
if (spkvecs_reader.HasKey(utt)) {
|
||||
spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
|
||||
am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
|
||||
} else {
|
||||
KALDI_WARN << "Cannot find speaker vector for " << utt;
|
||||
num_err++;
|
||||
continue;
|
||||
}
|
||||
} // else spk_vars is "empty"
|
||||
}
|
||||
if (spk != cur_spk)
|
||||
sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
|
||||
cur_spk = spk;
|
||||
|
||||
const Matrix<BaseFloat> &mat = feature_reader.Value();
|
||||
if (!gpost_reader.HasKey(utt) ||
|
||||
gpost_reader.Value(utt).size() != mat.NumRows()) {
|
||||
|
@ -98,19 +128,7 @@ int main(int argc, char *argv[]) {
|
|||
continue;
|
||||
}
|
||||
const Sgmm2GauPost &gpost = gpost_reader.Value(utt);
|
||||
|
||||
Sgmm2PerSpkDerivedVars spk_vars;
|
||||
if (spkvecs_reader.IsOpen()) {
|
||||
if (spkvecs_reader.HasKey(utt)) {
|
||||
spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
|
||||
am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
|
||||
} else {
|
||||
KALDI_WARN << "Cannot find speaker vector for " << utt;
|
||||
num_err++;
|
||||
continue;
|
||||
}
|
||||
} // else spk_vars is "empty"
|
||||
|
||||
|
||||
num_done++;
|
||||
BaseFloat tot_weight = 0.0;
|
||||
|
||||
|
@ -130,15 +148,14 @@ int main(int argc, char *argv[]) {
|
|||
pdf_id, &spk_vars);
|
||||
tot_weight += weight;
|
||||
}
|
||||
|
||||
sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars); // no harm doing it per
|
||||
// utterance.
|
||||
|
||||
tot_t += tot_weight;
|
||||
if (num_done % 50 == 0)
|
||||
KALDI_LOG << "Processed " << num_done << " utterances";
|
||||
}
|
||||
|
||||
tot_t += tot_weight;
|
||||
if (num_done % 50 == 0)
|
||||
KALDI_LOG << "Processed " << num_done << " utterances";
|
||||
}
|
||||
sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars); // for last speaker
|
||||
|
||||
KALDI_LOG << "Overall number of frames is " << tot_t;
|
||||
|
||||
KALDI_LOG << "Done " << num_done << " files, " << num_err;
|
||||
|
|
|
@ -85,7 +85,8 @@ int main(int argc, char *argv[]) {
|
|||
RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
|
||||
RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
|
||||
utt2spk_rspecifier);
|
||||
|
||||
RandomAccessTokenReader utt2spk_map(utt2spk_rspecifier);
|
||||
|
||||
AmSgmm2 am_sgmm;
|
||||
TransitionModel trans_model;
|
||||
{
|
||||
|
@ -103,9 +104,37 @@ int main(int argc, char *argv[]) {
|
|||
double tot_t = 0;
|
||||
|
||||
kaldi::Sgmm2PerFrameDerivedVars per_frame_vars;
|
||||
|
||||
std::string cur_spk;
|
||||
Sgmm2PerSpkDerivedVars spk_vars;
|
||||
|
||||
for (; !feature_reader.Done(); feature_reader.Next()) {
|
||||
std::string utt = feature_reader.Key();
|
||||
std::string spk = utt;
|
||||
if (!utt2spk_rspecifier.empty()) {
|
||||
if (!utt2spk_map.HasKey(utt)) {
|
||||
KALDI_WARN << "utt2spk map does not have value for " << utt
|
||||
<< ", ignoring this utterance.";
|
||||
continue;
|
||||
} else { spk = utt2spk_map.Value(utt); }
|
||||
}
|
||||
if (spk != cur_spk || spk_vars.Empty()) {
|
||||
spk_vars.Clear();
|
||||
if (spkvecs_reader.IsOpen()) {
|
||||
if (spkvecs_reader.HasKey(utt)) {
|
||||
spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
|
||||
am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
|
||||
} else {
|
||||
KALDI_WARN << "Cannot find speaker vector for " << utt;
|
||||
num_err++;
|
||||
continue;
|
||||
}
|
||||
} // else spk_vars is "empty"
|
||||
}
|
||||
|
||||
if (spk != cur_spk)
|
||||
sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
|
||||
cur_spk = spk;
|
||||
|
||||
const Matrix<BaseFloat> &features = feature_reader.Value();
|
||||
if (!posteriors_reader.HasKey(utt) ||
|
||||
posteriors_reader.Value(utt).size() != features.NumRows()) {
|
||||
|
@ -125,17 +154,6 @@ int main(int argc, char *argv[]) {
|
|||
const std::vector<std::vector<int32> > &gselect =
|
||||
gselect_reader.Value(utt);
|
||||
|
||||
Sgmm2PerSpkDerivedVars spk_vars;
|
||||
if (spkvecs_reader.IsOpen()) {
|
||||
if (spkvecs_reader.HasKey(utt)) {
|
||||
spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
|
||||
am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
|
||||
} else {
|
||||
KALDI_WARN << "Cannot find speaker vector for " << utt;
|
||||
num_err++;
|
||||
continue;
|
||||
}
|
||||
} // else spk_vars is "empty"
|
||||
num_done++;
|
||||
|
||||
BaseFloat tot_like_this_file = 0.0, tot_weight = 0.0;
|
||||
|
@ -155,8 +173,6 @@ int main(int argc, char *argv[]) {
|
|||
tot_weight += weight;
|
||||
}
|
||||
}
|
||||
|
||||
sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars); // no harm doing it per utterance.
|
||||
|
||||
KALDI_VLOG(2) << "Average like for this file is "
|
||||
<< (tot_like_this_file/tot_weight) << " over "
|
||||
|
@ -170,12 +186,15 @@ int main(int argc, char *argv[]) {
|
|||
<< " over " << tot_weight <<" frames.";
|
||||
}
|
||||
}
|
||||
sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars); // commit stats for
|
||||
// last speaker.
|
||||
|
||||
KALDI_LOG << "Overall like per frame (Gaussian only) = "
|
||||
<< (tot_like/tot_t) << " over " << tot_t << " frames.";
|
||||
|
||||
KALDI_LOG << "Done " << num_done << " files, " << num_err
|
||||
<< " with errors.";
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
Output ko(accs_wxfilename, binary);
|
||||
|
|
|
@ -77,6 +77,7 @@ int main(int argc, char *argv[]) {
|
|||
RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
|
||||
RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
|
||||
utt2spk_rspecifier);
|
||||
RandomAccessTokenReader utt2spk_map(utt2spk_rspecifier);
|
||||
|
||||
AmSgmm2 am_sgmm;
|
||||
TransitionModel trans_model;
|
||||
|
@ -114,8 +115,38 @@ int main(int argc, char *argv[]) {
|
|||
kaldi::Sgmm2PerFrameDerivedVars per_frame_vars;
|
||||
|
||||
int32 num_done = 0, num_err = 0;
|
||||
std::string cur_spk;
|
||||
Sgmm2PerSpkDerivedVars spk_vars;
|
||||
|
||||
for (; !feature_reader.Done(); feature_reader.Next()) {
|
||||
std::string utt = feature_reader.Key();
|
||||
std::string spk = utt;
|
||||
if (!utt2spk_rspecifier.empty()) {
|
||||
if (!utt2spk_map.HasKey(utt)) {
|
||||
KALDI_WARN << "utt2spk map does not have value for " << utt
|
||||
<< ", ignoring this utterance.";
|
||||
continue;
|
||||
} else { spk = utt2spk_map.Value(utt); }
|
||||
}
|
||||
if (spk != cur_spk || spk_vars.Empty()) {
|
||||
spk_vars.Clear();
|
||||
if (spkvecs_reader.IsOpen()) {
|
||||
if (spkvecs_reader.HasKey(utt)) {
|
||||
spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
|
||||
am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
|
||||
} else {
|
||||
KALDI_WARN << "Cannot find speaker vector for " << utt;
|
||||
num_err++;
|
||||
continue;
|
||||
}
|
||||
} // else spk_vars is "empty"
|
||||
}
|
||||
if (spk != cur_spk) {
|
||||
num_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
|
||||
den_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
|
||||
}
|
||||
cur_spk = spk;
|
||||
|
||||
const Matrix<BaseFloat> &features = feature_reader.Value();
|
||||
if (!posteriors_reader.HasKey(utt) ||
|
||||
posteriors_reader.Value(utt).size() != features.NumRows()) {
|
||||
|
@ -124,6 +155,7 @@ int main(int argc, char *argv[]) {
|
|||
num_err++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const Posterior &posterior = posteriors_reader.Value(utt);
|
||||
if (!gselect_reader.HasKey(utt)
|
||||
&& gselect_reader.Value(utt).size() != features.NumRows()) {
|
||||
|
@ -134,18 +166,6 @@ int main(int argc, char *argv[]) {
|
|||
const std::vector<std::vector<int32> > &gselect =
|
||||
gselect_reader.Value(utt);
|
||||
|
||||
Sgmm2PerSpkDerivedVars spk_vars;
|
||||
if (spkvecs_reader.IsOpen()) {
|
||||
if (spkvecs_reader.HasKey(utt)) {
|
||||
spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
|
||||
am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
|
||||
} else {
|
||||
KALDI_WARN << "Cannot find speaker vector for " << utt;
|
||||
num_err++;
|
||||
continue;
|
||||
}
|
||||
} // else spk_vars is "empty"
|
||||
|
||||
num_done++;
|
||||
BaseFloat tot_like_this_file = 0.0, tot_weight_this_file = 0.0,
|
||||
tot_abs_weight_this_file = 0.0;
|
||||
|
@ -172,8 +192,10 @@ int main(int argc, char *argv[]) {
|
|||
tot_abs_weight_this_file += abs_weight;
|
||||
}
|
||||
}
|
||||
num_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars); // no harm doing it per utterance.
|
||||
// Commit stats for the last speaker.
|
||||
num_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
|
||||
den_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
|
||||
|
||||
|
||||
tot_like += tot_like_this_file;
|
||||
tot_weight += tot_weight_this_file;
|
||||
|
@ -182,6 +204,10 @@ int main(int argc, char *argv[]) {
|
|||
if (num_done % 50 == 0)
|
||||
KALDI_LOG << "Processed " << num_done << " utterances.";
|
||||
}
|
||||
// Commit stats for last speaker.
|
||||
num_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
|
||||
den_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
|
||||
|
||||
KALDI_LOG << "Overall weighted acoustic likelihood per frame was "
|
||||
<< (tot_like/tot_frames) << " over " << tot_frames << " frames; "
|
||||
<< "average weight per frame is " << (tot_weight/tot_frames)
|
||||
|
@ -208,5 +234,3 @@ int main(int argc, char *argv[]) {
|
|||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -44,7 +44,7 @@ std::string PrintableRxfilename(std::string rxfilename) {
|
|||
|
||||
|
||||
std::string PrintableWxfilename(std::string wxfilename) {
|
||||
if (wxfilename == "" || wxfilename == "-") return "standard input";
|
||||
if (wxfilename == "" || wxfilename == "-") return "standard output";
|
||||
else {
|
||||
// If this call to Escape later causes compilation issues,
|
||||
// just replace it with "return rxfilename"; it's only a
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Make sure we are in the tools/ directory.
|
||||
if [ `basename $PWD` == extras ]; then
|
||||
cd ..
|
||||
fi
|
||||
|
||||
! [ `basename $PWD` == tools ] && \
|
||||
echo "You must call this script from the tools/ directory" && exit 1;
|
||||
|
||||
mkdir -p pitch_trackers
|
||||
cd pitch_trackers
|
||||
|
||||
echo "Installing a package for FFV feature extraction."
|
||||
|
||||
if [ -s ffv-1.0.1.tar.gz ]; then
|
||||
echo "*ffv-1.0.1.tar.gz already exists, not getting it."
|
||||
else
|
||||
! wget -t 2 http://www.cs.cmu.edu/~kornel/software/ffv-1.0.1.tar.gz && \
|
||||
echo "Error wgetting ffv-1.0.1.tar.gz" && exit 1;
|
||||
fi
|
||||
|
||||
if [ -d ffv-1.0.1 ]; then
|
||||
echo "*It looks like ffv-1.0.1.tar.gz has already been unpacked, not unpacking it."
|
||||
else
|
||||
! tar -zxvf ffv-1.0.1.tar.gz && \
|
||||
echo "Error unpacking ffv-1.0.1.tar.gz [e.g. unpack not installed?]" && exit 1;
|
||||
fi
|
||||
cd ffv-1.0.1
|
||||
|
||||
if [ -f Makefile ]; then
|
||||
echo "Makefile already exists, no creating it."
|
||||
else
|
||||
echo "Makefile does not exist, creating it."
|
||||
cat<<'EOF' > ./Makefile
|
||||
CC = gcc
|
||||
# CFLAGS = -c -O3 -Wall -pedantic -std=c99
|
||||
CFLAGS = -c -g -Wall -pedantic -std=c99
|
||||
LIBS = -lm
|
||||
|
||||
LIBOBJECTS = \
|
||||
\
|
||||
windowpair.o \
|
||||
filterbank.o \
|
||||
dcorrxform.o \
|
||||
ffv.o \
|
||||
mutils.o \
|
||||
sutils.o
|
||||
|
||||
all : ffv
|
||||
|
||||
ffv : ffv_main.o ${LIBOBJECTS}
|
||||
${CC} -o $@ $^ ${LIBS}
|
||||
|
||||
%.o : %.c
|
||||
${CC} ${CFLAGS} $<
|
||||
|
||||
clean :
|
||||
rm -f *.o ffv
|
||||
EOF
|
||||
chmod +x Makefile
|
||||
fi
|
||||
make;
|
||||
cd ..
|
||||
|
||||
echo "Installing ffv package is done."
|
||||
exit 0;
|
||||
|
||||
|
||||
|
|
@ -8,8 +8,9 @@ fi
|
|||
! [ `basename $PWD` == tools ] && \
|
||||
echo "You must call this script from the tools/ directory" && exit 1;
|
||||
|
||||
mkdir -p sacc
|
||||
cd sacc
|
||||
mkdir -p pitch_trackers/sacc
|
||||
|
||||
cd pitch_trackers/sacc
|
||||
if [ -s SAcC_GLNXA64.zip ]; then
|
||||
echo "*SAcC_GLNXA64.zip already exists, not getting it."
|
||||
else
|
||||
|
|
Загрузка…
Ссылка в новой задаче