src/util: Enabled reading partial matrices for TableReader (initial work by xzhang, rebased by dan)

This commit is contained in:
xiaohui-zhang 2016-03-22 23:11:56 -04:00 коммит произвёл Daniel Povey
Родитель adb67c4e01
Коммит 0a60eaf8c4
10 изменённых файлов: 435 добавлений и 74 удалений

Просмотреть файл

@ -103,6 +103,7 @@ class WaveData {
data_.Swap(&(other->data_));
std::swap(samp_freq_, other->samp_freq_);
}
private:
static const uint32 kBlockSize = 1024 * 1024; // Use 1M bytes.
Matrix<BaseFloat> data_;
@ -171,6 +172,11 @@ class WaveHolder {
t_.Swap(&(other->t_));
}
bool ExtractRange(WaveHolder *other, const std::string &range) {
KALDI_ERR << "ExtractRange is not defined for this type of holder.";
return false;
}
private:
T t_;
};
@ -216,6 +222,11 @@ class WaveInfoHolder {
void Swap(WaveInfoHolder *other) {
t_.Swap(&(other->t_));
}
bool ExtractRange(WaveInfoHolder *other, const std::string &range) {
KALDI_ERR << "ExtractRange is not defined for this type of holder.";
return false;
}
private:
T t_;
};

Просмотреть файл

@ -114,6 +114,11 @@ class VectorFstTplHolder {
std::swap(t_, other->t_);
}
bool ExtractRange(VectorFstTplHolder<Arc> *other, const std::string &range) {
KALDI_ERR << "ExtractRange is not defined for this type of holder.";
return false;
}
~VectorFstTplHolder() { Clear(); }
// No destructor. Assignment and
// copy constructor take their default implementations.

Просмотреть файл

@ -77,6 +77,11 @@ class PosteriorHolder {
void Swap(PosteriorHolder *other) {
t_.swap(other->t_);
}
bool ExtractRange(PosteriorHolder *other, const std::string &range) {
KALDI_ERR << "ExtractRange is not defined for this type of holder.";
return false;
}
private:
KALDI_DISALLOW_COPY_AND_ASSIGN(PosteriorHolder);
T t_;
@ -116,6 +121,11 @@ class GaussPostHolder {
void Swap(GaussPostHolder *other) {
t_.swap(other->t_);
}
bool ExtractRange(GaussPostHolder *other, const std::string &range) {
KALDI_ERR << "ExtractRange is not defined for this type of holder.";
return false;
}
private:
KALDI_DISALLOW_COPY_AND_ASSIGN(GaussPostHolder);
T t_;

Просмотреть файл

@ -94,6 +94,11 @@ class CompactLatticeHolder {
std::swap(t_, other->t_);
}
bool ExtractRange(CompactLatticeHolder *other, const std::string &range) {
KALDI_ERR << "ExtractRange is not defined for this type of holder.";
return false;
}
~CompactLatticeHolder() { Clear(); }
private:
T *t_;
@ -127,6 +132,11 @@ class LatticeHolder {
std::swap(t_, other->t_);
}
bool ExtractRange(LatticeHolder *other, const std::string &range) {
KALDI_ERR << "ExtractRange is not defined for this type of holder.";
return false;
}
~LatticeHolder() { Clear(); }
private:
T *t_;

Просмотреть файл

@ -10,7 +10,7 @@ TESTFILES = const-integer-set-test stl-utils-test text-utils-test \
kaldi-table-test simple-options-test
OBJFILES = text-utils.o kaldi-io.o \
kaldi-table.o parse-options.o simple-options.o simple-io-funcs.o
kaldi-holder.o kaldi-table.o parse-options.o simple-options.o simple-io-funcs.o
LIBNAME = kaldi-util

Просмотреть файл

@ -1,6 +1,7 @@
// util/kaldi-holder-inl.h
// Copyright 2009-2011 Microsoft Corporation
// 2016 Xiaohui Zhang
// See ../../COPYING for clarification regarding multiple authors
//
@ -101,6 +102,12 @@ template<class KaldiType> class KaldiObjectHolder {
std::swap(t_, other->t_);
}
bool ExtractRange(KaldiObjectHolder<T> *other, const std::string &range) {
delete other->t_;
other->t_ = new T;
return ExtractObjectRange(*t_, range, other->t_);
}
~KaldiObjectHolder() { delete t_; }
private:
KALDI_DISALLOW_COPY_AND_ASSIGN(KaldiObjectHolder);
@ -194,6 +201,11 @@ template<class BasicType> class BasicHolder {
std::swap(t_, other->t_);
}
bool ExtractRange(BasicHolder<T> *other, const std::string &range) {
KALDI_ERR << "ExtractRange is not defined for this type of holder.";
return false;
}
~BasicHolder() { }
private:
KALDI_DISALLOW_COPY_AND_ASSIGN(BasicHolder);
@ -309,6 +321,12 @@ template<class BasicType> class BasicVectorHolder {
t_.swap(other->t_);
}
bool ExtractRange(BasicVectorHolder<BasicType> *other,
const std::string &range) {
KALDI_ERR << "ExtractRange is not defined for this type of holder.";
return false;
}
~BasicVectorHolder() { }
private:
KALDI_DISALLOW_COPY_AND_ASSIGN(BasicVectorHolder);
@ -456,6 +474,12 @@ template<class BasicType> class BasicVectorVectorHolder {
t_.swap(other->t_);
}
bool ExtractRange(BasicVectorVectorHolder<BasicType> *other,
const std::string &range) {
KALDI_ERR << "ExtractRange is not defined for this type of holder.";
return false;
}
~BasicVectorVectorHolder() { }
private:
KALDI_DISALLOW_COPY_AND_ASSIGN(BasicVectorVectorHolder);
@ -597,6 +621,12 @@ template<class BasicType> class BasicPairVectorHolder {
t_.swap(other->t_);
}
bool ExtractRange(BasicPairVectorHolder<BasicType> *other,
const std::string &range) {
KALDI_ERR << "ExtractRange is not defined for this type of holder.";
return false;
}
~BasicPairVectorHolder() { }
private:
KALDI_DISALLOW_COPY_AND_ASSIGN(BasicPairVectorHolder);
@ -652,6 +682,11 @@ class TokenHolder {
t_.swap(other->t_);
}
bool ExtractRange(TokenHolder *other, const std::string &range) {
KALDI_ERR << "ExtractRange is not defined for this type of holder.";
return false;
}
private:
KALDI_DISALLOW_COPY_AND_ASSIGN(TokenHolder);
T t_;
@ -709,6 +744,11 @@ class TokenVectorHolder {
t_.swap(other->t_);
}
bool ExtractRange(TokenVectorHolder *other, const std::string &range) {
KALDI_ERR << "ExtractRange is not defined for this type of holder.";
return false;
}
private:
KALDI_DISALLOW_COPY_AND_ASSIGN(TokenVectorHolder);
T t_;
@ -752,6 +792,11 @@ class HtkMatrixHolder {
std::swap(t_.second, other->t_.second);
}
bool ExtractRange(HtkMatrixHolder *other, const std::string &range) {
KALDI_ERR << "ExtractRange is not defined for this type of holder.";
return false;
}
// No destructor.
private:
KALDI_DISALLOW_COPY_AND_ASSIGN(HtkMatrixHolder);
@ -857,6 +902,11 @@ template<int kFeatDim> class SphinxMatrixHolder {
feats_.Swap(&(other->feats_));
}
bool ExtractRange(SphinxMatrixHolder *other, const std::string &range) {
KALDI_ERR << "ExtractRange is not defined for this type of holder.";
return false;
}
private:
KALDI_DISALLOW_COPY_AND_ASSIGN(SphinxMatrixHolder);
T feats_;

77
src/util/kaldi-holder.cc Normal file
Просмотреть файл

@ -0,0 +1,77 @@
// util/kaldi-holder.cc
// Copyright 2009-2011 Microsoft Corporation
// 2016 Xiaohui Zhang
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "util/kaldi-holder.h"
#include "matrix/kaldi-matrix.h"
namespace kaldi {
template<class Real>
bool ExtractObjectRange(const Matrix<Real> &input, const std::string &range,
Matrix<Real> *output) {
if (range.empty()) {
KALDI_ERR << "Empty range specifier.";
return false;
}
std::vector<std::string> splits;
SplitStringToVector(range, ",", false, &splits);
if (!((splits.size() == 1 && !splits[0].empty()) ||
(splits.size() == 2 && !splits[0].empty() && !splits[1].empty()))) {
KALDI_ERR << "Invalid range specifier: " << range;
return false;
}
std::vector<int32> row_range, col_range;
bool status = true;
if (splits[0] != ":")
status = SplitStringToIntegers(splits[0], ":", false, &row_range);
if (splits.size() == 2 && splits[1] != ":") {
status = status && SplitStringToIntegers(splits[1], ":", false, &col_range);
}
if (row_range.size() == 0) {
row_range.push_back(0);
row_range.push_back(input.NumRows() - 1);
}
if (col_range.size() == 0) {
col_range.push_back(0);
col_range.push_back(input.NumCols() - 1);
}
if (!(status && row_range.size() == 2 && col_range.size() == 2 &&
row_range[0] >= 0 && row_range[0] < row_range[1] &&
row_range[1] < input.NumRows() && col_range[0] >=0 &&
col_range[0] < col_range[1] && col_range[1] < input.NumCols())) {
KALDI_ERR << "Invalid range specifier: " << range;
return false;
}
int32 row_size = row_range[1] - row_range[0] + 1,
col_size = col_range[1] - col_range[0] + 1;
output->Resize(row_size, col_size, kUndefined);
output->CopyFromMat(input.Range(row_range[0], row_size,
col_range[0], col_size));
return true;
}
// template instantiation
template bool ExtractObjectRange(const Matrix<double> &, const std::string &,
Matrix<double> *);
template bool ExtractObjectRange(const Matrix<BaseFloat> &, const std::string &,
Matrix<BaseFloat> *);
} // end namespace kaldi

Просмотреть файл

@ -2,6 +2,7 @@
// Copyright 2009-2011 Microsoft Corporation
// 2016 Johns Hopkins University (author: Daniel Povey)
// 2016 Xiaohui Zhang
// See ../../COPYING for clarification regarding multiple authors
//
@ -134,6 +135,15 @@ template<class SomeType> class GenericHolder {
/// of holder, not with some nonexistent base-class.
void Swap(GenericHolder<T> *other) { std::swap(t_, other->t_); }
/// This is only defined for KaldiObjectHolder holding matrix objects,
/// in order to extract a holder holding a sub-matrix specified by 'range',
/// e.g. [1:2,2:10]. It returns true with successful extraction.
/// For other types of holder it just throws an error.
bool ExtractRange(GenericHolder<T> *other, const std::string &range) {
KALDI_ERR << "ExtractRange is not defined for this type of holder.";
return false;
}
/// If the object held pointers, the destructor would free them.
~GenericHolder() { }
@ -205,6 +215,25 @@ class HtkMatrixHolder;
/// A class for reading/writing Sphinx format matrices.
template<int kFeatDim = 13> class SphinxMatrixHolder;
/// This templated function exists so that we can write .scp files with
/// 'object ranges' specified: the canonical example is a [first:last] range
/// of rows of a matrix, or [first-row:last-row,first-column,last-column]
/// of a matrix. We can also support [begin-time:end-time] of a wave
/// file. The string 'range' is whatever is in the square brackets; it is
/// parsed inside this function.
/// This function returns true if the partial object was successfully extracted,
/// and false if there was an error such as an invalid range.
/// The generic version of this function just fails; we overload the template
/// whenever we need it for a specific class.
template <class T>
bool ExtractObjectRange(const T &input, const std::string &range, T *output) {
KALDI_ERR << "Ranges not supported for objects of this type.";
return false;
}
template <class Real>
bool ExtractObjectRange(const Matrix<Real> &input, const std::string &range,
Matrix<Real> *output);
/// @} end "addtogroup holders"

Просмотреть файл

@ -28,6 +28,7 @@
#include <utility>
#include <vector>
#include "util/kaldi-io.h"
#include "util/kaldi-holder.h"
#include "util/text-utils.h"
#include "util/stl-utils.h" // for StringHasher.
#include "thread/kaldi-thread.h"
@ -39,6 +40,16 @@ namespace kaldi {
/// \addtogroup table_impl_types
/// @{
// In SequentialTableReaderScriptImpl and RandomAccessTableReaderScriptImple,
// We use this function to separate the input string 'line'
// (e.g "1.ark:100[1:2,2:10]"), or (e.g. "sph2pipe -f wav -p -c 1 1.sph |")
// into the data_rxfilename (e.g. "1.ark:100") and the optional range specifier
// (valid cases are: row range e.g. [1:2], column range e.g. [:,2:10], or both
// e.g. [1:2,2:10].) It returns true if successful.
bool ExtractRangeSpecifier(const std::string &line,
std::string *data_rxfilename,
std::string *range);
template<class Holder> class SequentialTableReaderImplBase {
public:
typedef typename Holder::T T;
@ -79,7 +90,6 @@ template<class Holder> class SequentialTableReaderImplBase {
KALDI_DISALLOW_COPY_AND_ASSIGN(SequentialTableReaderImplBase);
};
// This is the implementation for SequentialTableReader
// when it's actually a script file.
template<class Holder> class SequentialTableReaderScriptImpl:
@ -105,7 +115,8 @@ template<class Holder> class SequentialTableReaderScriptImpl:
state_ = kUninitialized;
return false;
} else { // Open succeeded.
if (binary) { // script file should not be binary file..
if (binary) {
KALDI_WARN << "Script file should not be binary file.";
state_ = kError; // bad script file.
script_input_.Close();
return false;
@ -140,9 +151,9 @@ template<class Holder> class SequentialTableReaderScriptImpl:
virtual bool Done() const {
switch (state_) {
case kHaveScpLine: return false;
case kLoadSucceeded: case kLoadFailed: return false;
// These cases are because we want LoadCurrent()
case kHaveScpLine: case kLoadSucceeded: case kLoadFailed:
case kRangeExtracted: case kRangeExtractionFailed: return false;
// These cases are because we want EnsureObjectLoaded()
// to be callable after Next() and to not change the Done() status
// [only Next() should change the Done() status].
case kEof: case kError: return true; // Error condition, like Eof, counts
@ -156,7 +167,8 @@ template<class Holder> class SequentialTableReaderScriptImpl:
virtual std::string Key() {
// Valid to call this whenever Done() returns false.
switch (state_) {
case kHaveScpLine: case kLoadSucceeded: case kLoadFailed: break;
case kHaveScpLine: case kLoadSucceeded: case kLoadFailed:
case kRangeExtracted: case kRangeExtractionFailed: break;
default:
// coding error.
KALDI_ERR << "Key() called on TableReader object at the wrong time.";
@ -165,9 +177,9 @@ template<class Holder> class SequentialTableReaderScriptImpl:
}
const T &Value() {
StateType orig_state = state_;
if (state_ == kHaveScpLine) LoadCurrent(); // Takes
// state_ to kLoadSucceeded or kLoadFailed.
if (state_ == kLoadFailed) { // this can happen due to
EnsureObjectLoaded(); // Takes state_ to kLoadSucceeded/kRangeExtracted
// or kLoadFailed/kRangeExtractionFailed.
if (state_ == kLoadFailed) {
// a file listed in an scp file not existing, or
// read failure, failure of a command, etc.
if (orig_state == kHaveScpLine)
@ -179,32 +191,53 @@ template<class Holder> class SequentialTableReaderScriptImpl:
else // orig_state_ was kLoadFailed, which only could have happened
// if the user called FreeCurrent().
KALDI_ERR << "You called Value() after FreeCurrent().";
} else if (state_ != kLoadSucceeded) {
} else if (state_ == kRangeExtractionFailed) {
KALDI_ERR << "Failed to load object from "
<< PrintableRxfilename(data_rxfilename_)
<< " [" << range_ << "] "
<< " (to suppress this error, add the permissive "
<< "(p, ) option to the rspecifier.";
} else if (state_ != kLoadSucceeded && state_ != kRangeExtracted) {
// This would be a coding error.
KALDI_ERR << "Value() called at the wrong time.";
}
return holder_.Value();
if (state_ == kRangeExtracted) {
return range_holder_.Value();
} else {
// Here state_ is kLoadSucceeded.
return holder_.Value();
}
}
void FreeCurrent() {
if (state_ == kLoadSucceeded) {
holder_.Clear();
state_ = kLoadFailed;
} else if (state_ == kRangeExtracted) {
range_holder_.Clear();
state_ = kRangeExtractionFailed;
} else {
KALDI_WARN << "FreeCurrent called at the wrong time.";
}
}
void SwapHolder(Holder *other_holder) {
// call Value() to ensure we have a value, and ignore its return value while
// suppressing compiler warnings by casting to void.
// call Value() to ensure we have a value, and ignore its return value
// while suppressing compiler warnings by casting to void.
(void) Value();
if (state_ == kLoadSucceeded) {
holder_.Swap(other_holder);
state_ = kLoadFailed;
} else if (state_ == kRangeExtracted) {
range_holder_.Swap(other_holder);
state_ = kLoadSucceeded;
// This indicates that we still have the base object (but no range).
} else {
KALDI_ERR << "SwapHolder called at the wrong time "
"(error related to ',bg' modifier).";
}
}
void Next() {
while (1) {
NextScpLine();
@ -212,7 +245,7 @@ template<class Holder> class SequentialTableReaderScriptImpl:
if (opts_.permissive) {
// Permissive mode means, when reading scp files, we treat keys whose
// scp entry cannot be read as nonexistent. This means trying to read.
if (LoadCurrent()) return; // Success.
if (EnsureObjectLoaded()) return; // Success.
// else try the next scp line.
} else {
return; // We go the next key; Value() will crash if we can't
@ -231,8 +264,11 @@ template<class Holder> class SequentialTableReaderScriptImpl:
status = script_input_.Close();
if (data_input_.IsOpen())
data_input_.Close();
if (state_ == kLoadSucceeded)
if (state_ == kLoadSucceeded ||
state_ == kRangeExtracted || state_ == kRangeExtractionFailed) {
range_holder_.Clear();
holder_.Clear();
}
if (!this->IsOpen())
KALDI_ERR << "Close() called on input that was not open.";
StateType old_state = state_;
@ -248,48 +284,88 @@ template<class Holder> class SequentialTableReaderScriptImpl:
} else {
return true;
}
// Possible states Return value
// kLoadedSucceeded/kRangeExtracted/kRangeExtractionFailed true
// kError (if opts_.permissive) true
// kError (if !opts_.permissive) false
// kEof (if script_input_.Close() && !opts.permissive) false
// kEof (if !script_input_.Close() || opts.permissive) true
// kUninitialized/kFileStart/kHaveScpLine true
// kUnitialized true
}
virtual ~SequentialTableReaderScriptImpl() {
if (!Close())
if (this->IsOpen() && !Close())
KALDI_ERR << "TableReader: reading script file failed: from scp "
<< PrintableRxfilename(script_rxfilename_);
}
private:
bool LoadCurrent() {
// Attempts to load object whose rxfilename is on the current scp line.
if (state_ != kHaveScpLine)
KALDI_ERR << "LoadCurrent() called at the wrong time.";
bool ans;
// note, NULL means it doesn't read the binary-mode header
if (Holder::IsReadInBinary()) {
ans = data_input_.Open(data_rxfilename_, NULL);
} else {
ans = data_input_.OpenTextMode(data_rxfilename_);
}
if (!ans) {
// May want to make this warning a VLOG at some point
KALDI_WARN << "Failed to open file "
<< PrintableRxfilename(data_rxfilename_);
state_ = kLoadFailed;
bool EnsureObjectLoaded() {
// state_ == kRangeExtracted or kRangeExtractionFailed means
// everything that needs to be done has been already done.
if (state_ == kRangeExtracted)
return true;
if (state_ == kRangeExtractionFailed)
return false;
} else {
if (holder_.Read(data_input_.Stream())) {
state_ = kLoadSucceeded;
return true;
} else { // holder_ will not contain data.
KALDI_WARN << "Failed to load object from "
// Attempts to load object whose rxfilename is on the current scp line.
if (state_ == kHaveScpLine) {
bool ans;
// note, NULL means it doesn't read the binary-mode header
if (Holder::IsReadInBinary()) {
KALDI_LOG << data_rxfilename_;
ans = data_input_.Open(data_rxfilename_, NULL);
} else {
ans = data_input_.OpenTextMode(data_rxfilename_);
}
if (!ans) {
// May want to make this warning a VLOG at some point
KALDI_WARN << "Failed to open file "
<< PrintableRxfilename(data_rxfilename_);
state_ = kLoadFailed;
return false;
} else {
if (holder_.Read(data_input_.Stream())) {
if (range_.empty()) {
state_ = kLoadSucceeded;
return true;
} // o.w. we'll go ahead to extract the partial object
// into range_holder_.
} else { // holder_ will not contain data.
KALDI_WARN << "Failed to load object from "
<< PrintableRxfilename(data_rxfilename_);
state_ = kLoadFailed;
return false;
}
}
} else if (state_ != kLoadSucceeded) {
KALDI_ERR << "EnsureObjectLoaded() called at the wrong time.";
}
}
// Here state_ == kLoadSucceeded, we will just
// extract the range_holder_ from holder_ according to the range_ specifier,
// e.g. [1:3,4:8], or just return true if range_ is empty.
if (range_.empty()) return true;
if (!holder_.ExtractRange(&range_holder_, range_)) {
KALDI_WARN << "Failed to load object from "
<< PrintableRxfilename(data_rxfilename_)
<< "[" << range_ << "]";
state_ = kRangeExtractionFailed;
return false;
} else {
state_ = kRangeExtracted;
return true;
}
// Possible states Return value
// kLoadSucceeded/kRangeExtracted true
// kLoadFailed/kRangeExtractionFailed false
}
// Reads the next line in the script file.
void NextScpLine() {
StateType old_state = state_;
switch (state_) {
case kLoadSucceeded: holder_.Clear(); break;
case kLoadSucceeded:
case kRangeExtracted: case kRangeExtractionFailed:
case kHaveScpLine: case kLoadFailed: case kFileStart: break;
default:
// No other states are valid to call Next() from.
@ -297,11 +373,38 @@ template<class Holder> class SequentialTableReaderScriptImpl:
}
std::string line;
if (getline(script_input_.Stream(), line)) {
SplitStringOnFirstSpace(line, &key_, &data_rxfilename_);
if (!key_.empty() && !data_rxfilename_.empty()) {
// After extracting "key" from "line", we put the rest
// of "line" into "rest", and then extract data_rxfilename_
// (e.g. 1.ark:100) and possibly the range_ specifer
// (e.g. [1:2,2:10]) from "rest".
std::string data_rxfilename_next, rest;
SplitStringOnFirstSpace(line, &key_, &rest);
if (!key_.empty() && !rest.empty()) {
// Got a valid line.
state_ = kHaveScpLine;
if (rest[rest.size()-1] == ']') {
if(!ExtractRangeSpecifier(rest, &data_rxfilename_next, &range_)) {
// Got an invalid line.
state_ = kError; // we can't make sense of this
// scp file and will now die.
}
} else {
data_rxfilename_next = rest;
range_ = "";
}
range_holder_.Clear();
if (old_state == kLoadSucceeded || old_state == kRangeExtracted ||
old_state == kRangeExtractionFailed) {
if (data_rxfilename_ == data_rxfilename_next) {
state_ = kLoadSucceeded;
} else {
holder_.Clear();
}
}
data_rxfilename_ = data_rxfilename_next;
} else {
KALDI_WARN << "We got an invalid line in the scp file. "
<< "(it should be like: some_key 1.ark:10)";
// Got an invalid line.
state_ = kError; // we can't make sense of this
// scp file and will now die.
@ -315,30 +418,37 @@ template<class Holder> class SequentialTableReaderScriptImpl:
}
}
Input script_input_; // Input object for the .scp file
Input data_input_; // Input object for the entries in
// the script file.
Holder holder_; // Holds the object.
Holder range_holder_; // Holds the partial object corresponding to the object
// range specifier 'range_'. this is only used when 'range_' is specified.
bool binary_; // Binary-mode archive.
std::string key_;
std::string rspecifier_;
std::string script_rxfilename_; // of the script file.
RspecifierOptions opts_; // options.
std::string data_rxfilename_; // of the file we're reading.
std::string range_; // range with which we extract range_holder_ from holder_.
enum StateType {
// [The state of the reading process] [does holder_ [is script_inp_
// have object] open]
kUninitialized, // Uninitialized or closed. no no
kEof, // We did Next() and found eof in script file. no no
kUninitialized, // Uninitialized or closed. no no
kEof, // We did Next() and found eof in script file. no no
kError, // Some other error no yes
kHaveScpLine, // Just called Open() or Next() and have a no yes
// line of the script file but no data.
kLoadSucceeded, // Called LoadCurrent() and it succeeded. yes yes
kLoadFailed, // Called LoadCurrent() and it failed, no yes
kLoadSucceeded, // Called EnsureObjectLoaded() and
// it succeeded. yes yes
kLoadFailed, // Called EnsureObjectLoaded() and it failed, no yes
// or the user called FreeCurrent().. note,
// if when called by user we are in this state,
// it means the user called FreeCurrent().
kRangeExtracted, // we successfully extracte the partial yes yes
// object.
kRangeExtractionFailed, // we failed to extract the partial yes yes
// object.
kFileStart, // [state we only use internally] no yes
} state_;
private:
@ -486,6 +596,7 @@ template<class Holder> class SequentialTableReaderArchiveImpl:
}
return key_;
}
const T &Value() {
switch (state_) {
case kHaveObject:
@ -496,6 +607,7 @@ template<class Holder> class SequentialTableReaderArchiveImpl:
}
return holder_.Value();
}
virtual void FreeCurrent() {
if (state_ == kHaveObject) {
holder_.Clear();
@ -504,6 +616,7 @@ template<class Holder> class SequentialTableReaderArchiveImpl:
KALDI_WARN << "FreeCurrent called at the wrong time.";
}
}
void SwapHolder(Holder *other_holder) {
// call Value() to ensure we have a value, and ignore its return value while
// suppressing compiler warnings by casting to void.
@ -546,7 +659,7 @@ template<class Holder> class SequentialTableReaderArchiveImpl:
}
virtual ~SequentialTableReaderArchiveImpl() {
if (!Close())
if (this->IsOpen() && !Close())
KALDI_ERR << "TableReader: error detected closing archive "
<< PrintableRxfilename(archive_rxfilename_);
}
@ -821,7 +934,7 @@ template<class Holder>
const typename SequentialTableReader<Holder>::T &
SequentialTableReader<Holder>::Value() {
CheckImpl();
return impl_->Value(); // This may throw (if LoadCurrent() returned false you
return impl_->Value(); // This may throw (if EnsureObjectLoaded() returned false you
// are safe.).
}
@ -1540,10 +1653,13 @@ class RandomAccessTableReaderScriptImpl:
KALDI_ERR << "Close() called on RandomAccessTableReader that was not"
" open.";
holder_.Clear();
range_holder_.Clear();
state_ = kUninitialized;
last_found_ = 0;
script_.clear();
current_key_ = "";
range_ = "";
data_rxfilename_ = "";
// This one cannot fail because any errors of a "global"
// nature would have been detected when we did Open().
// With archives it's different.
@ -1564,8 +1680,8 @@ class RandomAccessTableReaderScriptImpl:
if (!IsOpen())
KALDI_ERR << "Value() called on non-open object.";
if (!((state_ == kHaveObject || state_ == kGaveObject)
&& key == current_key_)) { // Not already stored...
if (!((state_ == kHaveObject || state_ == kGaveObject) &&
key == current_key_)) { // Not already stored...
bool has_key = HasKeyInternal(key, true); // preload.
if (!has_key)
KALDI_ERR << "Could not get item for key " << key
@ -1573,22 +1689,31 @@ class RandomAccessTableReaderScriptImpl:
<< "add the p, (permissive) option to the rspecifier.";
KALDI_ASSERT(state_ == kHaveObject && key == current_key_);
}
if (state_ == kHaveObject) {
state_ = kGaveObject;
if (opts_.once) MakeTombstone(key); // make sure that future lookups fail
return holder_.Value();
} else { // state_ == kGaveObject
if (opts_.once)
KALDI_ERR << "Value called twice for the same key and ,o (once) option "
<< "is used: rspecifier is " << rspecifier_;
}
if (!range_.empty()) {
// we extract range_holder_ from holder_ according to the range_ specifier,
// e.g. [1:3,4:8].
if (!holder_.ExtractRange(&range_holder_, range_)) {
KALDI_ERR << "Failed to extract partial object with range " << range_;
}
return range_holder_.Value();
} else {
return holder_.Value();
}
}
virtual ~RandomAccessTableReaderScriptImpl() {
if (state_ == kHaveObject || state_ == kGaveObject)
if (state_ == kHaveObject || state_ == kGaveObject) {
holder_.Clear();
data_rxfilename_ = "";
}
}
private:
@ -1626,25 +1751,49 @@ class RandomAccessTableReaderScriptImpl:
return true; // we have the key.
} else { // preload specified, so we have to pre-load the object before
// returning true.
if (!input_.Open(script_[key_pos].second)) {
KALDI_WARN << "Error opening stream "
<< PrintableRxfilename(script_[key_pos].second);
return false;
} else {
// Make sure holder empty.
if (state_ == kHaveObject || state_ == kGaveObject)
holder_.Clear();
if (holder_.Read(input_.Stream())) {
state_ = kHaveObject;
current_key_ = key;
return true;
} else {
KALDI_WARN << "Error reading object from "
"stream " << PrintableRxfilename(script_[key_pos].second);
state_ = kNotHaveObject;
std::string data_rxfilename; // We will split filename
// script_[key_pos].second into data_rxfilename (e.g. 1.ark:100)
// and range_(if any, e.g. [1:2,2:10]).
if (script_[key_pos].second[script_[key_pos].second.size()-1] == ']') {
if(!ExtractRangeSpecifier(script_[key_pos].second,
&data_rxfilename,
&range_)) {
KALDI_WARN << "TableReader: fail to parse file name "
<< PrintableRxfilename(script_[key_pos].second);
return false;
}
} else {
data_rxfilename = script_[key_pos].second;
if (!range_.empty()) {
range_ = "";
range_holder_.Clear();
}
}
// we only need to update holder_ if the new data_rxfilename is
// different from the current data_rxfilename we have.
if (data_rxfilename != data_rxfilename_) {
if (!input_.Open(data_rxfilename)) {
KALDI_WARN << "Error opening stream "
<< PrintableRxfilename(data_rxfilename);
return false;
} else {
data_rxfilename_ = data_rxfilename;
if (holder_.Read(input_.Stream())) {
state_ = kHaveObject;
current_key_ = key;
return true;
} else {
KALDI_WARN << "Error reading object from "
"stream " << PrintableRxfilename(script_[key_pos].second);
state_ = kNotHaveObject;
return false;
}
}
}
current_key_ = key;
state_ = kHaveObject;
return true;
}
}
}
@ -1693,6 +1842,11 @@ class RandomAccessTableReaderScriptImpl:
std::string current_key_; // Key of object in holder_
Holder holder_;
Holder range_holder_; // Holds the partial object corresponding to the object
// range specifier 'range_'. this is only used when 'range_' is specified.
std::string range_; // range within which we read the object from holder_.
std::string data_rxfilename_; // the rxfilename that's used to read the
// object into holder_, and will be nonempty whenever holder_ is nonempty.
// the script_ variable contains pairs of (key, filename), sorted using
// std::sort. This can be used with binary_search to look up filenames for
@ -1706,9 +1860,9 @@ class RandomAccessTableReaderScriptImpl:
enum { // [Do we have [Does holder_
// script_ set up?] contain object?]
kUninitialized, // no no
kNotReadScript, // no no
kNotHaveObject, // yes no
kUninitialized, // no no
kNotReadScript, // no no
kNotHaveObject, // yes no
kHaveObject, // yes yes
kGaveObject, // yes yes
// [kGaveObject is as kHaveObject but we note that the

Просмотреть файл

@ -22,6 +22,21 @@
namespace kaldi {
bool ExtractRangeSpecifier(const std::string &line,
std::string *data_rxfilename,
std::string *range) {
if (line.empty() || line[line.size()-1] != ']')
return false;
std::vector<std::string> splits;
SplitStringToVector(line, "[", false, &splits);
if (splits.size() == 2 && !splits[0].empty() && !splits[1].empty()) {
*data_rxfilename = splits[0];
range->assign(splits[1], 0, splits[1].size()-1);
return true;
}
return false;
}
bool ReadScriptFile(const std::string &rxfilename,
bool warn,