CNTK/DataReader/HTKMLFReader_linux/utterancesourcemulti.h

1355 строки
78 KiB
C++

//
// <copyright file="utterancesourcemulti.h" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//
// utterancesourcemulti.h -- implementation of utterancesource.h that supports multiple feature and label sets
//
#pragma once
#include "basetypes.h" // for attempt()
#include "htkfeatio.h" // for htkmlfreader
#include "latticearchive.h" // for reading HTK phoneme lattices (MMI training)
#include "minibatchsourcehelpers.h"
#include "minibatchiterator.h"
namespace msra { namespace dbn {
// ---------------------------------------------------------------------------
// minibatchutterancesource -- feature source to provide randomized utterances
// This also implements a frame-wise mode, which is layered on top of the utterance-wise mode
// and thus benefits from its goodies such as corpus-wide high-level randomization and chunk paging.
// ---------------------------------------------------------------------------
class minibatchutterancesourcemulti : public minibatchsource
{
void operator=(const minibatchutterancesourcemulti & other); // non-assignable
std::vector<size_t> vdim; // feature dimension after augmenting neighhors
std::vector<size_t> leftcontext; // number of frames to the left of the target frame in the context window
std::vector<size_t> rightcontext; // number of frames to the right of the target frame in the context window
std::vector<unsigned int> sampperiod; // (for reference and to check against model)
std::vector<string> featkind;
std::vector<size_t> featdim;
const bool framemode; // true -> actually return frame-level randomized frames (not possible in lattice mode)
std::vector<std::vector<size_t>> counts; // [s] occurence count for all states (used for priors)
int verbosity;
// lattice reader
//const std::vector<unique_ptr<latticesource>> &lattices;
const latticesource & lattices;
//std::vector<latticesource> lattices;
// word-level transcripts (for MMI mode when adding best path to lattices)
const map<wstring,msra::lattices::lattice::htkmlfwordsequence> & allwordtranscripts; // (used for getting word-level transcripts)
//std::vector<map<wstring,msra::lattices::lattice::htkmlfwordsequence>> allwordtranscripts;
// data store (incl. paging in/out of features and lattices)
struct utterancedesc // data descriptor for one utterance
{
msra::asr::htkfeatreader::parsedpath parsedpath; // archive filename and frame range in that file
size_t classidsbegin; // index into allclassids[] array (first frame)
utterancedesc (msra::asr::htkfeatreader::parsedpath && ppath, size_t classidsbegin) : parsedpath (ppath), classidsbegin (classidsbegin) {}
const wstring & logicalpath() const { return parsedpath; /*type cast will return logical path*/ }
size_t numframes() const { return parsedpath.numframes(); }
const wstring key() const // key used for looking up lattice (not stored to save space)
{
#ifdef _WIN32
static const wstring emptywstring;
static const wregex deleteextensionre (L"\\.[^\\.\\\\/:]*$");
return regex_replace (logicalpath(), deleteextensionre, emptywstring); // delete extension (or not if none)
#endif
#ifdef __unix__
return removeExtension(basename(logicalpath()));
#endif
}
};
struct utterancechunkdata // data for a chunk of utterances
{
std::vector<utterancedesc> utteranceset; // utterances in this set
size_t numutterances() const { return utteranceset.size(); }
std::vector<size_t> firstframes; // [utteranceindex] first frame for given utterance
mutable msra::dbn::matrix frames; // stores all frames consecutively (mutable since this is a cache)
size_t totalframes; // total #frames for all utterances in this chunk
mutable std::vector<shared_ptr<const latticesource::latticepair>> lattices; // (may be empty if none)
// construction
utterancechunkdata() : totalframes (0) {}
//utterancechunkdata (const utterancechunkdata& other) : utteranceset(other.utteranceset), firstframes(other.firstframes), frames (other.frames), totalframes (other.totalframes), lattices (other.lattices){};
void push_back (utterancedesc &&/*destructive*/ utt)
{
//printf ("start push %d %d\n",frames.rows(), frames.cols());
if (isinram())
{
throw std::logic_error ("utterancechunkdata: frames already paged into RAM--too late to add data");
}
firstframes.push_back (totalframes);
totalframes += utt.numframes();
utteranceset.push_back (utt);
}
// accessors to an utterance's data
size_t numframes (size_t i) const { return utteranceset[i].numframes(); }
size_t getclassidsbegin (size_t i) const { return utteranceset[i].classidsbegin; }
msra::dbn::matrixstripe getutteranceframes (size_t i) const // return the frame set for a given utterance
{
if (!isinram())
throw std::logic_error ("getutteranceframes: called when data have not been paged in");
const size_t ts = firstframes[i];
const size_t n = numframes(i);
return msra::dbn::matrixstripe (frames, ts, n);
}
shared_ptr<const latticesource::latticepair> getutterancelattice (size_t i) const // return the frame set for a given utterance
{
if (!isinram())
throw std::logic_error ("getutteranceframes: called when data have not been paged in");
return lattices[i];
}
// paging
// test if data is in memory at the moment
bool isinram() const {
return !frames.empty();
}
// page in data for this chunk
// We pass in the feature info variables by ref which will be filled lazily upon first read
void requiredata (string & featkind, size_t & featdim, unsigned int & sampperiod, const latticesource & latticesource, int verbosity=0) const
{
if (numutterances() == 0)
throw std::logic_error ("requiredata: cannot page in virgin block");
if (isinram())
throw std::logic_error ("requiredata: called when data is already in memory");
try // this function supports retrying since we read from the unrealible network, i.e. do not return in a broken state
{
msra::asr::htkfeatreader reader; // feature reader (we reinstantiate it for each block, i.e. we reopen the file actually)
// if this is the first feature read ever, we explicitly open the first file to get the information such as feature dimension
if (featdim == 0)
{
reader.getinfo (utteranceset[0].parsedpath, featkind, featdim, sampperiod);
fprintf (stderr, "requiredata: determined feature kind as %zu-dimensional '%s' with frame shift %.1f ms\n", featdim, featkind.c_str(), sampperiod / 1e4);
}
// read all utterances; if they are in the same archive, htkfeatreader will be efficient in not closing the file
frames.resize (featdim, totalframes);
if (!latticesource.empty())
lattices.resize (utteranceset.size());
foreach_index (i, utteranceset)
{
//fprintf (stderr, ".");
// read features for this file
auto uttframes = getutteranceframes (i); // matrix stripe for this utterance (currently unfilled)
reader.read (utteranceset[i].parsedpath, (const string &) featkind, sampperiod, uttframes); // note: file info here used for checkuing only
// page in lattice data
if (!latticesource.empty())
latticesource.getlattices (utteranceset[i].key(), lattices[i], uttframes.cols());
}
//fprintf (stderr, "\n");
if (verbosity)
fprintf (stderr, "requiredata: %zu utterances read\n", utteranceset.size());
}
catch (...)
{
releasedata();
throw;
}
}
// page out data for this chunk
void releasedata() const
{
if (numutterances() == 0)
throw std::logic_error ("releasedata: cannot page out virgin block");
if (!isinram())
throw std::logic_error ("releasedata: called when data is not memory");
// release frames
frames.resize (0, 0);
// release lattice data
lattices.clear();
}
};
std::vector<std::vector<utterancechunkdata>> allchunks; // set of utterances organized in chunks, referred to by an iterator (not an index)
std::vector<unique_ptr<biggrowablevector<CLASSIDTYPE>>> classids; // [classidsbegin+t] concatenation of all state sequences
bool issupervised() const { return !classids.empty(); }
size_t numutterances; // total number of utterances
size_t _totalframes; // total frames (same as classids.size() if we have labels)
double timegetbatch; // [v-hansu] for time measurement
// sequence in random order of actual use (randomized, where randomization is cached)
const size_t randomizationrange;// parameter remembered; this is the full window (e.g. 48 hours), not the half window
size_t currentsweep; // randomization is currently cached for this sweep; if it changes, rebuild all below
struct chunk // chunk as used in actual processing order (randomized sequence)
{
// the underlying chunk (as a non-indexed reference into the chunk set)
std::vector<utterancechunkdata>::const_iterator uttchunkdata;
const utterancechunkdata & getchunkdata() const { return *uttchunkdata; }
size_t numutterances() const { return uttchunkdata->numutterances(); }
size_t numframes() const { return uttchunkdata->totalframes; }
// position in utterance-position space
size_t utteranceposbegin;
size_t utteranceposend() const { return utteranceposbegin + numutterances(); }
// position on global time line
size_t globalts; // start frame on global timeline (after randomization)
size_t globalte() const { return globalts + numframes(); }
// randomization range limits
size_t windowbegin; // randomizedchunk index of earliest chunk that utterances in here can be randomized with
size_t windowend; // and end index [windowbegin, windowend)
chunk (std::vector<utterancechunkdata>::const_iterator uttchunkdata, size_t utteranceposbegin, size_t globalts) : uttchunkdata (uttchunkdata), utteranceposbegin (utteranceposbegin), globalts (globalts) {}
};
std::vector<std::vector<chunk>> randomizedchunks; // utterance chunks after being brought into random order (we randomize within a rolling window over them)
size_t chunksinram; // (for diagnostics messages)
struct utteranceref // describes the underlying random utterance associated with an utterance position
{
size_t chunkindex; // lives in this chunk (index into randomizedchunks[])
size_t utteranceindex; // utterance index in that chunk
size_t numframes; // (cached since we cannot directly access the underlying data from here)
size_t globalts; // start frame in global space after randomization (for mapping frame index to utterance position)
size_t globalte() const { return globalts + numframes; } // end frame
utteranceref (size_t chunkindex, size_t utteranceindex) : chunkindex (chunkindex), utteranceindex (utteranceindex), globalts (SIZE_MAX), numframes (0) {}
void swap (utteranceref & other) // used in randomization
{
::swap (chunkindex, other.chunkindex);
::swap (utteranceindex, other.utteranceindex);
assert (globalts == SIZE_MAX && other.globalts == SIZE_MAX && numframes == 0 && other.numframes == 0); // can only swap before assigning these
}
};
std::vector<utteranceref> randomizedutterancerefs; // [pos] randomized utterance ids
std::hash_map<size_t,size_t> randomizedutteranceposmap; // [globalts] -> pos lookup table
struct positionchunkwindow // chunk window required in memory when at a certain position, for controlling paging
{
std::vector<chunk>::const_iterator definingchunk; // the chunk in randomizedchunks[] that defined the utterance position of this utterance
size_t windowbegin() const { return definingchunk->windowbegin; }
size_t windowend() const { return definingchunk->windowend; }
bool isvalidforthisposition (const utteranceref & utt) const
{
return utt.chunkindex >= windowbegin() && utt.chunkindex < windowend(); // check if 'utt' lives in is in allowed range for this position
}
positionchunkwindow (std::vector<chunk>::iterator definingchunk) : definingchunk (definingchunk) {}
};
std::vector<positionchunkwindow> positionchunkwindows; // [utterance position] -> [windowbegin, windowend) for controlling paging
// frame-level randomization layered on top of utterance chunking (randomized, where randomization is cached)
struct frameref
{
#ifdef _WIN64 // (sadly, the compiler makes this 8 bytes, not 6)
unsigned short chunkindex; // lives in this chunk (index into randomizedchunks[])
unsigned short utteranceindex; // utterance index in that chunk
static const size_t maxutterancesperchunk = 65535;
unsigned short frameindex; // frame index within the utterance
static const size_t maxframesperutterance = 65535;
#elif __unix__ // (sadly, the compiler makes this 8 bytes, not 6)
unsigned short chunkindex; // lives in this chunk (index into randomizedchunks[])
unsigned short utteranceindex; // utterance index in that chunk
static const size_t maxutterancesperchunk = 65535;
unsigned short frameindex; // frame index within the utterance
static const size_t maxframesperutterance = 65535;
#else // For Win32, we care to keep it inside 32 bits. We have already encountered setups where that's not enough.
unsigned int chunkindex : 13; // lives in this chunk (index into randomizedchunks[])
unsigned int utteranceindex : 8; // utterance index in that chunk
static const size_t maxutterancesperchunk = 255;
unsigned int frameindex : 11; // frame index within the utterance
static const size_t maxframesperutterance = 2047;
#endif
frameref (size_t ci, size_t ui, size_t fi) : chunkindex ((unsigned short) ci), utteranceindex ((unsigned short) ui), frameindex ((unsigned short) fi)
{
#ifdef _WIN32
static_assert (sizeof (frameref) == 4, "frameref: bit fields too large to fit into 32-bit integer");
#endif
if (ci == chunkindex && ui == utteranceindex && fi == frameindex)
return;
throw std::logic_error ("frameref: bit fields too small");
}
frameref() : chunkindex (0), utteranceindex (0), frameindex (0) {}
};
biggrowablevector<frameref> randomizedframerefs; // [globalt-sweepts] -> (chunk, utt, frame) lookup table for randomized frames --this can be REALLY big!
// TODO: this may go away if we store classids directly in the utterance data
template<class VECTOR> class shiftedvector // accessing a vector with a non-0 starting index
{
void operator= (const shiftedvector &);
VECTOR & v;
size_t first;
size_t n;
void check (size_t i) const { if (i >= n) throw std::logic_error ("shiftedvector: index out of bounds"); }
public:
shiftedvector (VECTOR & v, size_t first, size_t n) : v (v), first (first), n (n) { }
// TODO: the following is not templated--do it if needed; also should return a const reference then
size_t operator[] (size_t i) const { check (i); return v[first + i]; }
};
template<class UTTREF> std::vector<shiftedvector<biggrowablevector<CLASSIDTYPE>>> getclassids (const UTTREF & uttref) // return sub-vector of classids[] for a given utterance
{
std::vector<shiftedvector<biggrowablevector<CLASSIDTYPE>>> allclassids;
allclassids.empty();
if (!issupervised())
{
foreach_index(i,classids)
allclassids.push_back(std::move(shiftedvector<biggrowablevector<CLASSIDTYPE>> ((*classids[i]), 0, 0)));
return allclassids; // nothing to return
}
const auto & chunk = randomizedchunks[0][uttref.chunkindex];
const auto & chunkdata = chunk.getchunkdata();
const size_t classidsbegin = chunkdata.getclassidsbegin (uttref.utteranceindex); // index of first state label in global concatenated classids[] array
const size_t n = chunkdata.numframes (uttref.utteranceindex);
foreach_index(i,classids)
{
if ((*classids[i])[classidsbegin + n] != (CLASSIDTYPE) -1)
throw std::logic_error ("getclassids: expected boundary marker not found, internal data structure screwed up");
allclassids.push_back(std::move(shiftedvector<biggrowablevector<CLASSIDTYPE>> ((*classids[i]), classidsbegin, n)));
}
return allclassids; // nothing to return
}
public:
// constructor
// Pass empty labels to denote unsupervised training (so getbatch() will not return uids).
// This mode requires utterances with time stamps.
minibatchutterancesourcemulti (const std::vector<std::vector<wstring>> & infiles, const std::vector<map<wstring,std::vector<msra::asr::htkmlfentry>>> & labels,
std::vector<size_t> vdim, std::vector<size_t> udim, std::vector<size_t> leftcontext, std::vector<size_t> rightcontext, size_t randomizationrange,
const latticesource & lattices, const map<wstring,msra::lattices::lattice::htkmlfwordsequence> & allwordtranscripts, const bool framemode)
: vdim (vdim), leftcontext(leftcontext), rightcontext(rightcontext), sampperiod (0), featdim (0), randomizationrange (randomizationrange), currentsweep (SIZE_MAX),
lattices (lattices), allwordtranscripts (allwordtranscripts), framemode (framemode), chunksinram (0), timegetbatch (0), verbosity(2)
// [v-hansu] change framemode (lattices.empty()) into framemode (false) to run utterance mode without lattice
// you also need to change another line, search : [v-hansu] comment out to run utterance mode without lattice
{
// process infiles to know dimensions of things (but not loading features)
std::vector<utterancedesc> utteranceset;// read all utterances to here first; at the end, distribute to chunks
utteranceset.reserve (infiles.size());
size_t nomlf = 0; // number of entries missing in MLF (diagnostics)
size_t nolat = 0; // number of entries missing in lattice archive (diagnostics)
std::vector<size_t> numclasses; // number of output classes as found in the label file (diagnostics)
_totalframes = 0;
wstring key;
size_t numutts=0;
std::vector<bool>uttisvalid; // boolean flag to check that utterance is valid. valid means number of
//frames is consistent across all feature and label streams
std::vector<size_t>uttduration; // track utterance durations to determine utterance validity
std::vector<size_t> classidsbegin;
if (!lattices.empty())
{
LogicError("lattices not supported in utterancereadermulti");
}
allchunks = std::vector<std::vector<utterancechunkdata>>(infiles.size(), std::vector<utterancechunkdata>());
featdim = std::vector<size_t>(infiles.size(), 0);
sampperiod = std::vector<unsigned int>(infiles.size(), 0);
featkind = std::vector<string>(infiles.size(), "");
numclasses = std::vector<size_t>(labels.size(), 0);
counts = std::vector<std::vector<size_t>>(labels.size(), std::vector<size_t>());
foreach_index (i, labels)
{
classids.push_back(unique_ptr<biggrowablevector<CLASSIDTYPE>>(new biggrowablevector<CLASSIDTYPE>()));
//std::pair<std::vector<wstring>,std::vector<wstring>> latticetocs;
//std::unordered_map<std::string,size_t> modelsymmap;
//lattices.push_back(shared_ptr<latticesource>(new latticesource(latticetocs, modelsymmap)));
}
// first check consistency across feature streams
// We'll go through the SCP files for each stream to make sure the duration is consistent
// If not, we'll plan to ignore the utterance, and inform the user
foreach_index(m, infiles){
if (m == 0){
numutts = infiles[m].size();
uttisvalid = std::vector<bool>(numutts, true);
uttduration = std::vector<size_t>(numutts, 0);
}
else if (infiles[m].size()!=numutts)
throw std::runtime_error("minibatchutterancesourcemulti: all feature files must have same number of utterances");
foreach_index(i, infiles[m]){
utterancedesc utterance(msra::asr::htkfeatreader::parsedpath(infiles[m][i]), 0); //mseltzer - is this foolproof for multiio? is classids always non-empty?
const size_t uttframes = utterance.numframes(); // will throw if frame bounds not given --required to be given in this mode
// we need at least 2 frames for boundary markers to work
if (uttframes < 2)
throw std::runtime_error("minibatchutterancesource: utterances < 2 frames not supported");
if (uttframes > frameref::maxframesperutterance)
{
fprintf(stderr, "minibatchutterancesource: skipping %d-th file (%d frames) because it exceeds max. frames (%d) for frameref bit field: %S\n", i, uttframes, frameref::maxframesperutterance, key.c_str());
uttduration[i] = 0;
uttisvalid[i] = false;
}
else{
if (m == 0){
uttduration[i] = uttframes;
uttisvalid[i] = true;
}
else if (uttduration[i] != uttframes){
fprintf(stderr, "minibatchutterancesource: skipping %d-th file due to inconsistency in duration in different feature streams (%d vs %d frames)\n", i, uttduration[i], uttframes);
uttduration[i] = 0;
uttisvalid[i] = false;
}
}
}
}
size_t invalidutts=0;
foreach_index(i, uttisvalid){
if (!uttisvalid[i])
invalidutts++;
}
if (invalidutts > uttisvalid.size() / 2)
throw std::runtime_error("minibatchutterancesource: too many files with inconsistent durations, assuming broken configuration\n");
else if (invalidutts>0)
fprintf(stderr, "Found inconsistent durations across feature streams in %d out of %d files\n", invalidutts, uttisvalid.size());
// now process the features and labels
size_t utterancesetsize = 0;
foreach_index (m, infiles)
{
utteranceset.clear();
//if (m==0)
// numutts = infiles[m].size();
//else
// if (infiles[m].size()!=numutts)
// throw std::runtime_error("minibatchutterancesourcemulti: all feature files must have same number of utterances\n");
if (m==0)
classidsbegin.clear();
foreach_index (i, infiles[m])
{
if (i % (infiles[m].size() / 100 + 1) == 0) { fprintf (stderr, "."); fflush (stderr); }
// build utterance descriptor
if (m == 0 && !labels.empty())
classidsbegin.push_back(classids[0]->size());
if (uttisvalid[i]){
utterancedesc utterance (msra::asr::htkfeatreader::parsedpath (infiles[m][i]), labels.empty() ? 0 : classidsbegin[i] ); //mseltzer - is this foolproof for multiio? is classids always non-empty?
const size_t uttframes = utterance.numframes(); // will throw if frame bounds not given --required to be given in this mode
assert(uttframes == uttduration[i]); // ensure nothing funky happened
// already performed these checks above
// we need at least 2 frames for boundary markers to work
//if (uttframes < 2)
// throw std::runtime_error ("minibatchutterancesource: utterances < 2 frames not supported");
//if (uttframes > frameref::maxframesperutterance)
//{
// fprintf (stderr, "minibatchutterancesource: skipping %d-th file (%d frames) because it exceeds max. frames (%d) for frameref bit field: %S", i, uttframes, frameref::maxframesperutterance, key.c_str());
// continue;
//}
// check whether we have the ref transcript
//auto labelsiter = labels[0].end();
bool lacksmlf = true;
if (!labels.empty()) // empty means unsupervised mode (don't load any)
{
key = utterance.key();
// check if labels are available (if not, it normally means that no path was found in realignment)
auto labelsiter = labels[0].find (key);
//const bool lacksmlf = (labelsiter == labels[0].end());
lacksmlf = (labelsiter == labels[0].end());
if (lacksmlf)
if (nomlf++ < 5)
fprintf (stderr, " [no labels for %S]", key.c_str());
// check if lattice is available (when in lattice mode)
// TODO: also check the #frames here; requires a design change of the TOC format & a rerun
const bool lackslat = !lattices.empty() && !lattices.haslattice (key); // ('true' if we have no lattices)
if (lackslat)
if (nolat++ < 5)
fprintf (stderr, " [no lattice for %S]", key.c_str());
// skip if either one is missing
if (lacksmlf || lackslat){
uttisvalid[i] = false;
continue; // skip this utterance at all
}
}
// push the label sequence into classids[], since we already looked it up
// TODO: we can store labels more efficiently now since we don't do frame-wise random access anymore.
// OK, utterance has all we need --remember it
if (m==0)
{
if (!labels.empty() && !lacksmlf)
//if (!labels.empty() && labelsiter != labels[0].end())
{
// first verify that all the label files have the proper duration
foreach_index (j, labels)
{
const auto & labseq = labels[j].find(key)->second;
// check if durations match; skip if not
size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes);
if (labframes != uttframes)
{
fprintf (stderr, " [duration mismatch (%zu in label vs. %zu in feat file), skipping %S]", labframes, uttframes, key.c_str());
nomlf++;
uttisvalid[i] = false;
break; // continue; // skip this utterance at all
}
}
if (uttisvalid[i])
{
utteranceset.push_back(std::move(utterance));
_totalframes += uttframes;
// then parse each mlf if the durations are consistent
foreach_index(j, labels)
{
const auto & labseq = labels[j].find(key)->second;
// expand classid sequence into flat array
foreach_index (i, labseq)
{
const auto & e = labseq[i];
if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
if (e.classid >= udim[j])
{
throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: class id exceeds model output dimension"));
}
if (e.classid != (CLASSIDTYPE) e.classid)
throw std::runtime_error ("CLASSIDTYPE has too few bits");
for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
classids[j]->push_back ((CLASSIDTYPE) e.classid);
numclasses[j] = max (numclasses[j], (size_t)(1u + e.classid));
counts[j].resize (numclasses[j], 0);
counts[j][e.classid] += e.numframes;
}
classids[j]->push_back ((CLASSIDTYPE) -1); // append a boundary marker marker for checking
if (!labels[j].empty() && classids[j]->size() != _totalframes + utteranceset.size())
throw std::logic_error (msra::strfun::strprintf ("minibatchutterancesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
assert (labels[j].empty() || classids[j]->size() == _totalframes + utteranceset.size());
}
}
}
else{
assert(classids.empty() && labels.empty());
utteranceset.push_back(std::move(utterance));
_totalframes += uttframes;
}
}
else
{
utteranceset.push_back(std::move(utterance));
}
}
}
if (m == 0)
utterancesetsize = utteranceset.size();
else
assert(utteranceset.size() == utterancesetsize);
fprintf (stderr, "feature set %d: %zu frames in %zu out of %zu utterances\n", m, _totalframes, utteranceset.size(),infiles[m].size());
if (!labels.empty()){
foreach_index (j, labels){
biggrowablevector<CLASSIDTYPE> & cid = *classids[j];
foreach_index (i, utteranceset){
//if ((*classids[j])[utteranceset[i].classidsbegin + utteranceset[i].numframes()] != (CLASSIDTYPE) -1)
//printf("index = %d\n",utteranceset[i].classidsbegin + utteranceset[i].numframes());
//printf("cid[index] = %d\n",cid[utteranceset[i].classidsbegin + utteranceset[i].numframes()]);
//printf("CLASSIDTYPE(-1) = %d\n",(CLASSIDTYPE) -1);
if (cid[utteranceset[i].classidsbegin + utteranceset[i].numframes()] != (CLASSIDTYPE) -1)
throw std::logic_error ("minibatchutterancesource: classids[] out of sync");
}
}
}
if (nomlf + nolat > 0)
{
fprintf (stderr, "minibatchutterancesource: out of %zu files, %zu files not found in label set and %zu have no lattice\n", infiles[0].size(), nomlf, nolat);
if (nomlf + nolat > infiles[m].size() / 2)
throw std::runtime_error ("minibatchutterancesource: too many files not found in label set--assuming broken configuration\n");
}
if (m==0) {foreach_index(j, numclasses) { fprintf(stderr,"label set %d: %zu classes\n",j, numclasses[j]); } }
// distribute them over chunks
// We simply count off frames until we reach the chunk size.
// Note that we first randomize the chunks, i.e. when used, chunks are non-consecutive and thus cause the disk head to seek for each chunk.
const size_t framespersec = 100; // we just assume this; our efficiency calculation is based on this
const size_t chunkframes = 15 * 60 * framespersec; // number of frames to target for each chunk
// Loading an initial 24-hour range will involve 96 disk seeks, acceptable.
// When paging chunk by chunk, chunk size ~14 MB.
std::vector<utterancechunkdata> & thisallchunks = allchunks[m];
//std::vector<utterancechunkdata> thisallchunks;
thisallchunks.resize (0);
thisallchunks.reserve (_totalframes / chunkframes);
foreach_index (i, utteranceset)
{
// if exceeding current entry--create a new one
// I.e. our chunks are a little larger than wanted (on av. half the av. utterance length).
if (thisallchunks.empty() || thisallchunks.back().totalframes > chunkframes || thisallchunks.back().numutterances() >= frameref::maxutterancesperchunk)
{
thisallchunks.push_back (utterancechunkdata());
}
// append utterance to last chunk
utterancechunkdata & currentchunk = thisallchunks.back();
//std::move(utteranceset[i]);
currentchunk.push_back (std::move (utteranceset[i])); // move it out from our temp array into the chunk
// TODO: above push_back does not actually 'move' because the internal push_back does not accept that
}
numutterances = utteranceset.size();
fprintf (stderr, "minibatchutterancesource: %zu utterances grouped into %zu chunks, av. chunk size: %.1f utterances, %.1f frames\n",
numutterances, thisallchunks.size(), numutterances / (double) thisallchunks.size(), _totalframes / (double) thisallchunks.size());
// Now utterances are stored exclusively in allchunks[]. They are never referred to by a sequential utterance id at this point, only by chunk/within-chunk index.
}
// preliminary mem allocation for frame references (if in frame mode)
if (framemode)
randomizedframerefs.resize (_totalframes);
}
private:
// shuffle a vector into random order by randomly swapping elements
template<typename VECTOR> static void randomshuffle (VECTOR & v, size_t randomseed)
{
if (v.size() > RAND_MAX * (size_t) RAND_MAX)
throw std::runtime_error ("randomshuffle: too large set: need to change to different random generator!");
srand ((unsigned int) randomseed);
foreach_index (i, v)
{
// pick a random location
const size_t irand = msra::dbn::rand (0, v.size());
// swap element i with it
if (irand == (size_t) i)
continue;
::swap (v[i], v[irand]);
}
}
#if 0
template<typename VECTOR> static void randomshuffle(std::vector<VECTOR &> v, size_t randomseed)
{
foreach_index(j, v)
{
if (v[j].size() > RAND_MAX * (size_t) RAND_MAX)
throw std::runtime_error ("randomshuffle: too large set: need to change to different random generator!");
}
srand ((unsigned int) randomseed);
foreach_index (i, v[0])
{
// pick a random location
const size_t irand = msra::dbn::rand (0, v[0].size());
foreach_index(j, v){
// swap element i with it
if (irand == (size_t) i)
continue;
::swap (v[j][i], v[j][irand]);
}
}
}
#endif //0
static void checkoverflow (size_t fieldval, size_t targetval, const char * fieldname)
{
if (fieldval != targetval)
throw std::runtime_error (msra::strfun::strprintf ("checkoverflow: bit field %s too small for value 0x%x (cut from 0x%x)", fieldname, targetval, fieldval));
}
// helper for testing whether a swapped frame position is valid (w.r.t. beign in RAM when being at position 't')
bool isframepositionvalid (const size_t t, const biggrowablevector<unsigned short> & ttochunk) const
{
// look up valid range for time position
const size_t positionchunkindex = ttochunk[t]; // position 't' lies within this original chunk (relationship is monotonous, not random)
const auto & chunk = randomizedchunks[0][positionchunkindex];
// get in-RAM chunk range for this frame position (shared across all frame positions within the same chunk)
const size_t poswindowbegin = chunk.windowbegin; // rolling window over chunks (which under the hood have been randomized)
const size_t poswindowend = chunk.windowend;
// Chunk implies that if we are at position 't', we are guaranteed to have chunks [poswindowbegin, poswindowend) in RAM.
// now see if the randomized location is within that window
const size_t actualchunkindexforpos = randomizedframerefs[t].chunkindex; // where this frame pos has been mapped to
return actualchunkindexforpos >= poswindowbegin && actualchunkindexforpos < poswindowend;
// We only need to test the chunk index. Utterance and frame can be randomized within a chunk as we want, as long it is in RAM.
}
// big long helper to update all cached randomization information
// This is a rather complex process since we randomize on two levels:
// - chunks of consecutive data in the feature archive
// - within a range of chunks that is paged into RAM
// - utterances (in utt mode), or
// - frames (in frame mode)
// The 'globalts' parameter is the start time that triggered the rerandomization; it is NOT the base time of the randomized area.
size_t lazyrandomization (const size_t globalts)
{
const size_t sweep = globalts / _totalframes; // which sweep (this determines randomization)
if (sweep == currentsweep) // already got this one--nothing to do
return sweep;
currentsweep = sweep;
if (verbosity>0)
fprintf (stderr, "lazyrandomization: re-randomizing for sweep %zu in %s mode\n", currentsweep, framemode ? "frame" : "utterance");
const size_t sweepts = sweep * _totalframes; // first global frame index for this sweep
// first randomize chunks
std::vector<std::vector<std::vector<utterancechunkdata>::const_iterator>> randomizedchunkrefs;
foreach_index (i, allchunks)
randomizedchunkrefs.push_back(std::vector<std::vector<utterancechunkdata>::const_iterator>());
foreach_index (i, allchunks)
randomizedchunkrefs[i].reserve (allchunks[i].size());
foreach_index (i, allchunks) // TODO: this cries for iterating using the iterator!
{
foreach_index(j, allchunks[i])
randomizedchunkrefs[i].push_back (allchunks[i].begin() + j);
assert (randomizedchunkrefs[i].size() == allchunks[i].size());
// note that sincew randomshuffle() uses sweep as seed, this will keep the randomization common across all feature streams
randomshuffle (randomizedchunkrefs[i], sweep); // bring into random order (with random seed depending on sweep)
}
// place them onto the global timeline -> randomizedchunks[]
// We are processing with randomization within a rolling window over this chunk sequence.
// Paging will happen on a chunk-by-chunk basis.
// The global time stamp is needed to determine the paging window.
randomizedchunks.clear(); // data chunks after being brought into random order (we randomize within a rolling window over them)
foreach_index(i, allchunks)
randomizedchunks.push_back(std::vector<chunk>());
foreach_index(i, allchunks)
{
randomizedchunks[i].reserve (randomizedchunkrefs[i].size());
foreach_index (k, randomizedchunkrefs[i])
randomizedchunks[i].push_back (chunk (randomizedchunkrefs[i][k], randomizedchunks[i].empty() ? 0 : randomizedchunks[i].back().utteranceposend(), randomizedchunks[i].empty() ? sweepts : randomizedchunks[i].back().globalte()));
assert (randomizedchunks[i].size() == allchunks[i].size());
assert (randomizedchunks[i].empty() || (randomizedchunks[i].back().utteranceposend() == numutterances && randomizedchunks[i].back().globalte() == sweepts + _totalframes));
}
// for each chunk, compute the randomization range (w.r.t. the randomized chunk sequence)
foreach_index (i, randomizedchunks)
{
foreach_index (k, randomizedchunks[i])
{
chunk & chunk = randomizedchunks[i][k];
// start with the range of left neighbor
if (k == 0)
{
chunk.windowbegin = 0;
chunk.windowend = 1;
}
else
{
chunk.windowbegin = randomizedchunks[i][k-1].windowbegin; // might be too early
chunk.windowend = randomizedchunks[i][k-1].windowend; // might have more space
}
while (chunk.globalts - randomizedchunks[i][chunk.windowbegin].globalts > randomizationrange/2)
chunk.windowbegin++; // too early
while (chunk.windowend < randomizedchunks[i].size() && randomizedchunks[i][chunk.windowend].globalte() - chunk.globalts < randomizationrange/2)
chunk.windowend++; // got more space
}
}
if (!framemode) // utterance mode
{
// This sets up the following members:
// - positionchunkwindows
// - randomizedutterancerefs
// - randomizedutteranceposmap
// We will now introduce the concept of utterance *position*.
// During processing, utterances will be indexed by position (which is in turn derived from a frame index in getbatch()),
// and it is assumed (required) that positions are requested consecutively.
// Each utterance position has an underlying associated utterance, which is represented as (chunkid, within-chunk index) and randomly assigned.
// Each utterance position also has an associated range of chunks that are kept in memory,
// and the associated underlying utterance is guaranteed to be found within that associated range of chunks.
// That allows to page out/in data when processing utterance positions in a consecutive manner.
// compute chunk windows for every utterance position -> positionchunkwindows[]
// Utterance positions can only reference underlying utterance data within the chunk window.
// Utterance positions are defined by the randomized chunk sequence (i.e. their underlying 'defining' chunk differs from sweep to sweep).
positionchunkwindows.clear(); // [utterance position] -> [windowbegin, windowend) for controlling paging
positionchunkwindows.reserve (numutterances);
// positionchunkwindows should be consistent for all inputs (distinct feature streams), so just build based on feature[0]
// contains pointer to chunk elements but only to compute index
foreach_index (k, randomizedchunks[0]) // TODO: this really cries for iterating using iterators!
{
chunk & chunk = randomizedchunks[0][k];
for (size_t i = chunk.utteranceposbegin; i < chunk.utteranceposend(); i++) // loop over utterances in this chunk
{
positionchunkwindows.push_back (randomizedchunks[0].begin() + k);
}
// to look up the chunk range in memory for a position, look up the defining chunk and its range
}
assert (positionchunkwindows.size() == numutterances);
// build the randomized utterances array -> randomizedutterancerefs[]
// start by assigning all utterance positions to utterances in non-random consecutive manner
randomizedutterancerefs.clear(); // [pos] randomized utterance ids
randomizedutterancerefs.reserve (numutterances);
foreach_index (k, randomizedchunks[0])
{
chunk & chunk = randomizedchunks[0][k];
for (size_t i = 0; i < chunk.numutterances(); i++) // loop over utterances in this chunk
randomizedutterancerefs.push_back (utteranceref (k, i));
}
assert (randomizedutterancerefs.size() == numutterances);
foreach_index (i, randomizedutterancerefs)
{
auto & uttref = randomizedutterancerefs[i];
assert (positionchunkwindows[i].isvalidforthisposition(uttref)); uttref;
}
// check we got those setup right
// we now randomly shuffle randomizedutterancerefs[pos], while considering the constraints of what chunk range needs to be in memory
srand ((unsigned int) sweep + 1);
for (size_t i = 0; i < randomizedutterancerefs.size(); i++)
{
// get valid randomization range, expressed in chunks
const size_t windowbegin = positionchunkwindows[i].windowbegin();
const size_t windowend = positionchunkwindows[i].windowend();
// get valid randomization range, expressed in utterance positions
// Remember, utterance positions are defined by chunks.
const size_t posbegin = randomizedchunks[0][windowbegin].utteranceposbegin;
const size_t posend = randomizedchunks[0][windowend-1].utteranceposend();
// randomization range for this utterance position is [posbegin, posend)
for(;;)
{
// pick a random location
const size_t j = msra::dbn::rand (posbegin, posend); // a random number within the window
if (i == j)
break; // the random gods say "this one points to its original position"... nothing wrong about that, but better not try to swap
// We want to swap utterances at i and j, but need to make sure they remain in their allowed range.
// This is guaranteed for a so-far untouched utterance, but both i and j may have been touched by a previous swap.
// We want to use the utterance previously referenced at utterance position j at position i. Is that allowed?
if (!positionchunkwindows[i].isvalidforthisposition (randomizedutterancerefs[j]))
continue; // nope --try another
// Likewise may we use the utterance previously referenced at utterance position i at position j?
if (!positionchunkwindows[j].isvalidforthisposition (randomizedutterancerefs[i]))
continue; // nope --try another
// yep--swap them
randomizedutterancerefs[i].swap (randomizedutterancerefs[j]);
break;
}
}
// place the randomized utterances on the global timeline so we can find them by globalts
size_t t = sweepts;
foreach_index (i, randomizedutterancerefs)
{
auto & uttref = randomizedutterancerefs[i];
uttref.globalts = t;
uttref.numframes = randomizedchunks[0][uttref.chunkindex].getchunkdata().numframes (uttref.utteranceindex);
t = uttref.globalte();
}
assert (t == sweepts + _totalframes);
// verify that we got it right (I got a knot in my head!)
foreach_index (i, randomizedutterancerefs)
{
// get utterance referenced at this position
const auto & uttref = randomizedutterancerefs[i];
// check if it is valid for this position
if (uttref.chunkindex < positionchunkwindows[i].windowbegin() || uttref.chunkindex >= positionchunkwindows[i].windowend())
throw std::logic_error ("lazyrandomization: randomization logic mangled!");
}
// create lookup table for (globalts values -> pos) -> randomizedutteranceposmap[]
randomizedutteranceposmap.clear(); // [globalts] -> pos lookup table
foreach_index (pos, randomizedutterancerefs)
{
auto & uttref = randomizedutterancerefs[pos];
randomizedutteranceposmap[uttref.globalts] = (size_t) pos;
}
}
else // frame mode
{
// This sets up the following members:
// - randomizedframerefs
srand ((unsigned int) sweep + 1);
// An original timeline is established by the randomized chunks, denoted by 't'.
// Returned frames are indexed by frame position j = (globalt - sweept), which have an associated underlying 't'.
// It is guaranteed that uttterance frame position j maps to an underlying frame within the corresponding chunk window.
biggrowablevector<unsigned short> ttochunk; // randomized chunk index associated with frame position
ttochunk.resize (_totalframes);
size_t t = 0;
frameref frameref;
// enumerate chunks in their randomized order and assign frame indices in that order -> randomizedframerefs[t]
// At this point, chunks are in randomized order, but utterances and frames within utterances are not randomized.
// Later we will randomize those as well.
foreach_index (i, randomizedchunks[0])
{
frameref.chunkindex = (unsigned short)i;
checkoverflow (frameref.chunkindex, i, "frameref::chunkindex");
const auto & chunk = randomizedchunks[0][i];
const auto & chunkdata = chunk.getchunkdata();
const size_t numutt = chunkdata.numutterances();
for (size_t k = 0; k < numutt; k++)
{
frameref.utteranceindex = (short)k;
checkoverflow (frameref.utteranceindex, k, "frameref::utteranceindex");
const size_t n = chunkdata.numframes (k);
for (size_t m = 0; m < n; m++)
{
frameref.frameindex = (short)m;
checkoverflow (frameref.frameindex, m, "frameref::utteranceindex");
randomizedframerefs[t] = frameref; // hopefully this is a memory copy, not a bit-wise assignment! If not, then code it explicitly
ttochunk[t] = (unsigned short) i;
checkoverflow (ttochunk[t], i, "ttochunk[]");
t++;
}
}
}
assert (t == _totalframes);
// now randomize them --we use the nested loop again to avoid storing a backpointer
// The condition is that a randomized frame may not be moved out of its associated chunk window.
foreach_index (t, randomizedframerefs)
{
const size_t positionchunkindex = ttochunk[t]; // position 't' lies within this chunk (relationship is monotonous, not random)
const auto & chunk = randomizedchunks[0][positionchunkindex]; // for window
// get in-RAM chunk range for this frame position (shared across all frame positions within the same chunk)
const size_t poswindowbegin = chunk.windowbegin; // rolling window over chunks (which under the hood have been randomized)
const size_t poswindowend = chunk.windowend;
// Chunk implies that if we are at position 't', we are guaranteed to have chunks [poswindowbegin, poswindowend) in RAM.
// These chunks are associated with a range of frame positions.
// It is implied that if we are at position 't', the frames covered by chunks [poswindowbegin, poswindowend) are in RAM.
const size_t postbegin = randomizedchunks[0][poswindowbegin].globalts - sweepts;
const size_t postend = randomizedchunks[0][poswindowend-1].globalte() - sweepts;
// The position that this frame gets randomized to must be guaranteed to belong to a chunk within [postbegin, postend).
for (;;) // (randomization retry loop)
{
size_t tswap = msra::dbn::rand (postbegin, postend); // random frame position within allowed range
// We want to swap 't' to 'tswap' and 'tswap' to 't'.
// - Both may have been swapped before.
// - Both must stay within the randomization window of their respective position.
// check admissibility of where the element at 'tswap' gets swapped to 't' (range = [windowbegin,windowend))
size_t tswapchunkindex = randomizedframerefs[tswap].chunkindex;
if (tswapchunkindex < poswindowbegin || tswapchunkindex >= poswindowend)
continue;
// check admissibility of where the element at t gets swapped to (which is frame position 'tswap')
const size_t sourcechunkindex = randomizedframerefs[t].chunkindex;
size_t targetchunkindex = ttochunk[tswap]; // chunk associated with this frame position defines value range
const auto & targetchunk = randomizedchunks[0][targetchunkindex];
const size_t targetwindowbegin = targetchunk.windowbegin;
const size_t targetwindowend = targetchunk.windowend;
if (sourcechunkindex < targetwindowbegin || sourcechunkindex >= targetwindowend)
continue;
// admissible--swap the two
::swap (randomizedframerefs[t], randomizedframerefs[tswap]);
#if 0
break;
#else // post-check --so far did not trigger, can be removed
// do a post-check if we got it right --we seem not to
if (isframepositionvalid (t, ttochunk) && isframepositionvalid (tswap, ttochunk))
break;
// not valid: swap them back and try again --we actually discovered a bug in the code above
::swap (randomizedframerefs[t], randomizedframerefs[tswap]);
fprintf (stderr, "lazyrandomization: BUGBUG --invalid swapping condition detected\n");
#endif
}
}
// check it --my head spins
t = 0;
foreach_index (i, randomizedchunks[0])
{
const auto & chunk = randomizedchunks[0][i]; // for window and chunkdata
const size_t poswindowbegin = chunk.windowbegin;
const size_t poswindowend = chunk.windowend;
const auto & chunkdata = chunk.getchunkdata(); // for numutterances/numframes
const size_t numutt = chunkdata.numutterances();
for (size_t k = 0; k < numutt; k++)
{
const size_t n = chunkdata.numframes (k);
for (size_t m = 0; m < n; m++)
{
const size_t randomizedchunkindex = randomizedframerefs[t].chunkindex;
if (randomizedchunkindex < poswindowbegin || randomizedchunkindex >= poswindowend)
throw std::logic_error ("lazyrandomization: nope, you got frame randomization wrong, dude");
t++;
}
}
}
assert (t == _totalframes);
}
return sweep;
}
// helper to page out a chunk with log message
void releaserandomizedchunk (size_t k)
{
size_t numreleased=0;
foreach_index(m, randomizedchunks){
auto & chunkdata = randomizedchunks[m][k].getchunkdata();
if (chunkdata.isinram())
{
if (verbosity)
fprintf (stderr, "releaserandomizedchunk: paging out randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n",
k, randomizedchunks[m][k].globalts, randomizedchunks[m][k].globalte()-1, chunksinram-1);
chunkdata.releasedata();
numreleased++;
}
}
if (numreleased>0 && numreleased<randomizedchunks.size())
{
LogicError ("releaserandomizedchunk: inconsistency detected - some inputs have chunks in ram, some not");
}
else if (numreleased==randomizedchunks.size())
{
chunksinram--;
}
return;
}
// helper to page in a chunk for a given utterance
// (window range passed in for checking only)
// Returns true if we actually did read something.
bool requirerandomizedchunk (const size_t chunkindex, const size_t windowbegin, const size_t windowend)
{
size_t numinram=0;
if (chunkindex < windowbegin || chunkindex >= windowend)
throw std::logic_error ("requirerandomizedchunk: requested utterance outside in-memory chunk range");
foreach_index(m, randomizedchunks)
{
auto & chunk = randomizedchunks[m][chunkindex];
auto & chunkdata = chunk.getchunkdata();
if (chunkdata.isinram())
numinram++;
}
if (numinram==randomizedchunks.size())
{
return false;
}
else if (numinram==0)
{
foreach_index(m, randomizedchunks)
{
auto & chunk = randomizedchunks[m][chunkindex];
auto & chunkdata = chunk.getchunkdata();
if (verbosity)
fprintf (stderr, "feature set %d: requirerandomizedchunk: paging in randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n", m, chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
msra::util::attempt (5, [&]() // (reading from network)
{
chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices, verbosity);
});
}
chunksinram++;
return true;
}
else{
LogicError ("requirerandomizedchunk: inconsistency detected - some inputs need chunks paged in, some not");
}
}
class matrixasvectorofvectors // wrapper around a matrix that views it as a vector of column vectors
{
void operator= (const matrixasvectorofvectors &); // non-assignable
msra::dbn::matrixbase & m;
public:
matrixasvectorofvectors (msra::dbn::matrixbase & m) : m (m) {}
size_t size() const { return m.cols(); }
const_array_ref<float> operator[] (size_t j) const { return array_ref<float> (&m(0,j), m.rows()); }
};
size_t chunkforframepos (const size_t t) const // find chunk for a given frame position
{
//inspect chunk of first feature stream only
auto iter = std::lower_bound (randomizedchunks[0].begin(), randomizedchunks[0].end(), t, [&] (const chunk & chunk, size_t t) { return chunk.globalte() <= t; });
const size_t chunkindex = iter - randomizedchunks[0].begin();
if (t < randomizedchunks[0][chunkindex].globalts || t >= randomizedchunks[0][chunkindex].globalte())
throw std::logic_error ("chunkforframepos: dude, learn STL!");
return chunkindex;
}
public:
void setverbosity(int newverbosity){ verbosity = newverbosity; }
// get the next minibatch
// A minibatch is made up of one or more utterances.
// We will return less than 'framesrequested' unless the first utterance is too long.
// Note that this may return frames that are beyond the epoch end, but the first frame is always within the epoch.
// We specify the utterance by its global start time (in a space of a infinitely repeated training set).
// This is efficient since getbatch() is called with sequential 'globalts' except at epoch start.
// Note that the start of an epoch does not necessarily fall onto an utterance boundary. The caller must use firstvalidglobalts() to find the first valid globalts at or after a given time.
/*implement*/ bool getbatch (const size_t globalts, const size_t framesrequested, std::vector<msra::dbn::matrix> & feat, std::vector<std::vector<size_t>> & uids,
std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & transcripts,
std::vector<shared_ptr<const latticesource::latticepair>> & latticepairs)
{
bool readfromdisk = false; // return value: shall be 'true' if we paged in anything
auto_timer timergetbatch;
assert (_totalframes > 0);
// update randomization if a new sweep is entered --this is a complex operation that updates many of the data members used below
const size_t sweep = lazyrandomization (globalts);
const std::vector<char> noboundaryflags; // dummy
if (!framemode) // regular utterance mode
{
// find utterance position for globalts
// There must be a precise match; it is not possible to specify frames that are not on boundaries.
auto positer = randomizedutteranceposmap.find (globalts);
if (positer == randomizedutteranceposmap.end())
throw std::logic_error ("getbatch: invalid 'globalts' parameter; must match an existing utterance boundary");
const size_t spos = positer->second;
// determine how many utterances will fit into the requested minibatch size
size_t mbframes = randomizedutterancerefs[spos].numframes; // at least one utterance, even if too long
size_t epos;
for (epos = spos + 1; epos < numutterances && mbframes + randomizedutterancerefs[epos].numframes < framesrequested; epos++) // add more utterances as long as they fit within requested minibatch size
mbframes += randomizedutterancerefs[epos].numframes;
// do some paging housekeeping
// This will also set the feature-kind information if it's the first time.
// Free all chunks left of the range.
// Page-in all chunks right of the range.
// We are a little more blunt for now: Free all outside the range, and page in only what is touched. We could save some loop iterations.
const size_t windowbegin = positionchunkwindows[spos].windowbegin();
const size_t windowend = positionchunkwindows[epos-1].windowend();
for (size_t k = 0; k < windowbegin; k++)
releaserandomizedchunk (k);
for (size_t k = windowend; k < randomizedchunks[0].size(); k++)
releaserandomizedchunk (k);
for (size_t pos = spos; pos < epos; pos++)
readfromdisk |= requirerandomizedchunk (randomizedutterancerefs[pos].chunkindex, windowbegin, windowend); // (window range passed in for checking only)
// resize feat and uids
feat.resize(vdim.size());
uids.resize(classids.size());
assert(feat.size()==vdim.size());
assert(feat.size()==randomizedchunks.size());
foreach_index(i, feat)
{
feat[i].resize (vdim[i], mbframes);
if (i==0)
{
foreach_index(j, uids)
{
if (issupervised()) // empty means unsupervised training -> return empty uids
uids[j].resize (mbframes);
else
uids[i].clear();
latticepairs.clear(); // will push_back() below
transcripts.clear();
}
}
}
// return these utterances
if (verbosity > 0)
fprintf (stderr, "getbatch: getting utterances %zu..%zu (%zu frames out of %zu requested) in sweep %zu\n", spos, epos -1, mbframes, framesrequested, sweep);
size_t tspos = 0; // relative start of utterance 'pos' within the returned minibatch
for (size_t pos = spos; pos < epos; pos++)
{
const auto & uttref = randomizedutterancerefs[pos];
size_t n=0;
foreach_index(i, randomizedchunks)
{
const auto & chunk = randomizedchunks[i][uttref.chunkindex];
const auto & chunkdata = chunk.getchunkdata();
assert (uttref.globalts == globalts + tspos);
auto uttframes = chunkdata.getutteranceframes (uttref.utteranceindex);
matrixasvectorofvectors uttframevectors (uttframes); // (wrapper that allows m[j].size() and m[j][i] as required by augmentneighbors())
n = uttframevectors.size();
assert (n == uttframes.cols() && uttref.numframes == n && chunkdata.numframes (uttref.utteranceindex) == n);
// copy the frames and class labels
for (size_t t = 0; t < n; t++) // t = time index into source utterance
{
size_t leftextent, rightextent;
// page in the needed range of frames
if (leftcontext[i] == 0 && rightcontext[i] == 0)
{
leftextent = rightextent = augmentationextent(uttframevectors[t].size(), vdim[i]);
}
else
{
leftextent = leftcontext[i];
rightextent = rightcontext[i];
}
augmentneighbors(uttframevectors, noboundaryflags, t, leftextent, rightextent, feat[i], t + tspos);
//augmentneighbors(uttframevectors, noboundaryflags, t, feat[i], t + tspos);
}
// copy the frames and class labels
if (i==0)
{
auto uttclassids = getclassids (uttref);
foreach_index(j, uttclassids)
{
for (size_t t = 0; t < n; t++) // t = time index into source utterance
{
if (issupervised())
uids[j][t + tspos] = uttclassids[j][t];
}
if (!this->lattices.empty())
{
auto latticepair = chunkdata.getutterancelattice (uttref.utteranceindex);
latticepairs.push_back (latticepair);
// look up reference
const auto & key = latticepair->getkey();
if (!allwordtranscripts.empty())
{
const auto & transcript = allwordtranscripts.find (key)->second;
transcripts.push_back (transcript.words);
}
}
}
}
}
tspos += n;
}
assert (tspos == mbframes);
}
else // // debug mode returning randomized frames again, to see whether convergence is better (we don't ensure non-repetition at this point)
{
const size_t sweepts = sweep * _totalframes; // first global frame index for this sweep
const size_t sweepte = sweepts + _totalframes; // and its end
const size_t globalte = min (globalts + framesrequested, sweepte); // we return as much as requested, but not exceeding sweep end
const size_t mbframes = globalte - globalts; // that's our mb size
// determine window range
// We enumerate all frames--can this be done more efficiently?
const size_t firstchunk = chunkforframepos (globalts);
const size_t lastchunk = chunkforframepos (globalte-1);
const size_t windowbegin = randomizedchunks[0][firstchunk].windowbegin;
const size_t windowend = randomizedchunks[0][lastchunk].windowend;
if (verbosity > 0)
fprintf (stderr, "getbatch: getting randomized frames [%zu..%zu] (%zu frames out of %zu requested) in sweep %zu; chunks [%zu..%zu] -> chunk window [%zu..%zu)\n",
globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend);
// release all data outside, and page in all data inside
for (size_t k = 0; k < windowbegin; k++)
releaserandomizedchunk (k);
for (size_t k = windowbegin; k < windowend; k++)
readfromdisk |= requirerandomizedchunk (k, windowbegin, windowend); // (window range passed in for checking only, redundant here)
for (size_t k = windowend; k < randomizedchunks[0].size(); k++)
releaserandomizedchunk (k);
// resize feat and uids
feat.resize(vdim.size());
uids.resize(classids.size());
assert(feat.size()==vdim.size());
assert(feat.size()==randomizedchunks.size());
foreach_index(i, feat)
{
feat[i].resize (vdim[i], mbframes);
if (i==0)
{
foreach_index(j, uids)
{
if (issupervised()) // empty means unsupervised training -> return empty uids
uids[j].resize (mbframes);
else
uids[i].clear();
latticepairs.clear(); // will push_back() below
transcripts.clear();
}
}
}
// return randomized frames for the time range of those utterances
for (size_t j = 0; j < mbframes; j++)
{
// map to time index inside arrays
const size_t framepos = (globalts + j) % _totalframes; // using mod because we may actually run beyond the sweep for the last call
const frameref & frameref = randomizedframerefs[framepos];
// random utterance
readfromdisk |= requirerandomizedchunk (frameref.chunkindex, windowbegin, windowend); // (this is just a check; should not actually page in anything)
foreach_index(i, randomizedchunks)
{
const auto & chunk = randomizedchunks[i][frameref.chunkindex];
const auto & chunkdata = chunk.getchunkdata();
auto uttframes = chunkdata.getutteranceframes (frameref.utteranceindex);
matrixasvectorofvectors uttframevectors (uttframes); // (wrapper that allows m[j].size() and m[j][i] as required by augmentneighbors())
const size_t n = uttframevectors.size();
assert (n == uttframes.cols() && chunkdata.numframes (frameref.utteranceindex) == n); n;
// copy frame and class labels
const size_t t = frameref.frameindex;
size_t leftextent, rightextent;
// page in the needed range of frames
if (leftcontext[i] == 0 && rightcontext[i] == 0)
{
leftextent = rightextent = augmentationextent(uttframevectors[t].size(), vdim[i]);
}
else
{
leftextent = leftcontext[i];
rightextent = rightcontext[i];
}
augmentneighbors(uttframevectors, noboundaryflags, t, leftextent, rightextent, feat[i], j);
//augmentneighbors(uttframevectors, noboundaryflags, t, feat[i], j);
if (issupervised() && i == 0)
{
auto frameclassids = getclassids(frameref);
foreach_index(k, uids)
uids[k][j] = frameclassids[k][t];
}
}
}
}
timegetbatch = timergetbatch;
return readfromdisk;
}
double gettimegetbatch() { return timegetbatch;}
// alternate (updated) definition for multiple inputs/outputs - read as a vector of feature matrixes or a vector of label strings
/*implement*/ bool getbatch (const size_t /*globalts*/,
const size_t /*framesrequested*/, msra::dbn::matrix & /*feat*/, std::vector<size_t> & /*uids*/,
std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & /*transcripts*/,
std::vector<shared_ptr<const latticesource::latticepair>> & /*latticepairs*/)
{
// should never get here
throw runtime_error("minibatchframesourcemulti: getbatch() being called for single input feature and single output feature, should use minibatchutterancesource instead\n");
// for single input/output set size to be 1 and run old getbatch
//feat.resize(1);
//uids.resize(1);
//return getbatch(globalts, framesrequested, feat[0], uids[0], transcripts, latticepairs);
}
size_t totalframes() const { return _totalframes; }
// return first valid globalts to ask getbatch() for
// In utterance mode, the epoch start may fall in the middle of an utterance.
// We return the end time of that utterance (which, in pathological cases, may in turn be outside the epoch; handle that).
/*implement*/ size_t firstvalidglobalts (const size_t globalts)
{
// update randomization if a new sweep is entered --this is a complex operation that updates many of the data members used below
const size_t sweep = lazyrandomization (globalts);
// frame mode: start at sweep boundary directly
if (framemode)
return globalts;
// utterance mode
assert (globalts >= sweep * _totalframes && globalts < (sweep + 1) * _totalframes); sweep;
foreach_index (pos, randomizedutterancerefs)
if (randomizedutterancerefs[pos].globalts >= globalts)
return randomizedutterancerefs[pos].globalts; // exact or inexact match
return randomizedutterancerefs.back().globalte(); // boundary case: requested time falls within the last utterance
}
const std::vector<size_t> & unitcounts() const { return counts[0]; }
const std::vector<size_t> & unitcounts(size_t index) const { return counts[index]; }
};
};};