// // // Copyright (c) Microsoft Corporation. All rights reserved. // // // utterancesourcemulti.h -- implementation of utterancesource.h that supports multiple feature and label sets // #pragma once #include "basetypes.h" // for attempt() #include "htkfeatio.h" // for htkmlfreader #include "latticearchive.h" // for reading HTK phoneme lattices (MMI training) #include "minibatchsourcehelpers.h" #include "minibatchiterator.h" namespace msra { namespace dbn { // --------------------------------------------------------------------------- // minibatchutterancesource -- feature source to provide randomized utterances // This also implements a frame-wise mode, which is layered on top of the utterance-wise mode // and thus benefits from its goodies such as corpus-wide high-level randomization and chunk paging. // --------------------------------------------------------------------------- class minibatchutterancesourcemulti : public minibatchsource { void operator=(const minibatchutterancesourcemulti & other); // non-assignable std::vector vdim; // feature dimension after augmenting neighhors std::vector leftcontext; // number of frames to the left of the target frame in the context window std::vector rightcontext; // number of frames to the right of the target frame in the context window std::vector sampperiod; // (for reference and to check against model) std::vector featkind; std::vector featdim; const bool framemode; // true -> actually return frame-level randomized frames (not possible in lattice mode) std::vector> counts; // [s] occurence count for all states (used for priors) int verbosity; // lattice reader //const std::vector> &lattices; const latticesource & lattices; //std::vector lattices; // word-level transcripts (for MMI mode when adding best path to lattices) const map & allwordtranscripts; // (used for getting word-level transcripts) //std::vector> allwordtranscripts; // data store (incl. paging in/out of features and lattices) struct utterancedesc // data descriptor for one utterance { msra::asr::htkfeatreader::parsedpath parsedpath; // archive filename and frame range in that file size_t classidsbegin; // index into allclassids[] array (first frame) utterancedesc (msra::asr::htkfeatreader::parsedpath && ppath, size_t classidsbegin) : parsedpath (ppath), classidsbegin (classidsbegin) {} const wstring & logicalpath() const { return parsedpath; /*type cast will return logical path*/ } size_t numframes() const { return parsedpath.numframes(); } const wstring key() const // key used for looking up lattice (not stored to save space) { #ifdef _WIN32 static const wstring emptywstring; static const wregex deleteextensionre (L"\\.[^\\.\\\\/:]*$"); return regex_replace (logicalpath(), deleteextensionre, emptywstring); // delete extension (or not if none) #endif #ifdef __unix__ return removeExtension(basename(logicalpath())); #endif } }; struct utterancechunkdata // data for a chunk of utterances { std::vector utteranceset; // utterances in this set size_t numutterances() const { return utteranceset.size(); } std::vector firstframes; // [utteranceindex] first frame for given utterance mutable msra::dbn::matrix frames; // stores all frames consecutively (mutable since this is a cache) size_t totalframes; // total #frames for all utterances in this chunk mutable std::vector> lattices; // (may be empty if none) // construction utterancechunkdata() : totalframes (0) {} //utterancechunkdata (const utterancechunkdata& other) : utteranceset(other.utteranceset), firstframes(other.firstframes), frames (other.frames), totalframes (other.totalframes), lattices (other.lattices){}; void push_back (utterancedesc &&/*destructive*/ utt) { //printf ("start push %d %d\n",frames.rows(), frames.cols()); if (isinram()) { throw std::logic_error ("utterancechunkdata: frames already paged into RAM--too late to add data"); } firstframes.push_back (totalframes); totalframes += utt.numframes(); utteranceset.push_back (utt); } // accessors to an utterance's data size_t numframes (size_t i) const { return utteranceset[i].numframes(); } size_t getclassidsbegin (size_t i) const { return utteranceset[i].classidsbegin; } msra::dbn::matrixstripe getutteranceframes (size_t i) const // return the frame set for a given utterance { if (!isinram()) throw std::logic_error ("getutteranceframes: called when data have not been paged in"); const size_t ts = firstframes[i]; const size_t n = numframes(i); return msra::dbn::matrixstripe (frames, ts, n); } shared_ptr getutterancelattice (size_t i) const // return the frame set for a given utterance { if (!isinram()) throw std::logic_error ("getutteranceframes: called when data have not been paged in"); return lattices[i]; } // paging // test if data is in memory at the moment bool isinram() const { return !frames.empty(); } // page in data for this chunk // We pass in the feature info variables by ref which will be filled lazily upon first read void requiredata (string & featkind, size_t & featdim, unsigned int & sampperiod, const latticesource & latticesource, int verbosity=0) const { if (numutterances() == 0) throw std::logic_error ("requiredata: cannot page in virgin block"); if (isinram()) throw std::logic_error ("requiredata: called when data is already in memory"); try // this function supports retrying since we read from the unrealible network, i.e. do not return in a broken state { msra::asr::htkfeatreader reader; // feature reader (we reinstantiate it for each block, i.e. we reopen the file actually) // if this is the first feature read ever, we explicitly open the first file to get the information such as feature dimension if (featdim == 0) { reader.getinfo (utteranceset[0].parsedpath, featkind, featdim, sampperiod); fprintf (stderr, "requiredata: determined feature kind as %zu-dimensional '%s' with frame shift %.1f ms\n", featdim, featkind.c_str(), sampperiod / 1e4); } // read all utterances; if they are in the same archive, htkfeatreader will be efficient in not closing the file frames.resize (featdim, totalframes); if (!latticesource.empty()) lattices.resize (utteranceset.size()); foreach_index (i, utteranceset) { //fprintf (stderr, "."); // read features for this file auto uttframes = getutteranceframes (i); // matrix stripe for this utterance (currently unfilled) reader.read (utteranceset[i].parsedpath, (const string &) featkind, sampperiod, uttframes); // note: file info here used for checkuing only // page in lattice data if (!latticesource.empty()) latticesource.getlattices (utteranceset[i].key(), lattices[i], uttframes.cols()); } //fprintf (stderr, "\n"); if (verbosity) fprintf (stderr, "requiredata: %zu utterances read\n", utteranceset.size()); } catch (...) { releasedata(); throw; } } // page out data for this chunk void releasedata() const { if (numutterances() == 0) throw std::logic_error ("releasedata: cannot page out virgin block"); if (!isinram()) throw std::logic_error ("releasedata: called when data is not memory"); // release frames frames.resize (0, 0); // release lattice data lattices.clear(); } }; std::vector> allchunks; // set of utterances organized in chunks, referred to by an iterator (not an index) std::vector>> classids; // [classidsbegin+t] concatenation of all state sequences bool issupervised() const { return !classids.empty(); } size_t numutterances; // total number of utterances size_t _totalframes; // total frames (same as classids.size() if we have labels) double timegetbatch; // [v-hansu] for time measurement // sequence in random order of actual use (randomized, where randomization is cached) const size_t randomizationrange;// parameter remembered; this is the full window (e.g. 48 hours), not the half window size_t currentsweep; // randomization is currently cached for this sweep; if it changes, rebuild all below struct chunk // chunk as used in actual processing order (randomized sequence) { // the underlying chunk (as a non-indexed reference into the chunk set) std::vector::const_iterator uttchunkdata; const utterancechunkdata & getchunkdata() const { return *uttchunkdata; } size_t numutterances() const { return uttchunkdata->numutterances(); } size_t numframes() const { return uttchunkdata->totalframes; } // position in utterance-position space size_t utteranceposbegin; size_t utteranceposend() const { return utteranceposbegin + numutterances(); } // position on global time line size_t globalts; // start frame on global timeline (after randomization) size_t globalte() const { return globalts + numframes(); } // randomization range limits size_t windowbegin; // randomizedchunk index of earliest chunk that utterances in here can be randomized with size_t windowend; // and end index [windowbegin, windowend) chunk (std::vector::const_iterator uttchunkdata, size_t utteranceposbegin, size_t globalts) : uttchunkdata (uttchunkdata), utteranceposbegin (utteranceposbegin), globalts (globalts) {} }; std::vector> randomizedchunks; // utterance chunks after being brought into random order (we randomize within a rolling window over them) size_t chunksinram; // (for diagnostics messages) struct utteranceref // describes the underlying random utterance associated with an utterance position { size_t chunkindex; // lives in this chunk (index into randomizedchunks[]) size_t utteranceindex; // utterance index in that chunk size_t numframes; // (cached since we cannot directly access the underlying data from here) size_t globalts; // start frame in global space after randomization (for mapping frame index to utterance position) size_t globalte() const { return globalts + numframes; } // end frame utteranceref (size_t chunkindex, size_t utteranceindex) : chunkindex (chunkindex), utteranceindex (utteranceindex), globalts (SIZE_MAX), numframes (0) {} void swap (utteranceref & other) // used in randomization { ::swap (chunkindex, other.chunkindex); ::swap (utteranceindex, other.utteranceindex); assert (globalts == SIZE_MAX && other.globalts == SIZE_MAX && numframes == 0 && other.numframes == 0); // can only swap before assigning these } }; std::vector randomizedutterancerefs; // [pos] randomized utterance ids std::hash_map randomizedutteranceposmap; // [globalts] -> pos lookup table struct positionchunkwindow // chunk window required in memory when at a certain position, for controlling paging { std::vector::const_iterator definingchunk; // the chunk in randomizedchunks[] that defined the utterance position of this utterance size_t windowbegin() const { return definingchunk->windowbegin; } size_t windowend() const { return definingchunk->windowend; } bool isvalidforthisposition (const utteranceref & utt) const { return utt.chunkindex >= windowbegin() && utt.chunkindex < windowend(); // check if 'utt' lives in is in allowed range for this position } positionchunkwindow (std::vector::iterator definingchunk) : definingchunk (definingchunk) {} }; std::vector positionchunkwindows; // [utterance position] -> [windowbegin, windowend) for controlling paging // frame-level randomization layered on top of utterance chunking (randomized, where randomization is cached) struct frameref { #ifdef _WIN64 // (sadly, the compiler makes this 8 bytes, not 6) unsigned short chunkindex; // lives in this chunk (index into randomizedchunks[]) unsigned short utteranceindex; // utterance index in that chunk static const size_t maxutterancesperchunk = 65535; unsigned short frameindex; // frame index within the utterance static const size_t maxframesperutterance = 65535; #elif __unix__ // (sadly, the compiler makes this 8 bytes, not 6) unsigned short chunkindex; // lives in this chunk (index into randomizedchunks[]) unsigned short utteranceindex; // utterance index in that chunk static const size_t maxutterancesperchunk = 65535; unsigned short frameindex; // frame index within the utterance static const size_t maxframesperutterance = 65535; #else // For Win32, we care to keep it inside 32 bits. We have already encountered setups where that's not enough. unsigned int chunkindex : 13; // lives in this chunk (index into randomizedchunks[]) unsigned int utteranceindex : 8; // utterance index in that chunk static const size_t maxutterancesperchunk = 255; unsigned int frameindex : 11; // frame index within the utterance static const size_t maxframesperutterance = 2047; #endif frameref (size_t ci, size_t ui, size_t fi) : chunkindex ((unsigned short) ci), utteranceindex ((unsigned short) ui), frameindex ((unsigned short) fi) { #ifdef _WIN32 static_assert (sizeof (frameref) == 4, "frameref: bit fields too large to fit into 32-bit integer"); #endif if (ci == chunkindex && ui == utteranceindex && fi == frameindex) return; throw std::logic_error ("frameref: bit fields too small"); } frameref() : chunkindex (0), utteranceindex (0), frameindex (0) {} }; biggrowablevector randomizedframerefs; // [globalt-sweepts] -> (chunk, utt, frame) lookup table for randomized frames --this can be REALLY big! // TODO: this may go away if we store classids directly in the utterance data template class shiftedvector // accessing a vector with a non-0 starting index { void operator= (const shiftedvector &); VECTOR & v; size_t first; size_t n; void check (size_t i) const { if (i >= n) throw std::logic_error ("shiftedvector: index out of bounds"); } public: shiftedvector (VECTOR & v, size_t first, size_t n) : v (v), first (first), n (n) { } // TODO: the following is not templated--do it if needed; also should return a const reference then size_t operator[] (size_t i) const { check (i); return v[first + i]; } }; template std::vector>> getclassids (const UTTREF & uttref) // return sub-vector of classids[] for a given utterance { std::vector>> allclassids; allclassids.empty(); if (!issupervised()) { foreach_index(i,classids) allclassids.push_back(std::move(shiftedvector> ((*classids[i]), 0, 0))); return allclassids; // nothing to return } const auto & chunk = randomizedchunks[0][uttref.chunkindex]; const auto & chunkdata = chunk.getchunkdata(); const size_t classidsbegin = chunkdata.getclassidsbegin (uttref.utteranceindex); // index of first state label in global concatenated classids[] array const size_t n = chunkdata.numframes (uttref.utteranceindex); foreach_index(i,classids) { if ((*classids[i])[classidsbegin + n] != (CLASSIDTYPE) -1) throw std::logic_error ("getclassids: expected boundary marker not found, internal data structure screwed up"); allclassids.push_back(std::move(shiftedvector> ((*classids[i]), classidsbegin, n))); } return allclassids; // nothing to return } public: // constructor // Pass empty labels to denote unsupervised training (so getbatch() will not return uids). // This mode requires utterances with time stamps. minibatchutterancesourcemulti (const std::vector> & infiles, const std::vector>> & labels, std::vector vdim, std::vector udim, std::vector leftcontext, std::vector rightcontext, size_t randomizationrange, const latticesource & lattices, const map & allwordtranscripts, const bool framemode) : vdim (vdim), leftcontext(leftcontext), rightcontext(rightcontext), sampperiod (0), featdim (0), randomizationrange (randomizationrange), currentsweep (SIZE_MAX), lattices (lattices), allwordtranscripts (allwordtranscripts), framemode (framemode), chunksinram (0), timegetbatch (0), verbosity(2) // [v-hansu] change framemode (lattices.empty()) into framemode (false) to run utterance mode without lattice // you also need to change another line, search : [v-hansu] comment out to run utterance mode without lattice { // process infiles to know dimensions of things (but not loading features) std::vector utteranceset;// read all utterances to here first; at the end, distribute to chunks utteranceset.reserve (infiles.size()); size_t nomlf = 0; // number of entries missing in MLF (diagnostics) size_t nolat = 0; // number of entries missing in lattice archive (diagnostics) std::vector numclasses; // number of output classes as found in the label file (diagnostics) _totalframes = 0; wstring key; size_t numutts=0; std::vectoruttisvalid; // boolean flag to check that utterance is valid. valid means number of //frames is consistent across all feature and label streams std::vectoruttduration; // track utterance durations to determine utterance validity std::vector classidsbegin; if (!lattices.empty()) { LogicError("lattices not supported in utterancereadermulti"); } allchunks = std::vector>(infiles.size(), std::vector()); featdim = std::vector(infiles.size(), 0); sampperiod = std::vector(infiles.size(), 0); featkind = std::vector(infiles.size(), ""); numclasses = std::vector(labels.size(), 0); counts = std::vector>(labels.size(), std::vector()); foreach_index (i, labels) { classids.push_back(unique_ptr>(new biggrowablevector())); //std::pair,std::vector> latticetocs; //std::unordered_map modelsymmap; //lattices.push_back(shared_ptr(new latticesource(latticetocs, modelsymmap))); } // first check consistency across feature streams // We'll go through the SCP files for each stream to make sure the duration is consistent // If not, we'll plan to ignore the utterance, and inform the user foreach_index(m, infiles){ if (m == 0){ numutts = infiles[m].size(); uttisvalid = std::vector(numutts, true); uttduration = std::vector(numutts, 0); } else if (infiles[m].size()!=numutts) throw std::runtime_error("minibatchutterancesourcemulti: all feature files must have same number of utterances"); foreach_index(i, infiles[m]){ utterancedesc utterance(msra::asr::htkfeatreader::parsedpath(infiles[m][i]), 0); //mseltzer - is this foolproof for multiio? is classids always non-empty? const size_t uttframes = utterance.numframes(); // will throw if frame bounds not given --required to be given in this mode // we need at least 2 frames for boundary markers to work if (uttframes < 2) throw std::runtime_error("minibatchutterancesource: utterances < 2 frames not supported"); if (uttframes > frameref::maxframesperutterance) { fprintf(stderr, "minibatchutterancesource: skipping %d-th file (%d frames) because it exceeds max. frames (%d) for frameref bit field: %S\n", i, uttframes, frameref::maxframesperutterance, key.c_str()); uttduration[i] = 0; uttisvalid[i] = false; } else{ if (m == 0){ uttduration[i] = uttframes; uttisvalid[i] = true; } else if (uttduration[i] != uttframes){ fprintf(stderr, "minibatchutterancesource: skipping %d-th file due to inconsistency in duration in different feature streams (%d vs %d frames)\n", i, uttduration[i], uttframes); uttduration[i] = 0; uttisvalid[i] = false; } } } } size_t invalidutts=0; foreach_index(i, uttisvalid){ if (!uttisvalid[i]) invalidutts++; } if (invalidutts > uttisvalid.size() / 2) throw std::runtime_error("minibatchutterancesource: too many files with inconsistent durations, assuming broken configuration\n"); else if (invalidutts>0) fprintf(stderr, "Found inconsistent durations across feature streams in %d out of %d files\n", invalidutts, uttisvalid.size()); // now process the features and labels size_t utterancesetsize = 0; foreach_index (m, infiles) { utteranceset.clear(); //if (m==0) // numutts = infiles[m].size(); //else // if (infiles[m].size()!=numutts) // throw std::runtime_error("minibatchutterancesourcemulti: all feature files must have same number of utterances\n"); if (m==0) classidsbegin.clear(); foreach_index (i, infiles[m]) { if (i % (infiles[m].size() / 100 + 1) == 0) { fprintf (stderr, "."); fflush (stderr); } // build utterance descriptor if (m == 0 && !labels.empty()) classidsbegin.push_back(classids[0]->size()); if (uttisvalid[i]){ utterancedesc utterance (msra::asr::htkfeatreader::parsedpath (infiles[m][i]), labels.empty() ? 0 : classidsbegin[i] ); //mseltzer - is this foolproof for multiio? is classids always non-empty? const size_t uttframes = utterance.numframes(); // will throw if frame bounds not given --required to be given in this mode assert(uttframes == uttduration[i]); // ensure nothing funky happened // already performed these checks above // we need at least 2 frames for boundary markers to work //if (uttframes < 2) // throw std::runtime_error ("minibatchutterancesource: utterances < 2 frames not supported"); //if (uttframes > frameref::maxframesperutterance) //{ // fprintf (stderr, "minibatchutterancesource: skipping %d-th file (%d frames) because it exceeds max. frames (%d) for frameref bit field: %S", i, uttframes, frameref::maxframesperutterance, key.c_str()); // continue; //} // check whether we have the ref transcript //auto labelsiter = labels[0].end(); bool lacksmlf = true; if (!labels.empty()) // empty means unsupervised mode (don't load any) { key = utterance.key(); // check if labels are available (if not, it normally means that no path was found in realignment) auto labelsiter = labels[0].find (key); //const bool lacksmlf = (labelsiter == labels[0].end()); lacksmlf = (labelsiter == labels[0].end()); if (lacksmlf) if (nomlf++ < 5) fprintf (stderr, " [no labels for %S]", key.c_str()); // check if lattice is available (when in lattice mode) // TODO: also check the #frames here; requires a design change of the TOC format & a rerun const bool lackslat = !lattices.empty() && !lattices.haslattice (key); // ('true' if we have no lattices) if (lackslat) if (nolat++ < 5) fprintf (stderr, " [no lattice for %S]", key.c_str()); // skip if either one is missing if (lacksmlf || lackslat){ uttisvalid[i] = false; continue; // skip this utterance at all } } // push the label sequence into classids[], since we already looked it up // TODO: we can store labels more efficiently now since we don't do frame-wise random access anymore. // OK, utterance has all we need --remember it if (m==0) { if (!labels.empty() && !lacksmlf) //if (!labels.empty() && labelsiter != labels[0].end()) { // first verify that all the label files have the proper duration foreach_index (j, labels) { const auto & labseq = labels[j].find(key)->second; // check if durations match; skip if not size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes); if (labframes != uttframes) { fprintf (stderr, " [duration mismatch (%zu in label vs. %zu in feat file), skipping %S]", labframes, uttframes, key.c_str()); nomlf++; uttisvalid[i] = false; break; // continue; // skip this utterance at all } } if (uttisvalid[i]) { utteranceset.push_back(std::move(utterance)); _totalframes += uttframes; // then parse each mlf if the durations are consistent foreach_index(j, labels) { const auto & labseq = labels[j].find(key)->second; // expand classid sequence into flat array foreach_index (i, labseq) { const auto & e = labseq[i]; if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0)) throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: labels not in consecutive order MLF in label set: %S", key.c_str())); if (e.classid >= udim[j]) { throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: class id exceeds model output dimension")); } if (e.classid != (CLASSIDTYPE) e.classid) throw std::runtime_error ("CLASSIDTYPE has too few bits"); for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++) classids[j]->push_back ((CLASSIDTYPE) e.classid); numclasses[j] = max (numclasses[j], (size_t)(1u + e.classid)); counts[j].resize (numclasses[j], 0); counts[j][e.classid] += e.numframes; } classids[j]->push_back ((CLASSIDTYPE) -1); // append a boundary marker marker for checking if (!labels[j].empty() && classids[j]->size() != _totalframes + utteranceset.size()) throw std::logic_error (msra::strfun::strprintf ("minibatchutterancesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str())); assert (labels[j].empty() || classids[j]->size() == _totalframes + utteranceset.size()); } } } else{ assert(classids.empty() && labels.empty()); utteranceset.push_back(std::move(utterance)); _totalframes += uttframes; } } else { utteranceset.push_back(std::move(utterance)); } } } if (m == 0) utterancesetsize = utteranceset.size(); else assert(utteranceset.size() == utterancesetsize); fprintf (stderr, "feature set %d: %zu frames in %zu out of %zu utterances\n", m, _totalframes, utteranceset.size(),infiles[m].size()); if (!labels.empty()){ foreach_index (j, labels){ biggrowablevector & cid = *classids[j]; foreach_index (i, utteranceset){ //if ((*classids[j])[utteranceset[i].classidsbegin + utteranceset[i].numframes()] != (CLASSIDTYPE) -1) //printf("index = %d\n",utteranceset[i].classidsbegin + utteranceset[i].numframes()); //printf("cid[index] = %d\n",cid[utteranceset[i].classidsbegin + utteranceset[i].numframes()]); //printf("CLASSIDTYPE(-1) = %d\n",(CLASSIDTYPE) -1); if (cid[utteranceset[i].classidsbegin + utteranceset[i].numframes()] != (CLASSIDTYPE) -1) throw std::logic_error ("minibatchutterancesource: classids[] out of sync"); } } } if (nomlf + nolat > 0) { fprintf (stderr, "minibatchutterancesource: out of %zu files, %zu files not found in label set and %zu have no lattice\n", infiles[0].size(), nomlf, nolat); if (nomlf + nolat > infiles[m].size() / 2) throw std::runtime_error ("minibatchutterancesource: too many files not found in label set--assuming broken configuration\n"); } if (m==0) {foreach_index(j, numclasses) { fprintf(stderr,"label set %d: %zu classes\n",j, numclasses[j]); } } // distribute them over chunks // We simply count off frames until we reach the chunk size. // Note that we first randomize the chunks, i.e. when used, chunks are non-consecutive and thus cause the disk head to seek for each chunk. const size_t framespersec = 100; // we just assume this; our efficiency calculation is based on this const size_t chunkframes = 15 * 60 * framespersec; // number of frames to target for each chunk // Loading an initial 24-hour range will involve 96 disk seeks, acceptable. // When paging chunk by chunk, chunk size ~14 MB. std::vector & thisallchunks = allchunks[m]; //std::vector thisallchunks; thisallchunks.resize (0); thisallchunks.reserve (_totalframes / chunkframes); foreach_index (i, utteranceset) { // if exceeding current entry--create a new one // I.e. our chunks are a little larger than wanted (on av. half the av. utterance length). if (thisallchunks.empty() || thisallchunks.back().totalframes > chunkframes || thisallchunks.back().numutterances() >= frameref::maxutterancesperchunk) { thisallchunks.push_back (utterancechunkdata()); } // append utterance to last chunk utterancechunkdata & currentchunk = thisallchunks.back(); //std::move(utteranceset[i]); currentchunk.push_back (std::move (utteranceset[i])); // move it out from our temp array into the chunk // TODO: above push_back does not actually 'move' because the internal push_back does not accept that } numutterances = utteranceset.size(); fprintf (stderr, "minibatchutterancesource: %zu utterances grouped into %zu chunks, av. chunk size: %.1f utterances, %.1f frames\n", numutterances, thisallchunks.size(), numutterances / (double) thisallchunks.size(), _totalframes / (double) thisallchunks.size()); // Now utterances are stored exclusively in allchunks[]. They are never referred to by a sequential utterance id at this point, only by chunk/within-chunk index. } // preliminary mem allocation for frame references (if in frame mode) if (framemode) randomizedframerefs.resize (_totalframes); } private: // shuffle a vector into random order by randomly swapping elements template static void randomshuffle (VECTOR & v, size_t randomseed) { if (v.size() > RAND_MAX * (size_t) RAND_MAX) throw std::runtime_error ("randomshuffle: too large set: need to change to different random generator!"); srand ((unsigned int) randomseed); foreach_index (i, v) { // pick a random location const size_t irand = msra::dbn::rand (0, v.size()); // swap element i with it if (irand == (size_t) i) continue; ::swap (v[i], v[irand]); } } #if 0 template static void randomshuffle(std::vector v, size_t randomseed) { foreach_index(j, v) { if (v[j].size() > RAND_MAX * (size_t) RAND_MAX) throw std::runtime_error ("randomshuffle: too large set: need to change to different random generator!"); } srand ((unsigned int) randomseed); foreach_index (i, v[0]) { // pick a random location const size_t irand = msra::dbn::rand (0, v[0].size()); foreach_index(j, v){ // swap element i with it if (irand == (size_t) i) continue; ::swap (v[j][i], v[j][irand]); } } } #endif //0 static void checkoverflow (size_t fieldval, size_t targetval, const char * fieldname) { if (fieldval != targetval) throw std::runtime_error (msra::strfun::strprintf ("checkoverflow: bit field %s too small for value 0x%x (cut from 0x%x)", fieldname, targetval, fieldval)); } // helper for testing whether a swapped frame position is valid (w.r.t. beign in RAM when being at position 't') bool isframepositionvalid (const size_t t, const biggrowablevector & ttochunk) const { // look up valid range for time position const size_t positionchunkindex = ttochunk[t]; // position 't' lies within this original chunk (relationship is monotonous, not random) const auto & chunk = randomizedchunks[0][positionchunkindex]; // get in-RAM chunk range for this frame position (shared across all frame positions within the same chunk) const size_t poswindowbegin = chunk.windowbegin; // rolling window over chunks (which under the hood have been randomized) const size_t poswindowend = chunk.windowend; // Chunk implies that if we are at position 't', we are guaranteed to have chunks [poswindowbegin, poswindowend) in RAM. // now see if the randomized location is within that window const size_t actualchunkindexforpos = randomizedframerefs[t].chunkindex; // where this frame pos has been mapped to return actualchunkindexforpos >= poswindowbegin && actualchunkindexforpos < poswindowend; // We only need to test the chunk index. Utterance and frame can be randomized within a chunk as we want, as long it is in RAM. } // big long helper to update all cached randomization information // This is a rather complex process since we randomize on two levels: // - chunks of consecutive data in the feature archive // - within a range of chunks that is paged into RAM // - utterances (in utt mode), or // - frames (in frame mode) // The 'globalts' parameter is the start time that triggered the rerandomization; it is NOT the base time of the randomized area. size_t lazyrandomization (const size_t globalts) { const size_t sweep = globalts / _totalframes; // which sweep (this determines randomization) if (sweep == currentsweep) // already got this one--nothing to do return sweep; currentsweep = sweep; if (verbosity>0) fprintf (stderr, "lazyrandomization: re-randomizing for sweep %zu in %s mode\n", currentsweep, framemode ? "frame" : "utterance"); const size_t sweepts = sweep * _totalframes; // first global frame index for this sweep // first randomize chunks std::vector::const_iterator>> randomizedchunkrefs; foreach_index (i, allchunks) randomizedchunkrefs.push_back(std::vector::const_iterator>()); foreach_index (i, allchunks) randomizedchunkrefs[i].reserve (allchunks[i].size()); foreach_index (i, allchunks) // TODO: this cries for iterating using the iterator! { foreach_index(j, allchunks[i]) randomizedchunkrefs[i].push_back (allchunks[i].begin() + j); assert (randomizedchunkrefs[i].size() == allchunks[i].size()); // note that sincew randomshuffle() uses sweep as seed, this will keep the randomization common across all feature streams randomshuffle (randomizedchunkrefs[i], sweep); // bring into random order (with random seed depending on sweep) } // place them onto the global timeline -> randomizedchunks[] // We are processing with randomization within a rolling window over this chunk sequence. // Paging will happen on a chunk-by-chunk basis. // The global time stamp is needed to determine the paging window. randomizedchunks.clear(); // data chunks after being brought into random order (we randomize within a rolling window over them) foreach_index(i, allchunks) randomizedchunks.push_back(std::vector()); foreach_index(i, allchunks) { randomizedchunks[i].reserve (randomizedchunkrefs[i].size()); foreach_index (k, randomizedchunkrefs[i]) randomizedchunks[i].push_back (chunk (randomizedchunkrefs[i][k], randomizedchunks[i].empty() ? 0 : randomizedchunks[i].back().utteranceposend(), randomizedchunks[i].empty() ? sweepts : randomizedchunks[i].back().globalte())); assert (randomizedchunks[i].size() == allchunks[i].size()); assert (randomizedchunks[i].empty() || (randomizedchunks[i].back().utteranceposend() == numutterances && randomizedchunks[i].back().globalte() == sweepts + _totalframes)); } // for each chunk, compute the randomization range (w.r.t. the randomized chunk sequence) foreach_index (i, randomizedchunks) { foreach_index (k, randomizedchunks[i]) { chunk & chunk = randomizedchunks[i][k]; // start with the range of left neighbor if (k == 0) { chunk.windowbegin = 0; chunk.windowend = 1; } else { chunk.windowbegin = randomizedchunks[i][k-1].windowbegin; // might be too early chunk.windowend = randomizedchunks[i][k-1].windowend; // might have more space } while (chunk.globalts - randomizedchunks[i][chunk.windowbegin].globalts > randomizationrange/2) chunk.windowbegin++; // too early while (chunk.windowend < randomizedchunks[i].size() && randomizedchunks[i][chunk.windowend].globalte() - chunk.globalts < randomizationrange/2) chunk.windowend++; // got more space } } if (!framemode) // utterance mode { // This sets up the following members: // - positionchunkwindows // - randomizedutterancerefs // - randomizedutteranceposmap // We will now introduce the concept of utterance *position*. // During processing, utterances will be indexed by position (which is in turn derived from a frame index in getbatch()), // and it is assumed (required) that positions are requested consecutively. // Each utterance position has an underlying associated utterance, which is represented as (chunkid, within-chunk index) and randomly assigned. // Each utterance position also has an associated range of chunks that are kept in memory, // and the associated underlying utterance is guaranteed to be found within that associated range of chunks. // That allows to page out/in data when processing utterance positions in a consecutive manner. // compute chunk windows for every utterance position -> positionchunkwindows[] // Utterance positions can only reference underlying utterance data within the chunk window. // Utterance positions are defined by the randomized chunk sequence (i.e. their underlying 'defining' chunk differs from sweep to sweep). positionchunkwindows.clear(); // [utterance position] -> [windowbegin, windowend) for controlling paging positionchunkwindows.reserve (numutterances); // positionchunkwindows should be consistent for all inputs (distinct feature streams), so just build based on feature[0] // contains pointer to chunk elements but only to compute index foreach_index (k, randomizedchunks[0]) // TODO: this really cries for iterating using iterators! { chunk & chunk = randomizedchunks[0][k]; for (size_t i = chunk.utteranceposbegin; i < chunk.utteranceposend(); i++) // loop over utterances in this chunk { positionchunkwindows.push_back (randomizedchunks[0].begin() + k); } // to look up the chunk range in memory for a position, look up the defining chunk and its range } assert (positionchunkwindows.size() == numutterances); // build the randomized utterances array -> randomizedutterancerefs[] // start by assigning all utterance positions to utterances in non-random consecutive manner randomizedutterancerefs.clear(); // [pos] randomized utterance ids randomizedutterancerefs.reserve (numutterances); foreach_index (k, randomizedchunks[0]) { chunk & chunk = randomizedchunks[0][k]; for (size_t i = 0; i < chunk.numutterances(); i++) // loop over utterances in this chunk randomizedutterancerefs.push_back (utteranceref (k, i)); } assert (randomizedutterancerefs.size() == numutterances); foreach_index (i, randomizedutterancerefs) { auto & uttref = randomizedutterancerefs[i]; assert (positionchunkwindows[i].isvalidforthisposition(uttref)); uttref; } // check we got those setup right // we now randomly shuffle randomizedutterancerefs[pos], while considering the constraints of what chunk range needs to be in memory srand ((unsigned int) sweep + 1); for (size_t i = 0; i < randomizedutterancerefs.size(); i++) { // get valid randomization range, expressed in chunks const size_t windowbegin = positionchunkwindows[i].windowbegin(); const size_t windowend = positionchunkwindows[i].windowend(); // get valid randomization range, expressed in utterance positions // Remember, utterance positions are defined by chunks. const size_t posbegin = randomizedchunks[0][windowbegin].utteranceposbegin; const size_t posend = randomizedchunks[0][windowend-1].utteranceposend(); // randomization range for this utterance position is [posbegin, posend) for(;;) { // pick a random location const size_t j = msra::dbn::rand (posbegin, posend); // a random number within the window if (i == j) break; // the random gods say "this one points to its original position"... nothing wrong about that, but better not try to swap // We want to swap utterances at i and j, but need to make sure they remain in their allowed range. // This is guaranteed for a so-far untouched utterance, but both i and j may have been touched by a previous swap. // We want to use the utterance previously referenced at utterance position j at position i. Is that allowed? if (!positionchunkwindows[i].isvalidforthisposition (randomizedutterancerefs[j])) continue; // nope --try another // Likewise may we use the utterance previously referenced at utterance position i at position j? if (!positionchunkwindows[j].isvalidforthisposition (randomizedutterancerefs[i])) continue; // nope --try another // yep--swap them randomizedutterancerefs[i].swap (randomizedutterancerefs[j]); break; } } // place the randomized utterances on the global timeline so we can find them by globalts size_t t = sweepts; foreach_index (i, randomizedutterancerefs) { auto & uttref = randomizedutterancerefs[i]; uttref.globalts = t; uttref.numframes = randomizedchunks[0][uttref.chunkindex].getchunkdata().numframes (uttref.utteranceindex); t = uttref.globalte(); } assert (t == sweepts + _totalframes); // verify that we got it right (I got a knot in my head!) foreach_index (i, randomizedutterancerefs) { // get utterance referenced at this position const auto & uttref = randomizedutterancerefs[i]; // check if it is valid for this position if (uttref.chunkindex < positionchunkwindows[i].windowbegin() || uttref.chunkindex >= positionchunkwindows[i].windowend()) throw std::logic_error ("lazyrandomization: randomization logic mangled!"); } // create lookup table for (globalts values -> pos) -> randomizedutteranceposmap[] randomizedutteranceposmap.clear(); // [globalts] -> pos lookup table foreach_index (pos, randomizedutterancerefs) { auto & uttref = randomizedutterancerefs[pos]; randomizedutteranceposmap[uttref.globalts] = (size_t) pos; } } else // frame mode { // This sets up the following members: // - randomizedframerefs srand ((unsigned int) sweep + 1); // An original timeline is established by the randomized chunks, denoted by 't'. // Returned frames are indexed by frame position j = (globalt - sweept), which have an associated underlying 't'. // It is guaranteed that uttterance frame position j maps to an underlying frame within the corresponding chunk window. biggrowablevector ttochunk; // randomized chunk index associated with frame position ttochunk.resize (_totalframes); size_t t = 0; frameref frameref; // enumerate chunks in their randomized order and assign frame indices in that order -> randomizedframerefs[t] // At this point, chunks are in randomized order, but utterances and frames within utterances are not randomized. // Later we will randomize those as well. foreach_index (i, randomizedchunks[0]) { frameref.chunkindex = (unsigned short)i; checkoverflow (frameref.chunkindex, i, "frameref::chunkindex"); const auto & chunk = randomizedchunks[0][i]; const auto & chunkdata = chunk.getchunkdata(); const size_t numutt = chunkdata.numutterances(); for (size_t k = 0; k < numutt; k++) { frameref.utteranceindex = (short)k; checkoverflow (frameref.utteranceindex, k, "frameref::utteranceindex"); const size_t n = chunkdata.numframes (k); for (size_t m = 0; m < n; m++) { frameref.frameindex = (short)m; checkoverflow (frameref.frameindex, m, "frameref::utteranceindex"); randomizedframerefs[t] = frameref; // hopefully this is a memory copy, not a bit-wise assignment! If not, then code it explicitly ttochunk[t] = (unsigned short) i; checkoverflow (ttochunk[t], i, "ttochunk[]"); t++; } } } assert (t == _totalframes); // now randomize them --we use the nested loop again to avoid storing a backpointer // The condition is that a randomized frame may not be moved out of its associated chunk window. foreach_index (t, randomizedframerefs) { const size_t positionchunkindex = ttochunk[t]; // position 't' lies within this chunk (relationship is monotonous, not random) const auto & chunk = randomizedchunks[0][positionchunkindex]; // for window // get in-RAM chunk range for this frame position (shared across all frame positions within the same chunk) const size_t poswindowbegin = chunk.windowbegin; // rolling window over chunks (which under the hood have been randomized) const size_t poswindowend = chunk.windowend; // Chunk implies that if we are at position 't', we are guaranteed to have chunks [poswindowbegin, poswindowend) in RAM. // These chunks are associated with a range of frame positions. // It is implied that if we are at position 't', the frames covered by chunks [poswindowbegin, poswindowend) are in RAM. const size_t postbegin = randomizedchunks[0][poswindowbegin].globalts - sweepts; const size_t postend = randomizedchunks[0][poswindowend-1].globalte() - sweepts; // The position that this frame gets randomized to must be guaranteed to belong to a chunk within [postbegin, postend). for (;;) // (randomization retry loop) { size_t tswap = msra::dbn::rand (postbegin, postend); // random frame position within allowed range // We want to swap 't' to 'tswap' and 'tswap' to 't'. // - Both may have been swapped before. // - Both must stay within the randomization window of their respective position. // check admissibility of where the element at 'tswap' gets swapped to 't' (range = [windowbegin,windowend)) size_t tswapchunkindex = randomizedframerefs[tswap].chunkindex; if (tswapchunkindex < poswindowbegin || tswapchunkindex >= poswindowend) continue; // check admissibility of where the element at t gets swapped to (which is frame position 'tswap') const size_t sourcechunkindex = randomizedframerefs[t].chunkindex; size_t targetchunkindex = ttochunk[tswap]; // chunk associated with this frame position defines value range const auto & targetchunk = randomizedchunks[0][targetchunkindex]; const size_t targetwindowbegin = targetchunk.windowbegin; const size_t targetwindowend = targetchunk.windowend; if (sourcechunkindex < targetwindowbegin || sourcechunkindex >= targetwindowend) continue; // admissible--swap the two ::swap (randomizedframerefs[t], randomizedframerefs[tswap]); #if 0 break; #else // post-check --so far did not trigger, can be removed // do a post-check if we got it right --we seem not to if (isframepositionvalid (t, ttochunk) && isframepositionvalid (tswap, ttochunk)) break; // not valid: swap them back and try again --we actually discovered a bug in the code above ::swap (randomizedframerefs[t], randomizedframerefs[tswap]); fprintf (stderr, "lazyrandomization: BUGBUG --invalid swapping condition detected\n"); #endif } } // check it --my head spins t = 0; foreach_index (i, randomizedchunks[0]) { const auto & chunk = randomizedchunks[0][i]; // for window and chunkdata const size_t poswindowbegin = chunk.windowbegin; const size_t poswindowend = chunk.windowend; const auto & chunkdata = chunk.getchunkdata(); // for numutterances/numframes const size_t numutt = chunkdata.numutterances(); for (size_t k = 0; k < numutt; k++) { const size_t n = chunkdata.numframes (k); for (size_t m = 0; m < n; m++) { const size_t randomizedchunkindex = randomizedframerefs[t].chunkindex; if (randomizedchunkindex < poswindowbegin || randomizedchunkindex >= poswindowend) throw std::logic_error ("lazyrandomization: nope, you got frame randomization wrong, dude"); t++; } } } assert (t == _totalframes); } return sweep; } // helper to page out a chunk with log message void releaserandomizedchunk (size_t k) { size_t numreleased=0; foreach_index(m, randomizedchunks){ auto & chunkdata = randomizedchunks[m][k].getchunkdata(); if (chunkdata.isinram()) { if (verbosity) fprintf (stderr, "releaserandomizedchunk: paging out randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n", k, randomizedchunks[m][k].globalts, randomizedchunks[m][k].globalte()-1, chunksinram-1); chunkdata.releasedata(); numreleased++; } } if (numreleased>0 && numreleased= windowend) throw std::logic_error ("requirerandomizedchunk: requested utterance outside in-memory chunk range"); foreach_index(m, randomizedchunks) { auto & chunk = randomizedchunks[m][chunkindex]; auto & chunkdata = chunk.getchunkdata(); if (chunkdata.isinram()) numinram++; } if (numinram==randomizedchunks.size()) { return false; } else if (numinram==0) { foreach_index(m, randomizedchunks) { auto & chunk = randomizedchunks[m][chunkindex]; auto & chunkdata = chunk.getchunkdata(); if (verbosity) fprintf (stderr, "feature set %d: requirerandomizedchunk: paging in randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n", m, chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1); msra::util::attempt (5, [&]() // (reading from network) { chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices, verbosity); }); } chunksinram++; return true; } else{ LogicError ("requirerandomizedchunk: inconsistency detected - some inputs need chunks paged in, some not"); } } class matrixasvectorofvectors // wrapper around a matrix that views it as a vector of column vectors { void operator= (const matrixasvectorofvectors &); // non-assignable msra::dbn::matrixbase & m; public: matrixasvectorofvectors (msra::dbn::matrixbase & m) : m (m) {} size_t size() const { return m.cols(); } const_array_ref operator[] (size_t j) const { return array_ref (&m(0,j), m.rows()); } }; size_t chunkforframepos (const size_t t) const // find chunk for a given frame position { //inspect chunk of first feature stream only auto iter = std::lower_bound (randomizedchunks[0].begin(), randomizedchunks[0].end(), t, [&] (const chunk & chunk, size_t t) { return chunk.globalte() <= t; }); const size_t chunkindex = iter - randomizedchunks[0].begin(); if (t < randomizedchunks[0][chunkindex].globalts || t >= randomizedchunks[0][chunkindex].globalte()) throw std::logic_error ("chunkforframepos: dude, learn STL!"); return chunkindex; } public: void setverbosity(int newverbosity){ verbosity = newverbosity; } // get the next minibatch // A minibatch is made up of one or more utterances. // We will return less than 'framesrequested' unless the first utterance is too long. // Note that this may return frames that are beyond the epoch end, but the first frame is always within the epoch. // We specify the utterance by its global start time (in a space of a infinitely repeated training set). // This is efficient since getbatch() is called with sequential 'globalts' except at epoch start. // Note that the start of an epoch does not necessarily fall onto an utterance boundary. The caller must use firstvalidglobalts() to find the first valid globalts at or after a given time. /*implement*/ bool getbatch (const size_t globalts, const size_t framesrequested, std::vector & feat, std::vector> & uids, std::vector> & transcripts, std::vector> & latticepairs) { bool readfromdisk = false; // return value: shall be 'true' if we paged in anything auto_timer timergetbatch; assert (_totalframes > 0); // update randomization if a new sweep is entered --this is a complex operation that updates many of the data members used below const size_t sweep = lazyrandomization (globalts); const std::vector noboundaryflags; // dummy if (!framemode) // regular utterance mode { // find utterance position for globalts // There must be a precise match; it is not possible to specify frames that are not on boundaries. auto positer = randomizedutteranceposmap.find (globalts); if (positer == randomizedutteranceposmap.end()) throw std::logic_error ("getbatch: invalid 'globalts' parameter; must match an existing utterance boundary"); const size_t spos = positer->second; // determine how many utterances will fit into the requested minibatch size size_t mbframes = randomizedutterancerefs[spos].numframes; // at least one utterance, even if too long size_t epos; for (epos = spos + 1; epos < numutterances && mbframes + randomizedutterancerefs[epos].numframes < framesrequested; epos++) // add more utterances as long as they fit within requested minibatch size mbframes += randomizedutterancerefs[epos].numframes; // do some paging housekeeping // This will also set the feature-kind information if it's the first time. // Free all chunks left of the range. // Page-in all chunks right of the range. // We are a little more blunt for now: Free all outside the range, and page in only what is touched. We could save some loop iterations. const size_t windowbegin = positionchunkwindows[spos].windowbegin(); const size_t windowend = positionchunkwindows[epos-1].windowend(); for (size_t k = 0; k < windowbegin; k++) releaserandomizedchunk (k); for (size_t k = windowend; k < randomizedchunks[0].size(); k++) releaserandomizedchunk (k); for (size_t pos = spos; pos < epos; pos++) readfromdisk |= requirerandomizedchunk (randomizedutterancerefs[pos].chunkindex, windowbegin, windowend); // (window range passed in for checking only) // resize feat and uids feat.resize(vdim.size()); uids.resize(classids.size()); assert(feat.size()==vdim.size()); assert(feat.size()==randomizedchunks.size()); foreach_index(i, feat) { feat[i].resize (vdim[i], mbframes); if (i==0) { foreach_index(j, uids) { if (issupervised()) // empty means unsupervised training -> return empty uids uids[j].resize (mbframes); else uids[i].clear(); latticepairs.clear(); // will push_back() below transcripts.clear(); } } } // return these utterances if (verbosity > 0) fprintf (stderr, "getbatch: getting utterances %zu..%zu (%zu frames out of %zu requested) in sweep %zu\n", spos, epos -1, mbframes, framesrequested, sweep); size_t tspos = 0; // relative start of utterance 'pos' within the returned minibatch for (size_t pos = spos; pos < epos; pos++) { const auto & uttref = randomizedutterancerefs[pos]; size_t n=0; foreach_index(i, randomizedchunks) { const auto & chunk = randomizedchunks[i][uttref.chunkindex]; const auto & chunkdata = chunk.getchunkdata(); assert (uttref.globalts == globalts + tspos); auto uttframes = chunkdata.getutteranceframes (uttref.utteranceindex); matrixasvectorofvectors uttframevectors (uttframes); // (wrapper that allows m[j].size() and m[j][i] as required by augmentneighbors()) n = uttframevectors.size(); assert (n == uttframes.cols() && uttref.numframes == n && chunkdata.numframes (uttref.utteranceindex) == n); // copy the frames and class labels for (size_t t = 0; t < n; t++) // t = time index into source utterance { size_t leftextent, rightextent; // page in the needed range of frames if (leftcontext[i] == 0 && rightcontext[i] == 0) { leftextent = rightextent = augmentationextent(uttframevectors[t].size(), vdim[i]); } else { leftextent = leftcontext[i]; rightextent = rightcontext[i]; } augmentneighbors(uttframevectors, noboundaryflags, t, leftextent, rightextent, feat[i], t + tspos); //augmentneighbors(uttframevectors, noboundaryflags, t, feat[i], t + tspos); } // copy the frames and class labels if (i==0) { auto uttclassids = getclassids (uttref); foreach_index(j, uttclassids) { for (size_t t = 0; t < n; t++) // t = time index into source utterance { if (issupervised()) uids[j][t + tspos] = uttclassids[j][t]; } if (!this->lattices.empty()) { auto latticepair = chunkdata.getutterancelattice (uttref.utteranceindex); latticepairs.push_back (latticepair); // look up reference const auto & key = latticepair->getkey(); if (!allwordtranscripts.empty()) { const auto & transcript = allwordtranscripts.find (key)->second; transcripts.push_back (transcript.words); } } } } } tspos += n; } assert (tspos == mbframes); } else // // debug mode returning randomized frames again, to see whether convergence is better (we don't ensure non-repetition at this point) { const size_t sweepts = sweep * _totalframes; // first global frame index for this sweep const size_t sweepte = sweepts + _totalframes; // and its end const size_t globalte = min (globalts + framesrequested, sweepte); // we return as much as requested, but not exceeding sweep end const size_t mbframes = globalte - globalts; // that's our mb size // determine window range // We enumerate all frames--can this be done more efficiently? const size_t firstchunk = chunkforframepos (globalts); const size_t lastchunk = chunkforframepos (globalte-1); const size_t windowbegin = randomizedchunks[0][firstchunk].windowbegin; const size_t windowend = randomizedchunks[0][lastchunk].windowend; if (verbosity > 0) fprintf (stderr, "getbatch: getting randomized frames [%zu..%zu] (%zu frames out of %zu requested) in sweep %zu; chunks [%zu..%zu] -> chunk window [%zu..%zu)\n", globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend); // release all data outside, and page in all data inside for (size_t k = 0; k < windowbegin; k++) releaserandomizedchunk (k); for (size_t k = windowbegin; k < windowend; k++) readfromdisk |= requirerandomizedchunk (k, windowbegin, windowend); // (window range passed in for checking only, redundant here) for (size_t k = windowend; k < randomizedchunks[0].size(); k++) releaserandomizedchunk (k); // resize feat and uids feat.resize(vdim.size()); uids.resize(classids.size()); assert(feat.size()==vdim.size()); assert(feat.size()==randomizedchunks.size()); foreach_index(i, feat) { feat[i].resize (vdim[i], mbframes); if (i==0) { foreach_index(j, uids) { if (issupervised()) // empty means unsupervised training -> return empty uids uids[j].resize (mbframes); else uids[i].clear(); latticepairs.clear(); // will push_back() below transcripts.clear(); } } } // return randomized frames for the time range of those utterances for (size_t j = 0; j < mbframes; j++) { // map to time index inside arrays const size_t framepos = (globalts + j) % _totalframes; // using mod because we may actually run beyond the sweep for the last call const frameref & frameref = randomizedframerefs[framepos]; // random utterance readfromdisk |= requirerandomizedchunk (frameref.chunkindex, windowbegin, windowend); // (this is just a check; should not actually page in anything) foreach_index(i, randomizedchunks) { const auto & chunk = randomizedchunks[i][frameref.chunkindex]; const auto & chunkdata = chunk.getchunkdata(); auto uttframes = chunkdata.getutteranceframes (frameref.utteranceindex); matrixasvectorofvectors uttframevectors (uttframes); // (wrapper that allows m[j].size() and m[j][i] as required by augmentneighbors()) const size_t n = uttframevectors.size(); assert (n == uttframes.cols() && chunkdata.numframes (frameref.utteranceindex) == n); n; // copy frame and class labels const size_t t = frameref.frameindex; size_t leftextent, rightextent; // page in the needed range of frames if (leftcontext[i] == 0 && rightcontext[i] == 0) { leftextent = rightextent = augmentationextent(uttframevectors[t].size(), vdim[i]); } else { leftextent = leftcontext[i]; rightextent = rightcontext[i]; } augmentneighbors(uttframevectors, noboundaryflags, t, leftextent, rightextent, feat[i], j); //augmentneighbors(uttframevectors, noboundaryflags, t, feat[i], j); if (issupervised() && i == 0) { auto frameclassids = getclassids(frameref); foreach_index(k, uids) uids[k][j] = frameclassids[k][t]; } } } } timegetbatch = timergetbatch; return readfromdisk; } double gettimegetbatch() { return timegetbatch;} // alternate (updated) definition for multiple inputs/outputs - read as a vector of feature matrixes or a vector of label strings /*implement*/ bool getbatch (const size_t /*globalts*/, const size_t /*framesrequested*/, msra::dbn::matrix & /*feat*/, std::vector & /*uids*/, std::vector> & /*transcripts*/, std::vector> & /*latticepairs*/) { // should never get here throw runtime_error("minibatchframesourcemulti: getbatch() being called for single input feature and single output feature, should use minibatchutterancesource instead\n"); // for single input/output set size to be 1 and run old getbatch //feat.resize(1); //uids.resize(1); //return getbatch(globalts, framesrequested, feat[0], uids[0], transcripts, latticepairs); } size_t totalframes() const { return _totalframes; } // return first valid globalts to ask getbatch() for // In utterance mode, the epoch start may fall in the middle of an utterance. // We return the end time of that utterance (which, in pathological cases, may in turn be outside the epoch; handle that). /*implement*/ size_t firstvalidglobalts (const size_t globalts) { // update randomization if a new sweep is entered --this is a complex operation that updates many of the data members used below const size_t sweep = lazyrandomization (globalts); // frame mode: start at sweep boundary directly if (framemode) return globalts; // utterance mode assert (globalts >= sweep * _totalframes && globalts < (sweep + 1) * _totalframes); sweep; foreach_index (pos, randomizedutterancerefs) if (randomizedutterancerefs[pos].globalts >= globalts) return randomizedutterancerefs[pos].globalts; // exact or inexact match return randomizedutterancerefs.back().globalte(); // boundary case: requested time falls within the last utterance } const std::vector & unitcounts() const { return counts[0]; } const std::vector & unitcounts(size_t index) const { return counts[index]; } }; };};