This commit is contained in:
Guoli Ye 2018-06-19 23:22:27 -07:00
Родитель 4018e1e724
Коммит fca46fc5a5
1 изменённых файлов: 3 добавлений и 152 удалений

Просмотреть файл

@ -69,7 +69,7 @@ enum mbrclassdefinition // used to identify definition of class in minimum bayes
// ===========================================================================
class lattice
{
mutable int verbosity;
// mutable int verbosity;
/* guoye: start */
@ -840,156 +840,7 @@ public:
void frommlf(const std::wstring& key, const std::unordered_map<std::string, size_t>& unitmap, const msra::asr::htkmlfreader<msra::asr::htkmlfentry, lattice::htkmlfwordsequence>& labels,
const msra::lm::CMGramLM& lm, const msra::lm::CSymbolSet& unigramsymbols);
/* guoye: start */
// template <class IDMAP>
// void fread(FILE* f, const IDMAP& idmap, size_t spunit, std::set<int>& specialwordids);
// move from latticearchive.cpp to .h, it requires template definition and delcaration are both in .h file
template <class IDMAP>
void fread(FILE* f, const IDMAP& idmap, size_t spunit, std::set<int>& specialwordids)
/* guoye: end */
{
size_t version = freadtag(f, "LAT ");
if (version == 1)
{
freadOrDie(&info, sizeof(info), 1, f);
freadvector(f, "NODE", nodes, info.numnodes);
if (nodes.back().t != info.numframes)
/* guoye: start */
{
// RuntimeError("fread: mismatch between info.numframes and last node's time");
// sometimes, the data is corrputed, let's try to live with it
fprintf(stderr, "fread: mismatch between info.numframes and last node's time: nodes.back().t = %d vs. info.numframes = %d \n", int(nodes.back().t), int(info.numframes));
}
/* guoye: end */
freadvector(f, "EDGE", edges, info.numedges);
freadvector(f, "ALIG", align);
fcheckTag(f, "END ");
// map align ids to user's symmap --the lattice gets updated in place here
foreach_index(k, align)
align[k].updateunit(idmap); // updates itself
}
else if (version == 2)
{
freadOrDie(&info, sizeof(info), 1, f);
freadvector(f, "NODS", nodes, info.numnodes);
if (nodes.back().t != info.numframes)
{
/* guoye: start */
{
// RuntimeError("fread: mismatch between info.numframes and last node's time");
// sometimes, the data is corrputed, let's try to live with it
fprintf(stderr, "fread: mismatch between info.numframes and last node's time: nodes.back().t = %d vs. info.numframes = %d \n", int(nodes.back().t), int(info.numframes));
}
/* guoye: end */
}
freadvector(f, "EDGS", edges2, info.numedges); // uniqued edges
freadvector(f, "ALNS", uniquededgedatatokens); // uniqued alignments
fcheckTag(f, "END ");
/* guoye: start */
vt_node_out_edge_indices.resize(info.numnodes);
for (size_t j = 0; j < info.numedges; j++)
{
// an edge with !NULL pointing to not <s>
// this code make sure if you always start from <s> in the sampled path.
// mask here: we delay the processing in EMBRsamplepaths controlled by flag: enforceValidPathEMBR
// if (edges2[j].S == 0 && nodes[edges2[j].E].wid != 1) continue;
vt_node_out_edge_indices[edges2[j].S].push_back(j);
}
is_special_words.resize(info.numnodes);
for (size_t i = 0; i < info.numnodes; i++)
{
/*
if (nodes[i].wid == 0xfffff)
{
nodes[i].wid;
}
*/
if (specialwordids.find(int(nodes[i].wid)) != specialwordids.end()) is_special_words[i] = true;
else is_special_words[i] = false;
}
/* guoye: end */
// check if we need to map
#if 1 // post-bugfix for incorrect inference of spunit
if (info.impliedspunitid != SIZE_MAX && info.impliedspunitid >= idmap.size()) // we have buggy lattices like that--what do they mean??
{
fprintf(stderr, "fread: detected buggy spunit id %d which is out of range (%d entries in map)\n", (int)info.impliedspunitid, (int)idmap.size());
RuntimeError("fread: out of bounds spunitid");
}
#endif
// This is critical--we have a buggy lattice set that requires no mapping where mapping would fail
bool needsmapping = false;
foreach_index(k, idmap)
{
if (idmap[k] != (size_t)k
#if 1
&& (k != (int)idmap.size() - 1 || idmap[k] != spunit) // that HACK that we add one more /sp/ entry at the end...
#endif
)
{
needsmapping = true;
break;
}
}
// map align ids to user's symmap --the lattice gets updated in place here
if (needsmapping)
{
if (info.impliedspunitid != SIZE_MAX)
info.impliedspunitid = idmap[info.impliedspunitid];
// deal with broken (zero-token) edges
std::vector<bool> isendworkaround;
if (info.impliedspunitid != spunit)
{
fprintf(stderr, "fread: lattice with broken spunit, using workaround to handle potentially broken zero-token edges\n");
inferends(isendworkaround);
}
size_t uniquealignments = 1;
const size_t skipscoretokens = info.hasacscores ? 2 : 1;
for (size_t k = skipscoretokens; k < uniquededgedatatokens.size(); k++)
{
if (!isendworkaround.empty() && isendworkaround[k]) // secondary criterion to detect ends in broken lattices
{
k--; // don't advance, since nothing to advance over
}
else
{
// this is a regular token: update it in-place
auto& ai = uniquededgedatatokens[k];
if (ai.unit >= idmap.size())
RuntimeError("fread: broken-file heuristics failed");
ai.updateunit(idmap); // updates itself
if (!ai.last)
continue;
}
// if last then skip over the lm and ac scores
k += skipscoretokens;
uniquealignments++;
}
fprintf(stderr, "fread: mapped %d unique alignments\n", (int)uniquealignments);
}
if (info.impliedspunitid != spunit)
{
// fprintf (stderr, "fread: inconsistent spunit id in file %d vs. expected %d; due to erroneous heuristic\n", info.impliedspunitid, spunit); // [v-hansu] comment out becaues it takes up most of the log
// it's actually OK, we can live with this, since we only decompress and then move on without any assumptions
// RuntimeError("fread: mismatching /sp/ units");
}
// reconstruct old lattice format from this --TODO: remove once we change to new data representation
rebuildedges(info.impliedspunitid != spunit /*to be able to read somewhat broken V2 lattice archives*/);
}
else
RuntimeError("fread: unsupported lattice format version");
}
/* guoye: end */
// check consistency
// - only one end node
// - only forward edges
@ -1284,7 +1135,7 @@ public:
// Helper method to process v2 Lattice format
template <class IDMAP>
void ProcessV2Lattice(size_t spunit, header_v1_v2& info, std::vector<aligninfo>& uniquededgedatatokens, const IDMAP& idmap, std::set<int>& specialwordids = {} )
void ProcessV2Lattice(size_t spunit, header_v1_v2& info, std::vector<aligninfo>& uniquededgedatatokens, const IDMAP& idmap, std::set<int>& specialwordids = NULL )
{
/* guoye: start */
vt_node_out_edge_indices.resize(info.numnodes);