Refactor binary MLF
This commit is contained in:
Родитель
78165413b8
Коммит
931193d735
|
@ -6,6 +6,7 @@
|
|||
#include "stdafx.h"
|
||||
#include <limits>
|
||||
#include "MLFBinaryDeserializer.h"
|
||||
#include "MLFDeserializer.h"
|
||||
#include "ConfigHelper.h"
|
||||
#include "SequenceData.h"
|
||||
#include "StringUtil.h"
|
||||
|
@ -19,117 +20,20 @@ namespace CNTK {
|
|||
using namespace std;
|
||||
using namespace Microsoft::MSR::CNTK;
|
||||
|
||||
static float s_oneFloat = 1.0;
|
||||
static double s_oneDouble = 1.0;
|
||||
|
||||
// Sparse labels for an utterance.
|
||||
template <class ElemType>
|
||||
struct MLFSequenceData : SparseSequenceData
|
||||
{
|
||||
vector<ElemType> m_values;
|
||||
vector<IndexType> m_indexBuffer;
|
||||
const NDShape& m_frameShape;
|
||||
|
||||
MLFSequenceData(size_t numberOfSamples, const NDShape& frameShape) :
|
||||
m_values(numberOfSamples, 1), m_frameShape(frameShape)
|
||||
{
|
||||
if (numberOfSamples > numeric_limits<IndexType>::max())
|
||||
{
|
||||
RuntimeError("Number of samples in an MLFSequenceData (%zu) "
|
||||
"exceeds the maximum allowed value (%zu)\n",
|
||||
numberOfSamples, (size_t)numeric_limits<IndexType>::max());
|
||||
}
|
||||
|
||||
m_indexBuffer.resize(numberOfSamples);
|
||||
m_nnzCounts.resize(numberOfSamples, static_cast<IndexType>(1));
|
||||
m_numberOfSamples = (uint32_t)numberOfSamples;
|
||||
m_totalNnzCount = static_cast<IndexType>(numberOfSamples);
|
||||
m_indices = &m_indexBuffer[0];
|
||||
}
|
||||
|
||||
const void* GetDataBuffer() override
|
||||
{
|
||||
return m_values.data();
|
||||
}
|
||||
|
||||
const NDShape& GetSampleShape() override
|
||||
{
|
||||
return m_frameShape;
|
||||
}
|
||||
};
|
||||
|
||||
// Base class for chunks in frame and sequence mode.
|
||||
// The lifetime is always less than the lifetime of the parent deserializer.
|
||||
class MLFBinaryDeserializer::ChunkBase : public Chunk
|
||||
{
|
||||
protected:
|
||||
vector<char> m_buffer; // Buffer for the whole chunk
|
||||
vector<bool> m_valid; // Bit mask whether the parsed sequence is valid.
|
||||
MLFUtteranceParser m_parser;
|
||||
|
||||
const MLFBinaryDeserializer& m_deserializer;
|
||||
const ChunkDescriptor& m_descriptor; // Current chunk descriptor.
|
||||
|
||||
ChunkBase(const MLFBinaryDeserializer& deserializer, const ChunkDescriptor& descriptor, const wstring& fileName, const StateTablePtr& states)
|
||||
: m_parser(states),
|
||||
m_descriptor(descriptor),
|
||||
m_deserializer(deserializer)
|
||||
{
|
||||
if (descriptor.NumberOfSequences() == 0 || descriptor.SizeInBytes() == 0)
|
||||
LogicError("Empty chunks are not supported.");
|
||||
|
||||
auto f = FileWrapper::OpenOrDie(fileName, L"rbS");
|
||||
size_t sizeInBytes = descriptor.SizeInBytes();
|
||||
|
||||
// Make sure we always have 0 at the end for buffer overrun.
|
||||
m_buffer.resize(sizeInBytes + 1);
|
||||
m_buffer[sizeInBytes] = 0;
|
||||
|
||||
// Seek and read chunk into memory.
|
||||
f.SeekOrDie(descriptor.StartOffset(), SEEK_SET);
|
||||
|
||||
f.ReadOrDie(m_buffer.data(), sizeInBytes, 1);
|
||||
|
||||
// all sequences are valid by default.
|
||||
m_valid.resize(m_descriptor.NumberOfSequences(), true);
|
||||
}
|
||||
|
||||
string KeyOf(const SequenceDescriptor& s)
|
||||
{
|
||||
return m_deserializer.m_corpus->IdToKey(s.m_key);
|
||||
}
|
||||
|
||||
void CleanBuffer()
|
||||
{
|
||||
// Make sure we do not keep unnecessary memory after sequences have been parsed.
|
||||
vector<char> tmp;
|
||||
m_buffer.swap(tmp);
|
||||
}
|
||||
};
|
||||
|
||||
// MLF chunk when operating in sequence mode.
|
||||
class MLFBinaryDeserializer::SequenceChunk : public MLFBinaryDeserializer::ChunkBase
|
||||
class MLFBinaryDeserializer::BinarySequenceChunk : public MLFDeserializer::SequenceChunk
|
||||
{
|
||||
vector<vector<MLFFrameRange>> m_sequences; // Each sequence is a vector of sequential frame ranges.
|
||||
|
||||
public:
|
||||
SequenceChunk(const MLFBinaryDeserializer& parent, const ChunkDescriptor& descriptor, const wstring& fileName, StateTablePtr states)
|
||||
: ChunkBase(parent, descriptor, fileName, states)
|
||||
BinarySequenceChunk(const MLFBinaryDeserializer& parent, const ChunkDescriptor& descriptor, const wstring& fileName, StateTablePtr states)
|
||||
: MLFDeserializer::SequenceChunk(parent, descriptor, fileName, states)
|
||||
{
|
||||
m_sequences.resize(m_descriptor.Sequences().size());
|
||||
|
||||
#pragma omp parallel for schedule(dynamic)
|
||||
for (int i = 0; i < descriptor.Sequences().size(); ++i)
|
||||
CacheSequence(descriptor.Sequences()[i], i);
|
||||
|
||||
CleanBuffer();
|
||||
}
|
||||
|
||||
void CacheSequence(const SequenceDescriptor& sequence, size_t index)
|
||||
{
|
||||
vector<MLFFrameRange> utterance;
|
||||
|
||||
auto start = m_buffer.data() + sequence.OffsetInChunk();
|
||||
auto start = this->m_buffer.data() + sequence.OffsetInChunk();
|
||||
|
||||
ushort stateCount = *(ushort*)start;
|
||||
utterance.resize(stateCount);
|
||||
|
@ -148,166 +52,21 @@ public:
|
|||
firstFrame += stateCount;
|
||||
}
|
||||
|
||||
m_sequences[index] = move(utterance);
|
||||
}
|
||||
|
||||
void GetSequence(size_t sequenceIndex, vector<SequenceDataPtr>& result) override
|
||||
{
|
||||
if (m_deserializer.m_elementType == DataType::Float)
|
||||
return GetSequence<float>(sequenceIndex, result);
|
||||
else
|
||||
{
|
||||
assert(m_deserializer.m_elementType == DataType::Double);
|
||||
return GetSequence<double>(sequenceIndex, result);
|
||||
}
|
||||
}
|
||||
|
||||
template<class ElementType>
|
||||
void GetSequence(size_t sequenceIndex, vector<SequenceDataPtr>& result)
|
||||
{
|
||||
if (!m_valid[sequenceIndex])
|
||||
{
|
||||
SparseSequenceDataPtr s = make_shared<MLFSequenceData<ElementType>>(0, m_deserializer.m_streams.front().m_sampleLayout);
|
||||
s->m_isValid = false;
|
||||
result.push_back(s);
|
||||
return;
|
||||
}
|
||||
|
||||
const auto& utterance = m_sequences[sequenceIndex];
|
||||
const auto& sequence = m_descriptor.Sequences()[sequenceIndex];
|
||||
|
||||
|
||||
auto s = make_shared<MLFSequenceData<ElementType>>(sequence.m_numberOfSamples, m_deserializer.m_streams.front().m_sampleLayout);
|
||||
auto* startRange = s->m_indices;
|
||||
for (const auto& range : utterance)
|
||||
{
|
||||
if (range.ClassId() >= m_deserializer.m_dimension)
|
||||
// TODO: Possibly set m_valid to false, but currently preserving the old behavior.
|
||||
RuntimeError("Class id '%ud' exceeds the model output dimension '%d'.", range.ClassId(), (int)m_deserializer.m_dimension);
|
||||
|
||||
// Filling all range of frames with the corresponding class id.
|
||||
fill(startRange, startRange + range.NumFrames(), static_cast<IndexType>(range.ClassId()));
|
||||
startRange += range.NumFrames();
|
||||
}
|
||||
|
||||
result.push_back(s);
|
||||
}
|
||||
};
|
||||
|
||||
// MLF chunk when operating in frame mode.
|
||||
// Implementation is different because frames of the same sequence can be accessed
|
||||
// in parallel by the randomizer, so all parsing/preprocessing should be done during
|
||||
// sequence caching, so that GetSequence only works with read only data structures.
|
||||
class MLFBinaryDeserializer::FrameChunk : public MLFBinaryDeserializer::ChunkBase
|
||||
{
|
||||
// Actual values of frames.
|
||||
vector<ClassIdType> m_classIds;
|
||||
|
||||
//For each sequence this vector contains the sequence offset in samples from the beginning of the chunk.
|
||||
std::vector<uint32_t> m_sequenceOffsetInChunkInSamples;
|
||||
|
||||
public:
|
||||
FrameChunk(const MLFBinaryDeserializer& parent, const ChunkDescriptor& descriptor, const wstring& fileName, StateTablePtr states)
|
||||
: ChunkBase(parent, descriptor, fileName, states)
|
||||
{
|
||||
uint32_t numSamples = static_cast<uint32_t>(m_descriptor.NumberOfSamples());
|
||||
|
||||
// The current assumption is that the number of samples in a chunk fits in uint32,
|
||||
// therefore we can save 4 bytes per sequence, storing offsets in samples as uint32.
|
||||
if (numSamples != m_descriptor.NumberOfSamples())
|
||||
RuntimeError("Exceeded maximum number of samples in a chunk");
|
||||
|
||||
// Preallocate a big array for filling in class ids for the whole chunk.
|
||||
m_classIds.resize(numSamples);
|
||||
m_sequenceOffsetInChunkInSamples.resize(m_descriptor.NumberOfSequences());
|
||||
|
||||
|
||||
uint32_t offset = 0;
|
||||
for (auto i = 0; i < m_descriptor.NumberOfSequences(); ++i)
|
||||
{
|
||||
m_sequenceOffsetInChunkInSamples[i] = offset;
|
||||
offset += descriptor[i].m_numberOfSamples;
|
||||
}
|
||||
|
||||
if (numSamples != offset)
|
||||
RuntimeError("Unexpected number of samples in a FrameChunk.");
|
||||
|
||||
// Parse the data on different threads to avoid locking during GetSequence calls.
|
||||
#pragma omp parallel for schedule(dynamic)
|
||||
for (auto i = 0; i < m_descriptor.NumberOfSequences(); ++i)
|
||||
CacheSequence(descriptor[i], i);
|
||||
|
||||
|
||||
CleanBuffer();
|
||||
}
|
||||
|
||||
// Get utterance by the absolute frame index in chunk.
|
||||
// Uses the upper bound to do the binary search among sequences of the chunk.
|
||||
size_t GetUtteranceForChunkFrameIndex(size_t frameIndex) const
|
||||
{
|
||||
auto result = upper_bound(
|
||||
m_sequenceOffsetInChunkInSamples.begin(),
|
||||
m_sequenceOffsetInChunkInSamples.end(),
|
||||
frameIndex,
|
||||
[](size_t fi, const size_t& a) { return fi < a; });
|
||||
return result - 1 - m_sequenceOffsetInChunkInSamples.begin();
|
||||
}
|
||||
|
||||
void GetSequence(size_t sequenceIndex, vector<SequenceDataPtr>& result) override
|
||||
{
|
||||
size_t utteranceId = GetUtteranceForChunkFrameIndex(sequenceIndex);
|
||||
if (!m_valid[utteranceId])
|
||||
{
|
||||
SparseSequenceDataPtr s = make_shared<MLFSequenceData<float>>(0, m_deserializer.m_streams.front().m_sampleLayout);
|
||||
s->m_isValid = false;
|
||||
result.push_back(s);
|
||||
return;
|
||||
}
|
||||
|
||||
size_t label = m_classIds[sequenceIndex];
|
||||
assert(label < m_deserializer.m_categories.size());
|
||||
result.push_back(m_deserializer.m_categories[label]);
|
||||
}
|
||||
|
||||
// Parses and caches sequence in the buffer for GetSequence fast retrieval.
|
||||
void CacheSequence(const SequenceDescriptor& sequence, size_t index)
|
||||
{
|
||||
auto start = m_buffer.data() + sequence.OffsetInChunk();
|
||||
auto end = start + sequence.SizeInBytes();
|
||||
|
||||
vector<MLFFrameRange> utterance;
|
||||
auto absoluteOffset = m_descriptor.StartOffset() + sequence.OffsetInChunk();
|
||||
bool parsed = m_parser.Parse(boost::make_iterator_range(start, end), utterance, absoluteOffset);
|
||||
if (!parsed)
|
||||
{
|
||||
m_valid[index] = false;
|
||||
fprintf(stderr, "WARNING: Cannot parse the utterance %s\n", KeyOf(sequence).c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
auto startRange = m_classIds.begin() + m_sequenceOffsetInChunkInSamples[index];
|
||||
for(size_t i = 0; i < utterance.size(); ++i)
|
||||
{
|
||||
const auto& range = utterance[i];
|
||||
if (range.ClassId() >= m_deserializer.m_dimension)
|
||||
// TODO: Possibly set m_valid to false, but currently preserving the old behavior.
|
||||
RuntimeError("Class id '%ud' exceeds the model output dimension '%d'.", range.ClassId(), (int)m_deserializer.m_dimension);
|
||||
|
||||
fill(startRange, startRange + range.NumFrames(), range.ClassId());
|
||||
startRange += range.NumFrames();
|
||||
}
|
||||
this->m_sequences[index] = move(utterance);
|
||||
}
|
||||
};
|
||||
|
||||
MLFBinaryDeserializer::MLFBinaryDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& cfg, bool primary)
|
||||
: DataDeserializerBase(primary),
|
||||
m_corpus(corpus)
|
||||
: MLFDeserializer(corpus, primary)
|
||||
{
|
||||
if (primary)
|
||||
RuntimeError("MLFBinaryDeserializer currently does not support primary mode.");
|
||||
RuntimeError("MLFDeserializer currently does not support primary mode.");
|
||||
|
||||
m_frameMode = (ConfigValue)cfg("frameMode", "true");
|
||||
|
||||
if (m_frameMode)
|
||||
LogicError("TODO: support frame mode in Binary MLF deserializer.");
|
||||
|
||||
wstring precision = cfg(L"precision", L"float");
|
||||
m_elementType = AreEqualIgnoreCase(precision, L"float") ? DataType::Float : DataType::Double;
|
||||
|
||||
|
@ -328,55 +87,12 @@ MLFBinaryDeserializer::MLFBinaryDeserializer(CorpusDescriptorPtr corpus, const C
|
|||
|
||||
m_withPhoneBoundaries = streamConfig(L"phoneBoundaries", false);
|
||||
if (m_withPhoneBoundaries)
|
||||
{
|
||||
LogicError("TODO: enable phoneBoundaries in binary MLF format.");
|
||||
}
|
||||
LogicError("TODO: implement phoneBoundaries setting.");
|
||||
|
||||
InitializeStream(inputName);
|
||||
InitializeChunkInfos(corpus, config);
|
||||
}
|
||||
|
||||
// TODO: Should be removed. Currently a lot of end to end tests still use this one.
|
||||
MLFBinaryDeserializer::MLFBinaryDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& labelConfig, const wstring& name)
|
||||
: DataDeserializerBase(false)
|
||||
{
|
||||
// The frame mode is currently specified once per configuration,
|
||||
// not in the configuration of a particular deserializer, but on a higher level in the configuration.
|
||||
// Because of that we are using find method below.
|
||||
m_frameMode = labelConfig.Find("frameMode", "true");
|
||||
|
||||
ConfigHelper config(labelConfig);
|
||||
|
||||
config.CheckLabelType();
|
||||
m_dimension = config.GetLabelDimension();
|
||||
|
||||
if (m_dimension > numeric_limits<ClassIdType>::max())
|
||||
{
|
||||
RuntimeError("Label dimension (%zu) exceeds the maximum allowed "
|
||||
"value (%zu)\n", m_dimension, (size_t)numeric_limits<ClassIdType>::max());
|
||||
}
|
||||
|
||||
// Same behavior as for the old deserializer - keep almost all in memory,
|
||||
// because there are a lot of none aligned sets.
|
||||
m_chunkSizeBytes = labelConfig(L"chunkSizeInBytes", g_64MB);
|
||||
|
||||
wstring precision = labelConfig(L"precision", L"float");;
|
||||
m_elementType = AreEqualIgnoreCase(precision, L"float") ? DataType::Float : DataType::Double;
|
||||
|
||||
m_withPhoneBoundaries = labelConfig(L"phoneBoundaries", "false");
|
||||
if (m_withPhoneBoundaries) {
|
||||
LogicError("TODO: enable phoneBoundaries for binary MLF format");
|
||||
}
|
||||
|
||||
InitializeStream(name);
|
||||
InitializeChunkInfos(corpus, config);
|
||||
}
|
||||
|
||||
static inline bool LessByFirstItem(const std::tuple<size_t, size_t, size_t>& a, const std::tuple<size_t, size_t, size_t>& b)
|
||||
{
|
||||
return std::get<0>(a) < std::get<0>(b);
|
||||
}
|
||||
|
||||
void MLFBinaryDeserializer::InitializeChunkInfos(CorpusDescriptorPtr corpus, const ConfigHelper& config)
|
||||
{
|
||||
// Similarly to the old reader, currently we assume all Mlfs will have same root name (key)
|
||||
|
@ -422,7 +138,7 @@ void MLFBinaryDeserializer::InitializeChunkInfos(CorpusDescriptorPtr corpus, con
|
|||
}
|
||||
}
|
||||
|
||||
std::sort(m_keyToChunkLocation.begin(), m_keyToChunkLocation.end(), LessByFirstItem);
|
||||
std::sort(m_keyToChunkLocation.begin(), m_keyToChunkLocation.end(), MLFDeserializer::LessByFirstItem);
|
||||
|
||||
fprintf(stderr, "MLFBinaryDeserializer: '%zu' utterances with '%zu' frames\n",
|
||||
totalNumSequences,
|
||||
|
@ -432,64 +148,6 @@ void MLFBinaryDeserializer::InitializeChunkInfos(CorpusDescriptorPtr corpus, con
|
|||
InitializeReadOnlyArrayOfLabels();
|
||||
}
|
||||
|
||||
void MLFBinaryDeserializer::InitializeReadOnlyArrayOfLabels()
|
||||
{
|
||||
m_categories.reserve(m_dimension);
|
||||
m_categoryIndices.reserve(m_dimension);
|
||||
for (size_t i = 0; i < m_dimension; ++i)
|
||||
{
|
||||
auto category = make_shared<CategorySequenceData>(m_streams.front().m_sampleLayout);
|
||||
m_categoryIndices.push_back(static_cast<IndexType>(i));
|
||||
category->m_indices = &(m_categoryIndices[i]);
|
||||
category->m_nnzCounts.resize(1);
|
||||
category->m_nnzCounts[0] = 1;
|
||||
category->m_totalNnzCount = 1;
|
||||
category->m_numberOfSamples = 1;
|
||||
if (m_elementType == DataType::Float)
|
||||
category->m_data = &s_oneFloat;
|
||||
else
|
||||
category->m_data = &s_oneDouble;
|
||||
m_categories.push_back(category);
|
||||
}
|
||||
}
|
||||
|
||||
void MLFBinaryDeserializer::InitializeStream(const wstring& name)
|
||||
{
|
||||
// Initializing stream description - a single stream of MLF data.
|
||||
StreamInformation stream;
|
||||
stream.m_id = 0;
|
||||
stream.m_name = name;
|
||||
stream.m_sampleLayout = NDShape({ m_dimension });
|
||||
stream.m_storageFormat = StorageFormat::SparseCSC;
|
||||
stream.m_elementType = m_elementType;
|
||||
m_streams.push_back(stream);
|
||||
}
|
||||
|
||||
std::vector<ChunkInfo> MLFBinaryDeserializer::ChunkInfos()
|
||||
{
|
||||
std::vector<ChunkInfo> chunks;
|
||||
chunks.reserve(m_chunks.size());
|
||||
for (size_t i = 0; i < m_chunks.size(); ++i)
|
||||
{
|
||||
ChunkInfo cd;
|
||||
cd.m_id = static_cast<ChunkIdType>(i);
|
||||
if (cd.m_id != i)
|
||||
RuntimeError("ChunkIdType overflow during creation of a chunk description.");
|
||||
|
||||
cd.m_numberOfSequences = m_frameMode ? m_chunks[i]->NumberOfSamples() : m_chunks[i]->NumberOfSequences();
|
||||
cd.m_numberOfSamples = m_chunks[i]->NumberOfSamples();
|
||||
chunks.push_back(cd);
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
|
||||
void MLFBinaryDeserializer::SequenceInfosForChunk(ChunkIdType, vector<SequenceInfo>& result)
|
||||
{
|
||||
UNUSED(result);
|
||||
LogicError("MLF deserializer does not support primary mode, it cannot control chunking. "
|
||||
"Please specify HTK deserializer as the first deserializer in your config file.");
|
||||
}
|
||||
|
||||
ChunkPtr MLFBinaryDeserializer::GetChunk(ChunkIdType chunkId)
|
||||
{
|
||||
ChunkPtr result;
|
||||
|
@ -498,48 +156,10 @@ ChunkPtr MLFBinaryDeserializer::GetChunk(ChunkIdType chunkId)
|
|||
auto chunk = m_chunks[chunkId];
|
||||
auto& fileName = m_mlfFiles[m_chunkToFileIndex[chunk]];
|
||||
|
||||
if (m_frameMode)
|
||||
result = make_shared<FrameChunk>(*this, *chunk, fileName, m_stateTable);
|
||||
else
|
||||
result = make_shared<SequenceChunk>(*this, *chunk, fileName, m_stateTable);
|
||||
result = make_shared<BinarySequenceChunk>(*this, *chunk, fileName, m_stateTable);
|
||||
});
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
bool MLFBinaryDeserializer::GetSequenceInfoByKey(const SequenceKey& key, SequenceInfo& result)
|
||||
{
|
||||
auto found = std::lower_bound(m_keyToChunkLocation.begin(), m_keyToChunkLocation.end(), std::make_tuple(key.m_sequence, 0, 0),
|
||||
LessByFirstItem);
|
||||
|
||||
if (found == m_keyToChunkLocation.end() || std::get<0>(*found) != key.m_sequence)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
auto chunkId = std::get<1>(*found);
|
||||
auto sequenceIndexInChunk = std::get<2>(*found);
|
||||
|
||||
|
||||
result.m_chunkId = std::get<1>(*found);
|
||||
result.m_key = key;
|
||||
|
||||
if (m_frameMode)
|
||||
{
|
||||
// in frame mode sequenceIndexInChunk == sequence offset in chunk in samples
|
||||
result.m_indexInChunk = sequenceIndexInChunk + key.m_sample;
|
||||
result.m_numberOfSamples = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(result.m_key.m_sample == 0);
|
||||
|
||||
const auto* chunk = m_chunks[chunkId];
|
||||
const auto& sequence = chunk->Sequences()[sequenceIndexInChunk];
|
||||
result.m_indexInChunk = sequenceIndexInChunk;
|
||||
result.m_numberOfSamples = sequence.m_numberOfSamples;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -6,83 +6,29 @@
|
|||
#pragma once
|
||||
|
||||
#include <boost/noncopyable.hpp>
|
||||
#include "HTKDeserializer.h"
|
||||
#include "CorpusDescriptor.h"
|
||||
#include "MLFUtils.h"
|
||||
#include "MLFDeserializer.h"
|
||||
#include "Index.h"
|
||||
|
||||
namespace CNTK {
|
||||
|
||||
// Class represents an MLF deserializer.
|
||||
// Provides a set of chunks/sequences to the upper layers.
|
||||
class MLFBinaryDeserializer : public DataDeserializerBase, boost::noncopyable
|
||||
{
|
||||
public:
|
||||
// Class represents an MLF deserializer.
|
||||
// Provides a set of chunks/sequences to the upper layers.
|
||||
class MLFBinaryDeserializer : public MLFDeserializer
|
||||
{
|
||||
public:
|
||||
// Expects new configuration.
|
||||
MLFBinaryDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& config, bool primary);
|
||||
|
||||
// TODO: Should be removed, when all readers go away, expects configuration in a legacy mode.
|
||||
MLFBinaryDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& config, const std::wstring& streamName);
|
||||
|
||||
// Retrieves sequence description by its key. Used for deserializers that are not in "primary"/"driving" mode.
|
||||
bool GetSequenceInfoByKey(const SequenceKey& key, SequenceInfo& s) override;
|
||||
|
||||
// Gets description of all chunks.
|
||||
virtual std::vector<ChunkInfo> ChunkInfos() override;
|
||||
|
||||
// Get sequence descriptions of a particular chunk.
|
||||
virtual void SequenceInfosForChunk(ChunkIdType chunkId, std::vector<SequenceInfo>& s) override;
|
||||
|
||||
// Retrieves a chunk with data.
|
||||
virtual ChunkPtr GetChunk(ChunkIdType) override;
|
||||
|
||||
private:
|
||||
class ChunkBase;
|
||||
class SequenceChunk;
|
||||
class FrameChunk;
|
||||
|
||||
private:
|
||||
// Initializes chunk descriptions.
|
||||
void InitializeChunkInfos(CorpusDescriptorPtr corpus, const ConfigHelper& config);
|
||||
|
||||
// Initializes a single stream this deserializer exposes.
|
||||
void InitializeStream(const std::wstring& name);
|
||||
|
||||
// In frame mode initializes data for all categories/labels in order to
|
||||
// avoid memory copy.
|
||||
void InitializeReadOnlyArrayOfLabels();
|
||||
|
||||
// Sorted vector that maps SequenceKey.m_sequence into an utterance ID (or type max() if the key is not assigned).
|
||||
std::vector<std::tuple<size_t, ChunkIdType, uint32_t>> m_keyToChunkLocation;
|
||||
|
||||
// Type of the data this serializer provides.
|
||||
DataType m_elementType;
|
||||
|
||||
// Array of available categories.
|
||||
// We do no allocate data for all input sequences, only returning a pointer to existing category.
|
||||
std::vector<SparseSequenceDataPtr> m_categories;
|
||||
|
||||
// A list of category indices
|
||||
// (a list of numbers from 0 to N, where N = (number of categories -1))
|
||||
std::vector<IndexType> m_categoryIndices;
|
||||
|
||||
// Flag that indicates whether a single speech frames should be exposed as a sequence.
|
||||
bool m_frameMode;
|
||||
|
||||
CorpusDescriptorPtr m_corpus;
|
||||
|
||||
std::vector<const ChunkDescriptor*> m_chunks;
|
||||
std::map<const ChunkDescriptor*, size_t> m_chunkToFileIndex;
|
||||
|
||||
size_t m_dimension;
|
||||
size_t m_chunkSizeBytes;
|
||||
|
||||
// Track phone boundaries
|
||||
bool m_withPhoneBoundaries;
|
||||
|
||||
StateTablePtr m_stateTable;
|
||||
|
||||
std::vector<std::shared_ptr<Index>> m_indices;
|
||||
std::vector<std::wstring> m_mlfFiles;
|
||||
};
|
||||
|
||||
class BinarySequenceChunk;
|
||||
};
|
||||
}
|
||||
|
|
|
@ -8,7 +8,6 @@
|
|||
#include "MLFBinaryIndexBuilder.h"
|
||||
#include "MLFUtils.h"
|
||||
#include "ReaderUtil.h"
|
||||
#include <iostream>
|
||||
|
||||
namespace CNTK {
|
||||
|
||||
|
|
|
@ -19,296 +19,6 @@ namespace CNTK {
|
|||
using namespace std;
|
||||
using namespace Microsoft::MSR::CNTK;
|
||||
|
||||
static float s_oneFloat = 1.0;
|
||||
static double s_oneDouble = 1.0;
|
||||
|
||||
// A constant used in 1-hot vectors to identify the first frame of a phone.
|
||||
// Used only in CTC-type training.
|
||||
static float s_phoneBoundary = 2.0f;
|
||||
|
||||
// Sparse labels for an utterance.
|
||||
template <class ElemType>
|
||||
struct MLFSequenceData : SparseSequenceData
|
||||
{
|
||||
vector<ElemType> m_values;
|
||||
vector<IndexType> m_indexBuffer;
|
||||
const NDShape& m_frameShape;
|
||||
|
||||
MLFSequenceData(size_t numberOfSamples, const NDShape& frameShape) :
|
||||
m_values(numberOfSamples, 1), m_frameShape(frameShape)
|
||||
{
|
||||
if (numberOfSamples > numeric_limits<IndexType>::max())
|
||||
{
|
||||
RuntimeError("Number of samples in an MLFSequenceData (%zu) "
|
||||
"exceeds the maximum allowed value (%zu)\n",
|
||||
numberOfSamples, (size_t)numeric_limits<IndexType>::max());
|
||||
}
|
||||
|
||||
m_indexBuffer.resize(numberOfSamples);
|
||||
m_nnzCounts.resize(numberOfSamples, static_cast<IndexType>(1));
|
||||
m_numberOfSamples = (uint32_t)numberOfSamples;
|
||||
m_totalNnzCount = static_cast<IndexType>(numberOfSamples);
|
||||
m_indices = &m_indexBuffer[0];
|
||||
}
|
||||
|
||||
MLFSequenceData(size_t numberOfSamples, const vector<size_t>& phoneBoundaries, const NDShape& frameShape) :
|
||||
MLFSequenceData(numberOfSamples, frameShape)
|
||||
{
|
||||
for (auto boundary : phoneBoundaries)
|
||||
m_values[boundary] = s_phoneBoundary;
|
||||
}
|
||||
|
||||
const void* GetDataBuffer() override
|
||||
{
|
||||
return m_values.data();
|
||||
}
|
||||
|
||||
const NDShape& GetSampleShape() override
|
||||
{
|
||||
return m_frameShape;
|
||||
}
|
||||
};
|
||||
|
||||
// Base class for chunks in frame and sequence mode.
|
||||
// The lifetime is always less than the lifetime of the parent deserializer.
|
||||
class MLFDeserializer::ChunkBase : public Chunk
|
||||
{
|
||||
protected:
|
||||
vector<char> m_buffer; // Buffer for the whole chunk
|
||||
vector<bool> m_valid; // Bit mask whether the parsed sequence is valid.
|
||||
MLFUtteranceParser m_parser;
|
||||
|
||||
const MLFDeserializer& m_deserializer;
|
||||
const ChunkDescriptor& m_descriptor; // Current chunk descriptor.
|
||||
|
||||
ChunkBase(const MLFDeserializer& deserializer, const ChunkDescriptor& descriptor, const wstring& fileName, const StateTablePtr& states)
|
||||
: m_parser(states),
|
||||
m_descriptor(descriptor),
|
||||
m_deserializer(deserializer)
|
||||
{
|
||||
if (descriptor.NumberOfSequences() == 0 || descriptor.SizeInBytes() == 0)
|
||||
LogicError("Empty chunks are not supported.");
|
||||
|
||||
auto f = FileWrapper::OpenOrDie(fileName, L"rbS");
|
||||
size_t sizeInBytes = descriptor.SizeInBytes();
|
||||
|
||||
// Make sure we always have 0 at the end for buffer overrun.
|
||||
m_buffer.resize(sizeInBytes + 1);
|
||||
m_buffer[sizeInBytes] = 0;
|
||||
|
||||
// Seek and read chunk into memory.
|
||||
f.SeekOrDie(descriptor.StartOffset(), SEEK_SET);
|
||||
|
||||
f.ReadOrDie(m_buffer.data(), sizeInBytes, 1);
|
||||
|
||||
// all sequences are valid by default.
|
||||
m_valid.resize(m_descriptor.NumberOfSequences(), true);
|
||||
}
|
||||
|
||||
string KeyOf(const SequenceDescriptor& s)
|
||||
{
|
||||
return m_deserializer.m_corpus->IdToKey(s.m_key);
|
||||
}
|
||||
|
||||
void CleanBuffer()
|
||||
{
|
||||
// Make sure we do not keep unnecessary memory after sequences have been parsed.
|
||||
vector<char> tmp;
|
||||
m_buffer.swap(tmp);
|
||||
}
|
||||
};
|
||||
|
||||
// MLF chunk when operating in sequence mode.
|
||||
class MLFDeserializer::SequenceChunk : public MLFDeserializer::ChunkBase
|
||||
{
|
||||
vector<vector<MLFFrameRange>> m_sequences; // Each sequence is a vector of sequential frame ranges.
|
||||
|
||||
public:
|
||||
SequenceChunk(const MLFDeserializer& parent, const ChunkDescriptor& descriptor, const wstring& fileName, StateTablePtr states)
|
||||
: ChunkBase(parent, descriptor, fileName, states)
|
||||
{
|
||||
m_sequences.resize(m_descriptor.Sequences().size());
|
||||
|
||||
#pragma omp parallel for schedule(dynamic)
|
||||
for (int i = 0; i < descriptor.Sequences().size(); ++i)
|
||||
CacheSequence(descriptor.Sequences()[i], i);
|
||||
|
||||
CleanBuffer();
|
||||
}
|
||||
|
||||
void CacheSequence(const SequenceDescriptor& sequence, size_t index)
|
||||
{
|
||||
auto start = m_buffer.data() + sequence.OffsetInChunk();
|
||||
auto end = start + sequence.SizeInBytes();
|
||||
|
||||
vector<MLFFrameRange> utterance;
|
||||
auto absoluteOffset = m_descriptor.StartOffset() + sequence.OffsetInChunk();
|
||||
bool parsed = m_parser.Parse(boost::make_iterator_range(start, end), utterance, absoluteOffset);
|
||||
if (!parsed) // cannot parse
|
||||
{
|
||||
fprintf(stderr, "WARNING: Cannot parse the utterance '%s'\n", KeyOf(sequence).c_str());
|
||||
m_valid[index] = false;
|
||||
return;
|
||||
}
|
||||
|
||||
m_sequences[index] = move(utterance);
|
||||
}
|
||||
|
||||
void GetSequence(size_t sequenceIndex, vector<SequenceDataPtr>& result) override
|
||||
{
|
||||
if (m_deserializer.m_elementType == DataType::Float)
|
||||
return GetSequence<float>(sequenceIndex, result);
|
||||
else
|
||||
{
|
||||
assert(m_deserializer.m_elementType == DataType::Double);
|
||||
return GetSequence<double>(sequenceIndex, result);
|
||||
}
|
||||
}
|
||||
|
||||
template<class ElementType>
|
||||
void GetSequence(size_t sequenceIndex, vector<SequenceDataPtr>& result)
|
||||
{
|
||||
if (!m_valid[sequenceIndex])
|
||||
{
|
||||
SparseSequenceDataPtr s = make_shared<MLFSequenceData<ElementType>>(0, m_deserializer.m_streams.front().m_sampleLayout);
|
||||
s->m_isValid = false;
|
||||
result.push_back(s);
|
||||
return;
|
||||
}
|
||||
|
||||
const auto& utterance = m_sequences[sequenceIndex];
|
||||
const auto& sequence = m_descriptor.Sequences()[sequenceIndex];
|
||||
|
||||
// Packing labels for the utterance into sparse sequence.
|
||||
vector<size_t> sequencePhoneBoundaries(m_deserializer.m_withPhoneBoundaries ? utterance.size() : 0);
|
||||
if (m_deserializer.m_withPhoneBoundaries)
|
||||
{
|
||||
for (size_t i = 0; i < utterance.size(); ++i)
|
||||
sequencePhoneBoundaries[i] = utterance[i].FirstFrame();
|
||||
}
|
||||
|
||||
auto s = make_shared<MLFSequenceData<ElementType>>(sequence.m_numberOfSamples, sequencePhoneBoundaries, m_deserializer.m_streams.front().m_sampleLayout);
|
||||
auto* startRange = s->m_indices;
|
||||
for (const auto& range : utterance)
|
||||
{
|
||||
if (range.ClassId() >= m_deserializer.m_dimension)
|
||||
// TODO: Possibly set m_valid to false, but currently preserving the old behavior.
|
||||
RuntimeError("Class id '%ud' exceeds the model output dimension '%d'.", range.ClassId(), (int)m_deserializer.m_dimension);
|
||||
|
||||
// Filling all range of frames with the corresponding class id.
|
||||
fill(startRange, startRange + range.NumFrames(), static_cast<IndexType>(range.ClassId()));
|
||||
startRange += range.NumFrames();
|
||||
}
|
||||
|
||||
result.push_back(s);
|
||||
}
|
||||
};
|
||||
|
||||
// MLF chunk when operating in frame mode.
|
||||
// Implementation is different because frames of the same sequence can be accessed
|
||||
// in parallel by the randomizer, so all parsing/preprocessing should be done during
|
||||
// sequence caching, so that GetSequence only works with read only data structures.
|
||||
class MLFDeserializer::FrameChunk : public MLFDeserializer::ChunkBase
|
||||
{
|
||||
// Actual values of frames.
|
||||
vector<ClassIdType> m_classIds;
|
||||
|
||||
//For each sequence this vector contains the sequence offset in samples from the beginning of the chunk.
|
||||
std::vector<uint32_t> m_sequenceOffsetInChunkInSamples;
|
||||
|
||||
public:
|
||||
FrameChunk(const MLFDeserializer& parent, const ChunkDescriptor& descriptor, const wstring& fileName, StateTablePtr states)
|
||||
: ChunkBase(parent, descriptor, fileName, states)
|
||||
{
|
||||
uint32_t numSamples = static_cast<uint32_t>(m_descriptor.NumberOfSamples());
|
||||
|
||||
// The current assumption is that the number of samples in a chunk fits in uint32,
|
||||
// therefore we can save 4 bytes per sequence, storing offsets in samples as uint32.
|
||||
if (numSamples != m_descriptor.NumberOfSamples())
|
||||
RuntimeError("Exceeded maximum number of samples in a chunk");
|
||||
|
||||
// Preallocate a big array for filling in class ids for the whole chunk.
|
||||
m_classIds.resize(numSamples);
|
||||
m_sequenceOffsetInChunkInSamples.resize(m_descriptor.NumberOfSequences());
|
||||
|
||||
|
||||
uint32_t offset = 0;
|
||||
for (auto i = 0; i < m_descriptor.NumberOfSequences(); ++i)
|
||||
{
|
||||
m_sequenceOffsetInChunkInSamples[i] = offset;
|
||||
offset += descriptor[i].m_numberOfSamples;
|
||||
}
|
||||
|
||||
if (numSamples != offset)
|
||||
RuntimeError("Unexpected number of samples in a FrameChunk.");
|
||||
|
||||
// Parse the data on different threads to avoid locking during GetSequence calls.
|
||||
#pragma omp parallel for schedule(dynamic)
|
||||
for (auto i = 0; i < m_descriptor.NumberOfSequences(); ++i)
|
||||
CacheSequence(descriptor[i], i);
|
||||
|
||||
|
||||
CleanBuffer();
|
||||
}
|
||||
|
||||
// Get utterance by the absolute frame index in chunk.
|
||||
// Uses the upper bound to do the binary search among sequences of the chunk.
|
||||
size_t GetUtteranceForChunkFrameIndex(size_t frameIndex) const
|
||||
{
|
||||
auto result = upper_bound(
|
||||
m_sequenceOffsetInChunkInSamples.begin(),
|
||||
m_sequenceOffsetInChunkInSamples.end(),
|
||||
frameIndex,
|
||||
[](size_t fi, const size_t& a) { return fi < a; });
|
||||
return result - 1 - m_sequenceOffsetInChunkInSamples.begin();
|
||||
}
|
||||
|
||||
void GetSequence(size_t sequenceIndex, vector<SequenceDataPtr>& result) override
|
||||
{
|
||||
size_t utteranceId = GetUtteranceForChunkFrameIndex(sequenceIndex);
|
||||
if (!m_valid[utteranceId])
|
||||
{
|
||||
SparseSequenceDataPtr s = make_shared<MLFSequenceData<float>>(0, m_deserializer.m_streams.front().m_sampleLayout);
|
||||
s->m_isValid = false;
|
||||
result.push_back(s);
|
||||
return;
|
||||
}
|
||||
|
||||
size_t label = m_classIds[sequenceIndex];
|
||||
assert(label < m_deserializer.m_categories.size());
|
||||
result.push_back(m_deserializer.m_categories[label]);
|
||||
}
|
||||
|
||||
// Parses and caches sequence in the buffer for GetSequence fast retrieval.
|
||||
void CacheSequence(const SequenceDescriptor& sequence, size_t index)
|
||||
{
|
||||
auto start = m_buffer.data() + sequence.OffsetInChunk();
|
||||
auto end = start + sequence.SizeInBytes();
|
||||
|
||||
vector<MLFFrameRange> utterance;
|
||||
auto absoluteOffset = m_descriptor.StartOffset() + sequence.OffsetInChunk();
|
||||
bool parsed = m_parser.Parse(boost::make_iterator_range(start, end), utterance, absoluteOffset);
|
||||
if (!parsed)
|
||||
{
|
||||
m_valid[index] = false;
|
||||
fprintf(stderr, "WARNING: Cannot parse the utterance %s\n", KeyOf(sequence).c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
auto startRange = m_classIds.begin() + m_sequenceOffsetInChunkInSamples[index];
|
||||
for(size_t i = 0; i < utterance.size(); ++i)
|
||||
{
|
||||
const auto& range = utterance[i];
|
||||
if (range.ClassId() >= m_deserializer.m_dimension)
|
||||
// TODO: Possibly set m_valid to false, but currently preserving the old behavior.
|
||||
RuntimeError("Class id '%ud' exceeds the model output dimension '%d'.", range.ClassId(), (int)m_deserializer.m_dimension);
|
||||
|
||||
fill(startRange, startRange + range.NumFrames(), range.ClassId());
|
||||
startRange += range.NumFrames();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
MLFDeserializer::MLFDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& cfg, bool primary)
|
||||
: DataDeserializerBase(primary),
|
||||
m_corpus(corpus)
|
||||
|
@ -379,11 +89,11 @@ MLFDeserializer::MLFDeserializer(CorpusDescriptorPtr corpus, const ConfigParamet
|
|||
InitializeChunkInfos(corpus, config, labelMappingFile);
|
||||
}
|
||||
|
||||
static inline bool LessByFirstItem(const std::tuple<size_t, size_t, size_t>& a, const std::tuple<size_t, size_t, size_t>& b)
|
||||
MLFDeserializer::MLFDeserializer(CorpusDescriptorPtr corpus, bool primary)
|
||||
: DataDeserializerBase(primary),
|
||||
m_corpus(corpus)
|
||||
{
|
||||
return std::get<0>(a) < std::get<0>(b);
|
||||
}
|
||||
|
||||
void MLFDeserializer::InitializeChunkInfos(CorpusDescriptorPtr corpus, const ConfigHelper& config, const wstring& stateListPath)
|
||||
{
|
||||
// Similarly to the old reader, currently we assume all Mlfs will have same root name (key)
|
||||
|
@ -512,9 +222,9 @@ ChunkPtr MLFDeserializer::GetChunk(ChunkIdType chunkId)
|
|||
auto& fileName = m_mlfFiles[m_chunkToFileIndex[chunk]];
|
||||
|
||||
if (m_frameMode)
|
||||
result = make_shared<FrameChunk>(*this, *chunk, fileName, m_stateTable);
|
||||
result = make_shared<MLFDeserializer::FrameChunk>(*this, *chunk, fileName, m_stateTable);
|
||||
else
|
||||
result = make_shared<SequenceChunk>(*this, *chunk, fileName, m_stateTable);
|
||||
result = make_shared<MLFDeserializer::SequenceChunk>(*this, *chunk, fileName, m_stateTable);
|
||||
});
|
||||
|
||||
return result;
|
||||
|
|
|
@ -9,10 +9,61 @@
|
|||
#include "HTKDeserializer.h"
|
||||
#include "CorpusDescriptor.h"
|
||||
#include "MLFUtils.h"
|
||||
#include "FileWrapper.h"
|
||||
#include "Index.h"
|
||||
|
||||
namespace CNTK {
|
||||
|
||||
static float s_oneFloat = 1.0;
|
||||
static double s_oneDouble = 1.0;
|
||||
|
||||
// A constant used in 1-hot vectors to identify the first frame of a phone.
|
||||
// Used only in CTC-type training.
|
||||
static float s_phoneBoundary = 2.0f;
|
||||
|
||||
// Sparse labels for an utterance.
|
||||
template <class ElemType>
|
||||
struct MLFSequenceData : SparseSequenceData
|
||||
{
|
||||
vector<ElemType> m_values;
|
||||
vector<IndexType> m_indexBuffer;
|
||||
const NDShape& m_frameShape;
|
||||
|
||||
MLFSequenceData(size_t numberOfSamples, const NDShape& frameShape) :
|
||||
m_values(numberOfSamples, 1), m_frameShape(frameShape)
|
||||
{
|
||||
if (numberOfSamples > numeric_limits<IndexType>::max())
|
||||
{
|
||||
RuntimeError("Number of samples in an MLFSequenceData (%zu) "
|
||||
"exceeds the maximum allowed value (%zu)\n",
|
||||
numberOfSamples, (size_t)numeric_limits<IndexType>::max());
|
||||
}
|
||||
|
||||
m_indexBuffer.resize(numberOfSamples);
|
||||
m_nnzCounts.resize(numberOfSamples, static_cast<IndexType>(1));
|
||||
m_numberOfSamples = (uint32_t)numberOfSamples;
|
||||
m_totalNnzCount = static_cast<IndexType>(numberOfSamples);
|
||||
m_indices = &m_indexBuffer[0];
|
||||
}
|
||||
|
||||
MLFSequenceData(size_t numberOfSamples, const vector<size_t>& phoneBoundaries, const NDShape& frameShape) :
|
||||
MLFSequenceData(numberOfSamples, frameShape)
|
||||
{
|
||||
for (auto boundary : phoneBoundaries)
|
||||
m_values[boundary] = s_phoneBoundary;
|
||||
}
|
||||
|
||||
const void* GetDataBuffer() override
|
||||
{
|
||||
return m_values.data();
|
||||
}
|
||||
|
||||
const NDShape& GetSampleShape() override
|
||||
{
|
||||
return m_frameShape;
|
||||
}
|
||||
};
|
||||
|
||||
// Class represents an MLF deserializer.
|
||||
// Provides a set of chunks/sequences to the upper layers.
|
||||
class MLFDeserializer : public DataDeserializerBase, boost::noncopyable
|
||||
|
@ -24,6 +75,8 @@ public:
|
|||
// TODO: Should be removed, when all readers go away, expects configuration in a legacy mode.
|
||||
MLFDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& config, const std::wstring& streamName);
|
||||
|
||||
MLFDeserializer(CorpusDescriptorPtr corpus, bool primary);
|
||||
|
||||
// Retrieves sequence description by its key. Used for deserializers that are not in "primary"/"driving" mode.
|
||||
bool GetSequenceInfoByKey(const SequenceKey& key, SequenceInfo& s) override;
|
||||
|
||||
|
@ -36,10 +89,250 @@ public:
|
|||
// Retrieves a chunk with data.
|
||||
virtual ChunkPtr GetChunk(ChunkIdType) override;
|
||||
|
||||
private:
|
||||
class ChunkBase;
|
||||
class SequenceChunk;
|
||||
class FrameChunk;
|
||||
static inline bool LessByFirstItem(const std::tuple<size_t, size_t, size_t>& a, const std::tuple<size_t, size_t, size_t>& b)
|
||||
{
|
||||
return std::get<0>(a) < std::get<0>(b);
|
||||
}
|
||||
|
||||
// Base class for chunks in frame and sequence mode.
|
||||
// The lifetime is always less than the lifetime of the parent deserializer.
|
||||
class ChunkBase : public Chunk
|
||||
{
|
||||
public:
|
||||
ChunkBase(const MLFDeserializer& deserializer, const ChunkDescriptor& descriptor, const wstring& fileName, const StateTablePtr& states)
|
||||
: m_parser(states),
|
||||
m_descriptor(descriptor),
|
||||
m_deserializer(deserializer)
|
||||
{
|
||||
if (descriptor.NumberOfSequences() == 0 || descriptor.SizeInBytes() == 0)
|
||||
LogicError("Empty chunks are not supported.");
|
||||
|
||||
auto f = FileWrapper::OpenOrDie(fileName, L"rbS");
|
||||
size_t sizeInBytes = descriptor.SizeInBytes();
|
||||
|
||||
// Make sure we always have 0 at the end for buffer overrun.
|
||||
m_buffer.resize(sizeInBytes + 1);
|
||||
m_buffer[sizeInBytes] = 0;
|
||||
|
||||
// Seek and read chunk into memory.
|
||||
f.SeekOrDie(descriptor.StartOffset(), SEEK_SET);
|
||||
|
||||
f.ReadOrDie(m_buffer.data(), sizeInBytes, 1);
|
||||
|
||||
// all sequences are valid by default.
|
||||
m_valid.resize(m_descriptor.NumberOfSequences(), true);
|
||||
}
|
||||
|
||||
string KeyOf(const SequenceDescriptor& s)
|
||||
{
|
||||
return m_deserializer.m_corpus->IdToKey(s.m_key);
|
||||
}
|
||||
|
||||
void CleanBuffer()
|
||||
{
|
||||
// Make sure we do not keep unnecessary memory after sequences have been parsed.
|
||||
vector<char> tmp;
|
||||
m_buffer.swap(tmp);
|
||||
}
|
||||
|
||||
vector<char> m_buffer; // Buffer for the whole chunk
|
||||
vector<bool> m_valid; // Bit mask whether the parsed sequence is valid.
|
||||
MLFUtteranceParser m_parser;
|
||||
|
||||
const MLFDeserializer& m_deserializer;
|
||||
const ChunkDescriptor& m_descriptor; // Current chunk descriptor.
|
||||
};
|
||||
|
||||
// MLF chunk when operating in sequence mode.
|
||||
class SequenceChunk : public ChunkBase
|
||||
{
|
||||
public:
|
||||
vector<vector<MLFFrameRange>> m_sequences; // Each sequence is a vector of sequential frame ranges.
|
||||
|
||||
SequenceChunk(const MLFDeserializer& parent, const ChunkDescriptor& descriptor, const wstring& fileName, StateTablePtr states)
|
||||
: ChunkBase(parent, descriptor, fileName, states)
|
||||
{
|
||||
m_sequences.resize(m_descriptor.Sequences().size());
|
||||
|
||||
#pragma omp parallel for schedule(dynamic)
|
||||
for (int i = 0; i < descriptor.Sequences().size(); ++i)
|
||||
CacheSequence(descriptor.Sequences()[i], i);
|
||||
|
||||
CleanBuffer();
|
||||
}
|
||||
|
||||
void CacheSequence(const SequenceDescriptor& sequence, size_t index)
|
||||
{
|
||||
auto start = m_buffer.data() + sequence.OffsetInChunk();
|
||||
auto end = start + sequence.SizeInBytes();
|
||||
|
||||
vector<MLFFrameRange> utterance;
|
||||
auto absoluteOffset = m_descriptor.StartOffset() + sequence.OffsetInChunk();
|
||||
bool parsed = m_parser.Parse(boost::make_iterator_range(start, end), utterance, absoluteOffset);
|
||||
if (!parsed) // cannot parse
|
||||
{
|
||||
fprintf(stderr, "WARNING: Cannot parse the utterance '%s'\n", KeyOf(sequence).c_str());
|
||||
m_valid[index] = false;
|
||||
return;
|
||||
}
|
||||
|
||||
m_sequences[index] = move(utterance);
|
||||
}
|
||||
|
||||
void GetSequence(size_t sequenceIndex, vector<SequenceDataPtr>& result) override
|
||||
{
|
||||
if (m_deserializer.m_elementType == DataType::Float)
|
||||
return GetSequence<float>(sequenceIndex, result);
|
||||
else
|
||||
{
|
||||
assert(m_deserializer.m_elementType == DataType::Double);
|
||||
return GetSequence<double>(sequenceIndex, result);
|
||||
}
|
||||
}
|
||||
|
||||
template<class ElementType>
|
||||
void GetSequence(size_t sequenceIndex, vector<SequenceDataPtr>& result)
|
||||
{
|
||||
if (!m_valid[sequenceIndex])
|
||||
{
|
||||
SparseSequenceDataPtr s = make_shared<MLFSequenceData<ElementType>>(0, m_deserializer.m_streams.front().m_sampleLayout);
|
||||
s->m_isValid = false;
|
||||
result.push_back(s);
|
||||
return;
|
||||
}
|
||||
|
||||
const auto& utterance = m_sequences[sequenceIndex];
|
||||
const auto& sequence = m_descriptor.Sequences()[sequenceIndex];
|
||||
|
||||
// Packing labels for the utterance into sparse sequence.
|
||||
vector<size_t> sequencePhoneBoundaries(m_deserializer.m_withPhoneBoundaries ? utterance.size() : 0);
|
||||
if (m_deserializer.m_withPhoneBoundaries)
|
||||
{
|
||||
for (size_t i = 0; i < utterance.size(); ++i)
|
||||
sequencePhoneBoundaries[i] = utterance[i].FirstFrame();
|
||||
}
|
||||
|
||||
auto s = make_shared<MLFSequenceData<ElementType>>(sequence.m_numberOfSamples, sequencePhoneBoundaries, m_deserializer.m_streams.front().m_sampleLayout);
|
||||
auto* startRange = s->m_indices;
|
||||
for (const auto& range : utterance)
|
||||
{
|
||||
if (range.ClassId() >= m_deserializer.m_dimension)
|
||||
// TODO: Possibly set m_valid to false, but currently preserving the old behavior.
|
||||
RuntimeError("Class id '%ud' exceeds the model output dimension '%d'.", range.ClassId(), (int)m_deserializer.m_dimension);
|
||||
|
||||
// Filling all range of frames with the corresponding class id.
|
||||
fill(startRange, startRange + range.NumFrames(), static_cast<IndexType>(range.ClassId()));
|
||||
startRange += range.NumFrames();
|
||||
}
|
||||
|
||||
result.push_back(s);
|
||||
}
|
||||
};
|
||||
|
||||
// MLF chunk when operating in frame mode.
|
||||
// Implementation is different because frames of the same sequence can be accessed
|
||||
// in parallel by the randomizer, so all parsing/preprocessing should be done during
|
||||
// sequence caching, so that GetSequence only works with read only data structures.
|
||||
class FrameChunk : public ChunkBase
|
||||
{
|
||||
// Actual values of frames.
|
||||
vector<ClassIdType> m_classIds;
|
||||
|
||||
//For each sequence this vector contains the sequence offset in samples from the beginning of the chunk.
|
||||
std::vector<uint32_t> m_sequenceOffsetInChunkInSamples;
|
||||
|
||||
public:
|
||||
FrameChunk(const MLFDeserializer& parent, const ChunkDescriptor& descriptor, const wstring& fileName, StateTablePtr states)
|
||||
: ChunkBase(parent, descriptor, fileName, states)
|
||||
{
|
||||
uint32_t numSamples = static_cast<uint32_t>(m_descriptor.NumberOfSamples());
|
||||
|
||||
// The current assumption is that the number of samples in a chunk fits in uint32,
|
||||
// therefore we can save 4 bytes per sequence, storing offsets in samples as uint32.
|
||||
if (numSamples != m_descriptor.NumberOfSamples())
|
||||
RuntimeError("Exceeded maximum number of samples in a chunk");
|
||||
|
||||
// Preallocate a big array for filling in class ids for the whole chunk.
|
||||
m_classIds.resize(numSamples);
|
||||
m_sequenceOffsetInChunkInSamples.resize(m_descriptor.NumberOfSequences());
|
||||
|
||||
|
||||
uint32_t offset = 0;
|
||||
for (auto i = 0; i < m_descriptor.NumberOfSequences(); ++i)
|
||||
{
|
||||
m_sequenceOffsetInChunkInSamples[i] = offset;
|
||||
offset += descriptor[i].m_numberOfSamples;
|
||||
}
|
||||
|
||||
if (numSamples != offset)
|
||||
RuntimeError("Unexpected number of samples in a FrameChunk.");
|
||||
|
||||
// Parse the data on different threads to avoid locking during GetSequence calls.
|
||||
#pragma omp parallel for schedule(dynamic)
|
||||
for (auto i = 0; i < m_descriptor.NumberOfSequences(); ++i)
|
||||
CacheSequence(descriptor[i], i);
|
||||
|
||||
|
||||
CleanBuffer();
|
||||
}
|
||||
|
||||
// Get utterance by the absolute frame index in chunk.
|
||||
// Uses the upper bound to do the binary search among sequences of the chunk.
|
||||
size_t GetUtteranceForChunkFrameIndex(size_t frameIndex) const
|
||||
{
|
||||
auto result = upper_bound(
|
||||
m_sequenceOffsetInChunkInSamples.begin(),
|
||||
m_sequenceOffsetInChunkInSamples.end(),
|
||||
frameIndex,
|
||||
[](size_t fi, const size_t& a) { return fi < a; });
|
||||
return result - 1 - m_sequenceOffsetInChunkInSamples.begin();
|
||||
}
|
||||
|
||||
void GetSequence(size_t sequenceIndex, vector<SequenceDataPtr>& result) override
|
||||
{
|
||||
size_t utteranceId = GetUtteranceForChunkFrameIndex(sequenceIndex);
|
||||
if (!m_valid[utteranceId])
|
||||
{
|
||||
SparseSequenceDataPtr s = make_shared<MLFSequenceData<float>>(0, m_deserializer.m_streams.front().m_sampleLayout);
|
||||
s->m_isValid = false;
|
||||
result.push_back(s);
|
||||
return;
|
||||
}
|
||||
|
||||
size_t label = m_classIds[sequenceIndex];
|
||||
assert(label < m_deserializer.m_categories.size());
|
||||
result.push_back(m_deserializer.m_categories[label]);
|
||||
}
|
||||
|
||||
// Parses and caches sequence in the buffer for GetSequence fast retrieval.
|
||||
void CacheSequence(const SequenceDescriptor& sequence, size_t index)
|
||||
{
|
||||
auto start = m_buffer.data() + sequence.OffsetInChunk();
|
||||
auto end = start + sequence.SizeInBytes();
|
||||
|
||||
vector<MLFFrameRange> utterance;
|
||||
auto absoluteOffset = m_descriptor.StartOffset() + sequence.OffsetInChunk();
|
||||
bool parsed = m_parser.Parse(boost::make_iterator_range(start, end), utterance, absoluteOffset);
|
||||
if (!parsed)
|
||||
{
|
||||
m_valid[index] = false;
|
||||
fprintf(stderr, "WARNING: Cannot parse the utterance %s\n", KeyOf(sequence).c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
auto startRange = m_classIds.begin() + m_sequenceOffsetInChunkInSamples[index];
|
||||
for (size_t i = 0; i < utterance.size(); ++i)
|
||||
{
|
||||
const auto& range = utterance[i];
|
||||
if (range.ClassId() >= m_deserializer.m_dimension)
|
||||
// TODO: Possibly set m_valid to false, but currently preserving the old behavior.
|
||||
RuntimeError("Class id '%ud' exceeds the model output dimension '%d'.", range.ClassId(), (int)m_deserializer.m_dimension);
|
||||
|
||||
fill(startRange, startRange + range.NumFrames(), range.ClassId());
|
||||
startRange += range.NumFrames();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Initializes chunk descriptions.
|
||||
void InitializeChunkInfos(CorpusDescriptorPtr corpus, const ConfigHelper& config, const std::wstring& stateListPath);
|
||||
|
|
Загрузка…
Ссылка в новой задаче