diff --git a/Source/Readers/HTKDeserializers/MLFBinaryDeserializer.cpp b/Source/Readers/HTKDeserializers/MLFBinaryDeserializer.cpp index 8caadaf4e..b325f85e7 100644 --- a/Source/Readers/HTKDeserializers/MLFBinaryDeserializer.cpp +++ b/Source/Readers/HTKDeserializers/MLFBinaryDeserializer.cpp @@ -6,6 +6,7 @@ #include "stdafx.h" #include #include "MLFBinaryDeserializer.h" +#include "MLFDeserializer.h" #include "ConfigHelper.h" #include "SequenceData.h" #include "StringUtil.h" @@ -19,117 +20,20 @@ namespace CNTK { using namespace std; using namespace Microsoft::MSR::CNTK; -static float s_oneFloat = 1.0; -static double s_oneDouble = 1.0; - -// Sparse labels for an utterance. -template -struct MLFSequenceData : SparseSequenceData -{ - vector m_values; - vector m_indexBuffer; - const NDShape& m_frameShape; - - MLFSequenceData(size_t numberOfSamples, const NDShape& frameShape) : - m_values(numberOfSamples, 1), m_frameShape(frameShape) - { - if (numberOfSamples > numeric_limits::max()) - { - RuntimeError("Number of samples in an MLFSequenceData (%zu) " - "exceeds the maximum allowed value (%zu)\n", - numberOfSamples, (size_t)numeric_limits::max()); - } - - m_indexBuffer.resize(numberOfSamples); - m_nnzCounts.resize(numberOfSamples, static_cast(1)); - m_numberOfSamples = (uint32_t)numberOfSamples; - m_totalNnzCount = static_cast(numberOfSamples); - m_indices = &m_indexBuffer[0]; - } - - const void* GetDataBuffer() override - { - return m_values.data(); - } - - const NDShape& GetSampleShape() override - { - return m_frameShape; - } -}; - -// Base class for chunks in frame and sequence mode. -// The lifetime is always less than the lifetime of the parent deserializer. -class MLFBinaryDeserializer::ChunkBase : public Chunk -{ -protected: - vector m_buffer; // Buffer for the whole chunk - vector m_valid; // Bit mask whether the parsed sequence is valid. - MLFUtteranceParser m_parser; - - const MLFBinaryDeserializer& m_deserializer; - const ChunkDescriptor& m_descriptor; // Current chunk descriptor. - - ChunkBase(const MLFBinaryDeserializer& deserializer, const ChunkDescriptor& descriptor, const wstring& fileName, const StateTablePtr& states) - : m_parser(states), - m_descriptor(descriptor), - m_deserializer(deserializer) - { - if (descriptor.NumberOfSequences() == 0 || descriptor.SizeInBytes() == 0) - LogicError("Empty chunks are not supported."); - - auto f = FileWrapper::OpenOrDie(fileName, L"rbS"); - size_t sizeInBytes = descriptor.SizeInBytes(); - - // Make sure we always have 0 at the end for buffer overrun. - m_buffer.resize(sizeInBytes + 1); - m_buffer[sizeInBytes] = 0; - - // Seek and read chunk into memory. - f.SeekOrDie(descriptor.StartOffset(), SEEK_SET); - - f.ReadOrDie(m_buffer.data(), sizeInBytes, 1); - - // all sequences are valid by default. - m_valid.resize(m_descriptor.NumberOfSequences(), true); - } - - string KeyOf(const SequenceDescriptor& s) - { - return m_deserializer.m_corpus->IdToKey(s.m_key); - } - - void CleanBuffer() - { - // Make sure we do not keep unnecessary memory after sequences have been parsed. - vector tmp; - m_buffer.swap(tmp); - } -}; - // MLF chunk when operating in sequence mode. -class MLFBinaryDeserializer::SequenceChunk : public MLFBinaryDeserializer::ChunkBase +class MLFBinaryDeserializer::BinarySequenceChunk : public MLFDeserializer::SequenceChunk { - vector> m_sequences; // Each sequence is a vector of sequential frame ranges. - public: - SequenceChunk(const MLFBinaryDeserializer& parent, const ChunkDescriptor& descriptor, const wstring& fileName, StateTablePtr states) - : ChunkBase(parent, descriptor, fileName, states) + BinarySequenceChunk(const MLFBinaryDeserializer& parent, const ChunkDescriptor& descriptor, const wstring& fileName, StateTablePtr states) + : MLFDeserializer::SequenceChunk(parent, descriptor, fileName, states) { - m_sequences.resize(m_descriptor.Sequences().size()); - -#pragma omp parallel for schedule(dynamic) - for (int i = 0; i < descriptor.Sequences().size(); ++i) - CacheSequence(descriptor.Sequences()[i], i); - - CleanBuffer(); } void CacheSequence(const SequenceDescriptor& sequence, size_t index) { vector utterance; - auto start = m_buffer.data() + sequence.OffsetInChunk(); + auto start = this->m_buffer.data() + sequence.OffsetInChunk(); ushort stateCount = *(ushort*)start; utterance.resize(stateCount); @@ -148,166 +52,21 @@ public: firstFrame += stateCount; } - m_sequences[index] = move(utterance); - } - - void GetSequence(size_t sequenceIndex, vector& result) override - { - if (m_deserializer.m_elementType == DataType::Float) - return GetSequence(sequenceIndex, result); - else - { - assert(m_deserializer.m_elementType == DataType::Double); - return GetSequence(sequenceIndex, result); - } - } - - template - void GetSequence(size_t sequenceIndex, vector& result) - { - if (!m_valid[sequenceIndex]) - { - SparseSequenceDataPtr s = make_shared>(0, m_deserializer.m_streams.front().m_sampleLayout); - s->m_isValid = false; - result.push_back(s); - return; - } - - const auto& utterance = m_sequences[sequenceIndex]; - const auto& sequence = m_descriptor.Sequences()[sequenceIndex]; - - - auto s = make_shared>(sequence.m_numberOfSamples, m_deserializer.m_streams.front().m_sampleLayout); - auto* startRange = s->m_indices; - for (const auto& range : utterance) - { - if (range.ClassId() >= m_deserializer.m_dimension) - // TODO: Possibly set m_valid to false, but currently preserving the old behavior. - RuntimeError("Class id '%ud' exceeds the model output dimension '%d'.", range.ClassId(), (int)m_deserializer.m_dimension); - - // Filling all range of frames with the corresponding class id. - fill(startRange, startRange + range.NumFrames(), static_cast(range.ClassId())); - startRange += range.NumFrames(); - } - - result.push_back(s); - } -}; - -// MLF chunk when operating in frame mode. -// Implementation is different because frames of the same sequence can be accessed -// in parallel by the randomizer, so all parsing/preprocessing should be done during -// sequence caching, so that GetSequence only works with read only data structures. -class MLFBinaryDeserializer::FrameChunk : public MLFBinaryDeserializer::ChunkBase -{ - // Actual values of frames. - vector m_classIds; - - //For each sequence this vector contains the sequence offset in samples from the beginning of the chunk. - std::vector m_sequenceOffsetInChunkInSamples; - -public: - FrameChunk(const MLFBinaryDeserializer& parent, const ChunkDescriptor& descriptor, const wstring& fileName, StateTablePtr states) - : ChunkBase(parent, descriptor, fileName, states) - { - uint32_t numSamples = static_cast(m_descriptor.NumberOfSamples()); - - // The current assumption is that the number of samples in a chunk fits in uint32, - // therefore we can save 4 bytes per sequence, storing offsets in samples as uint32. - if (numSamples != m_descriptor.NumberOfSamples()) - RuntimeError("Exceeded maximum number of samples in a chunk"); - - // Preallocate a big array for filling in class ids for the whole chunk. - m_classIds.resize(numSamples); - m_sequenceOffsetInChunkInSamples.resize(m_descriptor.NumberOfSequences()); - - - uint32_t offset = 0; - for (auto i = 0; i < m_descriptor.NumberOfSequences(); ++i) - { - m_sequenceOffsetInChunkInSamples[i] = offset; - offset += descriptor[i].m_numberOfSamples; - } - - if (numSamples != offset) - RuntimeError("Unexpected number of samples in a FrameChunk."); - - // Parse the data on different threads to avoid locking during GetSequence calls. -#pragma omp parallel for schedule(dynamic) - for (auto i = 0; i < m_descriptor.NumberOfSequences(); ++i) - CacheSequence(descriptor[i], i); - - - CleanBuffer(); - } - - // Get utterance by the absolute frame index in chunk. - // Uses the upper bound to do the binary search among sequences of the chunk. - size_t GetUtteranceForChunkFrameIndex(size_t frameIndex) const - { - auto result = upper_bound( - m_sequenceOffsetInChunkInSamples.begin(), - m_sequenceOffsetInChunkInSamples.end(), - frameIndex, - [](size_t fi, const size_t& a) { return fi < a; }); - return result - 1 - m_sequenceOffsetInChunkInSamples.begin(); - } - - void GetSequence(size_t sequenceIndex, vector& result) override - { - size_t utteranceId = GetUtteranceForChunkFrameIndex(sequenceIndex); - if (!m_valid[utteranceId]) - { - SparseSequenceDataPtr s = make_shared>(0, m_deserializer.m_streams.front().m_sampleLayout); - s->m_isValid = false; - result.push_back(s); - return; - } - - size_t label = m_classIds[sequenceIndex]; - assert(label < m_deserializer.m_categories.size()); - result.push_back(m_deserializer.m_categories[label]); - } - - // Parses and caches sequence in the buffer for GetSequence fast retrieval. - void CacheSequence(const SequenceDescriptor& sequence, size_t index) - { - auto start = m_buffer.data() + sequence.OffsetInChunk(); - auto end = start + sequence.SizeInBytes(); - - vector utterance; - auto absoluteOffset = m_descriptor.StartOffset() + sequence.OffsetInChunk(); - bool parsed = m_parser.Parse(boost::make_iterator_range(start, end), utterance, absoluteOffset); - if (!parsed) - { - m_valid[index] = false; - fprintf(stderr, "WARNING: Cannot parse the utterance %s\n", KeyOf(sequence).c_str()); - return; - } - - auto startRange = m_classIds.begin() + m_sequenceOffsetInChunkInSamples[index]; - for(size_t i = 0; i < utterance.size(); ++i) - { - const auto& range = utterance[i]; - if (range.ClassId() >= m_deserializer.m_dimension) - // TODO: Possibly set m_valid to false, but currently preserving the old behavior. - RuntimeError("Class id '%ud' exceeds the model output dimension '%d'.", range.ClassId(), (int)m_deserializer.m_dimension); - - fill(startRange, startRange + range.NumFrames(), range.ClassId()); - startRange += range.NumFrames(); - } + this->m_sequences[index] = move(utterance); } }; MLFBinaryDeserializer::MLFBinaryDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& cfg, bool primary) - : DataDeserializerBase(primary), - m_corpus(corpus) + : MLFDeserializer(corpus, primary) { if (primary) - RuntimeError("MLFBinaryDeserializer currently does not support primary mode."); + RuntimeError("MLFDeserializer currently does not support primary mode."); m_frameMode = (ConfigValue)cfg("frameMode", "true"); + if (m_frameMode) + LogicError("TODO: support frame mode in Binary MLF deserializer."); + wstring precision = cfg(L"precision", L"float"); m_elementType = AreEqualIgnoreCase(precision, L"float") ? DataType::Float : DataType::Double; @@ -328,55 +87,12 @@ MLFBinaryDeserializer::MLFBinaryDeserializer(CorpusDescriptorPtr corpus, const C m_withPhoneBoundaries = streamConfig(L"phoneBoundaries", false); if (m_withPhoneBoundaries) - { - LogicError("TODO: enable phoneBoundaries in binary MLF format."); - } + LogicError("TODO: implement phoneBoundaries setting."); InitializeStream(inputName); InitializeChunkInfos(corpus, config); } -// TODO: Should be removed. Currently a lot of end to end tests still use this one. -MLFBinaryDeserializer::MLFBinaryDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& labelConfig, const wstring& name) - : DataDeserializerBase(false) -{ - // The frame mode is currently specified once per configuration, - // not in the configuration of a particular deserializer, but on a higher level in the configuration. - // Because of that we are using find method below. - m_frameMode = labelConfig.Find("frameMode", "true"); - - ConfigHelper config(labelConfig); - - config.CheckLabelType(); - m_dimension = config.GetLabelDimension(); - - if (m_dimension > numeric_limits::max()) - { - RuntimeError("Label dimension (%zu) exceeds the maximum allowed " - "value (%zu)\n", m_dimension, (size_t)numeric_limits::max()); - } - - // Same behavior as for the old deserializer - keep almost all in memory, - // because there are a lot of none aligned sets. - m_chunkSizeBytes = labelConfig(L"chunkSizeInBytes", g_64MB); - - wstring precision = labelConfig(L"precision", L"float");; - m_elementType = AreEqualIgnoreCase(precision, L"float") ? DataType::Float : DataType::Double; - - m_withPhoneBoundaries = labelConfig(L"phoneBoundaries", "false"); - if (m_withPhoneBoundaries) { - LogicError("TODO: enable phoneBoundaries for binary MLF format"); - } - - InitializeStream(name); - InitializeChunkInfos(corpus, config); -} - -static inline bool LessByFirstItem(const std::tuple& a, const std::tuple& b) -{ - return std::get<0>(a) < std::get<0>(b); -} - void MLFBinaryDeserializer::InitializeChunkInfos(CorpusDescriptorPtr corpus, const ConfigHelper& config) { // Similarly to the old reader, currently we assume all Mlfs will have same root name (key) @@ -422,7 +138,7 @@ void MLFBinaryDeserializer::InitializeChunkInfos(CorpusDescriptorPtr corpus, con } } - std::sort(m_keyToChunkLocation.begin(), m_keyToChunkLocation.end(), LessByFirstItem); + std::sort(m_keyToChunkLocation.begin(), m_keyToChunkLocation.end(), MLFDeserializer::LessByFirstItem); fprintf(stderr, "MLFBinaryDeserializer: '%zu' utterances with '%zu' frames\n", totalNumSequences, @@ -432,64 +148,6 @@ void MLFBinaryDeserializer::InitializeChunkInfos(CorpusDescriptorPtr corpus, con InitializeReadOnlyArrayOfLabels(); } -void MLFBinaryDeserializer::InitializeReadOnlyArrayOfLabels() -{ - m_categories.reserve(m_dimension); - m_categoryIndices.reserve(m_dimension); - for (size_t i = 0; i < m_dimension; ++i) - { - auto category = make_shared(m_streams.front().m_sampleLayout); - m_categoryIndices.push_back(static_cast(i)); - category->m_indices = &(m_categoryIndices[i]); - category->m_nnzCounts.resize(1); - category->m_nnzCounts[0] = 1; - category->m_totalNnzCount = 1; - category->m_numberOfSamples = 1; - if (m_elementType == DataType::Float) - category->m_data = &s_oneFloat; - else - category->m_data = &s_oneDouble; - m_categories.push_back(category); - } -} - -void MLFBinaryDeserializer::InitializeStream(const wstring& name) -{ - // Initializing stream description - a single stream of MLF data. - StreamInformation stream; - stream.m_id = 0; - stream.m_name = name; - stream.m_sampleLayout = NDShape({ m_dimension }); - stream.m_storageFormat = StorageFormat::SparseCSC; - stream.m_elementType = m_elementType; - m_streams.push_back(stream); -} - -std::vector MLFBinaryDeserializer::ChunkInfos() -{ - std::vector chunks; - chunks.reserve(m_chunks.size()); - for (size_t i = 0; i < m_chunks.size(); ++i) - { - ChunkInfo cd; - cd.m_id = static_cast(i); - if (cd.m_id != i) - RuntimeError("ChunkIdType overflow during creation of a chunk description."); - - cd.m_numberOfSequences = m_frameMode ? m_chunks[i]->NumberOfSamples() : m_chunks[i]->NumberOfSequences(); - cd.m_numberOfSamples = m_chunks[i]->NumberOfSamples(); - chunks.push_back(cd); - } - return chunks; -} - -void MLFBinaryDeserializer::SequenceInfosForChunk(ChunkIdType, vector& result) -{ - UNUSED(result); - LogicError("MLF deserializer does not support primary mode, it cannot control chunking. " - "Please specify HTK deserializer as the first deserializer in your config file."); -} - ChunkPtr MLFBinaryDeserializer::GetChunk(ChunkIdType chunkId) { ChunkPtr result; @@ -498,48 +156,10 @@ ChunkPtr MLFBinaryDeserializer::GetChunk(ChunkIdType chunkId) auto chunk = m_chunks[chunkId]; auto& fileName = m_mlfFiles[m_chunkToFileIndex[chunk]]; - if (m_frameMode) - result = make_shared(*this, *chunk, fileName, m_stateTable); - else - result = make_shared(*this, *chunk, fileName, m_stateTable); + result = make_shared(*this, *chunk, fileName, m_stateTable); }); return result; }; -bool MLFBinaryDeserializer::GetSequenceInfoByKey(const SequenceKey& key, SequenceInfo& result) -{ - auto found = std::lower_bound(m_keyToChunkLocation.begin(), m_keyToChunkLocation.end(), std::make_tuple(key.m_sequence, 0, 0), - LessByFirstItem); - - if (found == m_keyToChunkLocation.end() || std::get<0>(*found) != key.m_sequence) - { - return false; - } - - auto chunkId = std::get<1>(*found); - auto sequenceIndexInChunk = std::get<2>(*found); - - - result.m_chunkId = std::get<1>(*found); - result.m_key = key; - - if (m_frameMode) - { - // in frame mode sequenceIndexInChunk == sequence offset in chunk in samples - result.m_indexInChunk = sequenceIndexInChunk + key.m_sample; - result.m_numberOfSamples = 1; - } - else - { - assert(result.m_key.m_sample == 0); - - const auto* chunk = m_chunks[chunkId]; - const auto& sequence = chunk->Sequences()[sequenceIndexInChunk]; - result.m_indexInChunk = sequenceIndexInChunk; - result.m_numberOfSamples = sequence.m_numberOfSamples; - } - return true; -} - } diff --git a/Source/Readers/HTKDeserializers/MLFBinaryDeserializer.h b/Source/Readers/HTKDeserializers/MLFBinaryDeserializer.h index 333bc3036..1a9895fa6 100644 --- a/Source/Readers/HTKDeserializers/MLFBinaryDeserializer.h +++ b/Source/Readers/HTKDeserializers/MLFBinaryDeserializer.h @@ -6,83 +6,29 @@ #pragma once #include -#include "HTKDeserializer.h" -#include "CorpusDescriptor.h" -#include "MLFUtils.h" +#include "MLFDeserializer.h" #include "Index.h" namespace CNTK { -// Class represents an MLF deserializer. -// Provides a set of chunks/sequences to the upper layers. -class MLFBinaryDeserializer : public DataDeserializerBase, boost::noncopyable -{ -public: - // Expects new configuration. - MLFBinaryDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& config, bool primary); + // Class represents an MLF deserializer. + // Provides a set of chunks/sequences to the upper layers. + class MLFBinaryDeserializer : public MLFDeserializer + { + public: + // Expects new configuration. + MLFBinaryDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& config, bool primary); - // TODO: Should be removed, when all readers go away, expects configuration in a legacy mode. - MLFBinaryDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& config, const std::wstring& streamName); + // TODO: Should be removed, when all readers go away, expects configuration in a legacy mode. + MLFBinaryDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& config, const std::wstring& streamName); - // Retrieves sequence description by its key. Used for deserializers that are not in "primary"/"driving" mode. - bool GetSequenceInfoByKey(const SequenceKey& key, SequenceInfo& s) override; + // Retrieves a chunk with data. + virtual ChunkPtr GetChunk(ChunkIdType) override; - // Gets description of all chunks. - virtual std::vector ChunkInfos() override; - - // Get sequence descriptions of a particular chunk. - virtual void SequenceInfosForChunk(ChunkIdType chunkId, std::vector& s) override; - - // Retrieves a chunk with data. - virtual ChunkPtr GetChunk(ChunkIdType) override; - -private: - class ChunkBase; - class SequenceChunk; - class FrameChunk; - - // Initializes chunk descriptions. - void InitializeChunkInfos(CorpusDescriptorPtr corpus, const ConfigHelper& config); - - // Initializes a single stream this deserializer exposes. - void InitializeStream(const std::wstring& name); - - // In frame mode initializes data for all categories/labels in order to - // avoid memory copy. - void InitializeReadOnlyArrayOfLabels(); - - // Sorted vector that maps SequenceKey.m_sequence into an utterance ID (or type max() if the key is not assigned). - std::vector> m_keyToChunkLocation; - - // Type of the data this serializer provides. - DataType m_elementType; - - // Array of available categories. - // We do no allocate data for all input sequences, only returning a pointer to existing category. - std::vector m_categories; - - // A list of category indices - // (a list of numbers from 0 to N, where N = (number of categories -1)) - std::vector m_categoryIndices; - - // Flag that indicates whether a single speech frames should be exposed as a sequence. - bool m_frameMode; - - CorpusDescriptorPtr m_corpus; - - std::vector m_chunks; - std::map m_chunkToFileIndex; - - size_t m_dimension; - size_t m_chunkSizeBytes; - - // Track phone boundaries - bool m_withPhoneBoundaries; - - StateTablePtr m_stateTable; - - std::vector> m_indices; - std::vector m_mlfFiles; -}; + private: + // Initializes chunk descriptions. + void InitializeChunkInfos(CorpusDescriptorPtr corpus, const ConfigHelper& config); + class BinarySequenceChunk; + }; } diff --git a/Source/Readers/HTKDeserializers/MLFBinaryIndexBuilder.cpp b/Source/Readers/HTKDeserializers/MLFBinaryIndexBuilder.cpp index 8b2b1758d..68277db57 100644 --- a/Source/Readers/HTKDeserializers/MLFBinaryIndexBuilder.cpp +++ b/Source/Readers/HTKDeserializers/MLFBinaryIndexBuilder.cpp @@ -8,7 +8,6 @@ #include "MLFBinaryIndexBuilder.h" #include "MLFUtils.h" #include "ReaderUtil.h" -#include namespace CNTK { diff --git a/Source/Readers/HTKDeserializers/MLFDeserializer.cpp b/Source/Readers/HTKDeserializers/MLFDeserializer.cpp index c4e0b8c42..d437b54be 100644 --- a/Source/Readers/HTKDeserializers/MLFDeserializer.cpp +++ b/Source/Readers/HTKDeserializers/MLFDeserializer.cpp @@ -19,296 +19,6 @@ namespace CNTK { using namespace std; using namespace Microsoft::MSR::CNTK; -static float s_oneFloat = 1.0; -static double s_oneDouble = 1.0; - -// A constant used in 1-hot vectors to identify the first frame of a phone. -// Used only in CTC-type training. -static float s_phoneBoundary = 2.0f; - -// Sparse labels for an utterance. -template -struct MLFSequenceData : SparseSequenceData -{ - vector m_values; - vector m_indexBuffer; - const NDShape& m_frameShape; - - MLFSequenceData(size_t numberOfSamples, const NDShape& frameShape) : - m_values(numberOfSamples, 1), m_frameShape(frameShape) - { - if (numberOfSamples > numeric_limits::max()) - { - RuntimeError("Number of samples in an MLFSequenceData (%zu) " - "exceeds the maximum allowed value (%zu)\n", - numberOfSamples, (size_t)numeric_limits::max()); - } - - m_indexBuffer.resize(numberOfSamples); - m_nnzCounts.resize(numberOfSamples, static_cast(1)); - m_numberOfSamples = (uint32_t)numberOfSamples; - m_totalNnzCount = static_cast(numberOfSamples); - m_indices = &m_indexBuffer[0]; - } - - MLFSequenceData(size_t numberOfSamples, const vector& phoneBoundaries, const NDShape& frameShape) : - MLFSequenceData(numberOfSamples, frameShape) - { - for (auto boundary : phoneBoundaries) - m_values[boundary] = s_phoneBoundary; - } - - const void* GetDataBuffer() override - { - return m_values.data(); - } - - const NDShape& GetSampleShape() override - { - return m_frameShape; - } -}; - -// Base class for chunks in frame and sequence mode. -// The lifetime is always less than the lifetime of the parent deserializer. -class MLFDeserializer::ChunkBase : public Chunk -{ -protected: - vector m_buffer; // Buffer for the whole chunk - vector m_valid; // Bit mask whether the parsed sequence is valid. - MLFUtteranceParser m_parser; - - const MLFDeserializer& m_deserializer; - const ChunkDescriptor& m_descriptor; // Current chunk descriptor. - - ChunkBase(const MLFDeserializer& deserializer, const ChunkDescriptor& descriptor, const wstring& fileName, const StateTablePtr& states) - : m_parser(states), - m_descriptor(descriptor), - m_deserializer(deserializer) - { - if (descriptor.NumberOfSequences() == 0 || descriptor.SizeInBytes() == 0) - LogicError("Empty chunks are not supported."); - - auto f = FileWrapper::OpenOrDie(fileName, L"rbS"); - size_t sizeInBytes = descriptor.SizeInBytes(); - - // Make sure we always have 0 at the end for buffer overrun. - m_buffer.resize(sizeInBytes + 1); - m_buffer[sizeInBytes] = 0; - - // Seek and read chunk into memory. - f.SeekOrDie(descriptor.StartOffset(), SEEK_SET); - - f.ReadOrDie(m_buffer.data(), sizeInBytes, 1); - - // all sequences are valid by default. - m_valid.resize(m_descriptor.NumberOfSequences(), true); - } - - string KeyOf(const SequenceDescriptor& s) - { - return m_deserializer.m_corpus->IdToKey(s.m_key); - } - - void CleanBuffer() - { - // Make sure we do not keep unnecessary memory after sequences have been parsed. - vector tmp; - m_buffer.swap(tmp); - } -}; - -// MLF chunk when operating in sequence mode. -class MLFDeserializer::SequenceChunk : public MLFDeserializer::ChunkBase -{ - vector> m_sequences; // Each sequence is a vector of sequential frame ranges. - -public: - SequenceChunk(const MLFDeserializer& parent, const ChunkDescriptor& descriptor, const wstring& fileName, StateTablePtr states) - : ChunkBase(parent, descriptor, fileName, states) - { - m_sequences.resize(m_descriptor.Sequences().size()); - -#pragma omp parallel for schedule(dynamic) - for (int i = 0; i < descriptor.Sequences().size(); ++i) - CacheSequence(descriptor.Sequences()[i], i); - - CleanBuffer(); - } - - void CacheSequence(const SequenceDescriptor& sequence, size_t index) - { - auto start = m_buffer.data() + sequence.OffsetInChunk(); - auto end = start + sequence.SizeInBytes(); - - vector utterance; - auto absoluteOffset = m_descriptor.StartOffset() + sequence.OffsetInChunk(); - bool parsed = m_parser.Parse(boost::make_iterator_range(start, end), utterance, absoluteOffset); - if (!parsed) // cannot parse - { - fprintf(stderr, "WARNING: Cannot parse the utterance '%s'\n", KeyOf(sequence).c_str()); - m_valid[index] = false; - return; - } - - m_sequences[index] = move(utterance); - } - - void GetSequence(size_t sequenceIndex, vector& result) override - { - if (m_deserializer.m_elementType == DataType::Float) - return GetSequence(sequenceIndex, result); - else - { - assert(m_deserializer.m_elementType == DataType::Double); - return GetSequence(sequenceIndex, result); - } - } - - template - void GetSequence(size_t sequenceIndex, vector& result) - { - if (!m_valid[sequenceIndex]) - { - SparseSequenceDataPtr s = make_shared>(0, m_deserializer.m_streams.front().m_sampleLayout); - s->m_isValid = false; - result.push_back(s); - return; - } - - const auto& utterance = m_sequences[sequenceIndex]; - const auto& sequence = m_descriptor.Sequences()[sequenceIndex]; - - // Packing labels for the utterance into sparse sequence. - vector sequencePhoneBoundaries(m_deserializer.m_withPhoneBoundaries ? utterance.size() : 0); - if (m_deserializer.m_withPhoneBoundaries) - { - for (size_t i = 0; i < utterance.size(); ++i) - sequencePhoneBoundaries[i] = utterance[i].FirstFrame(); - } - - auto s = make_shared>(sequence.m_numberOfSamples, sequencePhoneBoundaries, m_deserializer.m_streams.front().m_sampleLayout); - auto* startRange = s->m_indices; - for (const auto& range : utterance) - { - if (range.ClassId() >= m_deserializer.m_dimension) - // TODO: Possibly set m_valid to false, but currently preserving the old behavior. - RuntimeError("Class id '%ud' exceeds the model output dimension '%d'.", range.ClassId(), (int)m_deserializer.m_dimension); - - // Filling all range of frames with the corresponding class id. - fill(startRange, startRange + range.NumFrames(), static_cast(range.ClassId())); - startRange += range.NumFrames(); - } - - result.push_back(s); - } -}; - -// MLF chunk when operating in frame mode. -// Implementation is different because frames of the same sequence can be accessed -// in parallel by the randomizer, so all parsing/preprocessing should be done during -// sequence caching, so that GetSequence only works with read only data structures. -class MLFDeserializer::FrameChunk : public MLFDeserializer::ChunkBase -{ - // Actual values of frames. - vector m_classIds; - - //For each sequence this vector contains the sequence offset in samples from the beginning of the chunk. - std::vector m_sequenceOffsetInChunkInSamples; - -public: - FrameChunk(const MLFDeserializer& parent, const ChunkDescriptor& descriptor, const wstring& fileName, StateTablePtr states) - : ChunkBase(parent, descriptor, fileName, states) - { - uint32_t numSamples = static_cast(m_descriptor.NumberOfSamples()); - - // The current assumption is that the number of samples in a chunk fits in uint32, - // therefore we can save 4 bytes per sequence, storing offsets in samples as uint32. - if (numSamples != m_descriptor.NumberOfSamples()) - RuntimeError("Exceeded maximum number of samples in a chunk"); - - // Preallocate a big array for filling in class ids for the whole chunk. - m_classIds.resize(numSamples); - m_sequenceOffsetInChunkInSamples.resize(m_descriptor.NumberOfSequences()); - - - uint32_t offset = 0; - for (auto i = 0; i < m_descriptor.NumberOfSequences(); ++i) - { - m_sequenceOffsetInChunkInSamples[i] = offset; - offset += descriptor[i].m_numberOfSamples; - } - - if (numSamples != offset) - RuntimeError("Unexpected number of samples in a FrameChunk."); - - // Parse the data on different threads to avoid locking during GetSequence calls. -#pragma omp parallel for schedule(dynamic) - for (auto i = 0; i < m_descriptor.NumberOfSequences(); ++i) - CacheSequence(descriptor[i], i); - - - CleanBuffer(); - } - - // Get utterance by the absolute frame index in chunk. - // Uses the upper bound to do the binary search among sequences of the chunk. - size_t GetUtteranceForChunkFrameIndex(size_t frameIndex) const - { - auto result = upper_bound( - m_sequenceOffsetInChunkInSamples.begin(), - m_sequenceOffsetInChunkInSamples.end(), - frameIndex, - [](size_t fi, const size_t& a) { return fi < a; }); - return result - 1 - m_sequenceOffsetInChunkInSamples.begin(); - } - - void GetSequence(size_t sequenceIndex, vector& result) override - { - size_t utteranceId = GetUtteranceForChunkFrameIndex(sequenceIndex); - if (!m_valid[utteranceId]) - { - SparseSequenceDataPtr s = make_shared>(0, m_deserializer.m_streams.front().m_sampleLayout); - s->m_isValid = false; - result.push_back(s); - return; - } - - size_t label = m_classIds[sequenceIndex]; - assert(label < m_deserializer.m_categories.size()); - result.push_back(m_deserializer.m_categories[label]); - } - - // Parses and caches sequence in the buffer for GetSequence fast retrieval. - void CacheSequence(const SequenceDescriptor& sequence, size_t index) - { - auto start = m_buffer.data() + sequence.OffsetInChunk(); - auto end = start + sequence.SizeInBytes(); - - vector utterance; - auto absoluteOffset = m_descriptor.StartOffset() + sequence.OffsetInChunk(); - bool parsed = m_parser.Parse(boost::make_iterator_range(start, end), utterance, absoluteOffset); - if (!parsed) - { - m_valid[index] = false; - fprintf(stderr, "WARNING: Cannot parse the utterance %s\n", KeyOf(sequence).c_str()); - return; - } - - auto startRange = m_classIds.begin() + m_sequenceOffsetInChunkInSamples[index]; - for(size_t i = 0; i < utterance.size(); ++i) - { - const auto& range = utterance[i]; - if (range.ClassId() >= m_deserializer.m_dimension) - // TODO: Possibly set m_valid to false, but currently preserving the old behavior. - RuntimeError("Class id '%ud' exceeds the model output dimension '%d'.", range.ClassId(), (int)m_deserializer.m_dimension); - - fill(startRange, startRange + range.NumFrames(), range.ClassId()); - startRange += range.NumFrames(); - } - } -}; - MLFDeserializer::MLFDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& cfg, bool primary) : DataDeserializerBase(primary), m_corpus(corpus) @@ -379,11 +89,11 @@ MLFDeserializer::MLFDeserializer(CorpusDescriptorPtr corpus, const ConfigParamet InitializeChunkInfos(corpus, config, labelMappingFile); } -static inline bool LessByFirstItem(const std::tuple& a, const std::tuple& b) +MLFDeserializer::MLFDeserializer(CorpusDescriptorPtr corpus, bool primary) + : DataDeserializerBase(primary), + m_corpus(corpus) { - return std::get<0>(a) < std::get<0>(b); } - void MLFDeserializer::InitializeChunkInfos(CorpusDescriptorPtr corpus, const ConfigHelper& config, const wstring& stateListPath) { // Similarly to the old reader, currently we assume all Mlfs will have same root name (key) @@ -512,9 +222,9 @@ ChunkPtr MLFDeserializer::GetChunk(ChunkIdType chunkId) auto& fileName = m_mlfFiles[m_chunkToFileIndex[chunk]]; if (m_frameMode) - result = make_shared(*this, *chunk, fileName, m_stateTable); + result = make_shared(*this, *chunk, fileName, m_stateTable); else - result = make_shared(*this, *chunk, fileName, m_stateTable); + result = make_shared(*this, *chunk, fileName, m_stateTable); }); return result; diff --git a/Source/Readers/HTKDeserializers/MLFDeserializer.h b/Source/Readers/HTKDeserializers/MLFDeserializer.h index 84f6ff86d..1a1a4819f 100644 --- a/Source/Readers/HTKDeserializers/MLFDeserializer.h +++ b/Source/Readers/HTKDeserializers/MLFDeserializer.h @@ -9,10 +9,61 @@ #include "HTKDeserializer.h" #include "CorpusDescriptor.h" #include "MLFUtils.h" +#include "FileWrapper.h" #include "Index.h" namespace CNTK { +static float s_oneFloat = 1.0; +static double s_oneDouble = 1.0; + +// A constant used in 1-hot vectors to identify the first frame of a phone. +// Used only in CTC-type training. +static float s_phoneBoundary = 2.0f; + +// Sparse labels for an utterance. +template +struct MLFSequenceData : SparseSequenceData +{ + vector m_values; + vector m_indexBuffer; + const NDShape& m_frameShape; + + MLFSequenceData(size_t numberOfSamples, const NDShape& frameShape) : + m_values(numberOfSamples, 1), m_frameShape(frameShape) + { + if (numberOfSamples > numeric_limits::max()) + { + RuntimeError("Number of samples in an MLFSequenceData (%zu) " + "exceeds the maximum allowed value (%zu)\n", + numberOfSamples, (size_t)numeric_limits::max()); + } + + m_indexBuffer.resize(numberOfSamples); + m_nnzCounts.resize(numberOfSamples, static_cast(1)); + m_numberOfSamples = (uint32_t)numberOfSamples; + m_totalNnzCount = static_cast(numberOfSamples); + m_indices = &m_indexBuffer[0]; + } + + MLFSequenceData(size_t numberOfSamples, const vector& phoneBoundaries, const NDShape& frameShape) : + MLFSequenceData(numberOfSamples, frameShape) + { + for (auto boundary : phoneBoundaries) + m_values[boundary] = s_phoneBoundary; + } + + const void* GetDataBuffer() override + { + return m_values.data(); + } + + const NDShape& GetSampleShape() override + { + return m_frameShape; + } +}; + // Class represents an MLF deserializer. // Provides a set of chunks/sequences to the upper layers. class MLFDeserializer : public DataDeserializerBase, boost::noncopyable @@ -24,6 +75,8 @@ public: // TODO: Should be removed, when all readers go away, expects configuration in a legacy mode. MLFDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& config, const std::wstring& streamName); + MLFDeserializer(CorpusDescriptorPtr corpus, bool primary); + // Retrieves sequence description by its key. Used for deserializers that are not in "primary"/"driving" mode. bool GetSequenceInfoByKey(const SequenceKey& key, SequenceInfo& s) override; @@ -36,10 +89,250 @@ public: // Retrieves a chunk with data. virtual ChunkPtr GetChunk(ChunkIdType) override; -private: - class ChunkBase; - class SequenceChunk; - class FrameChunk; + static inline bool LessByFirstItem(const std::tuple& a, const std::tuple& b) + { + return std::get<0>(a) < std::get<0>(b); + } + + // Base class for chunks in frame and sequence mode. + // The lifetime is always less than the lifetime of the parent deserializer. + class ChunkBase : public Chunk + { + public: + ChunkBase(const MLFDeserializer& deserializer, const ChunkDescriptor& descriptor, const wstring& fileName, const StateTablePtr& states) + : m_parser(states), + m_descriptor(descriptor), + m_deserializer(deserializer) + { + if (descriptor.NumberOfSequences() == 0 || descriptor.SizeInBytes() == 0) + LogicError("Empty chunks are not supported."); + + auto f = FileWrapper::OpenOrDie(fileName, L"rbS"); + size_t sizeInBytes = descriptor.SizeInBytes(); + + // Make sure we always have 0 at the end for buffer overrun. + m_buffer.resize(sizeInBytes + 1); + m_buffer[sizeInBytes] = 0; + + // Seek and read chunk into memory. + f.SeekOrDie(descriptor.StartOffset(), SEEK_SET); + + f.ReadOrDie(m_buffer.data(), sizeInBytes, 1); + + // all sequences are valid by default. + m_valid.resize(m_descriptor.NumberOfSequences(), true); + } + + string KeyOf(const SequenceDescriptor& s) + { + return m_deserializer.m_corpus->IdToKey(s.m_key); + } + + void CleanBuffer() + { + // Make sure we do not keep unnecessary memory after sequences have been parsed. + vector tmp; + m_buffer.swap(tmp); + } + + vector m_buffer; // Buffer for the whole chunk + vector m_valid; // Bit mask whether the parsed sequence is valid. + MLFUtteranceParser m_parser; + + const MLFDeserializer& m_deserializer; + const ChunkDescriptor& m_descriptor; // Current chunk descriptor. + }; + + // MLF chunk when operating in sequence mode. + class SequenceChunk : public ChunkBase + { + public: + vector> m_sequences; // Each sequence is a vector of sequential frame ranges. + + SequenceChunk(const MLFDeserializer& parent, const ChunkDescriptor& descriptor, const wstring& fileName, StateTablePtr states) + : ChunkBase(parent, descriptor, fileName, states) + { + m_sequences.resize(m_descriptor.Sequences().size()); + +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < descriptor.Sequences().size(); ++i) + CacheSequence(descriptor.Sequences()[i], i); + + CleanBuffer(); + } + + void CacheSequence(const SequenceDescriptor& sequence, size_t index) + { + auto start = m_buffer.data() + sequence.OffsetInChunk(); + auto end = start + sequence.SizeInBytes(); + + vector utterance; + auto absoluteOffset = m_descriptor.StartOffset() + sequence.OffsetInChunk(); + bool parsed = m_parser.Parse(boost::make_iterator_range(start, end), utterance, absoluteOffset); + if (!parsed) // cannot parse + { + fprintf(stderr, "WARNING: Cannot parse the utterance '%s'\n", KeyOf(sequence).c_str()); + m_valid[index] = false; + return; + } + + m_sequences[index] = move(utterance); + } + + void GetSequence(size_t sequenceIndex, vector& result) override + { + if (m_deserializer.m_elementType == DataType::Float) + return GetSequence(sequenceIndex, result); + else + { + assert(m_deserializer.m_elementType == DataType::Double); + return GetSequence(sequenceIndex, result); + } + } + + template + void GetSequence(size_t sequenceIndex, vector& result) + { + if (!m_valid[sequenceIndex]) + { + SparseSequenceDataPtr s = make_shared>(0, m_deserializer.m_streams.front().m_sampleLayout); + s->m_isValid = false; + result.push_back(s); + return; + } + + const auto& utterance = m_sequences[sequenceIndex]; + const auto& sequence = m_descriptor.Sequences()[sequenceIndex]; + + // Packing labels for the utterance into sparse sequence. + vector sequencePhoneBoundaries(m_deserializer.m_withPhoneBoundaries ? utterance.size() : 0); + if (m_deserializer.m_withPhoneBoundaries) + { + for (size_t i = 0; i < utterance.size(); ++i) + sequencePhoneBoundaries[i] = utterance[i].FirstFrame(); + } + + auto s = make_shared>(sequence.m_numberOfSamples, sequencePhoneBoundaries, m_deserializer.m_streams.front().m_sampleLayout); + auto* startRange = s->m_indices; + for (const auto& range : utterance) + { + if (range.ClassId() >= m_deserializer.m_dimension) + // TODO: Possibly set m_valid to false, but currently preserving the old behavior. + RuntimeError("Class id '%ud' exceeds the model output dimension '%d'.", range.ClassId(), (int)m_deserializer.m_dimension); + + // Filling all range of frames with the corresponding class id. + fill(startRange, startRange + range.NumFrames(), static_cast(range.ClassId())); + startRange += range.NumFrames(); + } + + result.push_back(s); + } + }; + + // MLF chunk when operating in frame mode. + // Implementation is different because frames of the same sequence can be accessed + // in parallel by the randomizer, so all parsing/preprocessing should be done during + // sequence caching, so that GetSequence only works with read only data structures. + class FrameChunk : public ChunkBase + { + // Actual values of frames. + vector m_classIds; + + //For each sequence this vector contains the sequence offset in samples from the beginning of the chunk. + std::vector m_sequenceOffsetInChunkInSamples; + + public: + FrameChunk(const MLFDeserializer& parent, const ChunkDescriptor& descriptor, const wstring& fileName, StateTablePtr states) + : ChunkBase(parent, descriptor, fileName, states) + { + uint32_t numSamples = static_cast(m_descriptor.NumberOfSamples()); + + // The current assumption is that the number of samples in a chunk fits in uint32, + // therefore we can save 4 bytes per sequence, storing offsets in samples as uint32. + if (numSamples != m_descriptor.NumberOfSamples()) + RuntimeError("Exceeded maximum number of samples in a chunk"); + + // Preallocate a big array for filling in class ids for the whole chunk. + m_classIds.resize(numSamples); + m_sequenceOffsetInChunkInSamples.resize(m_descriptor.NumberOfSequences()); + + + uint32_t offset = 0; + for (auto i = 0; i < m_descriptor.NumberOfSequences(); ++i) + { + m_sequenceOffsetInChunkInSamples[i] = offset; + offset += descriptor[i].m_numberOfSamples; + } + + if (numSamples != offset) + RuntimeError("Unexpected number of samples in a FrameChunk."); + + // Parse the data on different threads to avoid locking during GetSequence calls. +#pragma omp parallel for schedule(dynamic) + for (auto i = 0; i < m_descriptor.NumberOfSequences(); ++i) + CacheSequence(descriptor[i], i); + + + CleanBuffer(); + } + + // Get utterance by the absolute frame index in chunk. + // Uses the upper bound to do the binary search among sequences of the chunk. + size_t GetUtteranceForChunkFrameIndex(size_t frameIndex) const + { + auto result = upper_bound( + m_sequenceOffsetInChunkInSamples.begin(), + m_sequenceOffsetInChunkInSamples.end(), + frameIndex, + [](size_t fi, const size_t& a) { return fi < a; }); + return result - 1 - m_sequenceOffsetInChunkInSamples.begin(); + } + + void GetSequence(size_t sequenceIndex, vector& result) override + { + size_t utteranceId = GetUtteranceForChunkFrameIndex(sequenceIndex); + if (!m_valid[utteranceId]) + { + SparseSequenceDataPtr s = make_shared>(0, m_deserializer.m_streams.front().m_sampleLayout); + s->m_isValid = false; + result.push_back(s); + return; + } + + size_t label = m_classIds[sequenceIndex]; + assert(label < m_deserializer.m_categories.size()); + result.push_back(m_deserializer.m_categories[label]); + } + + // Parses and caches sequence in the buffer for GetSequence fast retrieval. + void CacheSequence(const SequenceDescriptor& sequence, size_t index) + { + auto start = m_buffer.data() + sequence.OffsetInChunk(); + auto end = start + sequence.SizeInBytes(); + + vector utterance; + auto absoluteOffset = m_descriptor.StartOffset() + sequence.OffsetInChunk(); + bool parsed = m_parser.Parse(boost::make_iterator_range(start, end), utterance, absoluteOffset); + if (!parsed) + { + m_valid[index] = false; + fprintf(stderr, "WARNING: Cannot parse the utterance %s\n", KeyOf(sequence).c_str()); + return; + } + + auto startRange = m_classIds.begin() + m_sequenceOffsetInChunkInSamples[index]; + for (size_t i = 0; i < utterance.size(); ++i) + { + const auto& range = utterance[i]; + if (range.ClassId() >= m_deserializer.m_dimension) + // TODO: Possibly set m_valid to false, but currently preserving the old behavior. + RuntimeError("Class id '%ud' exceeds the model output dimension '%d'.", range.ClassId(), (int)m_deserializer.m_dimension); + + fill(startRange, startRange + range.NumFrames(), range.ClassId()); + startRange += range.NumFrames(); + } + } + }; // Initializes chunk descriptions. void InitializeChunkInfos(CorpusDescriptorPtr corpus, const ConfigHelper& config, const std::wstring& stateListPath);