Integrate mahilleb/MlfDataDeserializerWeakPtr into master

This commit is contained in:
Project Philly 2016-04-04 05:39:55 -07:00
Родитель 9873f48392 35528cc720
Коммит 0cdec2c2c9
5 изменённых файлов: 25 добавлений и 24 удалений

Просмотреть файл

@ -125,11 +125,6 @@ void HTKDataDeserializer::InitializeChunkDescriptions(ConfigHelper& config)
currentChunk.Add(move(utterances[i])); currentChunk.Add(move(utterances[i]));
} }
// Creating a table of weak pointers to chunks,
// so that if randomizer asks the same chunk twice
// we do not need to recreated the chunk if we already uploaded in memory.
m_weakChunks.resize(m_chunks.size());
fprintf(stderr, fprintf(stderr,
"HTKDataDeserializer::HTKDataDeserializer: %d utterances grouped into %d chunks, av. chunk size: %.1f utterances, %.1f frames\n", "HTKDataDeserializer::HTKDataDeserializer: %d utterances grouped into %d chunks, av. chunk size: %.1f utterances, %.1f frames\n",
(int)utterances.size(), (int)utterances.size(),
@ -289,14 +284,7 @@ private:
// Gets a data chunk with the specified chunk id. // Gets a data chunk with the specified chunk id.
ChunkPtr HTKDataDeserializer::GetChunk(size_t chunkId) ChunkPtr HTKDataDeserializer::GetChunk(size_t chunkId)
{ {
if (!m_weakChunks[chunkId].expired()) return make_shared<HTKChunk>(this, chunkId);
{
return m_weakChunks[chunkId].lock();
}
auto chunk = make_shared<HTKChunk>(this, chunkId);
m_weakChunks[chunkId] = chunk;
return chunk;
}; };
// A matrix that stores all samples of a sequence without padding (differently from ssematrix). // A matrix that stores all samples of a sequence without padding (differently from ssematrix).

Просмотреть файл

@ -51,11 +51,6 @@ private:
// Chunk descriptions. // Chunk descriptions.
std::vector<HTKChunkDescription> m_chunks; std::vector<HTKChunkDescription> m_chunks;
// Weak pointers on existing chunks.
// If randomizer asks the same chunk twice we do not need to recreate
// the chunk if we already uploaded it in memory.
std::vector<std::weak_ptr<Chunk>> m_weakChunks;
// Augmentation window. // Augmentation window.
std::pair<size_t, size_t> m_augmentationWindow; std::pair<size_t, size_t> m_augmentationWindow;

Просмотреть файл

@ -222,8 +222,8 @@ ChunkPtr MLFDataDeserializer::GetChunk(size_t chunkId)
{ {
UNUSED(chunkId); UNUSED(chunkId);
assert(chunkId == 0); assert(chunkId == 0);
return std::make_shared<MLFChunk>(this); return make_shared<MLFChunk>(this);
} };
// Sparse labels for an utterance. // Sparse labels for an utterance.
template <class ElemType> template <class ElemType>

Просмотреть файл

@ -5,6 +5,7 @@
#define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_WARNINGS
#include "Bundler.h" #include "Bundler.h"
#include <set>
namespace Microsoft { namespace MSR { namespace CNTK { namespace Microsoft { namespace MSR { namespace CNTK {
@ -150,16 +151,19 @@ void Bundler::GetSequencesForChunk(size_t chunkId, std::vector<SequenceDescripti
std::swap(sequences, result); std::swap(sequences, result);
} }
// Represents a chunk that has pointers to the underlying deserialzer chunks. // Represents a chunk that has pointers to the underlying deserializer chunks.
class Bundler::BundlingChunk : public Chunk class Bundler::BundlingChunk : public Chunk
{ {
size_t m_numberOfInputs; size_t m_numberOfInputs;
Bundler* m_parent; Bundler* m_parent;
size_t m_chunkId; size_t m_chunkId;
// A mapping between exposed sequence id and inner chunk for each deserialzier. // A mapping between exposed sequence id and inner chunk for each deserializer.
// Index i of the vector maps to the chunk of inner sequence (i / m_numberOfInputs) of
// deserializer (i % m_numberOfInputs).
std::vector<ChunkPtr> m_innerChunks; std::vector<ChunkPtr> m_innerChunks;
// A mapping between exposed sequence id and inner sequence id for each deserializer. // A mapping between exposed sequence id and inner sequence id for each deserializer.
// Indices as above.
std::vector<size_t> m_sequenceToSequence; std::vector<size_t> m_sequenceToSequence;
DISABLE_COPY_AND_MOVE(BundlingChunk); DISABLE_COPY_AND_MOVE(BundlingChunk);
@ -197,6 +201,8 @@ public:
SequenceDescription s; SequenceDescription s;
for (size_t deserializerIndex = 1; deserializerIndex < m_parent->m_deserializers.size(); ++deserializerIndex) for (size_t deserializerIndex = 1; deserializerIndex < m_parent->m_deserializers.size(); ++deserializerIndex)
{ {
std::map<size_t, ChunkPtr> secondaryChunks;
for (size_t sequenceIndex = 0; sequenceIndex < sequences.size(); ++sequenceIndex) for (size_t sequenceIndex = 0; sequenceIndex < sequences.size(); ++sequenceIndex)
{ {
if (chunk->m_invalid.find(sequenceIndex) != chunk->m_invalid.end()) if (chunk->m_invalid.find(sequenceIndex) != chunk->m_invalid.end())
@ -207,7 +213,20 @@ public:
size_t currentIndex = sequenceIndex * m_numberOfInputs + deserializerIndex; size_t currentIndex = sequenceIndex * m_numberOfInputs + deserializerIndex;
deserializers[deserializerIndex]->GetSequenceDescriptionByKey(sequences[sequenceIndex].m_key, s); deserializers[deserializerIndex]->GetSequenceDescriptionByKey(sequences[sequenceIndex].m_key, s);
m_sequenceToSequence[currentIndex] = s.m_id; m_sequenceToSequence[currentIndex] = s.m_id;
m_innerChunks[currentIndex] = deserializers[deserializerIndex]->GetChunk(s.m_chunkId);
ChunkPtr secondaryChunk;
auto it = secondaryChunks.find(s.m_chunkId);
if (it == secondaryChunks.end())
{
secondaryChunk = deserializers[deserializerIndex]->GetChunk(s.m_chunkId);
secondaryChunks.insert(make_pair(s.m_chunkId, secondaryChunk));
}
else
{
secondaryChunk = it->second;
}
m_innerChunks[currentIndex] = secondaryChunk;
} }
} }
} }

Просмотреть файл

@ -8,7 +8,6 @@
#include "DataDeserializer.h" #include "DataDeserializer.h"
#include "DataDeserializerBase.h" #include "DataDeserializerBase.h"
#include "Config.h" #include "Config.h"
#include <set>
namespace Microsoft { namespace MSR { namespace CNTK { namespace Microsoft { namespace MSR { namespace CNTK {