Integrate muditj/FixNoRandomizerBug3 into master

This commit is contained in:
Project Philly 2018-03-21 21:19:55 +00:00 коммит произвёл CNTK Team
Родитель aab2567d17 c8c02bfc11
Коммит 82142f0aeb
3 изменённых файлов: 2712 добавлений и 653 удалений

Просмотреть файл

@ -45,7 +45,7 @@ void LTNoRandomizer::RefillSequenceWindow(SequenceWindow& window)
std::swap(window.m_sequences[workerSequencePosition++], window.m_sequences[i]);
}
window.m_sequences.erase(window.m_sequences.begin() + workerSequencePosition);
window.m_sequences.erase(window.m_sequences.begin() + workerSequencePosition, window.m_sequences.end());
}
// If last chunk, add the sweep marker.

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -8,6 +8,7 @@
#include <random>
#include <set>
#include "NoRandomizer.h"
#include "LTNoRandomizer.h"
#include "DataDeserializer.h"
#include "BlockRandomizer.h"
#include "CorpusDescriptor.h"
@ -38,6 +39,7 @@ BOOST_AUTO_TEST_SUITE(ReaderLibTests)
class MockChunk : public Chunk
{
private:
ChunkIdType m_chunkId;
size_t m_chunkBegin;
size_t m_chunkEnd;
NDShape m_sampleShape;
@ -45,12 +47,13 @@ private:
vector<vector<float>>& m_sequenceData;
public:
MockChunk(size_t chunkBegin, size_t chunkEnd, vector<vector<float>>& sequenceData, uint32_t sequenceLength)
: m_chunkBegin(chunkBegin),
m_chunkEnd(chunkEnd),
m_sampleShape(NDShape({ 1 })),
m_sequenceLength(sequenceLength),
m_sequenceData(sequenceData)
MockChunk(ChunkIdType chunkdId, size_t chunkBegin, size_t chunkEnd, vector<vector<float>>& sequenceData, uint32_t sequenceLength)
: m_chunkId(chunkdId),
m_chunkBegin(chunkBegin),
m_chunkEnd(chunkEnd),
m_sampleShape(NDShape({ 1 })),
m_sequenceLength(sequenceLength),
m_sequenceData(sequenceData)
{
assert(chunkBegin <= chunkEnd);
assert(chunkEnd <= sequenceData.size());
@ -65,9 +68,20 @@ public:
data->m_data = &m_sequenceData[sequenceId][0];
data->m_numberOfSamples = m_sequenceLength;
data->m_sampleShape = m_sampleShape;
data->m_key.m_sequence = sequenceId;
result.push_back(data);
}
virtual void SequenceInfos(std::vector<SequenceInfo>& sequenceToFill) override
{
unsigned int numberOfSamples = 1;
for (size_t seqGlobalIdx = m_chunkBegin; seqGlobalIdx < m_chunkEnd; seqGlobalIdx++)
{
SequenceInfo seq{ seqGlobalIdx, numberOfSamples, m_chunkId, SequenceKey(seqGlobalIdx,0) };
sequenceToFill.push_back(seq);
}
}
~MockChunk() override {};
};
@ -138,7 +152,7 @@ public:
assert(chunkId < m_numChunks);
size_t chunkBegin = chunkId * m_numSequencesPerChunk;
size_t chunkEnd = chunkBegin + m_numSequencesPerChunk;
shared_ptr<Chunk> chunk = make_shared<MockChunk>(chunkBegin, chunkEnd, m_sequenceData, m_sequenceLength);
shared_ptr<Chunk> chunk = make_shared<MockChunk>(chunkId, chunkBegin, chunkEnd, m_sequenceData, m_sequenceLength);
return chunk;
}
@ -974,6 +988,44 @@ BOOST_AUTO_TEST_CASE(CheckGetCurrentCursorForRandomizers)
test(noRandomizer, epochSize);
}
// Check that each worker reads unique sequences. A bug was causing duplicate sequences in workers.
BOOST_AUTO_TEST_CASE(LTNoRandomizerCheckNoDuplicateSequence)
{
auto num_chunks = 2;
auto num_sequences = 10;
size_t num_workers = 2;
vector<float> input(num_sequences * num_chunks);
iota(input.begin(), input.end(), 0.0f);
for (int i = 0; i < num_workers; ++i)
{
auto mockDeserializer = make_shared<MockDeserializer>(num_chunks, num_sequences, input);
auto randomizer = make_shared<LTNoRandomizer>(mockDeserializer, false);
EpochConfiguration config;
config.m_allowMinibatchesToCrossSweepBoundaries = true;
config.m_numberOfWorkers = num_workers;
config.m_minibatchSizeInSamples = 10;
config.m_truncationSize = 1;
config.m_totalEpochSizeInSweeps = 1;
config.m_epochIndex = 0;
config.m_workerRank = i;
randomizer->StartEpoch(config);
Sequences sequences = randomizer->GetNextSequences(1, 10);
BOOST_CHECK_EQUAL(sequences.m_data.size(), 1);
for (size_t j = 0; j < sequences.m_data[0].size(); j++)
{
shared_ptr<MockDenseSequenceData> sequence = std::dynamic_pointer_cast<MockDenseSequenceData>(sequences.m_data[0][j]);
// Worker 0 should have data 0,2,4 ..,10,12 .. 18
// Worker 1 should have data 1,3,5 ..,11,13 .. 19
// Sequence should keys follow the same pattern.
BOOST_CHECK_EQUAL(*(static_cast<float*>(sequence->m_data)), (float)i + j * 2);
BOOST_CHECK_EQUAL(sequence->m_key.m_sequence, i + j * 2);
}
}
}
BOOST_AUTO_TEST_CASE(DefaultCorpusDescriptor)
{
const int seed = 13;