Introducing chunks
This commit is contained in:
Родитель
55da016d5a
Коммит
1edf828f54
|
@ -11,6 +11,7 @@
|
|||
#ifndef UNREFERENCED_PARAMETER
|
||||
#define UNREFERENCED_PARAMETER(P) (P)
|
||||
#endif
|
||||
#include "ImageSequence.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
|
@ -79,6 +80,68 @@ ImageDataDeserializer::ImageDataDeserializer(const ConfigParameters& config)
|
|||
CreateSequenceDescriptions(configHelper.GetMapPath(), labelDimension);
|
||||
}
|
||||
|
||||
class ImageDataDeserializer::ImageChunk : public Chunk, public std::enable_shared_from_this<ImageChunk>
|
||||
{
|
||||
ImageSequenceDescription m_description;
|
||||
ImageDataDeserializer& m_parent;
|
||||
|
||||
public:
|
||||
ImageChunk(ImageSequenceDescription& description, ImageDataDeserializer& parent)
|
||||
: m_description(description), m_parent(parent)
|
||||
{
|
||||
}
|
||||
|
||||
virtual std::vector<SequenceDataPtr> GetSequence(const size_t& sequenceId) override
|
||||
{
|
||||
assert(sequenceId == m_description.m_id);
|
||||
UNREFERENCED_PARAMETER(sequenceId);
|
||||
const auto& imageSequence = m_description;
|
||||
|
||||
auto image = std::make_shared<ImageSequenceData>();
|
||||
image->m_image = std::move(cv::imread(imageSequence.m_path, cv::IMREAD_COLOR));
|
||||
auto& cvImage = image->m_image;
|
||||
|
||||
if (!cvImage.data)
|
||||
{
|
||||
RuntimeError("Cannot open file '%s'", imageSequence.m_path.c_str());
|
||||
}
|
||||
|
||||
// Convert element type.
|
||||
int dataType = m_parent.m_featureElementType == ElementType::tfloat ? CV_32F : CV_64F;
|
||||
if (cvImage.type() != CV_MAKETYPE(dataType, cvImage.channels()))
|
||||
{
|
||||
cvImage.convertTo(cvImage, dataType);
|
||||
}
|
||||
|
||||
if (!cvImage.isContinuous())
|
||||
{
|
||||
cvImage = cvImage.clone();
|
||||
}
|
||||
assert(cvImage.isContinuous());
|
||||
|
||||
image->m_data = image->m_image.data;
|
||||
ImageDimensions dimensions(cvImage.cols, cvImage.rows, cvImage.channels());
|
||||
image->m_sampleLayout = std::make_shared<TensorShape>(dimensions.AsTensorShape(HWC));
|
||||
image->m_numberOfSamples = 1;
|
||||
image->m_chunk = shared_from_this();
|
||||
|
||||
SparseSequenceDataPtr label = std::make_shared<SparseSequenceData>();
|
||||
label->m_chunk = shared_from_this();
|
||||
m_parent.m_labelGenerator->CreateLabelFor(imageSequence.m_classId, *label);
|
||||
return std::vector<SequenceDataPtr> { image, label };
|
||||
}
|
||||
|
||||
~ImageChunk()
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
ChunkPtr ImageDataDeserializer::GetChunk(size_t chunkId)
|
||||
{
|
||||
auto sequenceDescription = m_imageSequences[chunkId];
|
||||
return std::make_shared<ImageChunk>(sequenceDescription, *this);
|
||||
}
|
||||
|
||||
void ImageDataDeserializer::CreateSequenceDescriptions(std::string mapPath, size_t labelDimension)
|
||||
{
|
||||
UNREFERENCED_PARAMETER(labelDimension);
|
||||
|
@ -127,72 +190,6 @@ std::vector<StreamDescriptionPtr> ImageDataDeserializer::GetStreamDescriptions()
|
|||
return m_streams;
|
||||
}
|
||||
|
||||
std::vector<std::vector<SequenceDataPtr>> ImageDataDeserializer::GetSequencesById(const std::vector<size_t>& ids)
|
||||
{
|
||||
if (ids.empty())
|
||||
{
|
||||
RuntimeError("Number of requested sequences cannot be zero.");
|
||||
}
|
||||
|
||||
m_currentImages.resize(ids.size());
|
||||
m_labels.resize(ids.size());
|
||||
|
||||
std::vector<std::vector<SequenceDataPtr>> result;
|
||||
result.resize(ids.size());
|
||||
|
||||
#pragma omp parallel for ordered schedule(dynamic)
|
||||
for (int i = 0; i < ids.size(); ++i)
|
||||
{
|
||||
if (ids[i] >= m_imageSequences.size())
|
||||
{
|
||||
RuntimeError("Invalid sequence id is provided '%d', expected range [0..%d].",
|
||||
static_cast<int>(ids[i]),
|
||||
static_cast<int>(m_imageSequences.size()) - 1);
|
||||
}
|
||||
|
||||
const auto& imageSequence = m_imageSequences[ids[i]];
|
||||
|
||||
// Construct image
|
||||
m_currentImages[i] = std::move(cv::imread(imageSequence.m_path, cv::IMREAD_COLOR));
|
||||
cv::Mat& cvImage = m_currentImages[i];
|
||||
|
||||
if (!cvImage.data)
|
||||
{
|
||||
RuntimeError("Cannot open file '%s'", imageSequence.m_path.c_str());
|
||||
}
|
||||
|
||||
// Convert element type.
|
||||
// TODO We should all native CV element types to be able to match the behavior of the old reader.
|
||||
int dataType = m_featureElementType == ElementType::tfloat ? CV_32F : CV_64F;
|
||||
if (cvImage.type() != CV_MAKETYPE(dataType, cvImage.channels()))
|
||||
{
|
||||
cvImage.convertTo(cvImage, dataType);
|
||||
}
|
||||
|
||||
if (!cvImage.isContinuous())
|
||||
{
|
||||
cvImage = cvImage.clone();
|
||||
}
|
||||
assert(cvImage.isContinuous());
|
||||
|
||||
ImageDimensions dimensions(cvImage.cols, cvImage.rows, cvImage.channels());
|
||||
auto image = std::make_shared<DenseSequenceData>();
|
||||
image->m_data = cvImage.data;
|
||||
image->m_sampleLayout = std::make_shared<TensorShape>(dimensions.AsTensorShape(HWC));
|
||||
image->m_numberOfSamples = 1;
|
||||
|
||||
if (m_labels[i] == nullptr)
|
||||
{
|
||||
m_labels[i] = std::make_shared<SparseSequenceData>();
|
||||
}
|
||||
|
||||
m_labelGenerator->CreateLabelFor(imageSequence.m_classId, *m_labels[i]);
|
||||
result[i] = std::move(std::vector<SequenceDataPtr>{image, m_labels[i]});
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void ImageDataDeserializer::FillSequenceDescriptions(SequenceDescriptions& timeline) const
|
||||
{
|
||||
timeline.resize(m_imageSequences.size());
|
||||
|
|
|
@ -24,7 +24,7 @@ public:
|
|||
std::vector<StreamDescriptionPtr> GetStreamDescriptions() const override;
|
||||
|
||||
// Get sequences by specified ids. Order of returned sequences corresponds to the order of provided ids.
|
||||
std::vector<std::vector<SequenceDataPtr>> GetSequencesById(const std::vector<size_t>& ids) override;
|
||||
virtual ChunkPtr GetChunk(size_t chunkId) override;
|
||||
|
||||
protected:
|
||||
void FillSequenceDescriptions(SequenceDescriptions& timeline) const override;
|
||||
|
@ -40,6 +40,8 @@ private:
|
|||
size_t m_classId;
|
||||
};
|
||||
|
||||
class ImageChunk;
|
||||
|
||||
// A helper class for generation of type specific labels (currently float/double only).
|
||||
class LabelGenerator;
|
||||
typedef std::shared_ptr<LabelGenerator> LabelGeneratorPtr;
|
||||
|
@ -48,12 +50,6 @@ private:
|
|||
// Sequence descriptions for all input data.
|
||||
std::vector<ImageSequenceDescription> m_imageSequences;
|
||||
|
||||
// Buffer to store label data.
|
||||
std::vector<SparseSequenceDataPtr> m_labels;
|
||||
|
||||
// Buffer to store feature data.
|
||||
std::vector<cv::Mat> m_currentImages;
|
||||
|
||||
// Element type of the feature/label stream (currently float/double only).
|
||||
ElementType m_featureElementType;
|
||||
};
|
||||
|
|
|
@ -113,6 +113,7 @@
|
|||
<ClInclude Include="ImageConfigHelper.h" />
|
||||
<ClInclude Include="ImageDataDeserializer.h" />
|
||||
<ClInclude Include="ImageReader.h" />
|
||||
<ClInclude Include="ImageSequence.h" />
|
||||
<ClInclude Include="ImageTransformers.h" />
|
||||
<ClInclude Include="stdafx.h" />
|
||||
<ClInclude Include="targetver.h" />
|
||||
|
@ -150,4 +151,4 @@
|
|||
<Target Name="CheckDependencies">
|
||||
<Warning Condition="!$(HasOpenCV)" Text="ImageReader requires the OpenCV library to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#opencv for installation instructions." />
|
||||
</Target>
|
||||
</Project>
|
||||
</Project>
|
|
@ -43,6 +43,7 @@
|
|||
<ClInclude Include="ImageDataDeserializer.h" />
|
||||
<ClInclude Include="ImageReader.h" />
|
||||
<ClInclude Include="ImageConfigHelper.h" />
|
||||
<ClInclude Include="ImageSequence.h" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Filter Include="Common">
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
#pragma once
|
||||
|
||||
#include <opencv2/core/mat.hpp>
|
||||
#include "DataDeserializer.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// Used to keep track of the image. Used in the implementations of open cv transformers
|
||||
// and deserializer. Accessed only using DenseSequenceData interface.
|
||||
struct ImageSequenceData : DenseSequenceData
|
||||
{
|
||||
cv::Mat m_image;
|
||||
|
||||
// In case we do not copy data - we preserve original sequence.
|
||||
SequenceDataPtr m_original;
|
||||
};
|
||||
}}}
|
|
@ -14,6 +14,7 @@
|
|||
#include "ImageConfigHelper.h"
|
||||
#include "StringUtil.h"
|
||||
#include "ElementTypeUtils.h"
|
||||
#include "ImageSequence.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK
|
||||
{
|
||||
|
@ -34,10 +35,11 @@ void ImageTransformerBase::Initialize(TransformerPtr next,
|
|||
}
|
||||
|
||||
SequenceDataPtr
|
||||
ImageTransformerBase::Apply(const DenseSequenceData &inputSequence,
|
||||
const StreamDescription &inputStream, cv::Mat &buffer,
|
||||
ImageTransformerBase::Apply(SequenceDataPtr sequence,
|
||||
const StreamDescription &inputStream,
|
||||
const StreamDescription & /*outputStream*/)
|
||||
{
|
||||
auto inputSequence = reinterpret_cast<const DenseSequenceData&>(*sequence.get());
|
||||
ImageDimensions dimensions(*inputSequence.m_sampleLayout, HWC);
|
||||
int columns = static_cast<int>(dimensions.m_width);
|
||||
int rows = static_cast<int>(dimensions.m_height);
|
||||
|
@ -57,20 +59,25 @@ ImageTransformerBase::Apply(const DenseSequenceData &inputSequence,
|
|||
RuntimeError("Unsupported type");
|
||||
}
|
||||
|
||||
auto result = std::make_shared<ImageSequenceData>();
|
||||
int type = CV_MAKETYPE(typeId, channels);
|
||||
buffer = cv::Mat(rows, columns, type, inputSequence.m_data);
|
||||
cv::Mat buffer = cv::Mat(rows, columns, type, inputSequence.m_data);
|
||||
this->Apply(buffer);
|
||||
if (!buffer.isContinuous())
|
||||
{
|
||||
buffer = buffer.clone();
|
||||
}
|
||||
else
|
||||
{
|
||||
result->m_original = sequence;
|
||||
}
|
||||
assert(buffer.isContinuous());
|
||||
result->m_image = buffer;
|
||||
result->m_data = buffer.ptr();
|
||||
result->m_numberOfSamples = inputSequence.m_numberOfSamples;
|
||||
|
||||
auto result = std::make_shared<DenseSequenceData>();
|
||||
ImageDimensions outputDimensions(buffer.cols, buffer.rows, buffer.channels());
|
||||
result->m_sampleLayout = std::make_shared<TensorShape>(outputDimensions.AsTensorShape(HWC));
|
||||
result->m_numberOfSamples = inputSequence.m_numberOfSamples;
|
||||
result->m_data = buffer.ptr();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -367,7 +374,7 @@ void MeanTransformer::Apply(cv::Mat &mat)
|
|||
void TransposeTransformer::Initialize(TransformerPtr next,
|
||||
const ConfigParameters &readerConfig)
|
||||
{
|
||||
Base::Initialize(next, readerConfig);
|
||||
TransformerBase::Initialize(next, readerConfig);
|
||||
|
||||
// Currently we only support a single stream.
|
||||
ImageConfigHelper config(readerConfig);
|
||||
|
@ -392,39 +399,46 @@ void TransposeTransformer::Initialize(TransformerPtr next,
|
|||
}
|
||||
|
||||
SequenceDataPtr
|
||||
TransposeTransformer::Apply(const DenseSequenceData &inputSequence,
|
||||
TransposeTransformer::Apply(SequenceDataPtr inputSequence,
|
||||
const StreamDescription &inputStream,
|
||||
vector<char> &buffer,
|
||||
const StreamDescription &outputStream)
|
||||
{
|
||||
if (inputStream.m_elementType == ElementType::tdouble)
|
||||
{
|
||||
return TypedApply<double>(inputSequence, inputStream, buffer, outputStream);
|
||||
return TypedApply<double>(inputSequence, inputStream, outputStream);
|
||||
}
|
||||
|
||||
if (inputStream.m_elementType == ElementType::tfloat)
|
||||
{
|
||||
return TypedApply<float>(inputSequence, inputStream, buffer, outputStream);
|
||||
return TypedApply<float>(inputSequence, inputStream, outputStream);
|
||||
}
|
||||
|
||||
RuntimeError("Unsupported type");
|
||||
}
|
||||
|
||||
struct OwnedDenseSequence : DenseSequenceData
|
||||
{
|
||||
std::vector<char> m_buffer;
|
||||
};
|
||||
|
||||
template <class TElement>
|
||||
SequenceDataPtr
|
||||
TransposeTransformer::TypedApply(const DenseSequenceData &inputSequence,
|
||||
TransposeTransformer::TypedApply(SequenceDataPtr sequence,
|
||||
const StreamDescription &inputStream,
|
||||
vector<char> &buffer,
|
||||
const StreamDescription &outputStream)
|
||||
{
|
||||
assert(inputStream.m_storageType == StorageType::dense);
|
||||
auto inputSequence = reinterpret_cast<DenseSequenceData&>(*sequence.get());
|
||||
assert(inputSequence.m_numberOfSamples == 1);
|
||||
assert(inputStream.m_sampleLayout->GetNumElements() ==
|
||||
outputStream.m_sampleLayout->GetNumElements());
|
||||
|
||||
size_t count = inputStream.m_sampleLayout->GetNumElements() * GetSizeByType(inputStream.m_elementType);
|
||||
buffer.resize(count);
|
||||
|
||||
TElement* typedBuffer = reinterpret_cast<TElement*>(&buffer[0]);
|
||||
auto result = std::make_shared<OwnedDenseSequence>();
|
||||
result->m_buffer.resize(count);
|
||||
|
||||
TElement* typedBuffer = reinterpret_cast<TElement*>(&result->m_buffer[0]);
|
||||
ImageDimensions dimensions(*inputStream.m_sampleLayout, ImageLayoutKind::HWC);
|
||||
|
||||
size_t rowCount = dimensions.m_height * dimensions.m_width;
|
||||
|
@ -441,9 +455,8 @@ TransposeTransformer::TypedApply(const DenseSequenceData &inputSequence,
|
|||
}
|
||||
}
|
||||
|
||||
auto result = std::make_shared<DenseSequenceData>();
|
||||
result->m_sampleLayout = outputStream.m_sampleLayout;
|
||||
result->m_data = &buffer[0];
|
||||
result->m_data = &result->m_buffer[0];
|
||||
result->m_numberOfSamples = inputSequence.m_numberOfSamples;
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -20,7 +20,7 @@ class ConfigParameters;
|
|||
|
||||
// Base class for image transformations based on OpenCV
|
||||
// that helps to wrap the sequences into OpenCV::Mat class.
|
||||
class ImageTransformerBase : public TransformerBase<cv::Mat>
|
||||
class ImageTransformerBase : public TransformerBase
|
||||
{
|
||||
public:
|
||||
// Initializes the transformer.
|
||||
|
@ -44,13 +44,13 @@ protected:
|
|||
return m_seed;
|
||||
}
|
||||
|
||||
using Base = TransformerBase<cv::Mat>;
|
||||
using Base = TransformerBase;
|
||||
using UniRealT = std::uniform_real_distribution<double>;
|
||||
using UniIntT = std::uniform_int_distribution<int>;
|
||||
|
||||
// Applies transformation to the sequence.
|
||||
SequenceDataPtr Apply(const DenseSequenceData &inputSequence,
|
||||
const StreamDescription &inputStream, cv::Mat &buffer,
|
||||
SequenceDataPtr Apply(SequenceDataPtr inputSequence,
|
||||
const StreamDescription &inputStream,
|
||||
const StreamDescription &outputStream) override;
|
||||
|
||||
// The only function that should be redefined by the inherited classes.
|
||||
|
@ -139,7 +139,7 @@ private:
|
|||
};
|
||||
|
||||
// Transpose transformation from HWC to CHW.
|
||||
class TransposeTransformer : public TransformerBase<vector<char>>
|
||||
class TransposeTransformer : public TransformerBase
|
||||
{
|
||||
public:
|
||||
virtual void Initialize(TransformerPtr next,
|
||||
|
@ -156,18 +156,14 @@ protected:
|
|||
return m_outputStreams;
|
||||
}
|
||||
|
||||
SequenceDataPtr Apply(const DenseSequenceData &inputSequence,
|
||||
SequenceDataPtr Apply(SequenceDataPtr inputSequence,
|
||||
const StreamDescription &inputStream,
|
||||
vector<char> &buffer,
|
||||
const StreamDescription &outputStream) override;
|
||||
|
||||
private:
|
||||
using Base = TransformerBase<vector<char>>;
|
||||
|
||||
template <class TElement>
|
||||
SequenceDataPtr TypedApply(const DenseSequenceData &inputSequence,
|
||||
SequenceDataPtr TypedApply(SequenceDataPtr inputSequence,
|
||||
const StreamDescription &inputStream,
|
||||
vector<char> &buffer,
|
||||
const StreamDescription &outputStream);
|
||||
|
||||
std::vector<StreamDescriptionPtr> m_outputStreams;
|
||||
|
|
|
@ -296,8 +296,6 @@ void BlockRandomizer::Initialize(TransformerPtr next, const ConfigParameters& re
|
|||
|
||||
void BlockRandomizer::StartEpoch(const EpochConfiguration& config)
|
||||
{
|
||||
m_deserializer->StartEpoch(config);
|
||||
|
||||
m_workerRank = config.m_workerRank;
|
||||
m_numberOfWorkers = config.m_numberOfWorkers;
|
||||
|
||||
|
@ -319,10 +317,10 @@ void BlockRandomizer::StartEpoch(const EpochConfiguration& config)
|
|||
RandomizeForGlobalSamplePosition(timeframe);
|
||||
};
|
||||
|
||||
bool BlockRandomizer::GetNextSequenceIds(size_t sampleCount, std::vector<size_t>& originalIds)
|
||||
bool BlockRandomizer::GetNextSequenceIds(size_t sampleCount, SequenceDescriptions& sequenceDescriptions)
|
||||
{
|
||||
assert(m_frameMode); // TODO !m_frameMode not implemented yet
|
||||
assert(originalIds.size() == 0);
|
||||
assert(sequenceDescriptions.size() == 0);
|
||||
assert(sampleCount < m_numSamples);
|
||||
|
||||
if (m_samplePositionInEpoch < m_epochSize)
|
||||
|
@ -332,7 +330,7 @@ bool BlockRandomizer::GetNextSequenceIds(size_t sampleCount, std::vector<size_t>
|
|||
assert(m_numberOfWorkers == 1); // TODO needs implementation
|
||||
|
||||
while ((m_samplePositionInEpoch < m_epochSize) &&
|
||||
(originalIds.size() < sampleCount))
|
||||
(sequenceDescriptions.size() < sampleCount))
|
||||
{
|
||||
RandomizeIfNewSweepIsEntered();
|
||||
|
||||
|
@ -340,7 +338,7 @@ bool BlockRandomizer::GetNextSequenceIds(size_t sampleCount, std::vector<size_t>
|
|||
if ((seqDesc.m_chunkId % m_numberOfWorkers) == m_workerRank)
|
||||
{
|
||||
// Got one, collect it
|
||||
originalIds.push_back(seqDesc.m_id);
|
||||
sequenceDescriptions.push_back(m_deserializer->GetSequenceDescriptions()[seqDesc.m_id]);
|
||||
}
|
||||
|
||||
m_samplePositionInEpoch += seqDesc.m_numberOfSamples;
|
||||
|
@ -362,7 +360,7 @@ bool BlockRandomizer::GetNextSequenceIds(size_t sampleCount, std::vector<size_t>
|
|||
if (strideBegin <= i && i < strideEnd)
|
||||
{
|
||||
const auto& seqDesc = m_randomTimeline[m_sequencePositionInSweep];
|
||||
originalIds.push_back(seqDesc.m_id);
|
||||
sequenceDescriptions.push_back(m_deserializer->GetSequenceDescriptions()[seqDesc.m_id]);
|
||||
}
|
||||
}
|
||||
assert(m_samplePositionInEpoch == nextSamplePositionInEpoch);
|
||||
|
@ -376,14 +374,13 @@ Sequences BlockRandomizer::GetNextSequences(size_t sampleCount)
|
|||
{
|
||||
assert(m_samplePositionInEpoch != SIZE_MAX); // SetEpochConfiguration() must be called first
|
||||
|
||||
std::vector<size_t> originalIds;
|
||||
Sequences result;
|
||||
|
||||
assert(m_frameMode); // TODO sequence mode not implemented yet
|
||||
|
||||
result.m_endOfEpoch = GetNextSequenceIds(sampleCount, originalIds);
|
||||
SequenceDescriptions sequenceDescriptions;
|
||||
result.m_endOfEpoch = GetNextSequenceIds(sampleCount, sequenceDescriptions);
|
||||
|
||||
if (originalIds.size() == 0)
|
||||
if (sequenceDescriptions.size() == 0)
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
@ -391,7 +388,15 @@ Sequences BlockRandomizer::GetNextSequences(size_t sampleCount)
|
|||
// TODO implement require and release chunks from the data deserializer, but only for this worker
|
||||
// (probably in GetNextSequenceIds())
|
||||
|
||||
result.m_data = m_deserializer->GetSequencesById(originalIds);
|
||||
result.m_data.resize(sequenceDescriptions.size());
|
||||
|
||||
#pragma omp parallel for ordered schedule(dynamic)
|
||||
for (int i = 0; i < sequenceDescriptions.size(); ++i)
|
||||
{
|
||||
ChunkPtr chunk = m_deserializer->GetChunk(sequenceDescriptions[i]->m_chunkId);
|
||||
result.m_data[i] = chunk->GetSequence(sequenceDescriptions[i]->m_id);
|
||||
}
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
|
|
|
@ -99,6 +99,6 @@ private:
|
|||
|
||||
void RandomizeIfNewSweepIsEntered();
|
||||
|
||||
bool GetNextSequenceIds(size_t sampleCount, std::vector<size_t>& ids);
|
||||
bool GetNextSequenceIds(size_t sampleCount, SequenceDescriptions& originalIds);
|
||||
};
|
||||
} } }
|
||||
|
|
|
@ -10,6 +10,25 @@
|
|||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
struct SequenceDataBase;
|
||||
typedef std::shared_ptr<SequenceDataBase> SequenceDataPtr;
|
||||
|
||||
class Chunk
|
||||
{
|
||||
public:
|
||||
// Gets sequences by id.
|
||||
virtual std::vector<SequenceDataPtr> GetSequence(const size_t& sequenceId) = 0;
|
||||
virtual ~Chunk() {};
|
||||
|
||||
protected:
|
||||
Chunk() {}
|
||||
|
||||
private:
|
||||
Chunk(const Chunk&) = delete;
|
||||
Chunk& operator=(const Chunk&) = delete;
|
||||
};
|
||||
typedef std::shared_ptr<Chunk> ChunkPtr;
|
||||
|
||||
// Defines main properties of a sequence.
|
||||
// Sequence descriptions are used by the randomizer to establish a global timeline for complete input.
|
||||
// A sequence is defined as an ordered set of samples (size == 1 is used for sample training).
|
||||
|
@ -31,7 +50,12 @@ typedef std::vector<const SequenceDescription*> SequenceDescriptions;
|
|||
struct SequenceDataBase
|
||||
{
|
||||
SequenceDataBase() : m_data(nullptr) { }
|
||||
virtual ~SequenceDataBase()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
ChunkPtr m_chunk;
|
||||
// A non-owned pointer. The actual size is provided for particular sequences,
|
||||
// i.e. see DenseSequenceData, or SparseSequenceData.
|
||||
void* m_data;
|
||||
|
@ -79,25 +103,8 @@ public:
|
|||
// Retrieves description of all sequences this data deserializer can produce.
|
||||
virtual const SequenceDescriptions& GetSequenceDescriptions() const = 0;
|
||||
|
||||
// Sets epoch configuration.
|
||||
virtual void StartEpoch(const EpochConfiguration& config) = 0;
|
||||
|
||||
// Gets sequences by id.
|
||||
// The return value can be used until the next call to GetSequencesById.
|
||||
// All non-owned pointers returned are valid till the next call to this method.
|
||||
virtual std::vector<std::vector<SequenceDataPtr>> GetSequencesById(const std::vector<size_t>& ids) = 0;
|
||||
|
||||
// Requires the chunk. Each sequence is assigned to the IO chunk by the data deserializer.
|
||||
// This information is communicated thru GetSequenceDescriptions method.
|
||||
// The randomizer guarantees that it accesses sequences only from a limited number of chunks.
|
||||
// When randomizer requires a sequence from a particular chunk it notifies about this the data deserializer,
|
||||
// so that the data deserializer can load/cache sequences more efficiently (loading complete chunks in memory).
|
||||
virtual void RequireChunk(size_t chunkIndex) = 0;
|
||||
|
||||
// Releases the chunk.
|
||||
// When randomizer read all sequences from a particular chunk it notifies the data deserializer
|
||||
// that the chunk can be freed.
|
||||
virtual void ReleaseChunk(size_t chunkIndex) = 0;
|
||||
// Gets a chunk.
|
||||
virtual ChunkPtr GetChunk(size_t chunkId) = 0;
|
||||
|
||||
virtual ~DataDeserializer() {};
|
||||
};
|
||||
|
|
|
@ -17,9 +17,6 @@ public:
|
|||
DataDeserializerBase() : m_sequencesInitialized(false)
|
||||
{}
|
||||
|
||||
// Sets configuration for the current epoch.
|
||||
void StartEpoch(const EpochConfiguration& /*config*/) override {};
|
||||
|
||||
// Provides description of all sequences the deserializer can produce.
|
||||
const SequenceDescriptions& GetSequenceDescriptions() const override
|
||||
{
|
||||
|
@ -31,14 +28,6 @@ public:
|
|||
return m_sequences;
|
||||
}
|
||||
|
||||
// To be called by the randomizer for prefetching the next chunk.
|
||||
// By default IO read-ahead is not implemented.
|
||||
void RequireChunk(size_t /*chunkIndex*/) override{};
|
||||
|
||||
// To be called by the randomizer for releasing a prefetched chunk.
|
||||
// By default IO read-ahead is not implemented.
|
||||
void ReleaseChunk(size_t /*chunkIndex*/) override{};
|
||||
|
||||
protected:
|
||||
// Fills the timeline with sequence descriptions.
|
||||
// Inherited classes should provide the complete Sequence descriptions for all input data.
|
||||
|
|
|
@ -35,7 +35,6 @@ void NoRandomizer::Initialize(TransformerPtr, const ConfigParameters&)
|
|||
|
||||
void NoRandomizer::StartEpoch(const EpochConfiguration& config)
|
||||
{
|
||||
m_deserializer->StartEpoch(config);
|
||||
m_config = config;
|
||||
|
||||
if (m_config.m_totalEpochSizeInSamples == requestDataSize)
|
||||
|
@ -62,24 +61,54 @@ Sequences NoRandomizer::GetNextSequences(size_t sampleCount)
|
|||
size_t end = maxSampleCount * (m_config.m_workerRank + 1) / m_config.m_numberOfWorkers;
|
||||
size_t subsetSize = end - start;
|
||||
|
||||
std::vector<size_t> originalIds;
|
||||
originalIds.reserve(subsetSize);
|
||||
std::vector<size_t> chunkIds;
|
||||
SequenceDescriptions sequences;
|
||||
sequences.reserve(subsetSize);
|
||||
size_t previousChunk = std::numeric_limits<size_t>::max();
|
||||
for (size_t i = start; i < end; ++i)
|
||||
{
|
||||
const auto& sequence = m_timeline[(m_sequencePosition + i) % m_timeline.size()];
|
||||
assert(sequence->m_numberOfSamples == 1);
|
||||
originalIds.push_back(sequence->m_id);
|
||||
sequences.push_back(sequence);
|
||||
|
||||
if (previousChunk != sequence->m_chunkId)
|
||||
{
|
||||
chunkIds.push_back(sequence->m_chunkId);
|
||||
previousChunk = sequence->m_chunkId;
|
||||
}
|
||||
}
|
||||
|
||||
m_samplePositionInEpoch += maxSampleCount;
|
||||
m_sequencePosition = (m_sequencePosition + maxSampleCount) % m_timeline.size();
|
||||
|
||||
if (originalIds.size() == 0)
|
||||
if (sequences.size() == 0)
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
result.m_data = m_deserializer->GetSequencesById(originalIds);
|
||||
std::map<size_t, ChunkPtr> chunks;
|
||||
for (size_t id : chunkIds)
|
||||
{
|
||||
auto chunk = m_chunks.find(id);
|
||||
if (chunk == m_chunks.end())
|
||||
{
|
||||
chunks[id] = m_deserializer->GetChunk(id);
|
||||
}
|
||||
else
|
||||
{
|
||||
chunks[id] = chunk->second;
|
||||
}
|
||||
}
|
||||
|
||||
m_chunks.swap(chunks);
|
||||
|
||||
result.m_data.resize(sequences.size());
|
||||
|
||||
#pragma omp parallel for ordered schedule(dynamic)
|
||||
for (int i = 0; i < sequences.size(); ++i)
|
||||
{
|
||||
result.m_data[i] = m_chunks[sequences[i]->m_chunkId]->GetSequence(sequences[i]->m_id);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include <set>
|
||||
#include "Transformer.h"
|
||||
#include "DataDeserializer.h"
|
||||
|
||||
|
@ -41,6 +41,8 @@ private:
|
|||
EpochConfiguration m_config;
|
||||
size_t m_samplePositionInEpoch;
|
||||
size_t m_sequencePosition;
|
||||
|
||||
std::map<size_t, ChunkPtr> m_chunks;
|
||||
};
|
||||
|
||||
}}}
|
||||
|
|
|
@ -12,8 +12,6 @@
|
|||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// Currently supports only dense data format.
|
||||
template <class TBufferElement>
|
||||
class TransformerBase : public Transformer
|
||||
{
|
||||
public:
|
||||
|
@ -48,7 +46,6 @@ public:
|
|||
const auto &appliedStreamIds = GetAppliedStreamIds();
|
||||
const auto &outputStreams = GetOutputStreams();
|
||||
assert(m_inputStreams.size() == outputStreams.size());
|
||||
m_buffer.resize(samples.m_data.size());
|
||||
|
||||
#pragma omp parallel for ordered schedule(dynamic)
|
||||
for (int i = 0; i < samples.m_data.size(); ++i)
|
||||
|
@ -56,15 +53,10 @@ public:
|
|||
auto &sample = samples.m_data[i];
|
||||
assert(sample.size() == m_inputStreams.size());
|
||||
|
||||
m_buffer[i].resize(appliedStreamIds.size());
|
||||
for (int j = 0; j < appliedStreamIds.size(); ++j)
|
||||
{
|
||||
size_t id = appliedStreamIds[j];
|
||||
assert(m_inputStreams[id]->m_storageType == StorageType::dense);
|
||||
const DenseSequenceData &sequence =
|
||||
reinterpret_cast<DenseSequenceData &>(*sample[id]);
|
||||
sample[id] = Apply(sequence, *m_inputStreams[id], m_buffer[i][j],
|
||||
*outputStreams[id]);
|
||||
sample[id] = Apply(sample[id], *m_inputStreams[id], *outputStreams[id]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -85,14 +77,12 @@ protected:
|
|||
|
||||
private:
|
||||
// Applies transformation to the sequence.
|
||||
virtual SequenceDataPtr Apply(const DenseSequenceData &inputSequence,
|
||||
virtual SequenceDataPtr Apply(SequenceDataPtr inputSequence,
|
||||
const StreamDescription &inputStream,
|
||||
TBufferElement &buffer,
|
||||
const StreamDescription &outputStream) = 0;
|
||||
|
||||
TransformerPtr m_next;
|
||||
std::vector<StreamId> m_featureStreamIds;
|
||||
std::vector<std::vector<TBufferElement>> m_buffer;
|
||||
std::vector<StreamDescriptionPtr> m_inputStreams;
|
||||
};
|
||||
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
dataDir: ../Data
|
||||
tags:
|
||||
# running on every BVT job in 'I' (Image) leg:
|
||||
- bvt-i (build_sku == 'gpu') and (device=='gpu') and (flavor=='release') and (os=='linux')
|
||||
- bvt-i (build_sku == 'gpu') and (device=='gpu') and (flavor=='release')
|
||||
# running every Nightly job in 'I' leg
|
||||
- nightly-i (build_sku == 'gpu') and (device=='gpu') and (flavor=='release') and (os=='linux')
|
||||
- nightly-i (build_sku == 'gpu') and (device=='gpu') and (flavor=='release')
|
||||
|
||||
testCases:
|
||||
CNTK Run must be completed:
|
||||
|
|
Загрузка…
Ссылка в новой задаче