First version of base64 deserializer

This commit is contained in:
Eldar Akchurin 2016-09-30 16:27:53 +02:00 коммит произвёл REDMOND\sayanpa
Родитель d0f5250e43
Коммит 27ee43a782
33 изменённых файлов: 1449 добавлений и 957 удалений

Просмотреть файл

@ -318,6 +318,7 @@ READER_SRC =\
$(SOURCEDIR)/Readers/ReaderLib/PackerBase.cpp \
$(SOURCEDIR)/Readers/ReaderLib/FramePacker.cpp \
$(SOURCEDIR)/Readers/ReaderLib/ReaderBase.cpp \
$(SOURCEDIR)/Readers/ReaderLib/Indexer.cpp \
$(SOURCEDIR)/Readers/ReaderLib/ChunkCache.cpp \
COMMON_SRC =\
@ -851,7 +852,6 @@ $(CNTKBINARYREADER): $(CNTKBINARYREADER_OBJ) | $(CNTKMATH_LIB)
CNTKTEXTFORMATREADER_SRC =\
$(SOURCEDIR)/Readers/CNTKTextFormatReader/Exports.cpp \
$(SOURCEDIR)/Readers/CNTKTextFormatReader/Indexer.cpp \
$(SOURCEDIR)/Readers/CNTKTextFormatReader/TextParser.cpp \
$(SOURCEDIR)/Readers/CNTKTextFormatReader/CNTKTextFormatReader.cpp \
$(SOURCEDIR)/Readers/CNTKTextFormatReader/TextConfigHelper.cpp \
@ -915,6 +915,8 @@ endif
IMAGEREADER_LIBS:= $(addprefix -l,$(IMAGEREADER_LIBS_LIST))
IMAGEREADER_SRC =\
$(SOURCEDIR)/Readers/ImageReader/Base64ImageDeserializer.cpp \
$(SOURCEDIR)/Readers/ImageReader/ImageDeserializerBase.cpp \
$(SOURCEDIR)/Readers/ImageReader/Exports.cpp \
$(SOURCEDIR)/Readers/ImageReader/ImageConfigHelper.cpp \
$(SOURCEDIR)/Readers/ImageReader/ImageDataDeserializer.cpp \
@ -1107,7 +1109,6 @@ UNITTEST_READER_SRC = \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ImageReaderTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ReaderLibTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/stdafx.cpp \
$(SOURCEDIR)/Readers/CNTKTextFormatReader/Indexer.cpp \
$(SOURCEDIR)/Readers/CNTKTextFormatReader/TextParser.cpp \
UNITTEST_READER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_READER_SRC))

Просмотреть файл

@ -95,7 +95,6 @@
<ClInclude Include="..\..\Common\Include\File.h" />
<ClInclude Include="..\..\Common\Include\fileutil.h" />
<ClInclude Include="TextReaderConstants.h" />
<ClInclude Include="Indexer.h" />
<ClInclude Include="TextConfigHelper.h" />
<ClInclude Include="TextParser.h" />
<ClInclude Include="Descriptors.h" />
@ -104,7 +103,6 @@
<ClInclude Include="targetver.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="Indexer.cpp" />
<ClCompile Include="TextConfigHelper.cpp" />
<ClCompile Include="TextParser.cpp" />
<ClCompile Include="dllmain.cpp" />

Просмотреть файл

@ -5,7 +5,6 @@
<ClCompile Include="stdafx.cpp" />
<ClCompile Include="dllmain.cpp" />
<ClCompile Include="TextConfigHelper.cpp" />
<ClCompile Include="Indexer.cpp" />
<ClCompile Include="TextParser.cpp" />
<ClCompile Include="CNTKTextFormatReader.cpp" />
</ItemGroup>
@ -23,7 +22,6 @@
</ClInclude>
<ClInclude Include="TextConfigHelper.h" />
<ClInclude Include="Descriptors.h" />
<ClInclude Include="Indexer.h" />
<ClInclude Include="TextReaderConstants.h" />
<ClInclude Include="TextParser.h" />
<ClInclude Include="CNTKTextFormatReader.h" />
@ -36,4 +34,4 @@
<UniqueIdentifier>{C6F55578-121A-4D7C-8F57-4172BC5C463B}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>
</Project>

Просмотреть файл

@ -5,8 +5,6 @@
#pragma once
#include <stdint.h>
#include <vector>
#include "DataDeserializer.h"
namespace Microsoft { namespace MSR { namespace CNTK {
@ -21,105 +19,4 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// (can be omitted for sparse input)
};
// Sequence metadata. This text-reader specific descriptor adds two additional
// fields: file offset and size in bytes. Both are required to efficiently
// locate and retrieve a sequence from file, given a sequence descriptor.
struct SequenceDescriptor : SequenceDescription
{
SequenceDescriptor() : SequenceDescription({}), m_fileOffsetBytes(0),
m_byteSize(0)
{
}
// size_t m_numberOfSamples -- number of samples in the sequence (largest count among all inputs)
// in case of text data this value == number of rows this sequence spans over.
int64_t m_fileOffsetBytes; // sequence offset in the input file (in bytes)
size_t m_byteSize; // size in bytes
};
// Chunk metadata, similar to the sequence descriptor above,
// but used to facilitate indexing and retrieval of blobs of input data of
// some user-specified size.
struct ChunkDescriptor : ChunkDescription
{
ChunkDescriptor() : ChunkDescription({}), m_byteSize(0) {}
// TODO: if we don't want to keep the whole index
// (metadata for all sequences in memory), we should not
// leave this empty when building a chunk index, and only
// fill it out when the chunk needs to be loaded
// (the indexer will have to do a second pass for this chunk).
std::vector<SequenceDescriptor> m_sequences;
size_t m_byteSize; // size in bytes
};
typedef shared_ptr<ChunkDescriptor> ChunkDescriptorPtr;
// A collection of chunk descriptors, each containing
// a collection of sequence descriptors for the corresponding
// chunk of the input data.
// It also stores a mapping of keys into sequence descriptors.
struct Index
{
std::vector<ChunkDescriptor> m_chunks; // chunks
std::map<size_t, std::pair<size_t, size_t>> m_keyToSequenceInChunk; // sequence key -> sequence location in chunk
const size_t m_maxChunkSize; // maximum chunk size in bytes
bool m_isPrimary; // index for primary deserializer
Index(size_t chunkSize, bool isPrimary) : m_maxChunkSize(chunkSize), m_isPrimary(isPrimary)
{}
// Adds sequence (metadata) to the index. Additionally, it
// assigns an appropriate chunk id to the sequence descriptor,
// ensures that chunks do not exceed the maximum allowed size
// (except when a sequence size is greater than the maximum chunk size)
void AddSequence(SequenceDescriptor& sd)
{
assert(!m_chunks.empty());
ChunkDescriptor* chunk = &m_chunks.back();
if (chunk->m_byteSize > 0 && (chunk->m_byteSize + sd.m_byteSize) > m_maxChunkSize)
{
// Creating a new chunk if the size is exceeded.
chunk->m_sequences.shrink_to_fit();
m_chunks.push_back({});
chunk = &m_chunks.back();
chunk->m_id = (ChunkIdType) (m_chunks.size() - 1);
if (CHUNKID_MAX < m_chunks.size())
{
RuntimeError("Maximum number of chunks exceeded");
}
}
chunk->m_byteSize += sd.m_byteSize;
chunk->m_numberOfSequences++;
chunk->m_numberOfSamples += sd.m_numberOfSamples;
sd.m_chunkId = chunk->m_id;
sd.m_id = chunk->m_sequences.size();
if (!m_isPrimary)
{
auto location = std::make_pair(chunk->m_id, sd.m_id);
auto sequenceId = sd.m_key.m_sequence;
m_keyToSequenceInChunk.insert(std::make_pair(sequenceId, location));
}
chunk->m_sequences.push_back(sd);
}
// Reserves inner structures for the specified number of bytes.
void Reserve(size_t sizeInBytes)
{
if (m_maxChunkSize > 0)
{
m_chunks.reserve((sizeInBytes + m_maxChunkSize - 1) / m_maxChunkSize);
}
m_chunks.push_back({});
}
// Checks if the index is empty.
bool IsEmpty() const
{
return m_chunks.empty();
}
DISABLE_COPY_AND_MOVE(Index);
};
}}}

Просмотреть файл

@ -1,84 +0,0 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include <stdint.h>
#include <vector>
#include "Descriptors.h"
#include "CorpusDescriptor.h"
namespace Microsoft { namespace MSR { namespace CNTK {
// A helper class that does a pass over the input file building up
// an index consisting of sequence and chunk descriptors (which among
// others specify size and file offset of the respective structure).
// As opposed to the data deserializer, indexer performs almost no parsing
// and therefore is several magnitudes faster.
class Indexer
{
public:
Indexer(FILE* file, bool isPrimary, bool skipSequenceIds = false, size_t chunkSize = 32 * 1024 * 1024);
// Reads the input file, building and index of chunks and corresponding
// sequences.
void Build(CorpusDescriptorPtr corpus);
// Returns input data index (chunk and sequence metadata)
const Index& GetIndex() const { return m_index; }
// True, when input does not have the sequence id column
// or when sequence id column was ignored during indexing
// (by passing skipSequenceIds = true to the constructor).
bool HasSequenceIds() const { return m_hasSequenceIds; }
private:
FILE* m_file;
int64_t m_fileOffsetStart;
int64_t m_fileOffsetEnd;
unique_ptr<char[]> m_buffer;
const char* m_bufferStart;
const char* m_bufferEnd;
const char* m_pos; // buffer index
bool m_done; // true, when all input was processed
bool m_hasSequenceIds; // true, when input contains one sequence per line
// or when sequence id column was ignored during indexing.
// a collection of chunk descriptors and sequence keys.
Index m_index;
// Same function as above but with check that the sequence is included in the corpus descriptor.
void AddSequenceIfIncluded(CorpusDescriptorPtr corpus, size_t sequenceId, SequenceDescriptor& sd);
// fills up the buffer with data from file, all previously buffered data
// will be overwritten.
void RefillBuffer();
// Moves the buffer position to the beginning of the next line.
void SkipLine();
// Reads the line until the next pipe character, parsing numerical characters into a sequence id.
// Throws an exception if a non-numerical is read until the pipe character or
// EOF is reached without hitting the pipe character.
// Returns false if no numerical characters are found preceding the pipe.
// Otherwise, writes sequence id value to the provided reference, returns true.
bool TryGetSequenceId(size_t& id);
// Build a chunk/sequence index, treating each line as an individual sequence.
// Does not do any sequence parsing, instead uses line number as
// the corresponding sequence id.
void BuildFromLines(CorpusDescriptorPtr corpus);
// Returns current offset in the input file (in bytes).
int64_t GetFileOffset() const { return m_fileOffsetStart + (m_pos - m_bufferStart); }
DISABLE_COPY_AND_MOVE(Indexer);
};
}}}

Просмотреть файл

@ -168,7 +168,7 @@ void TextParser<ElemType>::Initialize()
"UTF-16 encoding is currently not supported.", m_filename.c_str());
}
m_indexer = make_unique<Indexer>(m_file, m_isPrimary, m_skipSequenceIds, m_chunkSizeBytes);
m_indexer = make_unique<Indexer>(m_file, m_isPrimary, m_skipSequenceIds, NAME_PREFIX, m_chunkSizeBytes);
m_indexer->Build(m_corpus);
});

Просмотреть файл

@ -0,0 +1,211 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "stdafx.h"
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <opencv2/opencv.hpp>
#include "Base64ImageDeserializer.h"
#include "ImageTransformers.h"
#include "ReaderUtil.h"
namespace Microsoft { namespace MSR { namespace CNTK {
class Base64ImageDeserializer::ImageChunk : public Chunk, public std::enable_shared_from_this<ImageChunk>
{
ChunkDescriptor m_descriptor;
size_t m_chunkOffset;
Base64ImageDeserializer& m_deserializer;
// TODO: Could probably be a memory mapped region.
std::vector<char> m_buffer;
public:
ImageChunk(const ChunkDescriptor& descriptor, Base64ImageDeserializer& parent)
: m_descriptor(descriptor), m_deserializer(parent)
{
// Let's see if the open descriptor has problems.
if (ferror(m_deserializer.m_dataFile.get()) != 0)
m_deserializer.m_dataFile.reset(fopenOrDie(m_deserializer.m_fileName.c_str(), L"rbS"), [](FILE* f) { if (f) fclose(f); });
if (descriptor.m_sequences.empty() || !descriptor.m_byteSize)
LogicError("Empty chunks are not supported.");
m_buffer.resize(descriptor.m_byteSize + 1);
// Make sure we always have 0 at the end for buffer overrun.
m_buffer[descriptor.m_byteSize] = 0;
m_chunkOffset = descriptor.m_sequences.front().m_fileOffsetBytes;
// Read chunk into memory.
int rc = _fseeki64(m_deserializer.m_dataFile.get(), m_chunkOffset, SEEK_SET);
if (rc)
RuntimeError("Error seeking to position %" PRId64 " in the input file (%ls), error %d", m_chunkOffset, m_deserializer.m_fileName.c_str(), rc);
freadOrDie(m_buffer.data(), descriptor.m_byteSize, 1, m_deserializer.m_dataFile.get());
}
std::string KeyOf(const SequenceDescriptor& s) const
{
return m_deserializer.m_corpus->IdToKey(s.m_key.m_sequence);
}
void GetSequence(size_t sequenceId, std::vector<SequenceDataPtr>& result) override
{
size_t innerSequenceId = m_deserializer.m_multiViewCrop ? sequenceId / ImageDeserializerBase::NumMultiViewCopies : sequenceId;
const auto& sequence = m_descriptor.m_sequences[innerSequenceId];
size_t offset = sequence.m_fileOffsetBytes - m_chunkOffset;
// Let's parse the string
char* next_token = nullptr;
char* token = strtok_s(&m_buffer[0] + offset, "\t", &next_token);
bool hasSequenceKey = m_deserializer.m_indexer->HasSequenceIds();
if (hasSequenceKey) // Skip sequence key.
{
token = strtok_s(nullptr, "\t", &next_token);
assert(!std::string(token).empty());
}
// Let's get the label.
if (!token)
RuntimeError("Empty label value for sequence '%s'", KeyOf(sequence).c_str());
char* eptr = nullptr;
errno = 0;
size_t classId = strtoull(token, &eptr, 10);
if (token == eptr || errno == ERANGE)
RuntimeError("Cannot parse label value for sequence '%s'", KeyOf(sequence).c_str());
size_t labelDimension = m_deserializer.m_labelGenerator->LabelDimension();
if (classId >= labelDimension)
RuntimeError(
"Image with id '%s' has invalid class id '%" PRIu64 "'. It is exceeding the label dimension of '%" PRIu64,
KeyOf(sequence).c_str(), classId, labelDimension);
// Let's get the image.
token = strtok_s(nullptr, "\n", &next_token);
if (!token)
RuntimeError("Empty image for sequence '%s'", KeyOf(sequence).c_str());
// Find line end or end of buffer.
char* endToken = strchr(token, 0);
if (!endToken)
RuntimeError("Cannot find the end of the image for sequence '%s'", KeyOf(sequence).c_str());
// Remove non base64 characters at the end of the string (tabs/spaces)
while (endToken > token && !IsBase64Char(*(endToken - 1)))
endToken--;
std::vector<char> decodedImage = DecodeBase64(token, endToken);
cv::Mat img = cv::imdecode(decodedImage, m_deserializer.m_grayscale ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
auto image = std::make_shared<ImageSequenceData>();
image->m_image = std::move(img);
auto& cvImage = image->m_image;
if (!cvImage.data)
RuntimeError("Cannot decode sequence '%s'", KeyOf(sequence).c_str());
m_deserializer.PopulateSequenceData(cvImage, classId, sequenceId, result);
}
};
static bool HasSequenceKeys(const std::string& mapPath)
{
std::ifstream mapFile(mapPath);
if (!mapFile)
RuntimeError("Could not open '%s' for reading.", mapPath.c_str());
string line;
if (!std::getline(mapFile, line))
RuntimeError("Could not read the file '%s'.", mapPath.c_str());
// Try to parse sequence id, file path and label.
std::string image, classId, sequenceKey;
std::stringstream ss(line);
if (!std::getline(ss, sequenceKey, '\t') || !std::getline(ss, classId, '\t') || !std::getline(ss, image, '\t'))
{
return false;
}
return true;
}
Base64ImageDeserializer::Base64ImageDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& config, bool isPrimary) : ImageDeserializerBase(corpus, config)
{
auto mapFile = config(L"file");
bool hasSequenceKeys = HasSequenceKeys(mapFile);
m_fileName.assign(mapFile.begin(), mapFile.end());
attempt(5, [this, hasSequenceKeys, corpus, isPrimary]()
{
if (!m_dataFile || ferror(m_dataFile.get()) != 0)
m_dataFile.reset(fopenOrDie(m_fileName, L"rbS"), [](FILE* f) { if (f) fclose(f); });
m_indexer = make_unique<Indexer>(m_dataFile.get(), isPrimary, !hasSequenceKeys);
m_indexer->Build(corpus);
});
}
ChunkDescriptions Base64ImageDeserializer::GetChunkDescriptions()
{
const auto& index = m_indexer->GetIndex();
// In case of multi crop the deserializer provides the same sequence NumMultiViewCopies times.
size_t sequencesPerInitialSequence = m_multiViewCrop ? ImageDeserializerBase::NumMultiViewCopies : 1;
ChunkDescriptions result;
result.reserve(index.m_chunks.size() * sequencesPerInitialSequence);
for (auto const& chunk : index.m_chunks)
{
auto c = std::make_shared<ChunkDescription>();
c->m_id = chunk.m_id;
assert(chunk.m_numberOfSamples == chunk.m_numberOfSequences);
c->m_numberOfSamples = c->m_numberOfSequences = chunk.m_numberOfSequences * sequencesPerInitialSequence;
result.push_back(c);
}
return result;
}
void Base64ImageDeserializer::GetSequencesForChunk(ChunkIdType chunkId, std::vector<SequenceDescription>& result)
{
const auto& index = m_indexer->GetIndex();
const auto& chunk = index.m_chunks[chunkId];
size_t sequencesPerInitialSequence = m_multiViewCrop ? 10 : 1;
result.reserve(sequencesPerInitialSequence * chunk.m_sequences.size());
size_t currentId = 0;
for (auto const& s : chunk.m_sequences)
{
assert(currentId / sequencesPerInitialSequence == s.m_id);
for (size_t i = 0; i < sequencesPerInitialSequence; ++i)
{
result.push_back(
{
currentId,
s.m_numberOfSamples,
s.m_chunkId,
s.m_key
});
currentId++;
}
}
}
ChunkPtr Base64ImageDeserializer::GetChunk(ChunkIdType chunkId)
{
const auto& chunkDescriptor = m_indexer->GetIndex().m_chunks[chunkId];
return make_shared<ImageChunk>(chunkDescriptor, *this);
}
bool Base64ImageDeserializer::GetSequenceDescriptionByKey(const KeyType& key, SequenceDescription& result)
{
const auto& index = m_indexer->GetIndex();
const auto& keys = index.m_keyToSequenceInChunk;
auto sequenceLocation = keys.find(key.m_sequence);
if (sequenceLocation == keys.end())
return false;
const auto& chunks = index.m_chunks;
result = chunks[sequenceLocation->second.first].m_sequences[sequenceLocation->second.second];
return true;
}
}}}

Просмотреть файл

@ -0,0 +1,44 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include "ImageDeserializerBase.h"
#include "Config.h"
#include "CorpusDescriptor.h"
#include "Indexer.h"
namespace Microsoft { namespace MSR { namespace CNTK {
// Base 64 Image deserializer.
class Base64ImageDeserializer : public ImageDeserializerBase
{
public:
Base64ImageDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& config, bool isPrimary);
// Get a chunk by id.
ChunkPtr GetChunk(ChunkIdType chunkId) override;
// Get chunk descriptions.
ChunkDescriptions GetChunkDescriptions() override;
// Gets sequence descriptions for the chunk.
void GetSequencesForChunk(ChunkIdType, std::vector<SequenceDescription>&) override;
// Gets sequence description by key.
bool GetSequenceDescriptionByKey(const KeyType&, SequenceDescription&) override;
private:
// Creates a set of sequence descriptions.
void CreateSequenceDescriptions(CorpusDescriptorPtr corpus, std::string mapPath);
class ImageChunk;
std::unique_ptr<Indexer> m_indexer;
std::shared_ptr<FILE> m_dataFile;
std::wstring m_fileName;
};
}}}

Просмотреть файл

@ -14,6 +14,7 @@
#include "ImageDataDeserializer.h"
#include "ImageTransformers.h"
#include "CorpusDescriptor.h"
#include "Base64ImageDeserializer.h"
namespace Microsoft { namespace MSR { namespace CNTK {
@ -38,10 +39,12 @@ extern "C" DATAREADER_API void GetReaderD(IDataReader** preader)
// TODO: Not safe from the ABI perspective. Will be uglified to make the interface ABI.
// A factory method for creating image deserializers.
extern "C" DATAREADER_API bool CreateDeserializer(IDataDeserializer** deserializer, const std::wstring& type, const ConfigParameters& deserializerConfig, CorpusDescriptorPtr corpus, bool)
extern "C" DATAREADER_API bool CreateDeserializer(IDataDeserializer** deserializer, const std::wstring& type, const ConfigParameters& deserializerConfig, CorpusDescriptorPtr corpus, bool isPrimary)
{
if (type == L"ImageDeserializer")
*deserializer = new ImageDataDeserializer(corpus, deserializerConfig);
else if (type == L"Base64ImageDeserializer")
*deserializer = new Base64ImageDeserializer(corpus, deserializerConfig, isPrimary);
else
// Unknown type.
return false;

Просмотреть файл

@ -7,68 +7,25 @@
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <opencv2/opencv.hpp>
#include <numeric>
#include <limits>
#include "ImageDataDeserializer.h"
#include "ImageConfigHelper.h"
#include "StringUtil.h"
#include "ConfigUtil.h"
#include "TimerUtility.h"
#include "ImageTransformers.h"
#include "SequenceData.h"
#include "ImageUtil.h"
namespace Microsoft { namespace MSR { namespace CNTK {
class ImageDataDeserializer::LabelGenerator
{
public:
virtual void CreateLabelFor(size_t classId, CategorySequenceData& data) = 0;
virtual ~LabelGenerator() { }
};
// A helper class to generate a typed label in a sparse format.
// A label is just a category/class the image belongs to.
// It is represented as a array indexed by the category with zero values for all categories the image does not belong to,
// and a single one for a category it belongs to: [ 0, .. 0.. 1 .. 0 ]
// The class is parameterized because the representation of 1 is type specific.
template <class TElement>
class TypedLabelGenerator : public ImageDataDeserializer::LabelGenerator
{
public:
TypedLabelGenerator(size_t labelDimension) : m_value(1), m_indices(labelDimension)
{
if (labelDimension > numeric_limits<IndexType>::max())
{
RuntimeError("Label dimension (%" PRIu64 ") exceeds the maximum allowed "
"value (%" PRIu64 ")\n", labelDimension, (size_t)numeric_limits<IndexType>::max());
}
iota(m_indices.begin(), m_indices.end(), 0);
}
virtual void CreateLabelFor(size_t classId, CategorySequenceData& data) override
{
data.m_nnzCounts.resize(1);
data.m_nnzCounts[0] = 1;
data.m_totalNnzCount = 1;
data.m_data = &m_value;
data.m_indices = &(m_indices[classId]);
}
private:
TElement m_value;
vector<IndexType> m_indices;
};
// For image, chunks correspond to a single image.
class ImageDataDeserializer::ImageChunk : public Chunk, public std::enable_shared_from_this<ImageChunk>
class ImageDataDeserializer::ImageChunk : public Chunk
{
ImageSequenceDescription m_description;
ImageDataDeserializer& m_parent;
ImageDataDeserializer& m_deserializer;
public:
ImageChunk(ImageSequenceDescription& description, ImageDataDeserializer& parent)
: m_description(description), m_parent(parent)
: m_description(description), m_deserializer(parent)
{
}
@ -78,30 +35,12 @@ public:
const auto& imageSequence = m_description;
auto image = std::make_shared<ImageSequenceData>();
image->m_image = std::move(m_parent.ReadImage(m_description.m_id, imageSequence.m_path, m_parent.m_grayscale));
image->m_image = std::move(m_deserializer.ReadImage(m_description.m_id, imageSequence.m_path, m_deserializer.m_grayscale));
auto& cvImage = image->m_image;
if (!cvImage.data)
RuntimeError("Cannot open file '%s'", imageSequence.m_path.c_str());
// Convert element type.
ElementType dataType = ConvertImageToSupportedDataType(cvImage);
if (!cvImage.isContinuous())
cvImage = cvImage.clone();
assert(cvImage.isContinuous());
ImageDimensions dimensions(cvImage.cols, cvImage.rows, cvImage.channels());
image->m_sampleLayout = std::make_shared<TensorShape>(dimensions.AsTensorShape(HWC));
image->m_id = sequenceId;
image->m_numberOfSamples = 1;
image->m_chunk = shared_from_this();
image->m_elementType = dataType;
result.push_back(image);
auto label = std::make_shared<CategorySequenceData>();
label->m_chunk = shared_from_this();
m_parent.m_labelGenerator->CreateLabelFor(imageSequence.m_classId, *label);
label->m_numberOfSamples = 1;
result.push_back(label);
m_deserializer.PopulateSequenceData(cvImage, imageSequence.m_classId, sequenceId, result);
}
private:
@ -112,9 +51,9 @@ private:
{
// Could not identify element type.
// Natively unsupported image type. Let's convert it to required precision.
int requiredType = m_parent.m_precision == ElementType::tfloat ? CV_32F : CV_64F;
int requiredType = m_deserializer.m_precision == ElementType::tfloat ? CV_32F : CV_64F;
image.convertTo(image, requiredType);
resultType = m_parent.m_precision;
resultType = m_deserializer.m_precision;
}
return resultType;
}
@ -122,58 +61,9 @@ private:
// A new constructor to support new compositional configuration,
// that allows composition of deserializers and transforms on inputs.
ImageDataDeserializer::ImageDataDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& config)
ImageDataDeserializer::ImageDataDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& config) : ImageDeserializerBase(corpus, config)
{
ConfigParameters inputs = config("input");
std::vector<std::string> featureNames = GetSectionsWithParameter("ImageDataDeserializer", inputs, "transforms");
std::vector<std::string> labelNames = GetSectionsWithParameter("ImageDataDeserializer", inputs, "labelDim");
// TODO: currently support only one feature and label section.
if (featureNames.size() != 1 || labelNames.size() != 1)
{
RuntimeError(
"ImageReader currently supports a single feature and label stream. '%d' features , '%d' labels found.",
static_cast<int>(featureNames.size()),
static_cast<int>(labelNames.size()));
}
string precision = (ConfigValue)config("precision", "float");
m_precision = AreEqualIgnoreCase(precision, "float") ? ElementType::tfloat : ElementType::tdouble;
m_verbosity = config(L"verbosity", 0);
// Feature stream.
ConfigParameters featureSection = inputs(featureNames[0]);
auto features = std::make_shared<StreamDescription>();
features->m_id = 0;
features->m_name = msra::strfun::utf16(featureSection.ConfigName());
features->m_storageType = StorageType::dense;
// Due to performance, now we support images of different types.
features->m_elementType = ElementType::tvariant;
m_streams.push_back(features);
// Label stream.
ConfigParameters label = inputs(labelNames[0]);
size_t labelDimension = label("labelDim");
auto labels = std::make_shared<StreamDescription>();
labels->m_id = 1;
labels->m_name = msra::strfun::utf16(label.ConfigName());
labels->m_sampleLayout = std::make_shared<TensorShape>(labelDimension);
labels->m_storageType = StorageType::sparse_csc;
labels->m_elementType = m_precision;
m_streams.push_back(labels);
m_labelGenerator = labels->m_elementType == ElementType::tfloat ?
(LabelGeneratorPtr)std::make_shared<TypedLabelGenerator<float>>(labelDimension) :
std::make_shared<TypedLabelGenerator<double>>(labelDimension);
m_grayscale = config(L"grayscale", false);
// TODO: multiview should be done on the level of randomizer/transformers - it is responsiblity of the
// TODO: randomizer to collect how many copies each transform needs and request same sequence several times.
bool multiViewCrop = config(L"multiViewCrop", false);
CreateSequenceDescriptions(corpus, config(L"file"), labelDimension, multiViewCrop);
CreateSequenceDescriptions(corpus, config(L"file"), m_labelGenerator->LabelDimension(), m_multiViewCrop);
}
// TODO: Should be removed at some point.
@ -255,7 +145,7 @@ void ImageDataDeserializer::CreateSequenceDescriptions(CorpusDescriptorPtr corpu
auto mapFileDirectory = ExtractDirectory(mapPath);
m_defaultReader = make_unique<FileByteReader>(mapFileDirectory);
size_t itemsPerLine = isMultiCrop ? 10 : 1;
size_t itemsPerLine = isMultiCrop ? ImageDeserializerBase::NumMultiViewCopies : 1;
size_t curId = 0;
std::string line;
PathReaderMap knownReaders;

Просмотреть файл

@ -5,7 +5,7 @@
#pragma once
#include <opencv2/core/mat.hpp>
#include "DataDeserializerBase.h"
#include "ImageDeserializerBase.h"
#include "Config.h"
#include "ByteReader.h"
#include <unordered_map>
@ -18,7 +18,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// All sequences consist only of a single sample (image/label).
// For features it uses dense storage format with different layout (dimensions) per sequence.
// For labels it uses the csc sparse storage format.
class ImageDataDeserializer : public DataDeserializerBase
class ImageDataDeserializer : public ImageDeserializerBase
{
public:
// A new constructor to support new compositional configuration,
@ -40,10 +40,6 @@ public:
// Gets sequence description by key.
bool GetSequenceDescriptionByKey(const KeyType&, SequenceDescription&) override;
// A helper class for generation of type specific labels (currently float/double only).
class LabelGenerator;
typedef std::shared_ptr<LabelGenerator> LabelGeneratorPtr;
private:
// Creates a set of sequence descriptions.
void CreateSequenceDescriptions(CorpusDescriptorPtr corpus, std::string mapPath, size_t labelDimension, bool isMultiCrop);
@ -57,20 +53,9 @@ private:
class ImageChunk;
LabelGeneratorPtr m_labelGenerator;
// Sequence descriptions for all input data.
std::vector<ImageSequenceDescription> m_imageSequences;
// Mapping of logical sequence key into sequence description.
std::map<size_t, size_t> m_keyToSequence;
// Precision required by the network.
ElementType m_precision;
// whether images shall be loaded in grayscale
bool m_grayscale;
// Not using nocase_compare here as it's not correct on Linux.
using PathReaderMap = std::unordered_map<std::string, std::shared_ptr<ByteReader>>;
using ReaderSequenceMap = std::map<std::string, std::map<std::string, size_t>>;
@ -82,7 +67,6 @@ private:
SeqReaderMap m_readers;
std::unique_ptr<FileByteReader> m_defaultReader;
int m_verbosity;
};
}}}

Просмотреть файл

@ -0,0 +1,93 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "stdafx.h"
#define __STDC_FORMAT_MACROS
#include <opencv2/opencv.hpp>
#include "ImageDeserializerBase.h"
#include "StringUtil.h"
#include "ConfigUtil.h"
#include "ImageTransformers.h"
#include "SequenceData.h"
#include "ImageUtil.h"
namespace Microsoft { namespace MSR { namespace CNTK {
ImageDeserializerBase::ImageDeserializerBase() : m_precision(ElementType::tfloat),
m_grayscale(false), m_verbosity(0), m_multiViewCrop(false)
{}
ImageDeserializerBase::ImageDeserializerBase(CorpusDescriptorPtr corpus, const ConfigParameters& config) : m_corpus(corpus)
{
assert(m_corpus);
ConfigParameters inputs = config("input");
std::vector<std::string> featureNames = GetSectionsWithParameter("ImageDeserializerBase", inputs, "transforms");
std::vector<std::string> labelNames = GetSectionsWithParameter("ImageDeserializerBase", inputs, "labelDim");
if (featureNames.size() != 1 || labelNames.size() != 1)
RuntimeError(
"Please specify a single feature and label stream. '%d' features , '%d' labels found.",
static_cast<int>(featureNames.size()),
static_cast<int>(labelNames.size()));
string precision = config("precision", "float");
m_precision = AreEqualIgnoreCase(precision, "float") ? ElementType::tfloat : ElementType::tdouble;
m_verbosity = config(L"verbosity", 0);
// Feature stream.
ConfigParameters featureSection = inputs(featureNames[0]);
auto features = std::make_shared<StreamDescription>();
features->m_id = 0;
features->m_name = msra::strfun::utf16(featureSection.ConfigName());
features->m_storageType = StorageType::dense;
// Due to performance, now we support images of different types.
features->m_elementType = ElementType::tvariant;
m_streams.push_back(features);
// Label stream.
ConfigParameters label = inputs(labelNames[0]);
size_t labelDimension = label("labelDim");
auto labels = std::make_shared<StreamDescription>();
labels->m_id = 1;
labels->m_name = msra::strfun::utf16(label.ConfigName());
labels->m_sampleLayout = std::make_shared<TensorShape>(labelDimension);
labels->m_storageType = StorageType::sparse_csc;
labels->m_elementType = m_precision;
m_streams.push_back(labels);
m_labelGenerator = labels->m_elementType == ElementType::tfloat ?
(LabelGeneratorPtr)std::make_shared<TypedLabelGenerator<float>>(labelDimension) :
std::make_shared<TypedLabelGenerator<double>>(labelDimension);
m_grayscale = config(L"grayscale", false);
// TODO: multiview should be done on the level of randomizer/transformers - it is responsiblity of the
// TODO: randomizer to collect how many copies each transform needs and request same sequence several times.
m_multiViewCrop = config(L"multiViewCrop", false);
}
void ImageDeserializerBase::PopulateSequenceData(cv::Mat image, size_t classId, size_t sequenceId, std::vector<SequenceDataPtr>& result)
{
ElementType dataType = ConvertImageToSupportedDataType(image, m_precision);
if (!image.isContinuous())
image = image.clone();
assert(image.isContinuous());
ImageDimensions dimensions(image.cols, image.rows, image.channels());
auto imageData = make_shared<ImageSequenceData>();
imageData->m_sampleLayout = std::make_shared<TensorShape>(dimensions.AsTensorShape(HWC));
imageData->m_id = sequenceId;
imageData->m_image = image;
imageData->m_numberOfSamples = 1;
imageData->m_elementType = dataType;
result.push_back(imageData);
auto label = std::make_shared<CategorySequenceData>();
m_labelGenerator->CreateLabelFor(classId, *label);
label->m_numberOfSamples = 1;
result.push_back(label);
}
}}}

Просмотреть файл

@ -0,0 +1,55 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include "DataDeserializerBase.h"
#include "Config.h"
#include "CorpusDescriptor.h"
#include "ImageUtil.h"
namespace Microsoft { namespace MSR { namespace CNTK {
// Base class of image deserializers.
class ImageDeserializerBase : public DataDeserializerBase
{
public:
// A new constructor to support new compositional configuration,
// that allows composition of deserializers and transforms on inputs.
ImageDeserializerBase(CorpusDescriptorPtr corpus, const ConfigParameters& config);
// Currently for backward compat with the old reader.
ImageDeserializerBase();
protected:
void PopulateSequenceData(cv::Mat image, size_t classId, size_t sequenceId, std::vector<SequenceDataPtr>& result);
// A helper class for generation of type specific labels (currently float/double only).
LabelGeneratorPtr m_labelGenerator;
// Mapping of logical sequence key into sequence description.
std::map<size_t, size_t> m_keyToSequence;
// Precision required by the network.
ElementType m_precision;
// Flag whether images shall be loaded in grayscale.
bool m_grayscale;
// Verbosity.
int m_verbosity;
// Flag indicating whether to generate images for multi crop.
bool m_multiViewCrop;
// Number of mutlicrop versions to produce.
// Currently the default value of 10 is used as in AlexNet paper,
// Possibly we should make this configurable.
const static size_t NumMultiViewCopies = 10;
// Corpus descriptor.
CorpusDescriptorPtr m_corpus;
};
}}}

Просмотреть файл

@ -108,9 +108,11 @@
<ClInclude Include="..\..\Common\Include\DataReader.h" />
<ClInclude Include="..\..\Common\Include\File.h" />
<ClInclude Include="..\..\Common\Include\fileutil.h" />
<ClInclude Include="Base64ImageDeserializer.h" />
<ClInclude Include="ByteReader.h" />
<ClInclude Include="ImageConfigHelper.h" />
<ClInclude Include="ImageDataDeserializer.h" />
<ClInclude Include="ImageDeserializerBase.h" />
<ClInclude Include="ImageReader.h" />
<ClInclude Include="ImageTransformers.h" />
<ClInclude Include="ImageUtil.h" />
@ -118,12 +120,14 @@
<ClInclude Include="targetver.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="Base64ImageDeserializer.cpp" />
<ClCompile Include="ImageConfigHelper.cpp" />
<ClCompile Include="ImageDataDeserializer.cpp" />
<ClCompile Include="dllmain.cpp" />
<ClCompile Include="Exports.cpp">
<ExcludedFromBuild Condition="!$(HasOpenCv)">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="ImageDeserializerBase.cpp" />
<ClCompile Include="ImageReader.cpp" />
<ClCompile Include="ImageTransformers.cpp" />
<ClCompile Include="stdafx.cpp">

Просмотреть файл

@ -9,6 +9,8 @@
<ClCompile Include="ImageReader.cpp" />
<ClCompile Include="ImageConfigHelper.cpp" />
<ClCompile Include="ZipByteReader.cpp" />
<ClCompile Include="Base64ImageDeserializer.cpp" />
<ClCompile Include="ImageDeserializerBase.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="stdafx.h" />
@ -28,6 +30,8 @@
<ClInclude Include="ImageConfigHelper.h" />
<ClInclude Include="ByteReader.h" />
<ClInclude Include="ImageUtil.h" />
<ClInclude Include="Base64ImageDeserializer.h" />
<ClInclude Include="ImageDeserializerBase.h" />
</ItemGroup>
<ItemGroup>
<Filter Include="Common">

Просмотреть файл

@ -6,7 +6,8 @@
#pragma once
#include <opencv2/opencv.hpp>
#include "Transformer.h"
#include "SequenceData.h"
#include <numeric>
namespace Microsoft { namespace MSR { namespace CNTK {
@ -37,4 +38,64 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return result;
}
inline ElementType ConvertImageToSupportedDataType(cv::Mat& image, ElementType defaultElementType)
{
ElementType resultType;
if (!IdentifyElementTypeFromOpenCVType(image.depth(), resultType))
{
// Could not identify element type.
// Natively unsupported image type. Let's convert it to required precision.
int requiredType = defaultElementType == ElementType::tfloat ? CV_32F : CV_64F;
image.convertTo(image, requiredType);
resultType = defaultElementType;
}
return resultType;
}
// A helper interface to generate a typed label in a sparse format for categories.
// It is represented as an array indexed by the category, containing zero values for all categories the sequence does not belong to,
// and a single one for a category it belongs to: [ 0 .. 0.. 1 .. 0 ]
class LabelGenerator
{
public:
virtual void CreateLabelFor(size_t classId, CategorySequenceData& data) = 0;
virtual size_t LabelDimension() const = 0;
virtual ~LabelGenerator() { }
};
typedef std::shared_ptr<LabelGenerator> LabelGeneratorPtr;
// Simple implementation of the LabelGenerator.
// The class is parameterized because the representation of 1 is type specific.
template <class TElement>
class TypedLabelGenerator : public LabelGenerator
{
public:
TypedLabelGenerator(size_t labelDimension) : m_value(1), m_indices(labelDimension)
{
if (labelDimension > numeric_limits<IndexType>::max())
{
RuntimeError("Label dimension (%d) exceeds the maximum allowed "
"value (%d)\n", (int)labelDimension, (int)numeric_limits<IndexType>::max());
}
iota(m_indices.begin(), m_indices.end(), 0);
}
void CreateLabelFor(size_t classId, CategorySequenceData& data) override
{
data.m_nnzCounts.resize(1);
data.m_nnzCounts[0] = 1;
data.m_totalNnzCount = 1;
data.m_data = &m_value;
data.m_indices = &(m_indices[classId]);
}
size_t LabelDimension() const override
{
return m_indices.size();
}
private:
TElement m_value;
vector<IndexType> m_indices;
};
}}}

Просмотреть файл

@ -67,7 +67,6 @@ struct SequenceDataBase
size_t m_id;
uint32_t m_numberOfSamples; // Number of samples in the sequence
ChunkPtr m_chunk;
// Returns a pointer to the data buffer.
// The actual size is provided for particular sequences,i.e. see DenseSequenceData, or SparseSequenceData.
virtual const void* GetDataBuffer() = 0;

Просмотреть файл

@ -1,26 +0,0 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include "Reader.h"
namespace Microsoft { namespace MSR { namespace CNTK {
// Returns the size of the type.
inline size_t GetSizeByType(ElementType type)
{
switch (type)
{
case ElementType::tfloat:
return sizeof(float);
case ElementType::tdouble:
return sizeof(double);
default:
RuntimeError("Unsupported type '%d'", static_cast<int>(type));
}
}
}}}

Просмотреть файл

@ -3,21 +3,24 @@
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "stdafx.h"
#define __STDC_FORMAT_MACROS
#define _CRT_SECURE_NO_WARNINGS
#include <inttypes.h>
#include "Indexer.h"
#include "TextReaderConstants.h"
using std::string;
const static char ROW_DELIMITER = '\n';
namespace Microsoft { namespace MSR { namespace CNTK {
Indexer::Indexer(FILE* file, bool isPrimary, bool skipSequenceIds, size_t chunkSize) :
Indexer::Indexer(FILE* file, bool isPrimary, bool skipSequenceIds, char streamPrefix, size_t chunkSize, size_t bufferSize) :
m_streamPrefix(streamPrefix),
m_bufferSize(bufferSize),
m_file(file),
m_fileOffsetStart(0),
m_fileOffsetEnd(0),
m_buffer(new char[BUFFER_SIZE + 1]),
m_buffer(new char[bufferSize + 1]),
m_bufferStart(nullptr),
m_bufferEnd(nullptr),
m_pos(nullptr),
@ -35,7 +38,7 @@ void Indexer::RefillBuffer()
{
if (!m_done)
{
size_t bytesRead = fread(m_buffer.get(), 1, BUFFER_SIZE, m_file);
size_t bytesRead = fread(m_buffer.get(), 1, m_bufferSize, m_file);
if (bytesRead == (size_t)-1)
RuntimeError("Could not read from the input file.");
if (bytesRead == 0)
@ -116,7 +119,7 @@ void Indexer::Build(CorpusDescriptorPtr corpus)
}
// check the first byte and decide what to do next
if (!m_hasSequenceIds || m_bufferStart[0] == NAME_PREFIX)
if (!m_hasSequenceIds || m_bufferStart[0] == m_streamPrefix)
{
// skip sequence id parsing, treat lines as individual sequences
BuildFromLines(corpus);

Просмотреть файл

@ -0,0 +1,189 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include <stdint.h>
#include <vector>
#include "DataDeserializer.h"
#include "CorpusDescriptor.h"
namespace Microsoft { namespace MSR { namespace CNTK {
// Sequence metadata. This text-reader specific descriptor adds two additional
// fields: file offset and size in bytes. Both are required to efficiently
// locate and retrieve a sequence from file, given a sequence descriptor.
struct SequenceDescriptor : SequenceDescription
{
SequenceDescriptor() : SequenceDescription({}), m_fileOffsetBytes(0),
m_byteSize(0)
{
}
// size_t m_numberOfSamples -- number of samples in the sequence (largest count among all inputs)
// in case of text data this value == number of rows this sequence spans over.
int64_t m_fileOffsetBytes; // sequence offset in the input file (in bytes)
size_t m_byteSize; // size in bytes
};
// Chunk metadata, similar to the sequence descriptor above,
// but used to facilitate indexing and retrieval of blobs of input data of
// some user-specified size.
struct ChunkDescriptor : ChunkDescription
{
ChunkDescriptor() : ChunkDescription({}), m_byteSize(0) {}
// TODO: if we don't want to keep the whole index
// (metadata for all sequences in memory), we should not
// leave this empty when building a chunk index, and only
// fill it out when the chunk needs to be loaded
// (the indexer will have to do a second pass for this chunk).
std::vector<SequenceDescriptor> m_sequences;
size_t m_byteSize; // size in bytes
};
typedef shared_ptr<ChunkDescriptor> ChunkDescriptorPtr;
// A collection of chunk descriptors, each containing
// a collection of sequence descriptors for the corresponding
// chunk of the input data.
// It also stores a mapping of keys into sequence descriptors.
struct Index
{
std::vector<ChunkDescriptor> m_chunks; // chunks
std::map<size_t, std::pair<size_t, size_t>> m_keyToSequenceInChunk; // sequence key -> sequence location in chunk
const size_t m_maxChunkSize; // maximum chunk size in bytes
bool m_isPrimary; // index for primary deserializer
Index(size_t chunkSize, bool isPrimary) : m_maxChunkSize(chunkSize), m_isPrimary(isPrimary)
{}
// Adds sequence (metadata) to the index. Additionally, it
// assigns an appropriate chunk id to the sequence descriptor,
// ensures that chunks do not exceed the maximum allowed size
// (except when a sequence size is greater than the maximum chunk size)
void AddSequence(SequenceDescriptor& sd)
{
assert(!m_chunks.empty());
ChunkDescriptor* chunk = &m_chunks.back();
if (chunk->m_byteSize > 0 && (chunk->m_byteSize + sd.m_byteSize) > m_maxChunkSize)
{
// Creating a new chunk if the size is exceeded.
chunk->m_sequences.shrink_to_fit();
m_chunks.push_back({});
chunk = &m_chunks.back();
chunk->m_id = (ChunkIdType)(m_chunks.size() - 1);
if (CHUNKID_MAX < m_chunks.size())
{
RuntimeError("Maximum number of chunks exceeded");
}
}
chunk->m_byteSize += sd.m_byteSize;
chunk->m_numberOfSequences++;
chunk->m_numberOfSamples += sd.m_numberOfSamples;
sd.m_chunkId = chunk->m_id;
sd.m_id = chunk->m_sequences.size();
if (!m_isPrimary)
{
auto location = std::make_pair(chunk->m_id, sd.m_id);
auto sequenceId = sd.m_key.m_sequence;
m_keyToSequenceInChunk.insert(std::make_pair(sequenceId, location));
}
chunk->m_sequences.push_back(sd);
}
// Reserves inner structures for the specified number of bytes.
void Reserve(size_t sizeInBytes)
{
if (m_maxChunkSize > 0)
{
m_chunks.reserve((sizeInBytes + m_maxChunkSize - 1) / m_maxChunkSize);
}
m_chunks.push_back({});
}
// Checks if the index is empty.
bool IsEmpty() const
{
return m_chunks.empty();
}
DISABLE_COPY_AND_MOVE(Index);
};
// A helper class that does a pass over the input file building up
// an index consisting of sequence and chunk descriptors (which among
// others specify size and file offset of the respective structure).
// As opposed to the data deserializer, indexer performs almost no parsing
// and therefore is several magnitudes faster.
class Indexer
{
public:
Indexer(FILE* file, bool isPrimary, bool skipSequenceIds = false, char streamPrefix = '|', size_t chunkSize = 32 * 1024 * 1024, size_t bufferSize = 2 * 1024 * 1024);
// Reads the input file, building and index of chunks and corresponding
// sequences.
void Build(CorpusDescriptorPtr corpus);
// Returns input data index (chunk and sequence metadata)
const Index& GetIndex() const { return m_index; }
// True, when input does not have the sequence id column
// or when sequence id column was ignored during indexing
// (by passing skipSequenceIds = true to the constructor).
bool HasSequenceIds() const { return m_hasSequenceIds; }
private:
FILE* m_file;
int64_t m_fileOffsetStart;
int64_t m_fileOffsetEnd;
std::unique_ptr<char[]> m_buffer;
const size_t m_bufferSize;
const char* m_bufferStart;
const char* m_bufferEnd;
const char* m_pos; // buffer index
bool m_done; // true, when all input was processed
bool m_hasSequenceIds; // true, when input contains one sequence per line
// or when sequence id column was ignored during indexing.
// a collection of chunk descriptors and sequence keys.
Index m_index;
const char m_streamPrefix;
// Same function as above but with check that the sequence is included in the corpus descriptor.
void AddSequenceIfIncluded(CorpusDescriptorPtr corpus, size_t sequenceKey, SequenceDescriptor& sd);
// fills up the buffer with data from file, all previously buffered data
// will be overwritten.
void RefillBuffer();
// Moves the buffer position to the beginning of the next line.
void SkipLine();
// Reads the line until the next pipe character, parsing numerical characters into a sequence id.
// Throws an exception if a non-numerical is read until the pipe character or
// EOF is reached without hitting the pipe character.
// Returns false if no numerical characters are found preceding the pipe.
// Otherwise, writes sequence id value to the provided reference, returns true.
bool TryGetSequenceId(size_t& id);
// Build a chunk/sequence index, treating each line as an individual sequence.
// Does not do any sequence parsing, instead uses line number as
// the corresponding sequence id.
void BuildFromLines(CorpusDescriptorPtr corpus);
// Returns current offset in the input file (in bytes).
int64_t GetFileOffset() const { return m_fileOffsetStart + (m_pos - m_bufferStart); }
DISABLE_COPY_AND_MOVE(Indexer);
};
}}}

Просмотреть файл

@ -7,7 +7,7 @@
#define _SCL_SECURE_NO_WARNINGS
#include "PackerBase.h"
#include "ElementTypeUtils.h"
#include "ReaderUtil.h"
namespace Microsoft { namespace MSR { namespace CNTK {

Просмотреть файл

@ -50,6 +50,7 @@
<ClInclude Include="ChunkCache.h" />
<ClInclude Include="ChunkRandomizer.h" />
<ClInclude Include="ExceptionCapture.h" />
<ClInclude Include="Indexer.h" />
<ClInclude Include="ReaderBase.h" />
<ClInclude Include="SequenceData.h" />
<ClInclude Include="TransformBase.h" />
@ -65,7 +66,7 @@
<ClInclude Include="NoRandomizer.h" />
<ClInclude Include="CudaMemoryProvider.h" />
<ClInclude Include="DataDeserializer.h" />
<ClInclude Include="ElementTypeUtils.h" />
<ClInclude Include="ReaderUtil.h" />
<ClInclude Include="FramePacker.h" />
<ClInclude Include="HeapMemoryProvider.h" />
<ClInclude Include="MemoryProvider.h" />
@ -78,6 +79,7 @@
<ClCompile Include="Bundler.cpp" />
<ClCompile Include="ChunkCache.cpp" />
<ClCompile Include="ChunkRandomizer.cpp" />
<ClCompile Include="Indexer.cpp" />
<ClCompile Include="NoRandomizer.cpp" />
<ClCompile Include="BlockRandomizer.cpp" />
<ClCompile Include="PackerBase.cpp" />

Просмотреть файл

@ -25,9 +25,6 @@
<ClInclude Include="Transformer.h">
<Filter>Interfaces</Filter>
</ClInclude>
<ClInclude Include="ElementTypeUtils.h">
<Filter>Utils</Filter>
</ClInclude>
<ClInclude Include="DataDeserializerBase.h">
<Filter>Deserializers</Filter>
</ClInclude>
@ -88,6 +85,12 @@
<ClInclude Include="TransformBase.h">
<Filter>Transformers</Filter>
</ClInclude>
<ClInclude Include="Indexer.h">
<Filter>Utils</Filter>
</ClInclude>
<ClInclude Include="ReaderUtil.h">
<Filter>Utils</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="NoRandomizer.cpp">
@ -126,6 +129,9 @@
<ClCompile Include="ReaderBase.cpp">
<Filter>Utils</Filter>
</ClCompile>
<ClCompile Include="Indexer.cpp">
<Filter>Utils</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<Filter Include="Interfaces">

Просмотреть файл

@ -0,0 +1,80 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include "Reader.h"
namespace Microsoft { namespace MSR { namespace CNTK {
// Returns the size of the type.
inline size_t GetSizeByType(ElementType type)
{
switch (type)
{
case ElementType::tfloat:
return sizeof(float);
case ElementType::tdouble:
return sizeof(double);
default:
RuntimeError("Unsupported type '%d'", static_cast<int>(type));
}
}
static std::vector<unsigned char> FillIndexTable()
{
std::vector<unsigned char> indexTable;
indexTable.resize(std::numeric_limits<unsigned char>().max());
char value = 0;
for (unsigned char i = 'A'; i <= 'Z'; i++)
indexTable[i] = value++;
assert(value == 26);
for (unsigned char i = 'a'; i <= 'z'; i++)
indexTable[i] = value++;
assert(value == 52);
for (unsigned char i = '0'; i <= '9'; i++)
indexTable[i] = value++;
assert(value == 62);
indexTable['+'] = value++;
indexTable['/'] = value++;
assert(value == 64);
return indexTable;
}
const static char* base64IndexTable = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
static std::vector<unsigned char> base64DecodeTable = FillIndexTable();
inline bool IsBase64Char(char c)
{
return isalnum(c) || c == '/' || c == '+' || c == '=';
}
inline std::vector<char> DecodeBase64(const char* begin, const char* end)
{
assert(std::find_if(begin, end, [](char c) { return !IsBase64Char(c); }) == end);
size_t length = end - begin;
if (length % 4 != 0)
RuntimeError("Invalid base64 data, length '%d' is not divisible by 4.", (int)length);
std::vector<char> result;
result.resize((length * 3) / 4); // Upper bound on the max number of decoded symbols.
size_t currentDecodedIndex = 0;
while (begin < end)
{
result[currentDecodedIndex++] = base64DecodeTable[*begin] << 2 | base64DecodeTable[*(begin + 1)] >> 4;
result[currentDecodedIndex++] = base64DecodeTable[*(begin + 1)] << 4 | base64DecodeTable[*(begin + 2)] >> 2;
result[currentDecodedIndex++] = base64DecodeTable[*(begin + 2)] << 6 | base64DecodeTable[*(begin + 3)];
begin += 4;
}
// In Base 64 each 3 characters are encoded with 4 bytes. Plus there could be padding (last two bytes)
size_t resultingLength = (length * 3) / 4 - (*(end - 2) == '=' ? 2 : (*(end - 1) == '=' ? 1 : 0));
result.resize(resultingLength);
return result;
}
}}}

Просмотреть файл

@ -10,7 +10,7 @@
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include "SequencePacker.h"
#include "ElementTypeUtils.h"
#include "ReaderUtil.h"
namespace Microsoft { namespace MSR { namespace CNTK {

Просмотреть файл

@ -9,7 +9,7 @@
#include <cmath>
#include <deque>
#include "TruncatedBpttPacker.h"
#include "ElementTypeUtils.h"
#include "ReaderUtil.h"
namespace Microsoft { namespace MSR { namespace CNTK {

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -38,3 +38,33 @@ Simple_Test = [
]
]
]
DeserializerType = "ImageDeserializer"
MapFile="$RootDir$/ImageReaderSimple_map.txt"
Composite_Test= {
reader = {
verbosity = 0 ; randomize = false
deserializers = ({
type = $DeserializerType$
module = "ImageReader"
file = "$MapFile$"
input = {
features = {
transforms = (
{ type = "Crop" ; cropType = "center" ; cropRatio = 1.0 ; jitterType = "uniRatio" }:
{ type = "Scale" ; width = 4 ; height = 8 ; channels = 3 ; interpolations = "linear" }:
{ type = "Mean" ; }:
{ type = "Transpose" }
)
}
labels = {
labelDim = 4
}
}
})
}
}

Просмотреть файл

@ -0,0 +1,8 @@
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254 254
1 0 0 0
0 1 0 0
0 0 1 0
0 0 0 1

Просмотреть файл

@ -0,0 +1,4 @@
0 /9j/4AAQSkZJRgABAQEAYABgAAD/4QAiRXhpZgAATU0AKgAAAAgAAQESAAMAAAABAAEAAAAAAAD/2wBDAAIBAQIBAQICAgICAgICAwUDAwMDAwYEBAMFBwYHBwcGBwcICQsJCAgKCAcHCg0KCgsMDAwMBwkODw0MDgsMDAz/2wBDAQICAgMDAwYDAwYMCAcIDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAz/wAARCAAIAAQDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD+f+iiigD/2Q==
1 /9j/4AAQSkZJRgABAQEAYABgAAD/4QAiRXhpZgAATU0AKgAAAAgAAQESAAMAAAABAAEAAAAAAAD/2wBDAAIBAQIBAQICAgICAgICAwUDAwMDAwYEBAMFBwYHBwcGBwcICQsJCAgKCAcHCg0KCgsMDAwMBwkODw0MDgsMDAz/2wBDAQICAgMDAwYDAwYMCAcIDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAz/wAARCAAIAAQDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD8c6KKK/38Pys//9k=
2 /9j/4AAQSkZJRgABAQEAYABgAAD/4QAiRXhpZgAATU0AKgAAAAgAAQESAAMAAAABAAEAAAAAAAD/2wBDAAIBAQIBAQICAgICAgICAwUDAwMDAwYEBAMFBwYHBwcGBwcICQsJCAgKCAcHCg0KCgsMDAwMBwkODw0MDgsMDAz/2wBDAQICAgMDAwYDAwYMCAcIDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAz/wAARCAAIAAQDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD1iiiiv8rz/FM//9k=
3 /9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAIBAQIBAQICAgICAgICAwUDAwMDAwYEBAMFBwYHBwcGBwcICQsJCAgKCAcHCg0KCgsMDAwMBwkODw0MDgsMDAz/2wBDAQICAgMDAwYDAwYMCAcIDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAz/wAARCAAIAAQDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD4vooor+Uz/fw//9k=

Просмотреть файл

@ -53,6 +53,40 @@ BOOST_AUTO_TEST_CASE(ImageAndTextReaderSimple)
1);
}
BOOST_AUTO_TEST_CASE(ImageSimpleCompositeAndBase64)
{
auto test = [this](std::vector<std::wstring> additionalParameters)
{
HelperRunReaderTest<float>(
testDataPath() + "/Config/ImageReaderSimple_Config.cntk",
testDataPath() + "/Control/ImageSimpleCompositeAndBase64_Control.txt",
testDataPath() + "/Control/ImageSimpleCompositeAndBase64_Output.txt",
"Composite_Test",
"reader",
4,
4,
1,
1,
1,
0,
1,
false,
false,
true,
additionalParameters);
};
// Image deserializer.
test({});
// Base64 deserializer.
test(
{
L"MapFile=\"$RootDir$/Base64ImageReaderSimple_map.txt\"",
L"DeserializerType=\"Base64ImageDeserializer\"]]"
});
};
BOOST_AUTO_TEST_CASE(ImageAndImageReaderSimple)
{
HelperRunReaderTest<float>(

Просмотреть файл

@ -118,7 +118,6 @@
<ClCompile Include="stdafx.cpp">
<PrecompiledHeader>Create</PrecompiledHeader>
</ClCompile>
<ClCompile Include="..\..\..\Source\Readers\CNTKTextFormatReader\Indexer.cpp" />
<ClCompile Include="..\..\..\Source\Readers\CNTKTextFormatReader\TextParser.cpp" />
</ItemGroup>
<ItemGroup>

Просмотреть файл

@ -19,9 +19,6 @@
<ClCompile Include="..\..\..\Source\Readers\CNTKTextFormatReader\TextParser.cpp">
<Filter>Linked Source</Filter>
</ClCompile>
<ClCompile Include="..\..\..\Source\Readers\CNTKTextFormatReader\Indexer.cpp">
<Filter>Linked Source</Filter>
</ClCompile>
<ClCompile Include="CNTKBinaryReaderTests.cpp" />
</ItemGroup>
<ItemGroup>