Final update for CNTK Binary Reader.

This commit is contained in:
thhoens 2016-11-30 17:29:29 -08:00
Родитель 1a5b835058
Коммит ef6aa8584e
12 изменённых файлов: 676 добавлений и 640 удалений

1
.gitattributes поставляемый
Просмотреть файл

@ -139,3 +139,4 @@ Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b* text
*.dnn binary
Examples/Image/Detection/FastRCNN/fastRCNN/*/*.pyd binary
Tests/UnitTests/V2LibraryTests/data/*.bin binary
Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/*.bin binary

Просмотреть файл

@ -593,6 +593,12 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "LSTM", "LSTM", "{9F1F9C7C-2
Tests\EndToEndTests\Examples\Speech\AN4\LSTM\testcases.yml = Tests\EndToEndTests\Examples\Speech\AN4\LSTM\testcases.yml
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKBinaryReader", "Source\Readers\CNTKBinaryReader\CNTKBinaryReader.vcxproj", "{7FE16CBE-B717-45C9-97FB-FA3191039568}"
ProjectSection(ProjectDependencies) = postProject
{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
{F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKTextFormatReader", "Source\Readers\CNTKTextFormatReader\CNTKTextFormatReader.vcxproj", "{91973E60-A7BE-4C86-8FDB-59C88A0B3715}"
ProjectSection(ProjectDependencies) = postProject
{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
@ -1417,10 +1423,6 @@ EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Python", "Python", "{CB4566F1-6C8F-4270-83EE-F6AED84EBB2B}"
ProjectSection(SolutionItems) = preProject
Examples\Video\GettingStarted\Python\Conv3D_UCF11.py = Examples\Video\GettingStarted\Python\Conv3D_UCF11.py
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKBinaryReader", "Source\Readers\CNTKBinaryReader\CNTKBinaryReader.vcxproj", "{7FE16CBE-B717-45C9-97FB-FA3191039568}"
ProjectSection(ProjectDependencies) = postProject
{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
{F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602}
EndProjectSection
EndProject
Global
@ -1951,6 +1953,7 @@ Global
{181664AC-4C95-4798-A923-09B879215B33} = {8656B71D-E24C-4AC2-8BE4-C07B415A3E15}
{86883653-8A61-4038-81A0-2379FAE4200A} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
{7B7A563D-AA8E-4660-A805-D50235A02120} = {33EBFE78-A1A8-4961-8938-92A271941F94}
{7FE16CBE-B717-45C9-97FB-FA3191039568} = {33EBFE78-A1A8-4961-8938-92A271941F94}
{1FB54750-B668-4AC3-966F-ED504020AC06} = {8656B71D-E24C-4AC2-8BE4-C07B415A3E15}
{3E9BD61F-1F0A-4966-BE17-803AEFD1DFA4} = {6994C86D-A672-4254-824A-51F4DFEB807F}
{5560DDD4-1E6E-4F41-B9BD-F52A19DF0B31} = {6994C86D-A672-4254-824A-51F4DFEB807F}
@ -2052,6 +2055,5 @@ Global
{5EDBCD1A-4F07-4618-84C9-FC6905A438B4} = {FB604F98-008F-45CD-B06E-42C30E121F13}
{39C3C8CA-9A8A-4733-ADBB-3E19D0F52528} = {2A95B23C-D91E-4DF9-B8F0-5E997608AB65}
{CB4566F1-6C8F-4270-83EE-F6AED84EBB2B} = {39C3C8CA-9A8A-4733-ADBB-3E19D0F52528}
{7FE16CBE-B717-45C9-97FB-FA3191039568} = {33EBFE78-A1A8-4961-8938-92A271941F94}
EndGlobalSection
EndGlobal

Просмотреть файл

@ -792,19 +792,24 @@ $(SPARSEPCREADER): $(SPARSEPCREADER_OBJ) | $(CNTKMATH_LIB)
########################################
# CNTKBinaryReader plugin
########################################
CNTKBINARYREADER_SRC =\
$(SOURCEDIR)/Readers/CNTKBinaryReader/Exports.cpp \
$(SOURCEDIR)/Readers/CNTKBinaryReader/BinaryChunkDeserializer.cpp \
$(SOURCEDIR)/Readers/CNTKBinaryReader/BinaryConfigHelper.cpp \
$(SOURCEDIR)/Readers/CNTKBinaryReader/CNTKBinaryReader.cpp \
CNTKBINARYREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTKBINARYREADER_SRC))
CNTKBINARYREADER:=$(LIBDIR)/CNTKBinaryReader.so
ALL += $(CNTKBINARYREADER)
SRC+=$(CNTKBINARYREADER_SRC)
$(CNTKBINARYREADER): $(CNTKBINARYREADER_OBJ) | $(CNTKMATH_LIB)
@echo $(SEPARATOR)
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ -l$(CNTKMATH)
########################################
# CNTKTextFormatReader plugin
########################################
@ -1061,6 +1066,7 @@ $(UNITTEST_EVAL) : $(UNITTEST_EVAL_OBJ) | $(EVAL_LIB) $(CNTKMATH_LIB)
INCLUDEPATH += $(SOURCEDIR)/Readers/CNTKTextFormatReader
UNITTEST_READER_SRC = \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/CNTKBinaryReaderTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/CNTKTextFormatReaderTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/HTKLMFReaderTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ImageReaderTests.cpp \

Просмотреть файл

@ -1,4 +1,21 @@
#!/usr/bin/env python
# This script takes a CNTK text format file and a header file, and converts it
# to a CNTK binary format file.
#
# The header file must list all of the streams in the input file in the
# following format:
# <desired stream name> TAB <stream alias> TAB <matrix type> TAB <sample dimension>
#
# Where:
# <desired stream name> is the desired name for the input in CNTK.
# <stream alias> is the alias for the stream in the input file.
# <matrix type> is the matrix type, i.e., dense or sparse
# <sample dimension> is the dimensino of each sample for the input
#
import sys
import argparse
import re
import struct
import tempfile
@ -26,7 +43,7 @@ class Converter(object):
def appendSample(self, sample):
if( len(sample) != self.sampleDim ):
print "Invalid sample dimension for input {0}".format( self.name )
print( "Invalid sample dimension for input {0}" ).format( self.name )
sys.exit()
if( len(self.vals) == 0 ):
self.vals.append( list() )
@ -63,7 +80,7 @@ class DenseConverter(Converter):
output = ""
for sequence in self.vals:
if( len(sequence) != 1 ):
print "Converter does not support dense sequences."
print( "Converter does not support dense sequences." )
sys.exit()
for sample in sequence[0]:
output += struct.pack( "f", float(sample) )
@ -79,7 +96,7 @@ class SparseConverter(Converter):
def appendSample(self, sample):
for samp in sample:
if( int(samp.split(":")[0]) >= self.sampleDim ):
print "Invalid sample dimension for input {0}. Max {1}, given {2}".format( self.name, self.sampleDim, sample.split( ":" )[0] )
print( "Invalid sample dimension for input {0}. Max {1}, given {2}" ).format( self.name, self.sampleDim, sample.split( ":" )[0] )
sys.exit()
if( len(self.vals) == 0 ):
self.vals.append( list() )
@ -157,7 +174,7 @@ def GetConverter( inputtype, name, sampleDim ):
elif( inputtype.lower() == 'sparse' ):
converter = SparseConverter( name, sampleDim )
else:
print 'Invalid input format {0}'.format( inputtype )
print( 'Invalid input format {0}' ).format( inputtype )
sys.exit()
return converter
@ -195,23 +212,26 @@ def OutputOffset( headerFile, numBytes, numSeqs, numSamples ):
headerFile.write( struct.pack( "i", numSamples ) )
if __name__ == '__main__':
if( len(sys.argv) != 5 ):
print 'Invalid usage. Expected: ', sys.argv[0], ' <ctf file> <header file> <num sequences per chunk> <bin file>'
sys.exit()
parser = argparse.ArgumentParser(description="Transforms a CNTK Text Format file into CNTK binary format given a header.")
parser.add_argument('--input', help="CNTK Text Format file to convert to binary.", default="", required=True)
parser.add_argument('--header', help="Header file describing each stream in the input.", default="", required=True)
parser.add_argument('--seqsPerChunk', type=int, help='Number of sequences in each chunk.', default="", required=True)
parser.add_argument('--output', help='Name of the output file, stdout if not given', default="", required=True)
args = parser.parse_args()
# Since we don't know how many chunks we're going to write until we're done,
# grow the header/offsets table and the data portion separately. then at the
# end concatenate the data portion onto the end of the header/offsets
# portion.
headerFile = open( sys.argv[4], "wb+" )
dataFile = tempfile.NamedTemporaryFile(mode="rb+", delete=False)
dataPath = dataFile.name
binaryHeaderFile = open( args.output, "wb+" )
binaryDataFile = tempfile.NamedTemporaryFile(mode="rb+", delete=False)
dataPath = binaryDataFile.name
# parse the header to get the converters for this file
# <name> <alias> <input format> <sample size>
converters = []
aliasToId = dict()
with open( sys.argv[2], "r" ) as headerfile:
with open( args.header, "r" ) as headerfile:
id = 0
for line in headerfile:
split = re.split(r'\t+', line.strip())
@ -219,15 +239,15 @@ if __name__ == '__main__':
aliasToId[ split[ 1 ] ] = id
id += 1
OutputHeader( headerFile, converters )
OutputHeader( binaryHeaderFile, converters )
numChunks = 0
with open( sys.argv[1], "r" ) as datafile:
with open( args.input, "r" ) as inputFile:
curSequence = list()
numSeqs = 0
numSamps = 0
prevId = None
for line in datafile:
for line in inputFile:
split = line.rstrip().split('|')
# if the sequence id is empty or not equal to the previous sequence id,
# we are at a new sequence.
@ -236,10 +256,10 @@ if __name__ == '__main__':
numSamps += ParseSequence( aliasToId, curSequence, converters )
curSequence = list()
numSeqs += 1
if( numSeqs % int( sys.argv[3] ) == 0 ):
numBytes = OutputChunk( dataFile, converters )
if( numSeqs % int( args.seqsPerChunk ) == 0 ):
numBytes = OutputChunk( binaryDataFile, converters )
numChunks += 1
OutputOffset( headerFile, numBytes, numSeqs, numSamps )
OutputOffset( binaryHeaderFile, numBytes, numSeqs, numSamps )
numSeqs = 0
numSamps = 0
prevId = split[ 0 ]
@ -251,16 +271,16 @@ if __name__ == '__main__':
numSeqs += 1
numChunks += 1
numBytes = OutputChunk( dataFile, converters )
OutputOffset( headerFile, numBytes, numSeqs, numSamps )
numBytes = OutputChunk( binaryDataFile, converters )
OutputOffset( binaryHeaderFile, numBytes, numSeqs, numSamps )
UpdateHeader( headerFile, numChunks )
headerFile.flush()
dataFile.flush()
headerFile.close()
dataFile.close()
UpdateHeader( binaryHeaderFile, numChunks )
binaryHeaderFile.flush()
binaryDataFile.flush()
binaryHeaderFile.close()
binaryDataFile.close()
destination = open( sys.argv[4], 'awb+' )
destination = open( args.output, 'awb+' )
shutil.copyfileobj( open( dataPath, "rb" ), destination )
destination.flush()

Просмотреть файл

@ -124,14 +124,14 @@ void BinaryChunkDeserializer::Initialize(const std::map<std::wstring, std::wstri
// Read the matrix type. Then instantiate the appropriate BinaryDataDeserializer, and have it read in its parameters
// Note: Is there a better way to do this?
DeserializerType matType;
CNTKBinaryFileHelper::readOrDie(&matType, sizeof(matType), 1, m_file);
if (matType == DeserializerType::DenseBinaryDataDeserializer)
DeserializerType desType;
CNTKBinaryFileHelper::readOrDie(&desType, sizeof(desType), 1, m_file);
if (desType == DeserializerType::DenseBinaryDataDeserializer)
m_deserializers[c] = make_shared<DenseBinaryDataDeserializer>(m_file);
else if (matType == DeserializerType::SparseBinaryDataDeserializer)
else if (desType == DeserializerType::SparseBinaryDataDeserializer)
m_deserializers[c] = make_shared<SparseBinaryDataDeserializer>(m_file);
else
RuntimeError("Unknown matrix type %d requested.", matType);
RuntimeError("Unknown deserializer type %d requested.", desType);
streamDescription->m_id = c;
streamDescription->m_elementType = m_deserializers[c]->GetElementType();
@ -181,6 +181,8 @@ void BinaryChunkDeserializer::GetSequencesForChunk(ChunkIdType chunkId, std::vec
// We don't store every piece of sequence information, so we have to read the chunk in, parse it, and then
// find the information.
// BUGBUG: Note this requires reading each chunk twice. This might not be hugely disadvantageous due to OS
// caching, but should be avoided none the less.
ChunkPtr chunk = GetChunk(chunkId);
size_t startId = m_offsetsTable->GetStartIndex(chunkId);

Просмотреть файл

@ -27,7 +27,7 @@ struct DiskOffsetsTable
class OffsetsTable {
public:
OffsetsTable(size_t numChunks, DiskOffsetsTable* offsetsTable) : m_numChunks(numChunks)//, m_diskOffsetsTable(offsetsTable)
OffsetsTable(size_t numChunks, DiskOffsetsTable* offsetsTable) : m_numChunks(numChunks)
{
m_diskOffsetsTable = make_unique<DiskOffsetsTable*>(offsetsTable);
Initialize();

Просмотреть файл

@ -34,12 +34,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ConfigParameters input = section.second;
wstring name = msra::strfun::utf16(section.first);
// If there is an option for "original", we will rename the stream with the "original"
// If there is an option for "alias", we will rename the stream with the "alias"
// name to the target name.
if (input.ExistsCurrent(L"original"))
if (input.ExistsCurrent(L"alias"))
{
wstring original = msra::strfun::utf16(input(L"original"));
m_streams[original] = name;
wstring alias = msra::strfun::utf16(input(L"alias"));
m_streams[alias] = name;
}
else
m_streams[name] = name;

Просмотреть файл

@ -42,11 +42,6 @@ protected:
return m_data;
}
void SetDataBuffer(void* data)
{
m_data = data;
}
void* m_data;
};
@ -65,11 +60,6 @@ protected:
return m_data;
}
void SetDataBuffer(void* data)
{
m_data = data;
}
std::vector<IndexType> m_indicesBuffer;
void* m_data;
};
@ -115,7 +105,7 @@ public:
for (size_t c = 0; c < numSequences; c++)
{
shared_ptr<DenseInputStreamBuffer> sequence = make_shared<DenseInputStreamBuffer>();
sequence->SetDataBuffer( (char*)data + c*m_numCols*elemSize );
sequence->m_data = (char*)data + c*m_numCols*elemSize;
sequence->m_id = startIndex + c;
sequence->m_numberOfSamples = 1;
sequence->m_sampleLayout = std::make_shared<TensorShape>(m_numCols);
@ -204,7 +194,7 @@ public:
sequence->m_totalNnzCount = colOffsets[colIndex + 1] - colOffsets[colIndex];
// The values array is already properly packed, so just use it.
sequence->SetDataBuffer(values);
sequence->m_data = values;
// The indices are correct (note they MUST BE IN INCREASING ORDER), but we will have to fix them up a
// little bit, for now just use them

Просмотреть файл

@ -65,7 +65,7 @@ CNTKBinaryReader::CNTKBinaryReader(const ConfigParameters& config)
RuntimeError("CNTKBinaryReader: While reading '%ls': %s", configHelper.GetFilePath().c_str(), e.what());
}
if (configHelper.GetTraceLevel() > 2)
fprintf(stderr, "%s\n", log);
fprintf(stderr, "%s\n", log.c_str());
}
} } }

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -4,35 +4,13 @@
//
#include "stdafx.h"
#include <algorithm>
#include <io.h>
#include <cstdio>
#include <boost/scope_exit.hpp>
#include "Common/ReaderTestHelper.h"
#include "BinaryChunkDeserializer.h"
using namespace Microsoft::MSR::CNTK;
namespace Microsoft { namespace MSR { namespace CNTK {
// A thin wrapper around CNTK text format reader
class CNTKBinaryReaderTestRunner
{
BinaryChunkDeserializer m_deserializer;
public:
ChunkPtr m_chunk;
CNTKBinaryReaderTestRunner(const string& filename) :
m_deserializer(wstring(filename.begin(), filename.end()))
{
}
// Retrieves a chunk of data.
void LoadChunk()
{
m_chunk = m_deserializer.GetChunk(0);
}
};
namespace Test {
struct CNTKBinaryReaderFixture : ReaderFixture

Просмотреть файл

@ -8,10 +8,10 @@ SparseSeq = [
file = "sparseseqoutput.bin"
input = [
features1 = [ original="a" ]
features2 = [ original="b" ]
labels1 = [ original="c" ]
labels2 = [ original="d" ]
features1 = [ alias="a" ]
features2 = [ alias="b" ]
labels1 = [ alias="c" ]
labels2 = [ alias="d" ]
]
randomize = false
]
@ -24,10 +24,10 @@ Sparse = [
file = "sparseoutput.bin"
input = [
features1 = [ original="a" ]
features2 = [ original="b" ]
labels1 = [ original="c" ]
labels2 = [ original="d" ]
features1 = [ alias="a" ]
features2 = [ alias="b" ]
labels1 = [ alias="c" ]
labels2 = [ alias="d" ]
]
randomize = false
]
@ -40,10 +40,10 @@ Simple = [
file = "simple.bin"
input = [
features1 = [ original="a" ]
features2 = [ original="b" ]
features3 = [ original="c" ]
features4 = [ original="d" ]
features1 = [ alias="a" ]
features2 = [ alias="b" ]
features3 = [ alias="c" ]
features4 = [ alias="d" ]
]
randomize = false
]