Final update for CNTK Binary Reader.

2016-11-30 17:29:29 -08:00 · 2016-11-30 17:29:29 -08:00 · ef6aa8584e
--- a/.gitattributes
+++ b/.gitattributes
@ -139,3 +139,4 @@ Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b* text
 *.dnn binary
 Examples/Image/Detection/FastRCNN/fastRCNN/*/*.pyd binary
 Tests/UnitTests/V2LibraryTests/data/*.bin binary
+Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/*.bin binary
--- a/CNTK.sln
+++ b/CNTK.sln
@ -593,6 +593,12 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "LSTM", "LSTM", "{9F1F9C7C-2
 		Tests\EndToEndTests\Examples\Speech\AN4\LSTM\testcases.yml = Tests\EndToEndTests\Examples\Speech\AN4\LSTM\testcases.yml
 	EndProjectSection
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKBinaryReader", "Source\Readers\CNTKBinaryReader\CNTKBinaryReader.vcxproj", "{7FE16CBE-B717-45C9-97FB-FA3191039568}"
+	ProjectSection(ProjectDependencies) = postProject
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602}
+	EndProjectSection
+EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKTextFormatReader", "Source\Readers\CNTKTextFormatReader\CNTKTextFormatReader.vcxproj", "{91973E60-A7BE-4C86-8FDB-59C88A0B3715}"
 	ProjectSection(ProjectDependencies) = postProject
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
@ -1417,10 +1423,6 @@ EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Python", "Python", "{CB4566F1-6C8F-4270-83EE-F6AED84EBB2B}"
 	ProjectSection(SolutionItems) = preProject
 		Examples\Video\GettingStarted\Python\Conv3D_UCF11.py = Examples\Video\GettingStarted\Python\Conv3D_UCF11.py
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKBinaryReader", "Source\Readers\CNTKBinaryReader\CNTKBinaryReader.vcxproj", "{7FE16CBE-B717-45C9-97FB-FA3191039568}"
-	ProjectSection(ProjectDependencies) = postProject
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
-		{F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602}
 	EndProjectSection
 EndProject
 Global
@ -1951,6 +1953,7 @@ Global
 		{181664AC-4C95-4798-A923-09B879215B33} = {8656B71D-E24C-4AC2-8BE4-C07B415A3E15}
 		{86883653-8A61-4038-81A0-2379FAE4200A} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 		{7B7A563D-AA8E-4660-A805-D50235A02120} = {33EBFE78-A1A8-4961-8938-92A271941F94}
+		{7FE16CBE-B717-45C9-97FB-FA3191039568} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{1FB54750-B668-4AC3-966F-ED504020AC06} = {8656B71D-E24C-4AC2-8BE4-C07B415A3E15}
 		{3E9BD61F-1F0A-4966-BE17-803AEFD1DFA4} = {6994C86D-A672-4254-824A-51F4DFEB807F}
 		{5560DDD4-1E6E-4F41-B9BD-F52A19DF0B31} = {6994C86D-A672-4254-824A-51F4DFEB807F}
@ -2052,6 +2055,5 @@ Global
 		{5EDBCD1A-4F07-4618-84C9-FC6905A438B4} = {FB604F98-008F-45CD-B06E-42C30E121F13}
 		{39C3C8CA-9A8A-4733-ADBB-3E19D0F52528} = {2A95B23C-D91E-4DF9-B8F0-5E997608AB65}
 		{CB4566F1-6C8F-4270-83EE-F6AED84EBB2B} = {39C3C8CA-9A8A-4733-ADBB-3E19D0F52528}
-		{7FE16CBE-B717-45C9-97FB-FA3191039568} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 	EndGlobalSection
 EndGlobal
--- a/6
+++ b/6
@ -792,19 +792,24 @@ $(SPARSEPCREADER): $(SPARSEPCREADER_OBJ) | $(CNTKMATH_LIB)
 ########################################
 # CNTKBinaryReader plugin
 ########################################
+
 CNTKBINARYREADER_SRC =\
 	$(SOURCEDIR)/Readers/CNTKBinaryReader/Exports.cpp \
 	$(SOURCEDIR)/Readers/CNTKBinaryReader/BinaryChunkDeserializer.cpp \
 	$(SOURCEDIR)/Readers/CNTKBinaryReader/BinaryConfigHelper.cpp \
 	$(SOURCEDIR)/Readers/CNTKBinaryReader/CNTKBinaryReader.cpp \
+
 CNTKBINARYREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTKBINARYREADER_SRC))
+
 CNTKBINARYREADER:=$(LIBDIR)/CNTKBinaryReader.so
 ALL += $(CNTKBINARYREADER)
 SRC+=$(CNTKBINARYREADER_SRC)
+
 $(CNTKBINARYREADER): $(CNTKBINARYREADER_OBJ) | $(CNTKMATH_LIB)
 	@echo $(SEPARATOR)
 	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ -l$(CNTKMATH)

+
 ########################################
 # CNTKTextFormatReader plugin
 ########################################
@ -1061,6 +1066,7 @@ $(UNITTEST_EVAL) : $(UNITTEST_EVAL_OBJ) | $(EVAL_LIB) $(CNTKMATH_LIB)
 INCLUDEPATH += $(SOURCEDIR)/Readers/CNTKTextFormatReader

 UNITTEST_READER_SRC = \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/CNTKBinaryReaderTests.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/CNTKTextFormatReaderTests.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/HTKLMFReaderTests.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ImageReaderTests.cpp \
--- a/Scripts/ctf2bin.py
+++ b/Scripts/ctf2bin.py
@ -1,4 +1,21 @@
+#!/usr/bin/env python
+
+# This script takes a CNTK text format file and a header file, and converts it
+# to a CNTK binary format file.
+#
+# The header file must list all of the streams in the input file in the
+# following format:
+#   <desired stream name> TAB <stream alias> TAB <matrix type> TAB <sample dimension>
+#
+# Where:
+#   <desired stream name> is the desired name for the input in CNTK.
+#   <stream alias> is the alias for the stream in the input file.
+#   <matrix type> is the matrix type, i.e., dense or sparse
+#   <sample dimension> is the dimensino of each sample for the input
+#
+
 import sys
+import argparse
 import re
 import struct
 import tempfile
@ -26,7 +43,7 @@ class Converter(object):

    def appendSample(self, sample):
        if( len(sample) != self.sampleDim ):
-            print "Invalid sample dimension for input {0}".format( self.name )
+            print( "Invalid sample dimension for input {0}" ).format( self.name )
            sys.exit()
        if( len(self.vals) == 0 ):
            self.vals.append( list() )
@ -63,7 +80,7 @@ class DenseConverter(Converter):
        output = ""
        for sequence in self.vals:
            if( len(sequence) != 1 ):
-                print "Converter does not support dense sequences."
+                print( "Converter does not support dense sequences." )
                sys.exit()
            for sample in sequence[0]:
                output += struct.pack( "f", float(sample) )
@ -79,7 +96,7 @@ class SparseConverter(Converter):
    def appendSample(self, sample):
        for samp in sample:
            if( int(samp.split(":")[0]) >= self.sampleDim ):
-                print "Invalid sample dimension for input {0}. Max {1}, given {2}".format( self.name, self.sampleDim, sample.split( ":" )[0] )
+                print( "Invalid sample dimension for input {0}. Max {1}, given {2}" ).format( self.name, self.sampleDim, sample.split( ":" )[0] )
                sys.exit()
        if( len(self.vals) == 0 ):
            self.vals.append( list() )
@ -157,7 +174,7 @@ def GetConverter( inputtype, name, sampleDim ):
    elif( inputtype.lower() == 'sparse' ):
        converter = SparseConverter( name, sampleDim )
    else:
-        print 'Invalid input format {0}'.format( inputtype )
+        print( 'Invalid input format {0}' ).format( inputtype )
        sys.exit()

    return converter 
@ -195,23 +212,26 @@ def OutputOffset( headerFile, numBytes, numSeqs, numSamples ):
    headerFile.write( struct.pack( "i", numSamples ) )

 if __name__ == '__main__':
-    if( len(sys.argv) != 5 ):
-        print 'Invalid usage. Expected: ', sys.argv[0], ' <ctf file> <header file> <num sequences per chunk> <bin file>'
-        sys.exit()
+    parser = argparse.ArgumentParser(description="Transforms a CNTK Text Format file into CNTK binary format given a header.")
+    parser.add_argument('--input', help="CNTK Text Format file to convert to binary.", default="", required=True)
+    parser.add_argument('--header',  help="Header file describing each stream in the input.", default="", required=True)
+    parser.add_argument('--seqsPerChunk', type=int, help='Number of sequences in each chunk.', default="", required=True)
+    parser.add_argument('--output', help='Name of the output file, stdout if not given', default="", required=True)
+    args = parser.parse_args()

    # Since we don't know how many chunks we're going to write until we're done,
    # grow the header/offsets table and the data portion separately. then at the
    # end concatenate the data portion onto the end of the header/offsets
    # portion.
-    headerFile = open( sys.argv[4], "wb+" )
-    dataFile = tempfile.NamedTemporaryFile(mode="rb+", delete=False)
-    dataPath = dataFile.name
+    binaryHeaderFile = open( args.output, "wb+" )
+    binaryDataFile = tempfile.NamedTemporaryFile(mode="rb+", delete=False)
+    dataPath = binaryDataFile.name

    # parse the header to get the converters for this file
    # <name>    <alias>  <input format>  <sample size>
    converters = []
    aliasToId = dict()
-    with open( sys.argv[2], "r" ) as headerfile:
+    with open( args.header, "r" ) as headerfile:
        id = 0
        for line in headerfile:
            split = re.split(r'\t+', line.strip())
@ -219,15 +239,15 @@ if __name__ == '__main__':
            aliasToId[ split[ 1 ] ] = id
            id += 1

-    OutputHeader( headerFile, converters )
+    OutputHeader( binaryHeaderFile, converters )

    numChunks = 0
-    with open( sys.argv[1], "r" ) as datafile:
+    with open( args.input, "r" ) as inputFile:
        curSequence = list()
        numSeqs = 0
        numSamps = 0
        prevId = None
-        for line in datafile:
+        for line in inputFile:
            split = line.rstrip().split('|')
            # if the sequence id is empty or not equal to the previous sequence id,
            # we are at a new sequence.
@ -236,10 +256,10 @@ if __name__ == '__main__':
                    numSamps += ParseSequence( aliasToId, curSequence, converters )
                    curSequence = list()
                    numSeqs += 1
-                    if( numSeqs % int( sys.argv[3] ) == 0 ):
-                        numBytes = OutputChunk( dataFile, converters )
+                    if( numSeqs % int( args.seqsPerChunk ) == 0 ):
+                        numBytes = OutputChunk( binaryDataFile, converters )
                        numChunks += 1
-                        OutputOffset( headerFile, numBytes, numSeqs, numSamps )
+                        OutputOffset( binaryHeaderFile, numBytes, numSeqs, numSamps )
                        numSeqs = 0
                        numSamps = 0
                prevId = split[ 0 ]
@ -251,16 +271,16 @@ if __name__ == '__main__':
            numSeqs += 1
            numChunks += 1

-        numBytes = OutputChunk( dataFile, converters )
-        OutputOffset( headerFile, numBytes, numSeqs, numSamps )
+        numBytes = OutputChunk( binaryDataFile, converters )
+        OutputOffset( binaryHeaderFile, numBytes, numSeqs, numSamps )

-        UpdateHeader( headerFile, numChunks )
-        headerFile.flush()
-        dataFile.flush()
-        headerFile.close()
-        dataFile.close()
+        UpdateHeader( binaryHeaderFile, numChunks )
+        binaryHeaderFile.flush()
+        binaryDataFile.flush()
+        binaryHeaderFile.close()
+        binaryDataFile.close()

-        destination = open( sys.argv[4], 'awb+' )
+        destination = open( args.output, 'awb+' )
        shutil.copyfileobj( open( dataPath, "rb" ), destination )
        
        destination.flush()
--- a/Source/Readers/CNTKBinaryReader/BinaryChunkDeserializer.cpp
+++ b/Source/Readers/CNTKBinaryReader/BinaryChunkDeserializer.cpp
@ -124,14 +124,14 @@ void BinaryChunkDeserializer::Initialize(const std::map<std::wstring, std::wstri

        // Read the matrix type. Then instantiate the appropriate BinaryDataDeserializer, and have it read in its parameters
        // Note: Is there a better way to do this?
-        DeserializerType matType;
-        CNTKBinaryFileHelper::readOrDie(&matType, sizeof(matType), 1, m_file);
-        if (matType == DeserializerType::DenseBinaryDataDeserializer)
+        DeserializerType desType;
+        CNTKBinaryFileHelper::readOrDie(&desType, sizeof(desType), 1, m_file);
+        if (desType == DeserializerType::DenseBinaryDataDeserializer)
            m_deserializers[c] = make_shared<DenseBinaryDataDeserializer>(m_file);
-        else if (matType == DeserializerType::SparseBinaryDataDeserializer)
+        else if (desType == DeserializerType::SparseBinaryDataDeserializer)
            m_deserializers[c] = make_shared<SparseBinaryDataDeserializer>(m_file);
        else
-            RuntimeError("Unknown matrix type %d requested.", matType);
+            RuntimeError("Unknown deserializer type %d requested.", desType);

        streamDescription->m_id           = c;
        streamDescription->m_elementType  = m_deserializers[c]->GetElementType();
@ -181,6 +181,8 @@ void BinaryChunkDeserializer::GetSequencesForChunk(ChunkIdType chunkId, std::vec

    // We don't store every piece of sequence information, so we have to read the chunk in, parse it, and then
    // find the information.
+    // BUGBUG: Note this requires reading each chunk twice. This might not be hugely disadvantageous due to OS 
+    // caching, but should be avoided none the less.
    ChunkPtr chunk = GetChunk(chunkId);

    size_t startId = m_offsetsTable->GetStartIndex(chunkId);
--- a/Source/Readers/CNTKBinaryReader/BinaryChunkDeserializer.h
+++ b/Source/Readers/CNTKBinaryReader/BinaryChunkDeserializer.h
@ -27,7 +27,7 @@ struct DiskOffsetsTable
 class OffsetsTable {
 public:

-    OffsetsTable(size_t numChunks, DiskOffsetsTable* offsetsTable) : m_numChunks(numChunks)//, m_diskOffsetsTable(offsetsTable)
+    OffsetsTable(size_t numChunks, DiskOffsetsTable* offsetsTable) : m_numChunks(numChunks)
    {
        m_diskOffsetsTable = make_unique<DiskOffsetsTable*>(offsetsTable);
        Initialize();
--- a/Source/Readers/CNTKBinaryReader/BinaryConfigHelper.cpp
+++ b/Source/Readers/CNTKBinaryReader/BinaryConfigHelper.cpp
@ -34,12 +34,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                ConfigParameters input = section.second;
                wstring name = msra::strfun::utf16(section.first);

-                // If there is an option for "original", we will rename the stream with the "original"
+                // If there is an option for "alias", we will rename the stream with the "alias"
                // name to the target name.
-                if (input.ExistsCurrent(L"original"))
+                if (input.ExistsCurrent(L"alias"))
                {
-                    wstring original = msra::strfun::utf16(input(L"original"));
-                    m_streams[original] = name;
+                    wstring alias = msra::strfun::utf16(input(L"alias"));
+                    m_streams[alias] = name;
                }
                else
                    m_streams[name] = name;
--- a/Source/Readers/CNTKBinaryReader/BinaryDataDeserializer.h
+++ b/Source/Readers/CNTKBinaryReader/BinaryDataDeserializer.h
@ -42,11 +42,6 @@ protected:
            return m_data;
        }

-        void SetDataBuffer(void* data)
-        {
-            m_data = data;
-        }
-
        void* m_data;
    };

@ -65,11 +60,6 @@ protected:
            return m_data;
        }
        
-        void SetDataBuffer(void* data)
-        {
-            m_data = data;
-        }
-
        std::vector<IndexType> m_indicesBuffer;
        void* m_data;
    };
@ -115,7 +105,7 @@ public:
        for (size_t c = 0; c < numSequences; c++)
        {
            shared_ptr<DenseInputStreamBuffer> sequence = make_shared<DenseInputStreamBuffer>();
-            sequence->SetDataBuffer( (char*)data + c*m_numCols*elemSize );
+            sequence->m_data            = (char*)data + c*m_numCols*elemSize;
            sequence->m_id              = startIndex + c;
            sequence->m_numberOfSamples = 1;
            sequence->m_sampleLayout    = std::make_shared<TensorShape>(m_numCols);
@ -204,7 +194,7 @@ public:
            sequence->m_totalNnzCount = colOffsets[colIndex + 1] - colOffsets[colIndex];

            // The values array is already properly packed, so just use it.
-            sequence->SetDataBuffer(values);
+            sequence->m_data = values;
            
            // The indices are correct (note they MUST BE IN INCREASING ORDER), but we will have to fix them up a 
            // little bit, for now just use them
--- a/Source/Readers/CNTKBinaryReader/CNTKBinaryReader.cpp
+++ b/Source/Readers/CNTKBinaryReader/CNTKBinaryReader.cpp
@ -65,7 +65,7 @@ CNTKBinaryReader::CNTKBinaryReader(const ConfigParameters& config)
        RuntimeError("CNTKBinaryReader: While reading '%ls': %s", configHelper.GetFilePath().c_str(), e.what());
    }
    if (configHelper.GetTraceLevel() > 2)
-        fprintf(stderr, "%s\n", log);
+        fprintf(stderr, "%s\n", log.c_str());
 }

 } } }
--- a/Tests/EndToEndTests/UnitTests/ReaderTests/baseline.txt
+++ b/Tests/EndToEndTests/UnitTests/ReaderTests/baseline.txt
--- a/Tests/UnitTests/ReaderTests/CNTKBinaryReaderTests.cpp
+++ b/Tests/UnitTests/ReaderTests/CNTKBinaryReaderTests.cpp
@ -4,35 +4,13 @@
 //
 #include "stdafx.h"
 #include <algorithm>
-#include <io.h>
-#include <cstdio>
 #include <boost/scope_exit.hpp>
 #include "Common/ReaderTestHelper.h"
-#include "BinaryChunkDeserializer.h"

 using namespace Microsoft::MSR::CNTK;

 namespace Microsoft { namespace MSR { namespace CNTK {

-// A thin wrapper around CNTK text format reader
-class CNTKBinaryReaderTestRunner
-{
-    BinaryChunkDeserializer m_deserializer;
-
-public:
-    ChunkPtr m_chunk;
-
-    CNTKBinaryReaderTestRunner(const string& filename) :
-        m_deserializer(wstring(filename.begin(), filename.end()))
-    {
-    }
-    // Retrieves a chunk of data.
-    void LoadChunk()
-    {
-        m_chunk = m_deserializer.GetChunk(0);
-    }
-};
-
 namespace Test {

 struct CNTKBinaryReaderFixture : ReaderFixture
--- a/Tests/UnitTests/ReaderTests/Config/CNTKBinaryReader/test.cntk
+++ b/Tests/UnitTests/ReaderTests/Config/CNTKBinaryReader/test.cntk
@ -8,10 +8,10 @@ SparseSeq = [
        file = "sparseseqoutput.bin"

        input = [
-            features1 = [ original="a" ]
-            features2 = [ original="b" ]
-            labels1 = [ original="c" ]
-            labels2 = [ original="d" ]
+            features1 = [ alias="a" ]
+            features2 = [ alias="b" ]
+            labels1 = [ alias="c" ]
+            labels2 = [ alias="d" ]
        ]
        randomize = false
    ]
@ -24,10 +24,10 @@ Sparse = [
        file = "sparseoutput.bin"

        input = [
-            features1 = [ original="a" ]
-            features2 = [ original="b" ]
-            labels1 = [ original="c" ]
-            labels2 = [ original="d" ]
+            features1 = [ alias="a" ]
+            features2 = [ alias="b" ]
+            labels1 = [ alias="c" ]
+            labels2 = [ alias="d" ]
        ]
        randomize = false
    ]
@ -40,10 +40,10 @@ Simple = [
        file = "simple.bin"

        input = [
-            features1 = [ original="a" ]
-            features2 = [ original="b" ]
-            features3 = [ original="c" ]
-            features4 = [ original="d" ]
+            features1 = [ alias="a" ]
+            features2 = [ alias="b" ]
+            features3 = [ alias="c" ]
+            features4 = [ alias="d" ]
        ]
        randomize = false
    ]