Start binary reader overhaul

Redesign the binary format, refactor and update the reader implementation, as well as the ctf2bin.py converter.
2017-03-01 13:33:08 +01:00 · 2017-03-01 13:33:08 +01:00 · ff49f1e3c6
--- a/Scripts/ctf2bin.py
+++ b/Scripts/ctf2bin.py
@ -5,7 +5,7 @@
 #
 # The header file must list all of the streams in the input file in the
 # following format:
-#   <desired stream name> TAB <stream alias> TAB <matrix type> TAB <sample dimension>
+#   <desired stream name>  <stream alias> <matrix type> <sample dimension>
 #
 # Where:
 #   <desired stream name> is the desired name for the input in CNTK.
@ -16,275 +16,285 @@

 import sys
 import argparse
-import re
 import struct
-import tempfile
-import shutil
 import os
+from collections import OrderedDict

-# This will convert data in the ctf format into binary format
+MAGIC_NUMBER = 0x636e746b5f62696e;
+CBF_VERSION = 1;
+
+class ElementType:
+    FLOAT = 0
+    DOUBLE = 1
+
+class MatrixEncodingType:
+    DENSE = 0
+    SPARSE_CSC = 1
+    #COMPRESSED_DENSE = 2
+    #COMPRESSED_SPARSE_CSC = 3
+
+# This will convert data in the ctf format into the binary format
 class Converter(object):
-    def __init__(self, name, sampleDim):
+    def __init__(self, name, sample_dim, element_type):
        self.name = name
-        self.sampleDim = sampleDim
-        self.vals = list()
+        self.sample_dim = sample_dim
+        # contains length (in samples) for each sequence in the chunk
+        self.sequences = [] 
+        self.element_type = element_type

-    def getName(self):
-        return self.name
+    def write_header(self, output):
+        # First is the matrix type.
+        output.write(struct.pack('B', self.get_matrix_type()))
+        # Nest comes the stream name.
+        output.write(struct.pack('I', len(self.name)))
+        output.write(self.name.encode('ascii'))
+        # Next is the elem type
+        output.write(struct.pack('B', self.element_type))
+        # Finally, the sample dimension.
+        output.write(struct.pack('I', self.sample_dim))

-    def getSampleDim(self):
-        return self.sampleDim
+    def write_signed_ints(self, output, ints):
+        output.write(b''.join([struct.pack('i', x) for x in ints]))

-    def clear(self):
-        self.vals = list()
+    def write_floats(self, output, floats):
+        format = 'f' if self.is_float() else 'd'
+        output.write(b''.join([struct.pack(format, x) for x in floats]))

-    def addSequence(self):
-        self.vals.append(list())
+    def is_float(self):
+        return self.element_type == ElementType.FLOAT

-    def appendSample(self, sample):
-        if( len(sample) != self.sampleDim ):
-            raise ValueError(
-                "Invalid sample dimension for input {0}".format(self.name))
+    def get_matrix_type(self):
+        raise NotImplementedError()

-        if( len(self.vals) == 0 ):
-            self.vals.append( list() )
+    def reset(self):
+        self.sequences = []

-        self.vals[-1].append( sample )
-
-    def toString(self):
-        output = ""
-        for seq in self.vals:
-            for samp in seq:
-                output += "\t" + " ".join(samp )
-            output += "\n"
-        return output
+    def start_sequence(self):
+        self.sequences.append([])

+    def add_sample(self, sample):
+        raise NotImplementedError()

 # Specilization for dense inputs
 class DenseConverter(Converter):
-    def __init__(self, name, sampleDim):
-        Converter.__init__(self, name, sampleDim)
-    
-    def headerBytes(self):
-        output = bytearray()
-        # First is the matrix type. Dense is type 0
-        output += struct.pack( "i", 0 )
-        # Next is the elem type, currently float only
-        output += struct.pack( "i", 0 )
-        # Finally is whether or not this is a sequence
-        output += struct.pack( "i", self.sampleDim )

-        return output
+    def get_matrix_type(self):
+        return MatrixEncodingType.DENSE;

-    
-    def toBytes(self):
-        output = bytearray()
-        for sequence in self.vals:
-            if( len(sequence) != 1 ):
-                raise ValueError("Dense sequences currently not supported.")
+    def add_sample(self, sample):
+        if(len(sample) != self.sample_dim):
+            raise ValueError(
+                "Invalid sample dimension for input {0}".format(self.name))

-            for sample in sequence[0]:
-                output += struct.pack( "f", float(sample) )
+        byte_size = len(sample) * (4 if self.is_float() else 8)

-        return output
+        if(len(self.sequences) == 0):
+            self.sequences.append([])
+            byte_size += 4;
+
+        self.sequences[-1].append([float(x) for x in sample])
+
+        return byte_size
+
+    def write_data(self, output):
+        for sequence in self.sequences:
+            output.write(struct.pack('I', len(sequence)))
+            for sample in sequence:
+                self.write_floats(output, sample)


 # Specialization for sparse inputs
 class SparseConverter(Converter):
-    def __init__(self, name, sampleDim):
-        Converter.__init__(self, name, sampleDim)

-    def appendSample(self, sample):
-        for pair in sample:
-            index = int(pair.split(":")[0])
-            if (index >= self.sampleDim):
+    def add_sample(self, sample):
+        pairs = map(lambda x: (int(x[0]),float(x[1])),
+            [pair.split(':', 1) for pair in sample])
+
+        for pair in pairs:
+            index = pair[0]
+            if (index >= self.sample_dim):
                raise ValueError("Invalid sample dimension for input {0}. Max {1}, given {2}"
-                        .format(self.name, self.sampleDim, index))
-        if( len(self.vals) == 0 ):
-            self.vals.append( list() )
+                        .format(self.name, self.sample_dim, index))

-        self.vals[-1].append( sample )
+        byte_size = len(pairs) * (8 if self.is_float() else 12) + 4

-    def headerBytes(self):
-        output = bytearray()
-        # First is the matrix type. Sparse is type 1
-        output += struct.pack( "i", 1 )
-        # Next is the storage type, currently sparse csc only
-        output += struct.pack( "i", 0 )
-        # Next is the elem type, currently float only
-        output += struct.pack( "i", 0 )
-        # Next is whether or not this is a sequence
-        # Note this is currently ignored
-        output += struct.pack( "i", 1 )
-        # Finally is the sample dimension
-        output += struct.pack( "i", self.sampleDim )
+        if(len(self.sequences) == 0):
+            self.sequences.append([])
+            byte_size += 8;

-        return output
+        self.sequences[-1].append(pairs)

-    def toBytes(self):
-        output = bytearray()
-        values = list()
-        rowInd = list()
-        colInd = [0]
-        nnz = 0
-        for sequence in self.vals:
-            i = 0
+        return byte_size
+
+    def get_matrix_type(self):
+        return MatrixEncodingType.SPARSE_CSC;
+
+    def write_data(self, output):
+        format = 'f' if self.is_float() else 'd'
+        for sequence in self.sequences:
+            # write out each sequence in csc format
+            values = []
+            indices = []
+            sizes = []
            for sample in sequence:
-                # sort the indices least to greatest
-                sample.sort(key=lambda x: int(x.split(":")[0]))
-                for ele in sample:
-                    nnz += 1
-                    ind, val = ele.split(":")
-                    rowInd.append( int(ind) + i * self.sampleDim )
-                    values.append( val ) 
-                i += 1
-            colInd.append( nnz )
+                sizes.append(len(sample))
+                sample.sort(key=lambda x: x[0])
+                for (index, value) in sample:
+                    indices.append(index)
+                    values.append(value)

-        output += struct.pack( "i", nnz )
-        output += b''.join( [ struct.pack( "f", float(val) ) for val in values ] )
-        output += b''.join( [ struct.pack( "i", int(ind) ) for ind in rowInd ] )
-        output += b''.join( [ struct.pack( "i", int(ind) ) for ind in colInd ] )
+            output.write(struct.pack('I', len(sequence))) #number of samples in this sequence
+            # nnz and indices have to be written out as signed ints, since
+            # this is the index type of the CNTK sparse matrix
+            output.write(struct.pack('i', len(values))) #total nnz count for this sequence
+            self.write_floats(output, values)
+            self.write_signed_ints(output, indices)
+            self.write_signed_ints(output, sizes)

-        return output
-
-
-# Parse an entire sequence given an aliasToId map, and the converters
-def ParseSequence( aliasToId, curSequence, converters ):
-    for des in converters:
-        des.addSequence()
-    for line in curSequence:
-        for input in line.split( "|" )[1:]:
-            vals = input.split()
+# Process the entire sequence
+def process_sequence(data, converters, chunk):
+    byte_size = 0;
+    for converter in converters.values():
+        converter.start_sequence()
+    for line in data:
+        for input_stream in line.split("|")[1:]:
+            split = input_stream.split(None, 1)
+            if (len(split) < 2):
+                continue
+            (alias, values) = split
            # We need to ignore comments
-            if( vals[0] != "#" ):
-                converters[aliasToId[vals[0]]].appendSample( vals[1:] )
-    return max( [ len(des.vals[ -1 ]) for des in converters ] )
+            if(len(alias) > 0 and alias[0] != '#'):
+                byte_size += converters[alias].add_sample(values.split())
+    sequence_length_samples = max([len(x.sequences[-1]) for x in converters.values()])
+    chunk.add_sequence(sequence_length_samples)
+    return byte_size

 # Output a binary chunk
-def OutputChunk( binfile, converters ):
-    startPos = binfile.tell()
-    for des in converters:
-        binfile.write( des.toBytes() )
-        des.clear()
-    return startPos
+def write_chunk(binfile, converters, chunk):
+    binfile.flush()
+    chunk.offset = binfile.tell()
+    # write out the number of samples for each sequence in the chunk
+    binfile.write(b''.join([struct.pack('I', x) for x in chunk.sequences]))

-# Get a converter from a type
-def GetConverter( inputtype, name, sampleDim ):
-    converter = None
-    if( inputtype.lower() == 'dense' ):
-        converter = DenseConverter( name, sampleDim )
-    elif( inputtype.lower() == 'sparse' ):
-        converter = SparseConverter( name, sampleDim )
-    else:
-        print('Invalid input format {0}'.format( inputtype ))
-        sys.exit()
+    for converter in converters.values():
+        converter.write_data(binfile)
+        converter.reset()
+    # TODO: add a hash of the chunk

-    return converter 
+def get_converter(input_type, name, sample_dim, element_type):
+    if(input_type.lower() == 'dense'):
+        return DenseConverter(name, sample_dim, element_type)
+    if(input_type.lower() == 'sparse'):
+        return SparseConverter(name, sample_dim, element_type)

-# Output the binary format header.
-def OutputHeader( headerFile, converters ):
-    # First the version number
-    headerFile.write( struct.pack( "q", 1 ) )
-    # Next is the number of chunks, but we don't know what this is, so write a
-    # placeholder
-    headerFile.write( struct.pack( "q", 0 ) )
-    # Finally the number of inputs
-    headerFile.write( struct.pack( "i", len(converters) ) )
-    for conv in converters:
-        # first comes the name. This is common so write it first
-        headerFile.write( struct.pack( "i", len( conv.getName() ) ) )
-        headerFile.write( conv.getName().encode('ascii') )
-        headerFile.write( conv.headerBytes() )
-        
-        
-# At the end we know how many chunks there are. Update the header as needed.
-def UpdateHeader( headerFile, numChunks ):
-    # seek after the first Int64
-    headerFile.seek( 8 )
-    # Write the number of chunks
-    headerFile.write( struct.pack( "q", numChunks ) )
+    raise ValueError('Invalid input format {0}'.format(input_type))
+
+# parse the header to get the converters for this file
+# <name>    <alias>  <input format>  <sample size>
+def build_converters(header_file, element_type):
+    converters = OrderedDict();
+    with open(header_file, 'r') as inputs:
+        for line in inputs:
+            (name, alias, input_type, sample_dim) = line.strip().split()
+            converters[alias] = get_converter(input_type, name, int(sample_dim), element_type)
+    return converters
+
+class Chunk:
+    def __init__(self):
+        self.offset = 0
+        self.sequences = []
+
+    def num_sequences(self):
+        return len(self.sequences)
+
+    def num_samples(self):
+        return sum(self.sequences)
+
+    def add_sequence(self, num_samples):
+        return self.sequences.append(num_samples)
+
+class Header:
+    def __init__(self, converters):
+        self.converters = converters
+        self.chunks = []
+
+    def add_chunk(self, chunk):
+        assert(isinstance(chunk, Chunk))
+        self.chunks.append(chunk)
+
+    # Output the binary format header.
+    def write(self, output_file):
+        output_file.flush()
+        header_offset = output_file.tell()
+        # First, write the magic number (uint64, 8 bytes)
+        output_file.write(struct.pack('Q', MAGIC_NUMBER));
+         # Next is the number of chunks (uint32, 4 bytes)
+        output_file.write(struct.pack('I', len(self.chunks)))
+        # Finally the number of input streams (uint32, 4 bytes)
+        output_file.write(struct.pack('I', len(self.converters)))
+        for converter in self.converters.values():
+            converter.write_header(output_file)
+        # write the chunk table
+        for chunk in self.chunks:
+            # uint64: start offset for chunk
+            output_file.write(struct.pack('q', chunk.offset))
+            # uint32: number of sequences in the chunk
+            output_file.write(struct.pack('I', chunk.num_sequences()))
+            # uint32: number of samples in the chunk
+            output_file.write(struct.pack('I', chunk.num_samples()))
+
+        output_file.write(struct.pack('q', header_offset));

-# Output a single row of the offsets table
-def OutputOffset( headerFile, numBytes, numSeqs, numSamples ):
-    # Int64 start offset for chunk
-    headerFile.write( struct.pack( "q", numBytes ) )
-    # Int32 Num sequences in the chunk
-    headerFile.write( struct.pack( "i", numSeqs ) )
-    # Int32 Num samples in the chunk
-    headerFile.write( struct.pack( "i", numSamples ) )

 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Transforms a CNTK Text Format file into CNTK binary format given a header.")
-    parser.add_argument('--input', help="CNTK Text Format file to convert to binary.", default="", required=True)
-    parser.add_argument('--header',  help="Header file describing each stream in the input.", default="", required=True)
-    parser.add_argument('--seqsPerChunk', type=int, help='Number of sequences in each chunk.', default="", required=True)
-    parser.add_argument('--output', help='Name of the output file, stdout if not given', default="", required=True)
+    parser.add_argument('--input', help="CNTK Text Format file to convert to binary.", required=True)
+    parser.add_argument('--header',  help="Header file describing each stream in the input.", required=True)
+    parser.add_argument('--chunk_size', type=int, help='Chunk size in bytes.', required=True)
+    parser.add_argument('--output', help='Name of the output file, stdout if not given', required=True)
+    parser.add_argument('--precision', help='Floating point precision (double or float). Default is float',
+        choices=["float", "double"], default="float", required=False)
    args = parser.parse_args()

-    # Since we don't know how many chunks we're going to write until we're done,
-    # grow the header/offsets table and the data portion separately. then at the
-    # end concatenate the data portion onto the end of the header/offsets
-    # portion.
-    binaryHeaderFile = open( args.output, "wb+" )
-    binaryDataFile = tempfile.NamedTemporaryFile(mode="rb+", delete=False)
-    dataPath = binaryDataFile.name
+    output = open(args.output, "wb")
+    # The very first 8 bytes of the file is the CBF magic number.
+    output.write(struct.pack('Q', MAGIC_NUMBER));
+    # Next 4 bytes is the CBF version.
+    output.write(struct.pack('I', CBF_VERSION));

-    # parse the header to get the converters for this file
-    # <name>    <alias>  <input format>  <sample size>
-    converters = []
-    aliasToId = dict()
-    with open( args.header, "r" ) as headerfile:
-        id = 0
-        for line in headerfile:
-            split = re.split(r'\t+', line.strip())
-            converters.append( GetConverter( split[ 2 ], split[ 0 ], int(split[3]) ) )
-            aliasToId[ split[ 1 ] ] = id
-            id += 1
+    converters = build_converters(args.header, 
+        ElementType.FLOAT if args.precision == 'float' else ElementType.DOUBLE)

-    OutputHeader( binaryHeaderFile, converters )
-    numChunks = 0
-    with open( args.input, "r" ) as inputFile:
-        curSequence = list()
-        numSeqs = 0
-        numSamps = 0
-        prevId = None
-        for line in inputFile:
-            split = line.rstrip().split('|')
+    header = Header(converters)
+    chunk = Chunk()
+
+    with open(args.input, "r") as input_file:
+        sequence = []
+        seq_id = None
+        estimated_chunk_size = 0
+        for line in input_file:
+            (prefix, _) = line.rstrip().split('|',1)
            # if the sequence id is empty or not equal to the previous sequence id,
            # we are at a new sequence.
-            if( not split[0] or prevId != split[ 0 ] ):
-                if(len(curSequence) > 0):
-                    numSamps += ParseSequence( aliasToId, curSequence, converters )
-                    curSequence = list()
-                    numSeqs += 1
-                    if( numSeqs % int( args.seqsPerChunk ) == 0 ):
-                        numBytes = OutputChunk( binaryDataFile, converters )
-                        numChunks += 1
-                        OutputOffset( binaryHeaderFile, numBytes, numSeqs, numSamps )
-                        numSeqs = 0
-                        numSamps = 0
-                prevId = split[ 0 ]
+            if((not seq_id and not prefix) or (len(prefix) > 0 and seq_id != prefix)):
+                if(len(sequence) > 0):
+                    estimated_chunk_size += process_sequence(sequence, converters, chunk)
+                    sequence = []
+                    if(estimated_chunk_size >= int(args.chunk_size)):
+                        write_chunk(output, converters, chunk)
+                        header.add_chunk(chunk)
+                        chunk = Chunk()
+                seq_id = prefix

-            curSequence.append( line )
+            sequence.append(line)
        # we must parse the last line
-        if( len(curSequence) > 0 ):
-            numSamps += ParseSequence( aliasToId, curSequence, converters )
-            numSeqs += 1
-            numChunks += 1
+        if(len(sequence) > 0):
+            process_sequence(sequence, converters, chunk)

-        numBytes = OutputChunk( binaryDataFile, converters )
-        OutputOffset( binaryHeaderFile, numBytes, numSeqs, numSamps )
+        write_chunk(output, converters, chunk)
+        header.add_chunk(chunk)

-        UpdateHeader( binaryHeaderFile, numChunks )
-        binaryHeaderFile.flush()
-        binaryDataFile.flush()
-        binaryHeaderFile.close()
-        binaryDataFile.close()
+        header.write(output)

-        destination = open( args.output, 'ab+' )
-        shutil.copyfileobj( open( dataPath, "rb" ), destination )
-        
-        destination.flush()
-        destination.close()
-        os.unlink(dataPath)
-        assert not os.path.exists(dataPath)
+        output.flush()
+        output.close()
--- a/Source/Readers/CNTKBinaryReader/BinaryChunkDeserializer.cpp
+++ b/Source/Readers/CNTKBinaryReader/BinaryChunkDeserializer.cpp
@ -4,6 +4,8 @@
 //

 #include "stdafx.h"
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
 #include "BinaryChunkDeserializer.h"
 #include "BinaryDataChunk.h"
 #include "FileHelper.h"
@ -11,43 +13,52 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-enum class DeserializerType : int32_t
+enum class MatrixEncodingType : unsigned char
 {
-    DenseBinaryDataDeserializer = 0,
-    SparseBinaryDataDeserializer = 1
+    dense = 0,
+    sparse_csc = 1,
+    // TODO: compressed_sparse_csc = 2, // indices are encoded as var-ints
 };
-void BinaryChunkDeserializer::ReadOffsetsTable(FILE* infile)
+
+
+void BinaryChunkDeserializer::ReadChunkTable(FILE* infile)
 {
-    ReadOffsetsTable(infile, 0, m_numChunks);
+    ReadChunkTable(infile, 0, m_numChunks);
 }

-void BinaryChunkDeserializer::ReadOffsetsTable(FILE* infile, size_t startOffset, size_t numChunks)
+void BinaryChunkDeserializer::ReadChunkTable(FILE* infile, uint32_t firstChunkIdx, uint32_t numChunks)
 {
-    assert((int64_t)(startOffset + numChunks) <= m_numChunks);
-    size_t startPos = startOffset * sizeof(DiskOffsetsTable) + m_offsetStart;
+    if (firstChunkIdx + numChunks > m_numChunks) 
+    {
+        RuntimeError("Requested chunks (from %" PRIu32 " to %" PRIu32 ") are out of bounds "
+            "(the total number of chunks in the dataset is %" PRIu32 ").",
+            firstChunkIdx, (firstChunkIdx + numChunks - 1), m_numChunks);
+    }

-    // Seek to the offsets table start
-    CNTKBinaryFileHelper::seekOrDie(infile, startPos, SEEK_SET);
+    uint64_t firstChunkOffset = firstChunkIdx * sizeof(ChunkInfo) + m_chunkTableOffset;
+
+    // Seek to the start of the offset info for the first requested chunk 
+    CNTKBinaryFileHelper::SeekOrDie(infile, firstChunkOffset, SEEK_SET);

    // Note we create numChunks + 1 since we want to be consistent with determining the size of each chunk.
-    DiskOffsetsTable* offsetsTable = new DiskOffsetsTable[numChunks + 1];
+    ChunkInfo* chunks = new ChunkInfo[numChunks + 1];

    // Read in all of the offsets for the chunks of interest
-    CNTKBinaryFileHelper::readOrDie(offsetsTable, sizeof(DiskOffsetsTable), numChunks, infile);
+    CNTKBinaryFileHelper::ReadOrDie(chunks, sizeof(ChunkInfo), numChunks, infile);

    // Now read the final entry. It is either the next offset entry (if we're reading a subset and the
    // entry exists), or we just fill it with the correct information based on file size if it doesn't
-    if ((int64_t)(startOffset + numChunks) == m_numChunks)
+    if (firstChunkIdx + numChunks == m_numChunks)
    {
-        CNTKBinaryFileHelper::seekOrDie(infile, 0, SEEK_END);
-        offsetsTable[numChunks].offset = CNTKBinaryFileHelper::tellOrDie(infile) - m_dataStart;
-        offsetsTable[numChunks].numSamples = 0;
-        offsetsTable[numChunks].numSequences = 0;
+        auto position = CNTKBinaryFileHelper::TellOrDie(infile);
+        chunks[numChunks].offset = position;
+        chunks[numChunks].numSamples = 0;
+        chunks[numChunks].numSequences = 0;
    }
    else
-        CNTKBinaryFileHelper::readOrDie(offsetsTable + numChunks, sizeof(DiskOffsetsTable), 1, infile);
+        CNTKBinaryFileHelper::ReadOrDie(chunks + numChunks, sizeof(ChunkInfo), 1, infile);

-    m_offsetsTable = make_unique<OffsetsTable>(numChunks, offsetsTable);
+    m_chunkTable = make_unique<ChunkTable>(numChunks, chunks);

 }

@ -56,7 +67,7 @@ BinaryChunkDeserializer::BinaryChunkDeserializer(const BinaryConfigHelper& helpe
 {
    SetTraceLevel(helper.GetTraceLevel());

-    Initialize(helper.GetRename());
+    Initialize(helper.GetRename(), helper.GetElementType());
 }


@ -64,8 +75,8 @@ BinaryChunkDeserializer::BinaryChunkDeserializer(const std::wstring& filename) :
    DataDeserializerBase(true),
    m_filename(filename),
    m_file(nullptr),
-    m_offsetStart(0),
-    m_dataStart(0),
+    m_headerOffset(0),
+    m_chunkTableOffset(0),
    m_traceLevel(0)
 {
 }
@ -73,103 +84,85 @@ BinaryChunkDeserializer::BinaryChunkDeserializer(const std::wstring& filename) :
 BinaryChunkDeserializer::~BinaryChunkDeserializer()
 {
    if (m_file)
-        CNTKBinaryFileHelper::closeOrDie(m_file);
+        CNTKBinaryFileHelper::CloseOrDie(m_file);
 }

-void BinaryChunkDeserializer::Initialize(const std::map<std::wstring, std::wstring>& rename)
+
+void BinaryChunkDeserializer::Initialize(const std::map<std::wstring, std::wstring>& rename, ElementType precision)
 {
    if (m_file)
-        CNTKBinaryFileHelper::closeOrDie(m_file);
+        CNTKBinaryFileHelper::CloseOrDie(m_file);

-    m_file = CNTKBinaryFileHelper::openOrDie(m_filename, L"rb");
+    m_file = CNTKBinaryFileHelper::OpenOrDie(m_filename, L"rb");

-    // We are now parsing the header. Seek to the head of the header to start.
-    CNTKBinaryFileHelper::seekOrDie(m_file, 0, SEEK_SET);
+    // First, verify the magic number.
+    CNTKBinaryFileHelper::FindMagicOrDie(m_file, m_filename);
+    
+    // Second, read the version number of the data file, and (for now) make sure the reader version is the same.
+    uint32_t versionNumber = CNTKBinaryFileHelper::GetVersionNumber(m_file);
+    if (versionNumber != s_currentVersion)
+        LogicError("The reader version is %" PRIu32 ", but the data file was created for version %" PRIu32 ".",
+            s_currentVersion, versionNumber);

-    // First read the version number of the data file, and make sure the reader version is the same.
-    int64_t versionNumber;
-    CNTKBinaryFileHelper::readOrDie(&versionNumber, sizeof(versionNumber), 1, m_file);
-    if (versionNumber != m_versionNumber)
-        LogicError("The reader version is %d, but the data file was created for version %d.", (int)m_versionNumber, (int)versionNumber);
+    // Now, find where the header is.
+    m_headerOffset = CNTKBinaryFileHelper::GetHeaderOffset(m_file);
+    CNTKBinaryFileHelper::SeekOrDie(m_file, m_headerOffset, SEEK_SET);
+    // Once again, make sure that the header is well-formed and starts with a magic number.
+    CNTKBinaryFileHelper::FindMagicOrDie(m_file, m_filename);

    // Next is the number of chunks in the input file.
-    CNTKBinaryFileHelper::readOrDie(&m_numChunks, sizeof(m_numChunks), 1, m_file);
+    CNTKBinaryFileHelper::ReadOrDie(&m_numChunks, sizeof(m_numChunks), 1, m_file);

    // Next is the number of inputs
-    CNTKBinaryFileHelper::readOrDie(&m_numInputs, sizeof(m_numInputs), 1, m_file);
+    CNTKBinaryFileHelper::ReadOrDie(&m_numInputs, sizeof(m_numInputs), 1, m_file);

    // Reserve space for all of the inputs, and then read them in.
    m_streams.resize(m_numInputs);
    m_deserializers.resize(m_numInputs);

-    int32_t len;
-    // 100 characters should be plenty by default, but grow if necessary.
-    vector<char> tempName(100);
-    for (int32_t c = 0; c < m_numInputs; c++)
+    for (decltype(m_numInputs) i = 0; i < m_numInputs; i++)
    {
-        // Create our streamDescription for this input
-        auto streamDescription = std::make_shared<StreamDescription>();
+        MatrixEncodingType type;
+        CNTKBinaryFileHelper::ReadOrDie(&type, sizeof(type), 1, m_file);
+        if (type == MatrixEncodingType::dense)
+            m_deserializers[i] = make_shared<DenseBinaryDataDeserializer>(m_file, precision);
+        else if (type == MatrixEncodingType::sparse_csc)
+            m_deserializers[i] = make_shared<SparseBinaryDataDeserializer>(m_file, precision);
+        else
+            RuntimeError("Unknown encoding type %u requested.", type);

-        // read the name
-        CNTKBinaryFileHelper::readOrDie(&len, sizeof(len), 1, m_file);
-        // Need 1 extra char for the null.
-        tempName.resize(len+1);
-        CNTKBinaryFileHelper::readOrDie(tempName.data(), sizeof(char), len, m_file);
-        tempName[len] = '\0';
-        wstring wname = msra::strfun::utf16(tempName.data());
+        auto description = m_deserializers[i]->GetStreamDescription();
+        description->m_id = i;
        // Check if we should rename this input based on the config
-        if (rename.find(wname) == rename.end())
-            streamDescription->m_name = wname;
-        else
-            streamDescription->m_name = rename.at(wname);
+        auto it = rename.find(description->m_name);
+        if (it != rename.end()) 
+        {
+            description->m_name = it->second;
+        }

-        // Read the matrix type. Then instantiate the appropriate BinaryDataDeserializer, and have it read in its parameters
-        // Note: Is there a better way to do this?
-        DeserializerType desType;
-        CNTKBinaryFileHelper::readOrDie(&desType, sizeof(desType), 1, m_file);
-        if (desType == DeserializerType::DenseBinaryDataDeserializer)
-            m_deserializers[c] = make_shared<DenseBinaryDataDeserializer>(m_file);
-        else if (desType == DeserializerType::SparseBinaryDataDeserializer)
-            m_deserializers[c] = make_shared<SparseBinaryDataDeserializer>(m_file);
-        else
-            RuntimeError("Unknown deserializer type %d requested.", (int)desType);
-
-        streamDescription->m_id           = c;
-        streamDescription->m_elementType  = m_deserializers[c]->GetElementType();
-        streamDescription->m_storageType  = m_deserializers[c]->GetStorageType();
-        streamDescription->m_sampleLayout = m_deserializers[c]->GetSampleLayout();
-        m_streams[c]                      = streamDescription;
+        m_streams[i] = description;
    }

-    // We just finished the header. So we're now at the offsets table.
-    m_offsetStart = CNTKBinaryFileHelper::tellOrDie(m_file);
-
-    // After the header is the data start. Compute that now.
-    m_dataStart = m_offsetStart + m_numChunks * sizeof(DiskOffsetsTable);
+    // We just finished the header. So we're now at the chunk table.
+    m_chunkTableOffset = CNTKBinaryFileHelper::TellOrDie(m_file);

    // We only have to read in the offsets table once, so do that now.
    // Note it's possible in distributed reading mode to only want to read
    // a subset of the offsets table.
-    ReadOffsetsTable(m_file);
+    ReadChunkTable(m_file);
 }

 ChunkDescriptions BinaryChunkDeserializer::GetChunkDescriptions()
 {
-    assert(m_offsetsTable);
+    assert(m_chunkTable);
+
    ChunkDescriptions result;
    result.reserve(m_numChunks);

-    if (m_numChunks > CHUNKID_MAX)
-        RuntimeError("Currently CNTK does not support %d chunks. The maximum number of chunks allowed is %d.", (int)m_numChunks, (int)CHUNKID_MAX);
-
-    for (ChunkIdType c = 0; c < (ChunkIdType)m_numChunks; c++ ) 
+    for (ChunkIdType i = 0; i < m_numChunks; i++ ) 
    {
        result.push_back(shared_ptr<ChunkDescription>(
-            new ChunkDescription {
-                c,
-                (size_t)m_offsetsTable->GetNumSamples(c),
-                (size_t)m_offsetsTable->GetNumSequences(c)
-        }));
+            new ChunkDescription{ i, m_chunkTable->GetNumSamples(i), m_chunkTable->GetNumSequences(i) }));
    }

    return result;
@ -178,33 +171,25 @@ ChunkDescriptions BinaryChunkDeserializer::GetChunkDescriptions()
 void BinaryChunkDeserializer::GetSequencesForChunk(ChunkIdType chunkId, std::vector<SequenceDescription>& result)
 {
    // Reserve space for each sequence
-    result.reserve(m_offsetsTable->GetNumSequences(chunkId));
+    result.reserve(m_chunkTable->GetNumSequences(chunkId));

-    // We don't store every piece of sequence information, so we have to read the chunk in, parse it, and then
-    // find the information.
-    // BUGBUG: Note this requires reading each chunk twice. This might not be hugely disadvantageous due to OS 
-    // caching, but should be avoided none the less.
-    ChunkPtr chunk = GetChunk(chunkId);
+    auto offset = m_chunkTable->GetOffset(chunkId);
+    auto numberOfSequences = m_chunkTable->GetNumSequences(chunkId);
+    unique_ptr<uint32_t[]> numSamplesPerSequence(new uint32_t[numberOfSequences]);

-    size_t startId = m_offsetsTable->GetStartIndex(chunkId);
-    std::vector<SequenceDataPtr> temp;
-    for (size_t c = 0; c < m_offsetsTable->GetNumSequences(chunkId); c++)
+    // Seek to the start of the chunk
+    CNTKBinaryFileHelper::SeekOrDie(m_file, offset, SEEK_SET);
+    // read 'numberOfSequences' unsigned ints
+    CNTKBinaryFileHelper::ReadOrDie(numSamplesPerSequence.get(), sizeof(uint32_t), numberOfSequences, m_file);
+
+    auto startId = m_chunkTable->GetStartIndex(chunkId);
+    for (decltype(numberOfSequences) i = 0; i < numberOfSequences; i++)
    {
-        // BUGBUG: This is inefficient, but we don't have a choice. Why do we need this at all? Why can't
-        // this information just be gotten from the chunks? It's not clear.
-        // Note numSamples is 1 if there are no sequences.
-        uint32_t numSamples = 1;
-        temp.clear();
-        chunk->GetSequence(m_offsetsTable->GetStartIndex(chunkId) + c, temp);
-        // Only take the max over streams that are actually in use.
-        for (size_t i = 0; i < temp.size(); i++)
-            numSamples = max(numSamples, temp[i]->m_numberOfSamples);
-
        SequenceDescription sd = {};
-        sd.m_indexInChunk = startId + c;
-        sd.m_numberOfSamples = numSamples;
+        sd.m_indexInChunk = i;
+        sd.m_numberOfSamples = numSamplesPerSequence[i];
        sd.m_chunkId = chunkId;
-        sd.m_key.m_sequence = startId + c;
+        sd.m_key.m_sequence = startId + i;
        sd.m_key.m_sample = 0;

        result.push_back(sd);
@ -213,17 +198,18 @@ void BinaryChunkDeserializer::GetSequencesForChunk(ChunkIdType chunkId, std::vec

 unique_ptr<byte[]> BinaryChunkDeserializer::ReadChunk(ChunkIdType chunkId)
 {
-    // Seek to the start of the chunk
-    CNTKBinaryFileHelper::seekOrDie(m_file, m_dataStart + m_offsetsTable->GetOffset(chunkId), SEEK_SET);
+    // Seek to the start of the data portion in the chunk
+    CNTKBinaryFileHelper::SeekOrDie(m_file, m_chunkTable->GetDataStartOffset(chunkId), SEEK_SET);

    // Determine how big the chunk is.
-    size_t chunkSize = m_offsetsTable->GetChunkSize(chunkId);
+    size_t chunkSize = m_chunkTable->GetChunkSize(chunkId);
    
    // Create buffer
+    // TODO: use a pool of buffers instead of allocating a new one, each time a chunk is read.
    unique_ptr<byte[]> buffer(new byte[chunkSize]);

    // Read the chunk from disk
-    CNTKBinaryFileHelper::readOrDie(buffer.get(), sizeof(byte), chunkSize, m_file);
+    CNTKBinaryFileHelper::ReadOrDie(buffer.get(), sizeof(byte), chunkSize, m_file);

    return buffer;
 }
@ -232,9 +218,9 @@ unique_ptr<byte[]> BinaryChunkDeserializer::ReadChunk(ChunkIdType chunkId)
 ChunkPtr BinaryChunkDeserializer::GetChunk(ChunkIdType chunkId)
 {
    // Read the chunk into memory
-    unique_ptr<byte[]> chunkBuffer = ReadChunk(chunkId);
+    unique_ptr<byte[]> buffer = ReadChunk(chunkId);

-    return make_shared<BinaryDataChunk>(chunkId, m_offsetsTable->GetStartIndex(chunkId), m_offsetsTable->GetNumSequences(chunkId), std::move(chunkBuffer), m_deserializers);
+    return make_shared<BinaryDataChunk>(chunkId, m_chunkTable->GetNumSequences(chunkId), std::move(buffer), m_deserializers);
 }

 void BinaryChunkDeserializer::SetTraceLevel(unsigned int traceLevel)
--- a/Source/Readers/CNTKBinaryReader/BinaryChunkDeserializer.h
+++ b/Source/Readers/CNTKBinaryReader/BinaryChunkDeserializer.h
@ -13,55 +13,78 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-// Offsets table used to find the chunks in the binary file. Added some helper methods around the core data.
-#pragma pack(push, 1)
-struct DiskOffsetsTable 
+// Chunk meta-info: byte offset in the inputfile, number of sequences and samples in the chunk.
+struct ChunkInfo 
 {
    int64_t offset;
-    int32_t numSequences;
-    int32_t numSamples;
+    uint32_t numSequences;
+    uint32_t numSamples;
 };
-#pragma pack(pop)

-    // Offsets table used to find the chunks in the binary file. Added some helper methods around the core data.
-class OffsetsTable {
+// Chunk table used to find the chunks in the binary file. Added some helper methods around the core data.
+class ChunkTable {
 public:

-    OffsetsTable(size_t numChunks, DiskOffsetsTable* offsetsTable) : m_numChunks(numChunks)
+    ChunkTable(uint32_t numChunks, ChunkInfo* offsetsTable) :
+        m_numChunks(numChunks),
+        m_diskOffsetsTable(offsetsTable),
+        m_startIndex(numChunks)
    {
-        m_diskOffsetsTable = make_unique<DiskOffsetsTable*>(offsetsTable);
-        Initialize();
+        uint64_t numSequences = 0;
+        for (decltype(m_numChunks) i = 0; i < m_numChunks; i++)
+        {
+            m_startIndex[i] = numSequences;
+            numSequences += m_diskOffsetsTable[i].numSequences;
+        }
    }

-    int64_t GetOffset(size_t index) { return (*m_diskOffsetsTable)[index].offset; }
-    int32_t GetNumSequences(size_t index) { return (*m_diskOffsetsTable)[index].numSequences; }
-    int32_t GetNumSamples(size_t index) { return (*m_diskOffsetsTable)[index].numSamples; }
-    int64_t GetStartIndex(size_t index) { return m_startIndex[index]; }
-    size_t GetChunkSize(size_t index) { return (*m_diskOffsetsTable)[index + 1].offset - (*m_diskOffsetsTable)[index].offset; }
+    int64_t GetOffset(uint32_t index) 
+    { 
+        return m_diskOffsetsTable[index].offset; 
+    }

-private:
-    void Initialize()
+    int64_t GetDataStartOffset(uint32_t index)
    {
-        m_startIndex.resize(m_numChunks);
-        m_startIndex[0] = 0;
-        for (int64_t c = 1; c < m_numChunks; c++)
-            m_startIndex[c] = m_startIndex[c-1] + (*m_diskOffsetsTable)[c].numSequences;
+        auto sequenceLengthPrefix = GetNumSequences(index) * sizeof(uint32_t);
+        return GetOffset(index) + sequenceLengthPrefix;
+    }
+
+    uint32_t GetNumSequences(uint32_t index) 
+    { 
+        return m_diskOffsetsTable[index].numSequences;
+    }
+
+    uint32_t GetNumSamples(uint32_t index) 
+    { 
+        return m_diskOffsetsTable[index].numSamples; 
+    }
+
+    int64_t GetStartIndex(uint32_t index) 
+    {
+        return m_startIndex.at(index); 
+    }
+
+    uint64_t GetChunkSize(uint32_t index) 
+    { 
+        auto dataStartOffset = GetDataStartOffset(index);
+        auto dataEndOffset = GetOffset(index + 1);
+        return dataEndOffset - dataStartOffset;
    }

 private:
-    int64_t m_numChunks;
-    unique_ptr<DiskOffsetsTable*> m_diskOffsetsTable;
-    vector<size_t> m_startIndex;
+    uint32_t m_numChunks;
+    unique_ptr<ChunkInfo[]> m_diskOffsetsTable;
+    vector<uint64_t> m_startIndex;
 };

-typedef unique_ptr<OffsetsTable> OffsetsTablePtr;
+typedef unique_ptr<ChunkTable> ChunkTablePtr;

 // TODO: more details when tracing warnings 
 class BinaryChunkDeserializer : public DataDeserializerBase {
 public:
    explicit BinaryChunkDeserializer(const BinaryConfigHelper& helper);

-    BinaryChunkDeserializer(CorpusDescriptorPtr corpus, const BinaryConfigHelper& helper);
+    BinaryChunkDeserializer(CorpusDescriptorPtr corpus, const BinaryConfigHelper& helper) = delete;

    ~BinaryChunkDeserializer();

@ -74,16 +97,13 @@ public:
    // Get information about particular chunk.
    void GetSequencesForChunk(ChunkIdType chunkId, vector<SequenceDescription>& result) override;

-    // Parses buffer into a BinaryChunkPtr
-    void ParseChunk(ChunkIdType chunkId, unique_ptr<byte[]> const& buffer, std::vector<std::vector<SequenceDataPtr>>& data);
-
 private:
    // Builds an index of the input data.
-    void Initialize(const std::map<std::wstring, std::wstring>& rename);
+    void Initialize(const std::map<std::wstring, std::wstring>& rename, ElementType precision);

-    // Reads the offsets table from disk into memory
-    void ReadOffsetsTable(FILE* infile, size_t startOffset, size_t numChunks);
-    void ReadOffsetsTable(FILE* infile);
+    // Reads the chunk table from disk into memory
+    void ReadChunkTable(FILE* infile, uint32_t firstChunkIdx, uint32_t numChunks);
+    void ReadChunkTable(FILE* infile);

    // Reads a chunk from disk into buffer
    unique_ptr<byte[]> ReadChunk(ChunkIdType chunkId);
@ -96,22 +116,23 @@ private:
    const wstring m_filename;
    FILE* m_file;

-    int64_t m_offsetStart;
-    int64_t m_dataStart;
-
+    int64_t m_headerOffset, m_chunkTableOffset;

    std::vector<BinaryDataDeserializerPtr> m_deserializers;
-    OffsetsTablePtr m_offsetsTable;
+    ChunkTablePtr m_chunkTable;
    void* m_chunkBuffer;

-    int64_t m_versionNumber = 1;
-    int64_t m_numChunks;
-    int32_t m_numInputs;
+    
+    uint32_t m_numChunks;
+    uint32_t m_numInputs;
    
    unsigned int m_traceLevel;

+    static const uint32_t s_currentVersion = 1;
+
    friend class CNTKBinaryReaderTestRunner;

+
    DISABLE_COPY_AND_MOVE(BinaryChunkDeserializer);
 };
 }}}
--- a/Source/Readers/CNTKBinaryReader/BinaryConfigHelper.cpp
+++ b/Source/Readers/CNTKBinaryReader/BinaryConfigHelper.cpp
@ -48,6 +48,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
        }

+        string precision = config.Find("precision", "float");
+        if (AreEqualIgnoreCase(precision, "double"))
+        {
+            m_elementType = ElementType::tdouble;
+        }
+        else if (AreEqualIgnoreCase(precision, "float"))
+        {
+            m_elementType = ElementType::tfloat;
+        }
+        else
+        {
+            RuntimeError("Not supported precision '%s'. Expected 'double' or 'float'.", precision.c_str());
+        }
+
        m_filepath = msra::strfun::utf16(config(L"file"));
        m_keepDataInMemory = config(L"keepDataInMemory", false);

--- a/Source/Readers/CNTKBinaryReader/BinaryConfigHelper.h
+++ b/Source/Readers/CNTKBinaryReader/BinaryConfigHelper.h
@ -34,11 +34,14 @@ public:

    bool ShouldKeepDataInMemory() const { return m_keepDataInMemory; }

+    ElementType GetElementType() const { return m_elementType; }
+
    DISABLE_COPY_AND_MOVE(BinaryConfigHelper);

 private:
    std::wstring m_filepath;
    std::map<std::wstring, std::wstring> m_streams;
+    ElementType m_elementType;
    size_t m_randomizationWindow;
    // Specifies how to interpret randomization window, if true randomization window == number of samples, else 
    // randomization window = number of chunks (default).
--- a/Source/Readers/CNTKBinaryReader/BinaryDataChunk.h
+++ b/Source/Readers/CNTKBinaryReader/BinaryDataChunk.h
@ -15,30 +15,37 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 class BinaryDataChunk : public Chunk, public std::enable_shared_from_this<Chunk>
 {
 public:
-    explicit BinaryDataChunk(ChunkIdType chunkId, size_t startSequence, size_t numSequences, unique_ptr<byte[]> buffer, std::vector<BinaryDataDeserializerPtr> deserializer)
-        : m_chunkId(chunkId), m_startSequence(startSequence), m_numSequences(numSequences), m_buffer(std::move(buffer)), m_deserializers(deserializer)
-    {
-    }
+    explicit BinaryDataChunk(ChunkIdType chunkId,
+        size_t numSequences, 
+        unique_ptr<byte[]> buffer, 
+        std::vector<BinaryDataDeserializerPtr> deserializer)
+        : m_chunkId(chunkId),
+        m_numSequences(numSequences), 
+        m_buffer(std::move(buffer)), 
+        m_deserializers(deserializer)
+    { }

-    // Gets sequences by id.
-    void GetSequence(size_t sequenceId, std::vector<SequenceDataPtr>& result) override
+    // Gets a sequence using its index inside the chunk.
+    void GetSequence(size_t sequenceIdx, std::vector<SequenceDataPtr>& result) override
    {
        // Check if we've already parsed the chunk. If not, parse it.
        if (m_data.size() == 0)
            ParseChunk();
+
        assert(m_data.size() != 0);
+
        // resize the output to have the same dimensionality
        result.resize(m_data.size());
        // now copy the decoded sequences
-        for (size_t c = 0; c < m_data.size(); c++)
-            result[c] = m_data[c].at(sequenceId - m_startSequence);
+        for (size_t i = 0; i < m_data.size(); i++)
+            result[i] = m_data[i].at(sequenceIdx);
    }

-    uint32_t GetNumSamples(size_t sequenceId)
+    uint32_t GetNumSamples(size_t sequenceIdx)
    {
        uint32_t numSamples = 0;
-        for (size_t c = 0; c < m_data.size(); c++)
-            numSamples = max(numSamples, m_data[c].at(sequenceId)->m_numberOfSamples);
+        for (size_t i = 0; i < m_data.size(); i++)
+            numSamples = max(numSamples, m_data[i].at(sequenceIdx)->m_numberOfSamples);
        return numSamples;
    }

@ -50,15 +57,13 @@ protected:
        // the number of bytes of buffer that have been processed by the deserializer so far
        size_t bytesProcessed = 0;
        // Now call all of the deserializers on the chunk, in order
-        for (size_t c = 0; c < m_deserializers.size(); c++)
-            bytesProcessed += m_deserializers[c]->GetSequenceDataForChunk(m_numSequences, (byte*)m_buffer.get() + bytesProcessed, m_data[c]);
+        for (size_t i = 0; i < m_deserializers.size(); i++)
+            bytesProcessed += m_deserializers[i]->GetSequenceDataForChunk(m_numSequences, m_buffer.get() + bytesProcessed, m_data[i]);
    }

    // chunk id (copied from the descriptor)
    ChunkIdType m_chunkId;

-    // start id for sequences in this chunk. 
-    size_t m_startSequence;
    // num sequences in this chunk. Note this should be in the chunk, but for simplicity it is in the offsets table
    // so we must tell the chunk where it starts.
    size_t m_numSequences;
--- a/Source/Readers/CNTKBinaryReader/BinaryDataDeserializer.h
+++ b/Source/Readers/CNTKBinaryReader/BinaryDataDeserializer.h
@ -14,40 +14,102 @@
 namespace Microsoft { namespace MSR { namespace CNTK {


-class BinaryDataDeserialzer {
+class BinaryDataDeserialzer 
+{
 public:
+
+    BinaryDataDeserialzer(FILE* file, ElementType precision = ElementType::tfloat)
+    {
+        ReadName(file);
+        ReadDataType(file);
+        ReadSampleSize(file);
+        
+        if (precision != ElementType::tfloat && precision != ElementType::tdouble)
+            LogicError("Unsupported precision type %u.", precision);
+
+        if ((m_dataType == DataType::tfloat && precision != ElementType::tfloat) ||
+            (m_dataType == DataType::tdouble && precision != ElementType::tdouble))
+            LogicError("Unsupported combination of the input data type %u and precision %u. "
+                "At the moment, both have to match.", m_dataType, precision);
+
+        m_precision = precision;
+    }
+
    virtual size_t GetSequenceDataForChunk(size_t numSequences, void* data, std::vector<SequenceDataPtr>& result) = 0;

-    StorageType GetStorageType() { return m_storageType; }
-    ElementType GetElementType() { return m_elemType; }
-    TensorShapePtr GetSampleLayout() { return make_shared<TensorShape>(m_numCols); }
-    virtual bool IsSequence() { return false; }
+    virtual StorageType GetStorageType() = 0;

-    size_t GetElemSizeBytes()
+    StreamDescriptionPtr GetStreamDescription() 
    {
-        if (m_elemType == ElementType::tfloat)
+        auto streamDescription = std::make_shared<StreamDescription>();
+        streamDescription->m_elementType = m_precision;
+        streamDescription->m_storageType = GetStorageType();
+        streamDescription->m_sampleLayout = GetSampleLayout();
+        streamDescription->m_name = m_name;
+        return streamDescription;
+    }
+
+    TensorShapePtr GetSampleLayout() 
+    {
+        return  make_shared<TensorShape>(m_sampleDimension);
+    }
+
+    size_t SizeOfDataType()
+    {
+        if (m_dataType == DataType::tfloat)
            return sizeof(float);
-        else if (m_elemType == ElementType::tdouble)
+        if (m_dataType == DataType::tdouble)
            return sizeof(double);
-        else
-            LogicError("Error, elemtype is not defined for BinaryDataDeserializer.");
+        
+        LogicError("Unsupported input data type %u.", m_dataType);
    }

 protected:
+
+    enum class DataType : unsigned char
+    {
+        tfloat = 0,
+        tdouble = 1,
+        // TODO: 
+        // tbool = 2, 1 bit per value (one-hot data)
+        // tbyte = 3, 1 byte per value
+    };
+
+    virtual ~BinaryDataDeserialzer() = default;
+
+    void ReadName(FILE* file)
+    {
+        uint32_t len;
+        // read the name
+        CNTKBinaryFileHelper::ReadOrDie(&len, sizeof(len), 1, file);
+        vector<char> temp(len + 1 , '\0');
+        CNTKBinaryFileHelper::ReadOrDie(temp.data(), sizeof(char), len, file);
+        m_name = msra::strfun::utf16(temp.data());
+    }
+
+    void ReadDataType(FILE* file)
+    {
+        CNTKBinaryFileHelper::ReadOrDie(&m_dataType, sizeof(m_dataType), 1, file);
+        if (m_dataType> DataType::tdouble)
+            RuntimeError("Unsupported input data type %u.", m_dataType);
+    }
+
+    void ReadSampleSize(FILE* file)
+    {
+        CNTKBinaryFileHelper::ReadOrDie(&m_sampleDimension, sizeof(m_sampleDimension), 1, file);
+    }
+
    struct DenseInputStreamBuffer : DenseSequenceData
    {
-        // capacity = expected number of samples * sample size
        const void* GetDataBuffer() override
        {
            return m_data;
        }

        void* m_data;
+        DataType m_dataType;
    };

-    // In case of sparse input, we also need a vector of
-    // indices (one index for each input value) and a vector
-    // of NNZ counts (one for each sample).
    struct SparseInputStreamBuffer : SparseSequenceData
    {
        SparseInputStreamBuffer()
@ -60,16 +122,13 @@ protected:
            return m_data;
        }
        
-        std::vector<IndexType> m_indicesBuffer;
        void* m_data;
    };

-    
-protected:
-    StorageType m_storageType;
-    ElementType m_elemType;
-    size_t m_numCols;
-
+    ElementType m_precision;
+    DataType m_dataType;
+    uint32_t m_sampleDimension;
+    wstring m_name;
 };

 typedef shared_ptr<BinaryDataDeserialzer> BinaryDataDeserializerPtr;
@ -77,156 +136,108 @@ typedef shared_ptr<BinaryDataDeserialzer> BinaryDataDeserializerPtr;
 class DenseBinaryDataDeserializer : public BinaryDataDeserialzer
 {
 public:
-    DenseBinaryDataDeserializer(FILE* infile)
-    {
-        // We don't have to read the storage type. We know we're dense
-        m_storageType = StorageType::dense;
+    using BinaryDataDeserialzer::BinaryDataDeserialzer;

-        // Read the element type, note it's stored as an int32
-        int32_t elemType;
-        CNTKBinaryFileHelper::readOrDie(&elemType, sizeof(elemType), 1, infile);
-        if (elemType == 0)
-            m_elemType = ElementType::tfloat;
-        else if (elemType == 1)
-            m_elemType = ElementType::tdouble;
-        else
-            RuntimeError("Unsupported element type %d.", elemType);
-
-        // Read the number of columns
-        int32_t numCols;
-        CNTKBinaryFileHelper::readOrDie(&numCols, sizeof(numCols), 1, infile);
-        m_numCols = numCols;
-    }
+    virtual  StorageType GetStorageType() override { return StorageType::dense; }

    size_t GetSequenceDataForChunk(size_t numSequences, void* data, std::vector<SequenceDataPtr>& result)
    {
-        size_t elemSize = GetElemSizeBytes();
+        size_t valueSize = SizeOfDataType();
        result.resize(numSequences);
-        for (size_t c = 0; c < numSequences; c++)
+        size_t offset = 0;
+        for (size_t i = 0; i < numSequences; i++)
        {
-            shared_ptr<DenseInputStreamBuffer> sequence = make_shared<DenseInputStreamBuffer>();
-            sequence->m_data            = (char*)data + c*m_numCols*elemSize;
-            sequence->m_numberOfSamples = 1;
-            sequence->m_sampleLayout    = std::make_shared<TensorShape>(m_numCols);
-            result[c]                   = sequence;
+            shared_ptr<DenseInputStreamBuffer> sequenceDataPtr = make_shared<DenseInputStreamBuffer>();
+            sequenceDataPtr->m_numberOfSamples = *(uint32_t*)((char*)data + offset);
+            offset += sizeof(uint32_t);
+            sequenceDataPtr->m_data = (char*)data + offset;
+            sequenceDataPtr->m_sampleLayout = GetSampleLayout();
+            sequenceDataPtr->m_elementType = m_precision;
+            result[i]  = sequenceDataPtr;
+            offset += m_sampleDimension * valueSize * sequenceDataPtr->m_numberOfSamples;
        }

-        // For dense, the number of bytes processed is just numRows * numCols * elemSize;
-        return numSequences * m_numCols * elemSize;
+        return offset;
    }
-
 };

 class SparseBinaryDataDeserializer : public BinaryDataDeserialzer
 {
 public:
-    SparseBinaryDataDeserializer(FILE* infile)
+    SparseBinaryDataDeserializer(FILE* file, ElementType precision = ElementType::tfloat)
+        :BinaryDataDeserialzer(file, precision)
    {
-        // Read the storage type. Currently we only support sparse_csc, 
-        // but for future compatability allow it to be a parameter.
-        int32_t storageType;
-        CNTKBinaryFileHelper::readOrDie(&storageType, sizeof(storageType), 1, infile);
-        if (storageType == 0)
-            m_storageType = StorageType::sparse_csc;
-        else
-            RuntimeError("Unsupported storage type %d.", storageType);
-
-        // Read the element type, note it's stored as an int32
-        int32_t elemType;
-        CNTKBinaryFileHelper::readOrDie(&elemType, sizeof(elemType), 1, infile);
-        if (elemType== 0)
-            m_elemType = ElementType::tfloat;
-        else if (elemType == 1)
-            m_elemType = ElementType::tdouble;
-        else
-            RuntimeError("Unsupported element type %d.", elemType);
-
-        int32_t isSequence;
-        CNTKBinaryFileHelper::readOrDie(&isSequence, sizeof(isSequence), 1, infile);
-        if (isSequence == 0)
-            m_isSequence = false;
-        else if (isSequence == 1)
-            m_isSequence = true;
-        else
-            RuntimeError("Unsupported sequence type %d.", isSequence);
-
-        // Read the number of columns
-        int32_t numCols;
-        CNTKBinaryFileHelper::readOrDie(&numCols, sizeof(numCols), 1, infile);
-        m_numCols = numCols;
+        if (IndexType(m_sampleDimension) < 0)
+        {
+            RuntimeError("Sample dimension is too large for an IndexType value.");
+        }
    }
-    
-    bool IsSequence() override { return m_isSequence; }
+
+    virtual  StorageType GetStorageType() override { return StorageType::sparse_csc; }

    // The format of data is: 
-    // int32_t: nnz for the entire chunk
-    // ElemType[nnz]: the values for the sparse sequences
-    // int32_t[nnz]: the row offsets for the sparse sequences
-    // int32_t[numSequences]: the column offsets for the sparse sequences
+    // sequence[numSequences], where each sequence consists of:
+    //   uint32_t: numSamples
+    //   uint32_t: nnz for the sequence
+    //   ElemType[nnz]: the values for the sparse sequences
+    //   int32_t[nnz]: the row offsets for the sparse sequences
+    //   int32_t[numSamples]: sizes (nnz counts) for each sample in the sequence
    size_t GetSequenceDataForChunk(size_t numSequences, void* data, std::vector<SequenceDataPtr>& result)
    {
-        size_t elemSize = GetElemSizeBytes();
+        size_t offset = 0;
        result.resize(numSequences);
-
-        // For sparse, the first int32_t is the number of nnz values in the entire set of sequences
-        int32_t totalNNz = *(int32_t*)data;
-
-        // the rest of this chunk
-        // Since we're not templating on ElemType, we use void for the values. Note that this is the only place
-        // this deserializer uses ElemType, the rest are int32_t for this deserializer.
-        void* values = (char*)data + sizeof(int32_t);
-
-        // Now the row offsets
-        int32_t* rowOffsets = (int32_t*)((char*)values + elemSize * totalNNz);
-
-        // Now the col offsets
-        int32_t* colOffsets = rowOffsets + totalNNz;
-
-        // Now we setup some helper members to process the chunk
-        for (size_t colIndex = 0; colIndex < numSequences; colIndex++)
+        for (size_t i = 0; i < numSequences; i++)
        {
-            shared_ptr<SparseInputStreamBuffer> sequence = make_shared<SparseInputStreamBuffer>();
-            // We can't popuplate sequence->m_chunk here, so delay that for later
-
-            // We know the number of elements in all of the samples, it's just this:
-            sequence->m_totalNnzCount = colOffsets[colIndex + 1] - colOffsets[colIndex];
-
-            // The values array is already properly packed, so just use it.
-            sequence->m_data = values;
-            
-            // The indices are correct (note they MUST BE IN INCREASING ORDER), but we will have to fix them up a 
-            // little bit, for now just use them
-            sequence->m_indices = rowOffsets;
-            for (int32_t curRow = 0; curRow < sequence->m_totalNnzCount; curRow++)
-            {
-                // Get the sample for the current index
-                size_t sampleNum = rowOffsets[curRow] / m_numCols;
-                // The current sample might be OOB, if so, fill in the the missing ones.
-                while(sequence->m_nnzCounts.size() < sampleNum+1)
-                    sequence->m_nnzCounts.push_back(0);
-                // Now that we have enough samples, increment the nnz for the sample
-                sequence->m_nnzCounts[sampleNum] += 1;
-                // Now that we've found it's sample, fix up the index.
-                rowOffsets[curRow] %= m_numCols;
-            }
-            sequence->m_numberOfSamples = (uint32_t)sequence->m_nnzCounts.size();
-            // update values, rowOffsets pointers
-            values = (char*)values + sequence->m_totalNnzCount * elemSize;
-            rowOffsets += sequence->m_totalNnzCount;
-
-            result[colIndex] = sequence;
+            shared_ptr<SparseInputStreamBuffer> sequenceDataPtr = make_shared<SparseInputStreamBuffer>();
+            offset += GetSequenceData((char*)data + offset, sequenceDataPtr);
+            sequenceDataPtr->m_sampleLayout = GetSampleLayout();
+            sequenceDataPtr->m_elementType = m_precision;
+            result[i] = sequenceDataPtr;
        }

-        // For sparse, we compute how many bytes we processed
-        // From the header to this function, we see that is:
-        // sizeof(int32_t) + totalNNz * sizeof(ElemType) + totalNNz * sizeof(int32_t) + numSequences * sizeof(int32_t)
-        return sizeof(int32_t) + totalNNz * (elemSize + sizeof(int32_t)) + (numSequences + 1) * sizeof(int32_t);
+        return offset;
    }

-private:
-    bool m_isSequence;
+    size_t GetSequenceData(void* data, shared_ptr<SparseInputStreamBuffer>& sequence)
+    {
+        size_t valueSize = SizeOfDataType();
+        size_t offset = 0;

+        // The very first value in the buffer is the number of samples in this sequence.
+        sequence->m_numberOfSamples = *(uint32_t*)data;
+        offset += sizeof(uint32_t);

+        // Next is the total number of elements in all of the samples.
+        uint32_t nnz = *(uint32_t*)((char*)data + offset);
+        if (IndexType(nnz) < 0) 
+        {
+            RuntimeError("NNZ count is too large for an IndexType value.");
+        }
+        sequence->m_totalNnzCount = nnz;
+        offset += sizeof(uint32_t);
+        
+        
+
+        // the rest of this sequence
+        // Since we're not templating on ElemType, we use void for the values. Note that this is the only place
+        // this deserializer uses ElemType, the rest are int32_t for this deserializer.
+        // The data is already properly packed, so just use it.
+        sequence->m_data = (char*)data + offset;
+        offset += valueSize * sequence->m_totalNnzCount;
+
+        // The indices are supposed to be correctly packed (i.e., in increasing order)
+        sequence->m_indices = (int32_t*)((char*)data + offset);
+        offset += sizeof(int32_t) * sequence->m_totalNnzCount;
+        
+        int32_t* begin = (int32_t*)((char*)data + offset);
+        offset += sizeof(int32_t) * sequence->m_numberOfSamples;
+        int32_t* end = (int32_t*)((char*)data + offset);
+        
+        sequence->m_nnzCounts.reserve(sequence->m_numberOfSamples);
+        sequence->m_nnzCounts.assign(begin, end);
+
+        return offset;
+    }
 };

    
--- a/Source/Readers/CNTKBinaryReader/CNTKBinaryReader.cpp
+++ b/Source/Readers/CNTKBinaryReader/CNTKBinaryReader.cpp
@ -23,8 +23,8 @@ CNTKBinaryReader::CNTKBinaryReader(const ConfigParameters& config)
 {
    BinaryConfigHelper configHelper(config);

-    string log;
-    log += "Initializing CNTKBinaryReader";
+    std::stringstream log;
+    log << "Initializing CNTKBinaryReader";
    try
    {
        m_deserializer = shared_ptr<IDataDeserializer>(new BinaryChunkDeserializer(configHelper));
@ -32,14 +32,16 @@ CNTKBinaryReader::CNTKBinaryReader(const ConfigParameters& config)
        if (configHelper.ShouldKeepDataInMemory())
        {
            m_deserializer = shared_ptr<IDataDeserializer>(new ChunkCache(m_deserializer));
-            log += " | keeping data in memory";
+            log << " | keeping data in memory";
        }

        size_t window = configHelper.GetRandomizationWindow();
        if (window > 0)
        {
            // Verbosity is a general config parameter, not specific to the binary format reader.
-            log += " | randomizing with window: " + (int)window;
+            log << " | randomizing with window: " 
+                << window 
+                << configHelper.UseSampleBasedRandomizationWindow() ? " samples" : " chunks";
            int verbosity = config(L"verbosity", 0);
            m_sequenceEnumerator = make_shared<BlockRandomizer>(
                verbosity, /* verbosity */
@ -52,7 +54,7 @@ CNTKBinaryReader::CNTKBinaryReader(const ConfigParameters& config)
        }
        else
        {
-            log += " | without randomization";
+            log << " | without randomization";
            m_sequenceEnumerator = std::make_shared<NoRandomizer>(m_deserializer);
        }

@ -64,7 +66,7 @@ CNTKBinaryReader::CNTKBinaryReader(const ConfigParameters& config)
        RuntimeError("CNTKBinaryReader: While reading '%ls': %s", configHelper.GetFilePath().c_str(), e.what());
    }
    if (configHelper.GetTraceLevel() > 2)
-        fprintf(stderr, "%s\n", log.c_str());
+        fprintf(stderr, "%s\n", log.str().c_str());
 }

 } } }
--- a/Source/Readers/CNTKBinaryReader/FileHelper.h
+++ b/Source/Readers/CNTKBinaryReader/FileHelper.h
@ -34,6 +34,31 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 class CNTKBinaryFileHelper
 {
 public:
+    static const uint64_t MAGIC_NUMBER = 0x636e746b5f62696eU;
+
+    static void FindMagicOrDie(FILE* f, wstring name) {
+        // Read the magic number and make sure we're given a proper CBF file.
+        uint64_t number;
+        ReadOrDie(&number, sizeof(number), 1, f);
+        if (number != MAGIC_NUMBER)
+            RuntimeError("The input (%S) is not a valid CNTK binary format file.",
+                name.c_str());
+    }
+
+    static uint32_t GetVersionNumber(FILE* f) {
+        uint32_t versionNumber;
+        ReadOrDie(&versionNumber, sizeof(versionNumber), 1, f);
+        return versionNumber;
+    }
+
+    static int64_t GetHeaderOffset(FILE* f) {
+        // Seek to the end of file -8 bytes to find the offset of the header.
+        SeekOrDie(f, -int64_t(sizeof(int64_t)), SEEK_END);
+        int64_t headerOffset;
+        ReadOrDie(&headerOffset, sizeof(headerOffset), 1, f);
+        return headerOffset;
+    }
+
    static FILE* openOrDie(const string& pathname, const char* mode)
    {
        FILE* f = fopen(pathname.c_str(), mode);
@ -42,7 +67,7 @@ public:
        return f;
    }

-    static FILE* openOrDie(const wstring& pathname, const wchar_t* mode)
+    static FILE* OpenOrDie(const wstring& pathname, const wchar_t* mode)
    {
        FILE* f = _wfopen(pathname.c_str(), mode);
        if (!f)
@ -50,14 +75,14 @@ public:
        return f;
    }

-    static void closeOrDie(FILE* f)
+    static void CloseOrDie(FILE* f)
    {
        int rc = fclose(f);
        if (rc != 0)
            RuntimeError("Error closing: %s.", strerror(errno));
    }

-    static void seekOrDie(FILE* f, int64_t offset, int mode)
+    static void SeekOrDie(FILE* f, int64_t offset, int mode)
    {
        int rc;
 #ifdef __WINDOWS__
@ -69,7 +94,7 @@ public:
            RuntimeError("Error seeking: %s.", strerror(errno));
    }
    
-    static int64_t tellOrDie(FILE* f)
+    static int64_t TellOrDie(FILE* f)
    {
        size_t rc;
 #ifdef __WINDOWS__
@ -82,7 +107,7 @@ public:
        return rc;
    }

-    static void readOrDie(void* ptr, size_t size, size_t count, FILE* f)
+    static void ReadOrDie(void* ptr, size_t size, size_t count, FILE* f)
    {
        size_t rc;
        rc = fread(ptr, size, count, f);
--- a/Tests/EndToEndTests/UnitTests/ReaderTests/baseline.txt
+++ b/Tests/EndToEndTests/UnitTests/ReaderTests/baseline.txt
@ -1431,16 +1431,22 @@ Test module "ReaderTests" has passed with:
    95 test cases out of 95 passed
    16971350 assertions out of 16971350 passed

-    Test case "ReaderTestSuite/CNTKBinaryReader_sparse_seq" has passed with:
-      1 assertion out of 1 passed
-
-    Test case "ReaderTestSuite/CNTKBinaryReader_Simple_sparse" has passed with:
-      1 assertion out of 1 passed
-
    Test case "ReaderTestSuite/CNTKBinaryReader_Simple_dense" has passed with:
      1 assertion out of 1 passed

-    Test case "ReaderTestSuite/CNTKBinaryReader_Simple_dense2" has passed with:
+    Test case "ReaderTestSuite/CNTKBinaryReader_MNIST_dense" has passed with:
+      1 assertion out of 1 passed
+
+    Test case "ReaderTestSuite/CNTKBinaryReader_10x10_dense" has passed with:
+      1 assertion out of 1 passed
+
+    Test case "ReaderTestSuite/CNTKBinaryReader_50x20_jagged_sequences_dense" has passed with:
+      1 assertion out of 1 passed
+
+    Test case "ReaderTestSuite/CNTKBinaryReader_10x10_sparse" has passed with:
+      1 assertion out of 1 passed
+
+    Test case "ReaderTestSuite/CNTKBinaryReader_50x20_jagged_sequences_sparse" has passed with:
      1 assertion out of 1 passed

    Test case "ReaderTestSuite/CNTKTextFormatReader_Simple_dense" has passed with:
--- a/Tests/UnitTests/ReaderTests/CNTKBinaryReaderTests.cpp
+++ b/Tests/UnitTests/ReaderTests/CNTKBinaryReaderTests.cpp
@ -9,9 +9,7 @@

 using namespace Microsoft::MSR::CNTK;

-namespace Microsoft { namespace MSR { namespace CNTK {
-
-namespace Test {
+namespace Microsoft { namespace MSR { namespace CNTK { namespace Test {

 struct CNTKBinaryReaderFixture : ReaderFixture
 {
@ -23,75 +21,116 @@ struct CNTKBinaryReaderFixture : ReaderFixture

 BOOST_FIXTURE_TEST_SUITE(ReaderTestSuite, CNTKBinaryReaderFixture)

-BOOST_AUTO_TEST_CASE(CNTKBinaryReader_sparse_seq)
-{
-    HelperRunReaderTest<float>(
-        testDataPath() + "/Config/CNTKBinaryReader/test.cntk",
-        testDataPath() + "/Control/CNTKBinaryReader/Simple_sparse_seq.txt",
-        testDataPath() + "/Control/CNTKBinaryReader/Simple_sparse_seq_Output.txt",
-        "SparseSeq",
-        "reader",
-        1500, // epoch size
-        250,  // mb size
-        1,   // num epochs 
-        2,
-        2,
-        0,
-        1, true, false, false);
-};
-
-BOOST_AUTO_TEST_CASE(CNTKBinaryReader_Simple_sparse)
-{
-    HelperRunReaderTest<float>(
-        testDataPath() + "/Config/CNTKBinaryReader/test.cntk",
-        testDataPath() + "/Control/CNTKBinaryReader/Simple_sparse.txt",
-        testDataPath() + "/Control/CNTKBinaryReader/Simple_sparse_Output.txt",
-        "Sparse",
-        "reader",
-        1600, // epoch size
-        250,  // mb size
-        1,   // num epochs 
-        2,
-        2,
-        0,
-        1, true, false, false);
-};

 BOOST_AUTO_TEST_CASE(CNTKBinaryReader_Simple_dense)
 {
    HelperRunReaderTest<float>(
        testDataPath() + "/Config/CNTKBinaryReader/test.cntk",
-        testDataPath() + "/Control/CNTKBinaryReader/Simple_dense.txt",
+        testDataPath() + "/Control/CNTKTextFormatReader/Simple_dense.txt",
        testDataPath() + "/Control/CNTKBinaryReader/Simple_dense_Output.txt",
        "Simple",
        "reader",
-        1600, // epoch size
+        1000, // epoch size
        250,  // mb size
-        1,   // num epochs 
-        4,
+        10,   // num epochs 
+        1,
+        1,
        0,
-        0,
-        1, false, false, false);
+        1);
 };

-BOOST_AUTO_TEST_CASE(CNTKBinaryReader_Simple_dense2)
+BOOST_AUTO_TEST_CASE(CNTKBinaryReader_MNIST_dense)
+{
+    HelperRunReaderTest<double>(
+        testDataPath() + "/Config/CNTKBinaryReader/test.cntk",
+        testDataPath() + "/Control/CNTKTextFormatReader/MNIST_dense.txt",
+        testDataPath() + "/Control/CNTKBinaryReader/MNIST_dense_Output.txt",
+        "MNIST",
+        "reader",
+        1000, // epoch size
+        1000,  // mb size
+        1,   // num epochs
+        1,
+        1,
+        0,
+        1);
+};
+
+// 10 sequences with 10 samples each (no randomization)
+BOOST_AUTO_TEST_CASE(CNTKBinaryReader_10x10_dense)
 {
    HelperRunReaderTest<float>(
        testDataPath() + "/Config/CNTKBinaryReader/test.cntk",
-        testDataPath() + "/Control/CNTKBinaryReader/Simple_dense_312.txt",
-        testDataPath() + "/Control/CNTKBinaryReader/Simple_dense_312_Output.txt",
-        "Simple",
+        testDataPath() + "/Control/CNTKTextFormatReader/10x10_dense.txt",
+        testDataPath() + "/Control/CNTKBinaryReader/10x10_dense_Output.txt",
+        "10x10_dense",
        "reader",
-        1600, // epoch size
-        312,  // mb size
-        1,   // num epochs 
-        4,
+        100, // epoch size
+        100,  // mb size
+        1,  // num epochs
+        1,
+        0, // no labels
        0,
-        0,
-        1, false, false, false);
+        1);
 };


+// 50 sequences with up to 20 samples each (508 samples in total)
+BOOST_AUTO_TEST_CASE(CNTKBinaryReader_50x20_jagged_sequences_dense)
+{
+    HelperRunReaderTest<double>(
+        testDataPath() + "/Config/CNTKBinaryReader/test.cntk",
+        testDataPath() + "/Control/CNTKTextFormatReader/50x20_jagged_sequences_dense.txt",
+        testDataPath() + "/Control/CNTKBinaryReader/50x20_jagged_sequences_dense_Output.txt",
+        "50x20_jagged_sequences_dense",
+        "reader",
+        508,  // epoch size
+        508,  // mb size 
+        1,  // num epochs
+        1,
+        0,
+        0,
+        1);
+};
+
+// 10 sequences with 10 samples each (no randomization)
+BOOST_AUTO_TEST_CASE(CNTKBinaryReader_10x10_sparse)
+{
+    HelperRunReaderTest<double>(
+        testDataPath() + "/Config/CNTKBinaryReader/test.cntk",
+        testDataPath() + "/Control/CNTKTextFormatReader/10x10_sparse.txt",
+        testDataPath() + "/Control/CNTKBinaryReader/10x10_sparse_Output.txt",
+        "10x10_sparse",
+        "reader",
+        100, // epoch size
+        100, // mb size
+        1, // num epochs
+        1,
+        0, // no labels
+        0,
+        1,
+        true);
+};
+
+// 50 sequences with up to 20 samples each (536 samples in total)
+BOOST_AUTO_TEST_CASE(CNTKBinaryReader_50x20_jagged_sequences_sparse)
+{
+    HelperRunReaderTest<float>(
+        testDataPath() + "/Config/CNTKBinaryReader/test.cntk",
+        testDataPath() + "/Control/CNTKTextFormatReader/50x20_jagged_sequences_sparse.txt",
+        testDataPath() + "/Control/CNTKBinaryReader/50x20_jagged_sequences_sparse_Output.txt",
+        "50x20_jagged_sequences_sparse",
+        "reader",
+        564,  // epoch size
+        564,  // mb size 
+        1,  // num epochs
+        1,
+        0,
+        0,
+        1,
+        true);
+};
+
 BOOST_AUTO_TEST_SUITE_END()

 } } } }
--- a/Tests/UnitTests/ReaderTests/CNTKTextFormatReaderTests.cpp
+++ b/Tests/UnitTests/ReaderTests/CNTKTextFormatReaderTests.cpp
@ -51,31 +51,6 @@ public:

 namespace Test {

-// identical to 'sort -o filename filename'
-void SortLinesInFile(string filename, size_t expectedNumLines = 1)
-{
-    vector<string> content;
-    content.reserve(expectedNumLines);
-    ifstream ifstream(filename);
-    
-    string line;
-    while (getline(ifstream, line))
-    {
-        content.push_back(line);
-    }
-
-    ifstream.close();
-
-    sort(content.begin(), content.end());
-
-    ofstream ofstream(filename);
-
-    copy(content.begin(), content.end(), ostream_iterator<string>(ofstream, "\n"));
-
-    ofstream.close();
-}
-
-
 struct CNTKTextFormatReaderFixture : ReaderFixture
 {
    CNTKTextFormatReaderFixture()
--- a/Tests/UnitTests/ReaderTests/Common/ReaderTestHelper.h
+++ b/Tests/UnitTests/ReaderTests/Common/ReaderTestHelper.h
@ -13,6 +13,30 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test {

 const double relError = 1e-5f;

+// identical to 'sort -o filename filename'
+inline void SortLinesInFile(string filename, size_t expectedNumLines = 1)
+{
+    vector<string> content;
+    content.reserve(expectedNumLines);
+    ifstream ifstream(filename);
+
+    string line;
+    while (getline(ifstream, line))
+    {
+        content.push_back(line);
+    }
+
+    ifstream.close();
+
+    sort(content.begin(), content.end());
+
+    ofstream ofstream(filename);
+
+    copy(content.begin(), content.end(), ostream_iterator<string>(ofstream, "\n"));
+
+    ofstream.close();
+}
+
 struct ReaderFixture
 {
    // This fixture sets up paths so the tests can assume the right location for finding the configuration
--- a/Tests/UnitTests/ReaderTests/Config/CNTKBinaryReader/test.cntk
+++ b/Tests/UnitTests/ReaderTests/Config/CNTKBinaryReader/test.cntk
@ -1,50 +1,91 @@
 # deviceId = -1 for CPU, >= 0 for GPU devices
 deviceId = -1

-SparseSeq = [
-    precision = "float"
-    reader = [
-        readerType = "CNTKBinaryReader"
-        file = "sparseseqoutput.bin"
-
-        input = [
-            features1 = [ alias="a" ]
-            features2 = [ alias="b" ]
-            labels1 = [ alias="c" ]
-            labels2 = [ alias="d" ]
-        ]
-        randomize = false
-    ]
-]
-
-Sparse = [
-    precision = "float"
-    reader = [
-        readerType = "CNTKBinaryReader"
-        file = "sparseoutput.bin"
-
-        input = [
-            features1 = [ alias="a" ]
-            features2 = [ alias="b" ]
-            labels1 = [ alias="c" ]
-            labels2 = [ alias="d" ]
-        ]
-        randomize = false
-    ]
-]

 Simple = [
    precision = "float"
    reader = [
        readerType = "CNTKBinaryReader"
-        file = "simple.bin"
+        file = "Simple_dense.bin"
+        randomize = false
+    ]
+]

+MNIST = [
+    precision = "double"
+    reader = [
+        readerType = "CNTKBinaryReader"
+        file = "MNIST_dense.bin" # contains half a dozen chunks with ca. 400 KB in each
+        randomize = false
+        keepDataInMemory = true
+    ]
+]
+
+
+10x10_dense = [
+    precision = "float"
+    reader = [
+        readerType = "CNTKBinaryReader"
+        # Training file contains ten sequence with ten samples each
+        file = "10x10_dense.bin"
+        randomize = false
+    ]
+]
+
+50x20_jagged_sequences_dense = [
+    precision = "double"
+    reader = [
+        readerType = "CNTKBinaryReader"
+        # Training file contains 50 sequence with *up to* 20 samples each
+        file = "50x20_jagged_sequences_dense.bin"
+        randomize = false
+    ]
+]
+
+10x10_sparse = [
+    precision = "double"
+    reader = [
+        readerType = "CNTKBinaryReader"
+        # Training file contains ten sequences with ten samples each
+        file = "10x10_sparse.bin"
+        randomize = false
+    ]
+]
+
+50x20_jagged_sequences_sparse = [
+    precision = "float"
+    reader = [
+        readerType = "CNTKBinaryReader"
+        # Training file contains 50 sequence with *up to* 20 samples each
+        file = "50x20_jagged_sequences_sparse.bin"
+        randomize = false
+    ]
+]
+
+100x100x3_randomize_auto = [
+    precision = "double"
+    reader = [
+        readerType = "CNTKBinaryReader"
+        # Training file contains 100 sequence with *up to* 100 samples 
+        # in each of 3 inputs.
+        file = "100x100x3_jagged_sequences_dense.bin"
+        randomize = true
+    ]
+]
+
+5_inputs_100x10_jagged_mixed = [
+    precision = "float"
+    reader = [
+        readerType = "CNTKBinaryReader"
+        # Training file contains 100 sequence with *up to* 100 samples 
+        # in each of 3 inputs.
+        file = "5_inputs_100x10_jagged_mixed.bin"
        input = [
            features1 = [ alias="a" ]
            features2 = [ alias="b" ]
            features3 = [ alias="c" ]
            features4 = [ alias="d" ]
+            features5 = [ alias="e" ]
        ]
-        randomize = false
    ]
-]
+]
--- a/Tests/UnitTests/ReaderTests/Control/CNTKBinaryReader/Simple_dense_312.txt
+++ b/Tests/UnitTests/ReaderTests/Control/CNTKBinaryReader/Simple_dense_312.txt
--- a/Tests/UnitTests/ReaderTests/Control/CNTKBinaryReader/Simple_sparse.txt
+++ b/Tests/UnitTests/ReaderTests/Control/CNTKBinaryReader/Simple_sparse.txt
--- a/Tests/UnitTests/ReaderTests/Control/CNTKBinaryReader/Simple_sparse_seq.txt
+++ b/Tests/UnitTests/ReaderTests/Control/CNTKBinaryReader/Simple_sparse_seq.txt
--- a/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/100x100x3_jagged_sequences_dense.bin
+++ b/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/100x100x3_jagged_sequences_dense.bin
--- a/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/10x10_dense.bin
+++ b/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/10x10_dense.bin
--- a/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/10x10_sparse.bin
+++ b/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/10x10_sparse.bin
--- a/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/50x20_jagged_sequences_dense.bin
+++ b/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/50x20_jagged_sequences_dense.bin
--- a/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/50x20_jagged_sequences_sparse.bin
+++ b/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/50x20_jagged_sequences_sparse.bin
--- a/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/MNIST_dense.bin
+++ b/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/MNIST_dense.bin
--- a/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/Simple_dense.bin
+++ b/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/Simple_dense.bin
--- a/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/simple.bin
+++ b/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/simple.bin
--- a/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/sparseoutput.bin
+++ b/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/sparseoutput.bin
--- a/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/sparseseqoutput.bin
+++ b/Tests/UnitTests/ReaderTests/Data/CNTKBinaryReader/sparseseqoutput.bin
--- a/Tests/UnitTests/ReaderTests/ReaderTests.vcxproj
+++ b/Tests/UnitTests/ReaderTests/ReaderTests.vcxproj
@ -143,6 +143,10 @@
    <Text Include="Config\HTKMLFReaderSimpleDataLoop9_Config.cntk" />
    <Text Include="Config\ImageReaderSimple_Config.cntk" />
    <Text Include="Config\UCIFastReaderSimpleDataLoop_Config.cntk" />
+    <Text Include="Control\CNTKBinaryReader\Simple_dense.txt" />
+    <Text Include="Control\CNTKBinaryReader\Simple_dense_312.txt" />
+    <Text Include="Control\CNTKBinaryReader\Simple_sparse.txt" />
+    <Text Include="Control\CNTKBinaryReader\Simple_sparse_seq.txt" />
    <Text Include="Control\CNTKTextFormatReader\100x100x3_jagged_sequences_dense_sorted.txt" />
    <Text Include="Control\CNTKTextFormatReader\100x1_1_dense.txt" />
    <Text Include="Control\CNTKTextFormatReader\100x1_2_dense.txt" />
@ -218,6 +222,7 @@
    <Image Include="Data\images\red.jpg" />
  </ItemGroup>
  <ItemGroup>
+    <None Include="Config\CNTKBinaryReader\test.cntk" />
    <None Include="Config\CNTKTextFormatReader\dense.cntk" />
    <None Include="Config\CNTKTextFormatReader\edge_cases.cntk" />
    <None Include="Config\CNTKTextFormatReader\sparse.cntk" />
@ -244,6 +249,9 @@
    <None Include="Config\ImageReaderLabelOutOfRange_Config.cntk" />
    <None Include="Config\ImageReaderMultiView_Config.cntk" />
    <None Include="Config\ImageReaderZip_Config.cntk" />
+    <None Include="Data\CNTKBinaryReader\simple.bin" />
+    <None Include="Data\CNTKBinaryReader\sparseoutput.bin" />
+    <None Include="Data\CNTKBinaryReader\sparseseqoutput.bin" />
    <None Include="Data\images\chunk0.zip" />
    <None Include="Data\images\chunk1.zip" />
    <None Include="Data\images\simple.zip" />
--- a/Tests/UnitTests/ReaderTests/ReaderTests.vcxproj.filters
+++ b/Tests/UnitTests/ReaderTests/ReaderTests.vcxproj.filters
@ -52,6 +52,15 @@
    <Filter Include="Config\HTKDeserializers">
      <UniqueIdentifier>{b2a5f515-1b00-4f19-bd19-8e3fd7eb6872}</UniqueIdentifier>
    </Filter>
+    <Filter Include="Config\CNTKBinaryReader">
+      <UniqueIdentifier>{46df705e-a35d-4bb8-9272-92c7cb293079}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Control\CNTKBinaryReader">
+      <UniqueIdentifier>{4194f906-450f-4112-841d-c80b3f90081e}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Data\CNTKBinaryReader">
+      <UniqueIdentifier>{4637ff35-9ba4-4850-bfe5-d534c5920388}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <Text Include="Data\ImageReaderSimple_map.txt">
@ -324,6 +333,18 @@
    <Text Include="Data\ImageAndTextReaderSimple_map.txt">
      <Filter>Data</Filter>
    </Text>
+    <Text Include="Control\CNTKBinaryReader\Simple_dense.txt">
+      <Filter>Control\CNTKBinaryReader</Filter>
+    </Text>
+    <Text Include="Control\CNTKBinaryReader\Simple_dense_312.txt">
+      <Filter>Control\CNTKBinaryReader</Filter>
+    </Text>
+    <Text Include="Control\CNTKBinaryReader\Simple_sparse.txt">
+      <Filter>Control\CNTKBinaryReader</Filter>
+    </Text>
+    <Text Include="Control\CNTKBinaryReader\Simple_sparse_seq.txt">
+      <Filter>Control\CNTKBinaryReader</Filter>
+    </Text>
  </ItemGroup>
  <ItemGroup>
    <Image Include="Data\images\black.jpg">
@ -430,6 +451,18 @@
    <None Include="Config\ImageDeserializers.cntk">
      <Filter>Config</Filter>
    </None>
+    <None Include="Config\CNTKBinaryReader\test.cntk">
+      <Filter>Config\CNTKBinaryReader</Filter>
+    </None>
+    <None Include="Data\CNTKBinaryReader\simple.bin">
+      <Filter>Data\CNTKBinaryReader</Filter>
+    </None>
+    <None Include="Data\CNTKBinaryReader\sparseoutput.bin">
+      <Filter>Data\CNTKBinaryReader</Filter>
+    </None>
+    <None Include="Data\CNTKBinaryReader\sparseseqoutput.bin">
+      <Filter>Data\CNTKBinaryReader</Filter>
+    </None>
  </ItemGroup>
  <ItemGroup>
    <Xml Include="Data\ImageNet1K_intensity.xml">