2014-08-30 03:21:42 +04:00
//
// <copyright file="UCIFastReader.cpp" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//
// UCIFastReader.cpp : Defines the exported functions for the DLL application.
//
# include "stdafx.h"
# define DATAREADER_EXPORTS // creating the exports here
# include "DataReader.h"
# include "UCIFastReader.h"
# ifdef LEAKDETECT
# include <vld.h> // leak detection
# endif
2014-10-11 03:59:42 +04:00
# include "fileutil.h" // for fexists()
2014-08-30 03:21:42 +04:00
namespace Microsoft { namespace MSR { namespace CNTK {
template < class ElemType >
size_t UCIFastReader < ElemType > : : RandomizeSweep ( size_t mbStartSample )
{
//size_t randomRangePerEpoch = (m_epochSize+m_randomizeRange-1)/m_randomizeRange;
//return m_epoch*randomRangePerEpoch + epochSample/m_randomizeRange;
return mbStartSample / m_randomizeRange ;
}
// ReadLine - Read a line
// readSample - sample to read in global sample space
// returns - true if we successfully read a record, otherwise false
template < class ElemType >
2014-10-13 22:36:46 +04:00
bool UCIFastReader < ElemType > : : ReadRecord ( size_t /*readSample*/ )
2014-08-30 03:21:42 +04:00
{
return false ; // not used
}
// RecordsToRead - Determine number of records to read to populate record buffers
// mbStartSample - the starting sample from which to read
// tail - we are checking for possible remainer records to read (default false)
// returns - true if we have more to read, false if we hit the end of the dataset
template < class ElemType >
size_t UCIFastReader < ElemType > : : RecordsToRead ( size_t mbStartSample , bool tail )
{
assert ( mbStartSample > = m_epochStartSample ) ;
// determine how far ahead we need to read
bool randomize = Randomize ( ) ;
// need to read to the end of the next minibatch
size_t epochSample = mbStartSample ;
epochSample % = m_epochSize ;
// determine number left to read for this epoch
size_t numberToEpoch = m_epochSize - epochSample ;
// we will take either a minibatch or the number left in the epoch
size_t numberToRead = min ( numberToEpoch , m_mbSize ) ;
if ( numberToRead = = 0 & & ! tail )
numberToRead = m_mbSize ;
if ( randomize )
{
size_t randomizeSweep = RandomizeSweep ( mbStartSample ) ;
// if first read or read takes us to another randomization range
// we need to read at least randomization range records
if ( randomizeSweep ! = m_randomordering . CurrentSeed ( ) ) // the range has changed since last time
{
numberToRead = RoundUp ( epochSample , m_randomizeRange ) - epochSample ;
if ( numberToRead = = 0 & & ! tail )
numberToRead = m_randomizeRange ;
}
}
return numberToRead ;
}
// EnsureDataAvailable - Read enough lines so we can request a minibatch starting as requested
// mbStartSample - the starting sample we are ensureing are good
// endOfDataCheck - check if we are at the end of the dataset (no wraparound)
// returns - true if we have more to read, false if we hit the end of the dataset
template < class ElemType >
bool UCIFastReader < ElemType > : : EnsureDataAvailable ( size_t mbStartSample , bool endOfDataCheck )
{
assert ( mbStartSample > = m_epochStartSample ) ;
// determine how far ahead we need to read
2014-10-13 22:36:46 +04:00
Randomize ( ) ;
2014-08-30 03:21:42 +04:00
// need to read to the end of the next minibatch
size_t epochSample = mbStartSample ;
epochSample % = m_epochSize ;
bool moreToRead = true ;
size_t numberToRead = RecordsToRead ( mbStartSample ) ;
// check to see if we have the proper records read already
if ( m_readNextSample > = mbStartSample + numberToRead & & mbStartSample > = m_epochStartSample )
return true ;
// truncate the present arrays to the location we are reading from, parser appends on these arrays
if ( m_featureData . size ( ) > epochSample * m_featureCount ) // should be this size, if not, truncate
m_featureData . resize ( epochSample * m_featureCount ) ;
if ( m_labelType ! = labelNone & & m_labelData . size ( ) > epochSample )
{
// make sure the labelId array is also the correct size
if ( m_labelType = = labelCategory )
m_labelIdData . resize ( epochSample ) ;
m_labelData . resize ( epochSample ) ;
}
int recordsRead = 0 ;
do
{
int numRead = m_parser . Parse ( numberToRead - recordsRead , & m_featureData , & m_labelData ) ;
recordsRead + = numRead ;
if ( ! m_endReached )
m_totalSamples + = numRead ; // total number of records in the dataset
// we should only get less records than requested at when we hit the end of the dataset
if ( recordsRead < numberToRead )
{
// update dataset variables
size_t additionalToRead = UpdateDataVariables ( mbStartSample + recordsRead ) ;
m_parser . SetFilePosition ( 0 ) ; // make another pass of the dataset
// if doing and end of data check, and we are at the end
// or a partial minibatch was found exit now
if ( ( endOfDataCheck & & recordsRead = = 0 ) | |
( m_partialMinibatch & & recordsRead > 0 ) )
{
moreToRead = false ;
break ;
}
// get the additional number to read
numberToRead = recordsRead + additionalToRead ;
}
}
while ( recordsRead < numberToRead ) ;
m_readNextSample + = recordsRead ;
// for category labels, we need to build up a list of IDs and a mapping table
if ( m_labelType = = labelCategory )
{
// loop through all the newly read records
for ( int numberRead = 0 ; numberRead < recordsRead ; numberRead + + )
{
LabelType & label = m_labelData [ epochSample + numberRead ] ;
// check to see if we have seen this label before
auto value = m_mapLabelToId . find ( label ) ;
LabelIdType labelId ;
if ( value = = m_mapLabelToId . end ( ) )
{
if ( m_labelFileToWrite . empty ( ) )
2014-10-30 20:33:51 +03:00
RuntimeError ( " label found in data not specified in label mapping file: %s " , label . c_str ( ) ) ;
2014-08-30 03:21:42 +04:00
// new label so add it to the mapping tables
m_mapLabelToId [ label ] = m_labelIdMax ;
m_mapIdToLabel [ m_labelIdMax ] = label ;
labelId = m_labelIdMax + + ;
// if our label dimension is lower than the current labelId then increase it
if ( m_labelDim < m_labelIdMax )
m_labelDim = m_labelIdMax ;
}
else
{
labelId = value - > second ;
}
// now add the label id to the label data array
m_labelIdData . push_back ( labelId ) ;
}
}
// if there more to read (always is, unless we want partial minibatches
return moreToRead ;
}
// UpdateDataVariables - Update variables that depend on the dataset being completely read
template < class ElemType >
size_t UCIFastReader < ElemType > : : UpdateDataVariables ( size_t mbStartSample )
{
// if we haven't been all the way through the file yet
if ( ! m_endReached )
{
// get the size of the dataset
assert ( m_totalSamples * m_featureCount > = m_featureData . size ( ) ) ;
// if they want us to determine epoch size based on dataset size, do that
if ( m_epochSize = = requestDataSize )
{
// set the epoch size to be a multiple of mbSize or randomization range
if ( m_partialMinibatch )
m_epochSize = m_totalSamples ;
else
{
size_t roundUpTo = m_mbSize ;
if ( m_randomizeRange ! = randomizeAuto & & m_randomizeRange ! = randomizeNone )
roundUpTo = m_randomizeRange ;
m_epochSize = RoundUp ( m_totalSamples , roundUpTo ) ;
}
}
// make sure randomization range is within the sample bounds
if ( m_randomizeRange > m_epochSize )
{
m_randomizeRange = m_epochSize ;
m_randomordering . resize ( m_randomizeRange , m_randomizeRange ) ;
}
// write the label file if we hit the end of the file
WriteLabelFile ( ) ;
// we got to the end of the dataset
m_endReached = true ;
}
// update the label dimension if it is not big enough, need it here because m_labelIdMax get's updated in the processing loop (after a read)
if ( m_labelType = = labelCategory & & m_labelIdMax > m_labelDim )
m_labelDim = m_labelIdMax ; // update the label dimensions if different
bool recordsToRead = mbStartSample < m_epochStartSample + m_epochSize ; // still some to read after potential epochSize change?
return recordsToRead ? RecordsToRead ( mbStartSample ) : 0 ;
}
template < class ElemType >
void UCIFastReader < ElemType > : : WriteLabelFile ( )
{
// write out the label file if they don't have one
if ( ! m_labelFileToWrite . empty ( ) )
{
if ( m_mapIdToLabel . size ( ) > 0 )
{
File labelFile ( m_labelFileToWrite , fileOptionsWrite | fileOptionsText ) ;
for ( int i = 0 ; i < m_mapIdToLabel . size ( ) ; + + i )
{
labelFile < < m_mapIdToLabel [ i ] < < ' \n ' ;
}
2015-05-19 10:24:08 +03:00
fprintf ( stderr , " label file %ls written to disk \n " , m_labelFileToWrite . c_str ( ) ) ;
2014-08-30 03:21:42 +04:00
m_labelFileToWrite . clear ( ) ;
}
else if ( ! m_cachingWriter )
{
2015-05-19 10:24:08 +03:00
fprintf ( stderr , " WARNING: file %ls NOT written to disk yet, will be written the first time the end of the entire dataset is found. \n " , m_labelFileToWrite . c_str ( ) ) ;
2014-08-30 03:21:42 +04:00
}
}
}
// Destroy - cleanup and remove this class
// NOTE: this destroys the object, and it can't be used past this point
template < class ElemType >
void UCIFastReader < ElemType > : : Destroy ( )
{
delete this ;
}
// Init - Reader Initialize for multiple data sets
// config - [in] configuration parameters for the datareader
// Sample format below:
//# Parameter values for the reader
//reader=[
// # reader to use
// readerType=UCIFastReader
// miniBatchMode=Partial
// randomize=None
// features=[
// dim=784
// start=1
// file=c:\speech\mnist\mnist_test.txt
// ]
// labels=[
// dim=1
// start=0
// file=c:\speech\mnist\mnist_test.txt
// labelMappingFile=c:\speech\mnist\labels.txt
// labelDim=10
// labelType=Category
// ]
//]
template < class ElemType >
void UCIFastReader < ElemType > : : Init ( const ConfigParameters & readerConfig )
{
// See if the user wants caching
m_cachingReader = NULL ;
m_cachingWriter = NULL ;
// initialize the cache
InitCache ( readerConfig ) ;
readerConfig . CopyTo ( m_readerConfig ) ;
// if we have a cache, no need to parse the test files...
if ( m_cachingReader )
return ;
std : : vector < std : : wstring > features ;
std : : vector < std : : wstring > labels ;
GetFileConfigNames ( readerConfig , features , labels ) ;
if ( features . size ( ) > 0 )
{
m_featuresName = features [ 0 ] ;
}
if ( labels . size ( ) > 0 )
{
m_labelsName = labels [ 0 ] ;
}
ConfigParameters configFeatures = readerConfig ( m_featuresName , " " ) ;
ConfigParameters configLabels = readerConfig ( m_labelsName , " " ) ; ;
if ( configFeatures . size ( ) = = 0 )
2014-10-30 20:33:51 +03:00
RuntimeError ( " features file not found, required in configuration: i.e. 'features=[file=c: \\ myfile.txt;start=1;dim=123]' " ) ;
2014-08-30 03:21:42 +04:00
if ( configLabels . size ( ) = = 0 )
fprintf ( stderr , " Warning: labels are not specified. " ) ;
else if ( configFeatures ( " file " , " " ) ! = configLabels ( " file " , " " ) )
2014-10-30 20:33:51 +03:00
RuntimeError ( " features and label files must be the same file, use separate readers to define single use files " ) ;
2014-08-30 03:21:42 +04:00
size_t vdim = configFeatures ( " dim " ) ;
string name = configFeatures . Name ( ) ;
size_t udim = configLabels ( " labelDim " , " 0 " ) ;
// initialize all the variables
m_mbStartSample = m_epoch = m_totalSamples = m_epochStartSample = 0 ;
m_labelIdMax = m_labelDim = 0 ;
m_partialMinibatch = m_endReached = false ;
m_labelType = labelCategory ;
m_featureCount = vdim ;
m_readNextSample = 0 ;
m_traceLevel = readerConfig ( " traceLevel " , " 0 " ) ;
m_parser . SetTraceLevel ( m_traceLevel ) ;
// set the feature count to at least one (we better have one feature...)
assert ( m_featureCount ! = 0 ) ;
if ( readerConfig . Exists ( " randomize " ) )
{
string randomizeString = readerConfig ( " randomize " ) ;
if ( randomizeString = = " None " )
{
m_randomizeRange = randomizeNone ;
}
else if ( randomizeString = = " Auto " )
{
m_randomizeRange = randomizeAuto ;
}
else
{
m_randomizeRange = readerConfig ( " randomize " ) ;
}
}
else
{
m_randomizeRange = randomizeAuto ;
}
// determine if we partial minibatches are desired
std : : string minibatchMode ( readerConfig ( " minibatchMode " , " Partial " ) ) ;
m_partialMinibatch = ! _stricmp ( minibatchMode . c_str ( ) , " Partial " ) ;
// get start and dimensions for labels and features
size_t startLabels = configLabels ( " start " , " 0 " ) ;
size_t dimLabels = configLabels ( " dim " , " 0 " ) ;
size_t startFeatures = configFeatures ( " start " , " 0 " ) ;
size_t dimFeatures = configFeatures ( " dim " , " 0 " ) ;
// determine label type desired
std : : string labelType ;
if ( configLabels . size ( ) = = 0 )
labelType = " None " ;
else
labelType = configLabels ( " labelType " , " Category " ) ;
//convert to lower case for case insensitive comparison
2014-11-05 00:35:18 +03:00
msra : : strfun : : tolower_ascii ( labelType ) ;
2014-08-30 03:21:42 +04:00
if ( labelType = = " category " )
{
m_labelType = labelCategory ;
}
else if ( labelType = = " regression " )
{
m_labelType = labelRegression ;
}
else if ( labelType = = " none " )
{
m_labelType = labelNone ;
dimLabels = 0 ; // override for no labels
}
std : : wstring file = configFeatures ( " file " ) ;
if ( m_traceLevel > 0 )
2015-05-19 10:24:08 +03:00
fprintf ( stderr , " reading uci file %ls \n " , file . c_str ( ) ) ;
2014-08-30 03:21:42 +04:00
m_parser . ParseInit ( file . c_str ( ) , startFeatures , dimFeatures , startLabels , dimLabels ) ;
// if we have labels, we need a label Mapping file, it will be a file with one label per line
if ( m_labelType ! = labelNone )
{
ConfigArray arrayLabels ;
std : : wstring labelPath = configLabels ( " labelMappingFile " ) ;
if ( fexists ( labelPath ) )
{
arrayLabels . LoadConfigFile ( labelPath ) ;
for ( int i = 0 ; i < arrayLabels . size ( ) ; + + i )
{
LabelType label = arrayLabels [ i ] ;
m_mapIdToLabel [ i ] = label ;
m_mapLabelToId [ label ] = i ;
}
m_labelIdMax = ( LabelIdType ) arrayLabels . size ( ) ;
}
else
{
// only do label creation if we have the allow flag, should only be done as a separate command
// to ensure that the label file will exist for verification step in training
bool allowLabelCreation = readerConfig ( " allowMapCreation " , " false " ) ;
if ( allowLabelCreation )
m_labelFileToWrite = labelPath ;
else
2015-05-19 10:24:08 +03:00
RuntimeError ( " label mapping file %ls not found, can be created with a 'createLabelMap' command/action \n " , labelPath . c_str ( ) ) ;
2014-08-30 03:21:42 +04:00
}
}
// if we know the size of the randomization now, resize, otherwise wait until we know the epochSize in StartMinibatchLoop()
if ( Randomize ( ) & & m_randomizeRange ! = randomizeAuto )
m_randomordering . resize ( m_randomizeRange , m_randomizeRange ) ;
// if the value they passed in as udim is not big enough, add something on
if ( udim < m_labelIdMax )
udim = m_labelIdMax ;
m_labelDim = ( LabelIdType ) udim ;
2015-06-09 08:12:44 +03:00
mOneLinePerFile = false ;
mOneLinePerFile = readerConfig ( " onelineperfile " , " false " ) ;
2014-08-30 03:21:42 +04:00
}
// InitCache - Initialize the caching reader if cache files exist, otherwise the writer
// readerConfig - reader configuration
template < class ElemType >
void UCIFastReader < ElemType > : : InitCache ( const ConfigParameters & readerConfig )
{
// check for a writer tag first (lets us know we are caching)
if ( ! readerConfig . Exists ( " writerType " ) )
return ;
// first try to open the binary cache
bool found = false ;
try
{
// TODO: need to go down to all levels, maybe search for sectionType
ConfigArray filesList ( ' , ' ) ;
vector < std : : wstring > names ;
if ( readerConfig . Exists ( " wfile " ) )
{
filesList . push_back ( readerConfig ( " wfile " ) ) ;
if ( fexists ( readerConfig ( " wfile " ) ) )
found = true ;
}
FindConfigNames ( readerConfig , " wfile " , names ) ;
2015-09-21 20:18:31 +03:00
for ( const auto & name : names )
2014-08-30 03:21:42 +04:00
{
ConfigParameters config = readerConfig ( name ) ;
filesList . push_back ( config ( " wfile " ) ) ;
if ( fexists ( config ( " wfile " ) ) )
found = true ;
}
// if we have a file already, we are going to read the cached files
if ( found )
{
ConfigParameters config ;
readerConfig . CopyTo ( config ) ;
// mmodify the config so the reader types look correct
config [ " readerType " ] = config ( " writerType " ) ;
config [ " file " ] = filesList ;
m_cachingReader = new DataReader < ElemType > ( config ) ;
}
else
{
m_cachingWriter = new DataWriter < ElemType > ( readerConfig ) ;
// now get the section names for map and category types
std : : map < std : : wstring , SectionType , nocase_compare > sections ;
m_cachingWriter - > GetSections ( sections ) ;
2015-09-21 20:18:31 +03:00
for ( const auto & pair : sections )
2014-08-30 03:21:42 +04:00
{
if ( pair . second = = sectionTypeCategoryLabel )
{
m_labelsCategoryName = pair . first ;
}
else if ( pair . second = = sectionTypeLabelMapping )
{
m_labelsMapName = pair . first ;
}
}
}
}
catch ( runtime_error err )
{
fprintf ( stderr , " Error attemping to create Binary%s \n %s \n " , found ? " Reader " : " Writer " , err . what ( ) ) ;
delete m_cachingReader ;
m_cachingReader = NULL ;
delete m_cachingWriter ;
m_cachingWriter = NULL ;
}
catch ( . . . )
{
// if there is any error, just get rid of the object
fprintf ( stderr , " Error attemping to create Binary%s \n " , found ? " Reader " : " Writer " ) ;
delete m_cachingReader ;
m_cachingReader = NULL ;
delete m_cachingWriter ;
m_cachingWriter = NULL ;
}
}
// destructor - virtual so it gets called properly
template < class ElemType >
UCIFastReader < ElemType > : : ~ UCIFastReader ( )
{
ReleaseMemory ( ) ;
delete m_cachingReader ;
delete m_cachingWriter ;
}
// ReleaseMemory - release the memory footprint of UCIFastReader
// used when the caching reader is taking over
template < class ElemType >
void UCIFastReader < ElemType > : : ReleaseMemory ( )
{
if ( m_featuresBuffer ! = NULL )
delete [ ] m_featuresBuffer ;
m_featuresBuffer = NULL ;
if ( m_labelsBuffer ! = NULL )
delete [ ] m_labelsBuffer ;
m_labelsBuffer = NULL ;
if ( m_labelsIdBuffer ! = NULL )
delete [ ] m_labelsIdBuffer ;
m_labelsIdBuffer = NULL ;
m_featureData . clear ( ) ;
m_labelIdData . clear ( ) ;
m_labelData . clear ( ) ;
}
//SetupEpoch - Setup the proper position in the file, and other variable settings to start a particular epoch
template < class ElemType >
void UCIFastReader < ElemType > : : SetupEpoch ( )
{
// if we are starting fresh (epoch zero and no data read), init everything
// however if we are using cachingWriter, we need to know record count, so do that first
if ( m_epoch = = 0 & & m_totalSamples = = 0 & & m_cachingWriter ! = NULL )
{
m_readNextSample = m_epochStartSample = m_mbStartSample = 0 ;
m_parser . SetFilePosition ( 0 ) ;
}
else // otherwise, position the read to start at the right location
{
// don't know the total number of samples yet, so count them
if ( m_totalSamples = = 0 )
{
if ( m_traceLevel > 0 )
2015-05-22 10:25:44 +03:00
fprintf ( stderr , " starting at epoch %lu counting lines to determine record count \n " , ( unsigned long ) m_epoch ) ;
2014-08-30 03:21:42 +04:00
m_parser . SetParseMode ( ParseLineCount ) ;
m_totalSamples = m_parser . Parse ( size_t ( - 1 ) , NULL , NULL ) ;
m_parser . SetParseMode ( ParseNormal ) ;
m_parser . SetFilePosition ( 0 ) ;
m_mbStartSample = 0 ;
UpdateDataVariables ( 0 ) ; // update all the variables since we read to the end...
if ( m_traceLevel > 0 )
2015-05-22 10:25:44 +03:00
fprintf ( stderr , " \n %lu records found \n " , ( unsigned long ) m_totalSamples ) ;
2014-08-30 03:21:42 +04:00
}
// make sure we are in the correct location for mid-dataset epochs
size_t mbStartSample = m_epoch * m_epochSize ;
size_t fileRecord = m_totalSamples ? mbStartSample % m_totalSamples : 0 ;
2015-05-22 10:25:44 +03:00
fprintf ( stderr , " starting epoch %lu at record count %lu, and file position %lu \n " , ( unsigned long ) m_epoch , ( unsigned long ) mbStartSample , ( unsigned long ) fileRecord ) ;
2014-08-30 03:21:42 +04:00
size_t currentFileRecord = m_mbStartSample % m_totalSamples ;
// reset the next read sample
m_readNextSample = mbStartSample ;
if ( currentFileRecord = = fileRecord )
{
fprintf ( stderr , " already there from last epoch \n " ) ;
// we have a slight delima here, if we haven't determined the end of the file yet
// and the user told us to find how many records are in the file, we can't distinguish "almost done"
// with a file (a character away) and the middle of the file. So read ahead a record to see if it's there.
bool endReached = m_endReached ;
if ( ! endReached )
{
if ( ! m_parser . HasMoreData ( ) )
{
endReached = true ;
UpdateDataVariables ( mbStartSample ) ;
assert ( m_endReached ) ;
}
}
// move the read pointer to the end since we have everything already in memory.
if ( endReached & & m_epochStartSample % m_totalSamples = = fileRecord
& & m_featureData . size ( ) > = m_epochSize * m_featureCount )
{
m_readNextSample = mbStartSample + m_epochSize ;
// write the label file here to make sure we do it somewhere. We know the entire dataset has been read at this point
WriteLabelFile ( ) ;
}
}
// not the right position, need to get there
else
{
// if we are already past the desired record, start at the beginning again
if ( currentFileRecord > fileRecord )
{
m_parser . SetFilePosition ( 0 ) ;
currentFileRecord = 0 ;
}
2015-05-22 10:25:44 +03:00
fprintf ( stderr , " reading from record %lu to %lu to be positioned properly for epoch \n " , ( unsigned long ) currentFileRecord , ( unsigned long ) fileRecord ) ;
2014-08-30 03:21:42 +04:00
m_parser . SetParseMode ( ParseLineCount ) ;
m_parser . Parse ( fileRecord - currentFileRecord , NULL , NULL ) ;
m_parser . SetParseMode ( ParseNormal ) ;
if ( ! m_labelFileToWrite . empty ( ) )
{
2015-05-19 10:24:08 +03:00
fprintf ( stderr , " WARNING: file %ls NOT written to disk, label file will only be written when starting epochs at the beginning of the dataset \n " , m_labelFileToWrite . c_str ( ) ) ;
2014-08-30 03:21:42 +04:00
m_labelFileToWrite . clear ( ) ;
2014-10-30 20:33:51 +03:00
RuntimeError ( " LabelMappingFile not provided in config, must be provided if not starting from epoch Zero (0) " ) ;
2014-08-30 03:21:42 +04:00
}
}
m_epochStartSample = m_mbStartSample = mbStartSample ;
}
}
// utility function to round an integer up to a multiple of size
size_t RoundUp ( size_t value , size_t size )
{
return ( ( value + size - 1 ) / size ) * size ;
}
2015-06-09 08:12:44 +03:00
template < class ElemType >
2015-09-19 03:35:07 +03:00
void UCIFastReader < ElemType > : : SetNumParallelSequences ( const size_t sz )
2015-06-09 08:12:44 +03:00
{
mBlgSize = sz ;
if ( mOneLinePerFile )
m_mbSize = mBlgSize ;
} ;
2014-08-30 03:21:42 +04:00
//StartMinibatchLoop - Startup a minibatch loop
// mbSize - [in] size of the minibatch (number of Samples, etc.)
// epoch - [in] epoch number for this loop, if > 0 the requestedEpochSamples must be specified (unless epoch zero was completed this run)
// requestedEpochSamples - [in] number of samples to randomize, defaults to requestDataSize which uses the number of samples there are in the dataset
// this value must be a multiple of mbSize, if it is not, it will be rounded up to one.
template < class ElemType >
void UCIFastReader < ElemType > : : StartMinibatchLoop ( size_t mbSize , size_t epoch , size_t requestedEpochSamples )
{
2015-06-09 08:12:44 +03:00
if ( mOneLinePerFile )
mbSize = mBlgSize ; /// each file has only one observation, therefore the number of data to read is the number of files
2014-08-30 03:21:42 +04:00
// if we aren't currently caching, see if we can use a cache
if ( ! m_cachingReader & & ! m_cachingWriter )
{
InitCache ( m_readerConfig ) ;
if ( m_cachingReader )
ReleaseMemory ( ) ; // free the memory used by the UCIFastReader
}
// if we are reading from the cache, do so now and return
if ( m_cachingReader )
{
m_cachingReader - > StartMinibatchLoop ( mbSize , epoch , requestedEpochSamples ) ;
return ;
}
2014-11-18 03:13:20 +03:00
if ( m_featuresBuffer = = NULL | | mbSize > m_mbSize )
2014-08-30 03:21:42 +04:00
{
2014-11-18 03:13:20 +03:00
// if we are reallocating bigger, release the original
if ( m_featuresBuffer ! = NULL )
delete [ ] m_featuresBuffer ;
2014-08-30 03:21:42 +04:00
m_featuresBuffer = new ElemType [ mbSize * m_featureCount ] ;
memset ( m_featuresBuffer , 0 , sizeof ( ElemType ) * mbSize * m_featureCount ) ;
}
2014-11-18 03:13:20 +03:00
if ( m_labelsBuffer = = NULL | | mbSize > m_mbSize )
2014-08-30 03:21:42 +04:00
{
2014-11-18 03:13:20 +03:00
// if we are reallocating bigger, release the original
if ( m_labelsBuffer ! = NULL )
delete [ ] m_labelsBuffer ;
2014-08-30 03:21:42 +04:00
if ( m_labelType = = labelCategory )
{
m_labelsBuffer = new ElemType [ m_labelDim * mbSize ] ;
memset ( m_labelsBuffer , 0 , sizeof ( ElemType ) * m_labelDim * mbSize ) ;
2015-01-17 16:03:40 +03:00
m_labelsIdBuffer = new LabelIdType [ mbSize ] ;
memset ( m_labelsIdBuffer , 0 , sizeof ( LabelIdType ) * mbSize ) ;
2014-08-30 03:21:42 +04:00
}
else if ( m_labelType ! = labelNone )
{
m_labelsBuffer = new ElemType [ mbSize ] ;
memset ( m_labelsBuffer , 0 , sizeof ( ElemType ) * mbSize ) ;
m_labelsIdBuffer = NULL ;
}
}
m_mbSize = mbSize ;
if ( requestedEpochSamples = = requestDataSize )
{
if ( ! m_endReached )
{
m_epochSize = requestDataSize ;
}
}
else
{
m_epochSize = requestedEpochSamples ;
if ( ! m_partialMinibatch )
m_epochSize = RoundUp ( requestedEpochSamples , mbSize ) ;
if ( m_epochSize ! = requestedEpochSamples )
2014-10-11 03:59:42 +04:00
fprintf ( stderr , " epochSize rounded up to %d to fit an integral number of minibatches \n " , ( int ) m_epochSize ) ;
2014-08-30 03:21:42 +04:00
}
// set the randomization range for randomizationAuto
// or if it's invalid less than the minibatch size, we need to make it at least minibatch size
if ( m_randomizeRange ! = randomizeNone )
{
if ( m_epochSize ! = requestDataSize & & m_randomizeRange = = randomizeAuto )
{
m_randomizeRange = m_epochSize ;
}
m_randomizeRange = max ( m_randomizeRange , m_mbSize ) ;
if ( m_randomizeRange ! = randomizeAuto )
{
if ( ( m_epochSize ! = requestDataSize & & m_epochSize % m_randomizeRange ! = 0 ) | | ( m_randomizeRange % m_mbSize ! = 0 ) )
2014-10-30 20:33:51 +03:00
RuntimeError ( " randomizeRange must be an even multiple of mbSize and an integral factor of epochSize " ) ;
2014-08-30 03:21:42 +04:00
m_randomordering . resize ( m_randomizeRange , m_randomizeRange ) ;
}
}
// we use epochSize, which might not be set yet, so use a default value for allocations if not yet set
size_t epochSize = m_epochSize = = requestDataSize ? 1000 : m_epochSize ;
m_epoch = epoch ;
m_mbStartSample = epoch * m_epochSize ;
// allocate room for the data
m_featureData . reserve ( m_featureCount * epochSize ) ;
if ( m_labelType = = labelCategory )
m_labelIdData . reserve ( epochSize ) ;
else if ( m_labelType ! = labelNone )
m_labelData . reserve ( epochSize ) ;
SetupEpoch ( ) ;
}
// function to store the LabelType in an ElemType
// required for string labels, which can't be stored in ElemType arrays
template < class ElemType >
void UCIFastReader < ElemType > : : StoreLabel ( ElemType & labelStore , const LabelType & labelValue )
{
labelStore = ( ElemType ) m_mapLabelToId [ labelValue ] ;
}
// GetMinibatch - Get the next minibatch (features and labels)
// matrices - [in] a map with named matrix types (i.e. 'features', 'labels') mapped to the corresponing matrix,
// [out] each matrix resized if necessary containing data.
// returns - true if there are more minibatches, false if no more minibatchs remain
template < class ElemType >
bool UCIFastReader < ElemType > : : GetMinibatch ( std : : map < std : : wstring , Matrix < ElemType > * > & matrices )
{
if ( m_cachingReader )
{
return m_cachingReader - > GetMinibatch ( matrices ) ;
}
// get the features array
2014-11-07 06:24:05 +03:00
if ( matrices . find ( m_featuresName ) = = matrices . end ( ) )
RuntimeError ( " Features matrix not found in config file, there should be a section '%ls=[...]' in the configuration file. " , m_featuresName . c_str ( ) ) ;
2014-08-30 03:21:42 +04:00
Matrix < ElemType > & features = * matrices [ m_featuresName ] ;
// get out if they didn't call StartMinibatchLoop() first
if ( m_mbSize = = 0 )
return false ;
// check to see if we have changed epochs, if so we are done with this one.
if ( m_mbStartSample / m_epochSize ! = m_epoch )
return false ;
bool randomize = Randomize ( ) ;
bool moreData = EnsureDataAvailable ( m_mbStartSample ) ;
// figure which sweep of the randomization we are on
size_t epochSample = m_mbStartSample % m_epochSize ; // where the minibatch starts in this epoch
//size_t samplesExtra = m_totalSamples % m_epochSize; // extra samples at the end of an epoch
//size_t epochsDS = (m_totalSamples+m_epochSize-1)/m_epochSize; // how many epochs per dataset
size_t randomizeSet = randomize ? RandomizeSweep ( m_mbStartSample ) : 0 ;
const auto & tmap = m_randomordering ( randomizeSet ) ;
size_t epochEnd = m_epochSize ;
size_t recordStart = m_totalSamples ? m_mbStartSample % m_totalSamples : m_mbStartSample ;
// actual size is either what requested, or total number of samples read so far
size_t actualmbsize = min ( m_totalSamples , m_mbSize ) ; // it may still return less if at end of sweep
// check for an odd sized last minibatch
if ( epochSample + actualmbsize > epochEnd )
{
actualmbsize = epochEnd - epochSample ;
}
// hit the end of the dataset, we should only get here in "one=pass mode"
if ( ! moreData )
{
// make sure we take into account hitting the end of the dataset (not wrapping around)
actualmbsize = min ( m_totalSamples - recordStart , actualmbsize ) ;
}
if ( m_labelType = = labelCategory )
{
memset ( m_labelsBuffer , 0 , sizeof ( ElemType ) * m_labelDim * actualmbsize ) ;
2015-01-17 16:03:40 +03:00
memset ( m_labelsIdBuffer , 0 , sizeof ( LabelIdType ) * actualmbsize ) ;
2014-08-30 03:21:42 +04:00
}
else if ( m_labelType ! = labelNone )
{
memset ( m_labelsBuffer , 0 , sizeof ( ElemType ) * 1 * actualmbsize ) ;
}
if ( actualmbsize > 0 )
{
// loop through and copy data to matrix
int j = 0 ; // vector of vectors of feature data
// determine randomization base index
2014-10-13 22:36:46 +04:00
size_t randBase = 0 ; // (keep compiler happy)
2014-08-30 03:21:42 +04:00
if ( randomize )
randBase = epochSample - epochSample % m_randomizeRange ;
//loop through all the samples
for ( size_t jSample = m_mbStartSample ; j < actualmbsize ; + + j , + + jSample )
{
// pick the right sample with randomization if desired
size_t jRand = randomize ? ( randBase + tmap [ jSample % m_randomizeRange ] ) : jSample ;
jRand % = m_epochSize ;
// vector of feature data goes into matrix column
memcpy ( & m_featuresBuffer [ j * m_featureCount ] , & m_featureData [ jRand * m_featureCount ] , sizeof ( ElemType ) * m_featureCount ) ;
if ( m_labelType = = labelCategory )
{
m_labelsBuffer [ j * m_labelDim + m_labelIdData [ jRand ] ] = ( ElemType ) 1 ;
m_labelsIdBuffer [ j ] = m_labelIdData [ jRand ] ;
}
else if ( m_labelType ! = labelNone )
{
if ( m_labelType = = labelRegression )
{
m_labelsBuffer [ j ] = ( ElemType ) atof ( m_labelData [ jRand ] . c_str ( ) ) ;
}
else
{
StoreLabel ( m_labelsBuffer [ j ] , m_labelData [ jRand ] ) ;
}
}
}
2015-09-30 09:50:17 +03:00
}
// create the respective MBLayout
2015-10-03 05:01:38 +03:00
// Every sample is returned as a sequence of 1 frame.
2015-10-03 02:26:19 +03:00
m_pMBLayout - > Init ( actualmbsize , 1 , false /*means it is not sequential*/ ) ;
2014-08-30 03:21:42 +04:00
// if we are writing out to the caching writer, do it now
if ( m_cachingWriter )
{
map < std : : wstring , void * , nocase_compare > writeBuffer ;
writeBuffer [ m_featuresName ] = m_featuresBuffer ;
if ( m_labelType = = labelCategory )
{
writeBuffer [ m_labelsName ] = m_labelsIdBuffer ;
if ( ! m_labelsCategoryName . empty ( ) )
writeBuffer [ m_labelsCategoryName ] = m_labelsBuffer ;
}
else if ( m_labelType ! = labelNone )
{
writeBuffer [ m_labelsName ] = m_labelsBuffer ;
}
// write out the data, on a second pass compute statistics as needed
bool moreToWrite = m_cachingWriter - > SaveData ( m_mbStartSample , writeBuffer , actualmbsize , m_totalSamples , 0 ) ;
// done writing
if ( ! moreToWrite )
{
// write out the mapping table as necessary
if ( m_labelType = = labelCategory & & ! m_labelsMapName . empty ( ) )
{
m_cachingWriter - > SaveMapping ( m_labelsMapName , m_mapIdToLabel ) ;
}
WriteLabelFile ( ) ;
// now close the cache writer
delete m_cachingWriter ;
m_cachingWriter = NULL ;
}
}
// advance to the next minibatch
m_mbStartSample + = actualmbsize ;
// if they don't want partial minibatches, skip data transfer and return
if ( actualmbsize < m_mbSize & & ! m_partialMinibatch
| | actualmbsize = = 0 ) // no records found (end of minibatch)
{
return false ;
}
// now transfer to the GPU as needed
2015-10-18 06:35:35 +03:00
features . SetValue ( m_featureCount , actualmbsize , features . GetDeviceId ( ) , m_featuresBuffer , matrixFlagNormal ) ;
2014-08-30 03:21:42 +04:00
if ( m_labelType = = labelCategory )
{
auto labelEntry = matrices . find ( m_labelsName ) ;
if ( labelEntry ! = matrices . end ( ) )
{
Matrix < ElemType > * labels = labelEntry - > second ;
if ( labels ! = nullptr )
2015-10-18 06:35:35 +03:00
labels - > SetValue ( m_labelDim , actualmbsize , labels - > GetDeviceId ( ) , m_labelsBuffer , matrixFlagNormal ) ;
2014-08-30 03:21:42 +04:00
}
}
else if ( m_labelType ! = labelNone )
{
auto labelEntry = matrices . find ( m_labelsName ) ;
if ( labelEntry ! = matrices . end ( ) )
{
Matrix < ElemType > * labels = labelEntry - > second ;
if ( labels ! = nullptr )
2015-10-18 06:35:35 +03:00
labels - > SetValue ( 1 , actualmbsize , labels - > GetDeviceId ( ) , m_labelsBuffer , matrixFlagNormal ) ;
2014-08-30 03:21:42 +04:00
}
}
// we read some records, so process them
return true ;
}
// GetLabelMapping - Gets the label mapping from integer index to label type
// returns - a map from numeric datatype to native label type
template < class ElemType >
const std : : map < typename IDataReader < ElemType > : : LabelIdType , typename IDataReader < ElemType > : : LabelType > & UCIFastReader < ElemType > : : GetLabelMapping ( const std : : wstring & sectionName )
{
if ( m_cachingReader )
{
return m_cachingReader - > GetLabelMapping ( sectionName ) ;
}
return m_mapIdToLabel ;
}
// SetLabelMapping - Sets the label mapping from integer index to label
// labelMapping - mapping table from label values to IDs (must be 0-n)
// note: for tasks with labels, the mapping table must be the same between a training run and a testing run
template < class ElemType >
2015-01-17 16:03:40 +03:00
void UCIFastReader < ElemType > : : SetLabelMapping ( const std : : wstring & /*sectionName*/ , const std : : map < typename IDataReader < ElemType > : : LabelIdType , LabelType > & labelMapping )
2014-08-30 03:21:42 +04:00
{
if ( m_cachingReader )
{
2015-10-11 04:07:19 +03:00
RuntimeError ( " Cannot set mapping table when the caching reader is being used " ) ;
2014-08-30 03:21:42 +04:00
}
m_mapIdToLabel = labelMapping ;
m_mapLabelToId . clear ( ) ;
2014-09-18 00:49:18 +04:00
for ( std : : pair < unsigned , LabelType > var : labelMapping )
2014-08-30 03:21:42 +04:00
{
m_mapLabelToId [ var . second ] = var . first ;
}
}
// GetData - Gets metadata from the specified section (into CPU memory)
// sectionName - section name to retrieve data from
// numRecords - number of records to read
// data - pointer to data buffer, if NULL, dataBufferSize will be set to size of required buffer to accomidate request
// dataBufferSize - [in] size of the databuffer in bytes
// [out] size of buffer filled with data
// recordStart - record to start reading from, defaults to zero (start of data)
// returns: true if data remains to be read, false if the end of data was reached
template < class ElemType >
bool UCIFastReader < ElemType > : : GetData ( const std : : wstring & sectionName , size_t numRecords , void * data , size_t & dataBufferSize , size_t recordStart )
{
if ( m_cachingReader )
{
return m_cachingReader - > GetData ( sectionName , numRecords , data , dataBufferSize , recordStart ) ;
}
2015-10-11 04:07:19 +03:00
RuntimeError ( " GetData not supported in UCIFastReader " ) ;
2014-08-30 03:21:42 +04:00
}
template < class ElemType >
bool UCIFastReader < ElemType > : : DataEnd ( EndDataType endDataType )
{
if ( m_cachingReader )
{
return m_cachingReader - > DataEnd ( endDataType ) ;
}
bool ret = false ;
switch ( endDataType )
{
case endDataNull :
assert ( false ) ;
break ;
case endDataEpoch :
ret = ( m_mbStartSample / m_epochSize ! = m_epoch ) ;
break ;
case endDataSet :
ret = EnsureDataAvailable ( m_mbStartSample , true ) ;
break ;
case endDataSentence : // for fast reader each minibatch is considered a "sentence", so always true
ret = true ;
break ;
}
return ret ;
}
// instantiate all the combinations we expect to be used
template class UCIFastReader < double > ;
template class UCIFastReader < float > ;
2015-01-17 16:03:40 +03:00
} } }