CNTK/DataReader/UCIFastReader/UCIFastReader.cpp

1008 строки
37 KiB
C++
Исходник Обычный вид История

2014-08-30 03:21:42 +04:00
//
// <copyright file="UCIFastReader.cpp" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//
// UCIFastReader.cpp : Defines the exported functions for the DLL application.
//
#include "stdafx.h"
#define DATAREADER_EXPORTS // creating the exports here
#include "DataReader.h"
#include "UCIFastReader.h"
#ifdef LEAKDETECT
#include <vld.h> // leak detection
#endif
#include "fileutil.h" // for fexists()
2014-08-30 03:21:42 +04:00
namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
size_t UCIFastReader<ElemType>::RandomizeSweep(size_t mbStartSample)
{
//size_t randomRangePerEpoch = (m_epochSize+m_randomizeRange-1)/m_randomizeRange;
//return m_epoch*randomRangePerEpoch + epochSample/m_randomizeRange;
return mbStartSample/m_randomizeRange;
}
// ReadLine - Read a line
// readSample - sample to read in global sample space
// returns - true if we successfully read a record, otherwise false
template<class ElemType>
bool UCIFastReader<ElemType>::ReadRecord(size_t /*readSample*/)
2014-08-30 03:21:42 +04:00
{
return false; // not used
}
// RecordsToRead - Determine number of records to read to populate record buffers
// mbStartSample - the starting sample from which to read
// tail - we are checking for possible remainer records to read (default false)
// returns - true if we have more to read, false if we hit the end of the dataset
template<class ElemType>
size_t UCIFastReader<ElemType>::RecordsToRead(size_t mbStartSample, bool tail)
{
assert(mbStartSample >= m_epochStartSample);
// determine how far ahead we need to read
bool randomize = Randomize();
// need to read to the end of the next minibatch
size_t epochSample = mbStartSample;
epochSample %= m_epochSize;
// determine number left to read for this epoch
size_t numberToEpoch = m_epochSize - epochSample;
// we will take either a minibatch or the number left in the epoch
size_t numberToRead = min(numberToEpoch, m_mbSize);
if (numberToRead == 0 && !tail)
numberToRead = m_mbSize;
if (randomize)
{
size_t randomizeSweep = RandomizeSweep(mbStartSample);
// if first read or read takes us to another randomization range
// we need to read at least randomization range records
if (randomizeSweep != m_randomordering.CurrentSeed()) // the range has changed since last time
{
numberToRead = RoundUp(epochSample, m_randomizeRange) - epochSample;
if (numberToRead == 0 && !tail)
numberToRead = m_randomizeRange;
}
}
return numberToRead;
}
// EnsureDataAvailable - Read enough lines so we can request a minibatch starting as requested
// mbStartSample - the starting sample we are ensureing are good
// endOfDataCheck - check if we are at the end of the dataset (no wraparound)
// returns - true if we have more to read, false if we hit the end of the dataset
template<class ElemType>
bool UCIFastReader<ElemType>::EnsureDataAvailable(size_t mbStartSample, bool endOfDataCheck)
{
assert(mbStartSample >= m_epochStartSample);
// determine how far ahead we need to read
Randomize();
2014-08-30 03:21:42 +04:00
// need to read to the end of the next minibatch
size_t epochSample = mbStartSample;
epochSample %= m_epochSize;
bool moreToRead = true;
size_t numberToRead = RecordsToRead(mbStartSample);
// check to see if we have the proper records read already
if (m_readNextSample >= mbStartSample+numberToRead && mbStartSample >= m_epochStartSample)
return true;
// truncate the present arrays to the location we are reading from, parser appends on these arrays
if (m_featureData.size() > epochSample*m_featureCount) // should be this size, if not, truncate
m_featureData.resize(epochSample*m_featureCount);
if (m_labelType != labelNone && m_labelData.size() > epochSample)
{
// make sure the labelId array is also the correct size
if (m_labelType == labelCategory)
m_labelIdData.resize(epochSample);
m_labelData.resize(epochSample);
}
int recordsRead = 0;
do
{
int numRead = m_parser.Parse(numberToRead-recordsRead, &m_featureData, &m_labelData);
recordsRead += numRead;
if (!m_endReached)
m_totalSamples += numRead; // total number of records in the dataset
// we should only get less records than requested at when we hit the end of the dataset
if (recordsRead < numberToRead)
{
// update dataset variables
size_t additionalToRead = UpdateDataVariables(mbStartSample+recordsRead);
m_parser.SetFilePosition(0); // make another pass of the dataset
// if doing and end of data check, and we are at the end
// or a partial minibatch was found exit now
if ((endOfDataCheck && recordsRead == 0) ||
(m_partialMinibatch && recordsRead > 0))
{
moreToRead = false;
break;
}
// get the additional number to read
numberToRead = recordsRead + additionalToRead;
}
}
while (recordsRead < numberToRead);
m_readNextSample += recordsRead;
// for category labels, we need to build up a list of IDs and a mapping table
if (m_labelType == labelCategory)
{
// loop through all the newly read records
for (int numberRead=0; numberRead < recordsRead; numberRead++)
{
LabelType& label = m_labelData[epochSample+numberRead];
// check to see if we have seen this label before
auto value = m_mapLabelToId.find(label);
LabelIdType labelId;
if (value == m_mapLabelToId.end())
{
if (m_labelFileToWrite.empty())
RuntimeError("label found in data not specified in label mapping file: %s", label.c_str());
2014-08-30 03:21:42 +04:00
// new label so add it to the mapping tables
m_mapLabelToId[label] = m_labelIdMax;
m_mapIdToLabel[m_labelIdMax] = label;
labelId = m_labelIdMax++;
// if our label dimension is lower than the current labelId then increase it
if (m_labelDim < m_labelIdMax)
m_labelDim = m_labelIdMax;
}
else
{
labelId = value->second;
}
// now add the label id to the label data array
m_labelIdData.push_back(labelId);
}
}
// if there more to read (always is, unless we want partial minibatches
return moreToRead;
}
// UpdateDataVariables - Update variables that depend on the dataset being completely read
template<class ElemType>
size_t UCIFastReader<ElemType>::UpdateDataVariables(size_t mbStartSample)
{
// if we haven't been all the way through the file yet
if (!m_endReached)
{
// get the size of the dataset
assert(m_totalSamples*m_featureCount >= m_featureData.size());
// if they want us to determine epoch size based on dataset size, do that
if (m_epochSize == requestDataSize)
{
// set the epoch size to be a multiple of mbSize or randomization range
if (m_partialMinibatch)
m_epochSize = m_totalSamples;
else
{
size_t roundUpTo = m_mbSize;
if (m_randomizeRange != randomizeAuto && m_randomizeRange != randomizeNone)
roundUpTo = m_randomizeRange;
m_epochSize = RoundUp(m_totalSamples, roundUpTo);
}
}
// make sure randomization range is within the sample bounds
if (m_randomizeRange > m_epochSize)
{
m_randomizeRange = m_epochSize;
m_randomordering.resize(m_randomizeRange,m_randomizeRange);
}
// write the label file if we hit the end of the file
WriteLabelFile();
// we got to the end of the dataset
m_endReached = true;
}
// update the label dimension if it is not big enough, need it here because m_labelIdMax get's updated in the processing loop (after a read)
if (m_labelType == labelCategory && m_labelIdMax > m_labelDim)
m_labelDim = m_labelIdMax; // update the label dimensions if different
bool recordsToRead = mbStartSample < m_epochStartSample + m_epochSize; // still some to read after potential epochSize change?
return recordsToRead?RecordsToRead(mbStartSample):0;
}
template<class ElemType>
void UCIFastReader<ElemType>::WriteLabelFile()
{
// write out the label file if they don't have one
if (!m_labelFileToWrite.empty())
{
if (m_mapIdToLabel.size() > 0)
{
File labelFile(m_labelFileToWrite, fileOptionsWrite | fileOptionsText);
for (int i=0; i < m_mapIdToLabel.size(); ++i)
{
labelFile << m_mapIdToLabel[i] << '\n';
}
fprintf(stderr, "label file %ls written to disk\n", m_labelFileToWrite.c_str());
2014-08-30 03:21:42 +04:00
m_labelFileToWrite.clear();
}
else if (!m_cachingWriter)
{
fprintf(stderr, "WARNING: file %ls NOT written to disk yet, will be written the first time the end of the entire dataset is found.\n", m_labelFileToWrite.c_str());
2014-08-30 03:21:42 +04:00
}
}
}
// Destroy - cleanup and remove this class
// NOTE: this destroys the object, and it can't be used past this point
template<class ElemType>
void UCIFastReader<ElemType>::Destroy()
{
delete this;
}
// Init - Reader Initialize for multiple data sets
// config - [in] configuration parameters for the datareader
// Sample format below:
//# Parameter values for the reader
//reader=[
// # reader to use
// readerType=UCIFastReader
// miniBatchMode=Partial
// randomize=None
// features=[
// dim=784
// start=1
// file=c:\speech\mnist\mnist_test.txt
// ]
// labels=[
// dim=1
// start=0
// file=c:\speech\mnist\mnist_test.txt
// labelMappingFile=c:\speech\mnist\labels.txt
// labelDim=10
// labelType=Category
// ]
//]
template<class ElemType>
void UCIFastReader<ElemType>::Init(const ConfigParameters& readerConfig)
{
// See if the user wants caching
m_cachingReader = NULL;
m_cachingWriter = NULL;
// initialize the cache
InitCache(readerConfig);
readerConfig.CopyTo(m_readerConfig);
// if we have a cache, no need to parse the test files...
if (m_cachingReader)
return;
std::vector<std::wstring> features;
std::vector<std::wstring> labels;
GetFileConfigNames(readerConfig, features, labels);
if (features.size() > 0)
{
m_featuresName = features[0];
}
if (labels.size() > 0)
{
m_labelsName = labels[0];
}
ConfigParameters configFeatures = readerConfig(m_featuresName,"");
ConfigParameters configLabels = readerConfig(m_labelsName,"");;
if (configFeatures.size() == 0)
RuntimeError("features file not found, required in configuration: i.e. 'features=[file=c:\\myfile.txt;start=1;dim=123]'");
2014-08-30 03:21:42 +04:00
if (configLabels.size() == 0)
fprintf(stderr, "Warning: labels are not specified.");
else if (configFeatures("file","") != configLabels("file",""))
RuntimeError("features and label files must be the same file, use separate readers to define single use files");
2014-08-30 03:21:42 +04:00
size_t vdim = configFeatures("dim");
string name = configFeatures.Name();
size_t udim = configLabels("labelDim","0");
// initialize all the variables
m_mbStartSample = m_epoch = m_totalSamples = m_epochStartSample = 0;
m_labelIdMax = m_labelDim = 0;
m_partialMinibatch = m_endReached = false;
m_labelType = labelCategory;
m_featureCount = vdim;
m_readNextSample = 0;
m_traceLevel = readerConfig("traceLevel","0");
m_parser.SetTraceLevel(m_traceLevel);
// set the feature count to at least one (we better have one feature...)
assert (m_featureCount != 0);
if (readerConfig.Exists("randomize"))
{
string randomizeString = readerConfig("randomize");
if (randomizeString == "None")
{
m_randomizeRange = randomizeNone;
}
else if (randomizeString == "Auto")
{
m_randomizeRange = randomizeAuto;
}
else
{
m_randomizeRange = readerConfig("randomize");
}
}
else
{
m_randomizeRange = randomizeAuto;
}
// determine if we partial minibatches are desired
std::string minibatchMode(readerConfig("minibatchMode","Partial"));
m_partialMinibatch = !_stricmp(minibatchMode.c_str(),"Partial");
// get start and dimensions for labels and features
size_t startLabels = configLabels("start", "0");
size_t dimLabels = configLabels("dim", "0");
size_t startFeatures = configFeatures("start", "0");
size_t dimFeatures = configFeatures("dim", "0");
// determine label type desired
std::string labelType;
if (configLabels.size() == 0)
labelType = "None";
else
labelType = configLabels("labelType","Category");
//convert to lower case for case insensitive comparison
msra::strfun::tolower_ascii(labelType);
2014-08-30 03:21:42 +04:00
if (labelType == "category")
{
m_labelType = labelCategory;
}
else if (labelType == "regression")
{
m_labelType = labelRegression;
}
else if (labelType == "none")
{
m_labelType = labelNone;
dimLabels = 0; // override for no labels
}
std::wstring file = configFeatures("file");
if (m_traceLevel > 0)
fprintf(stderr, "reading uci file %ls\n", file.c_str());
2014-08-30 03:21:42 +04:00
m_parser.ParseInit(file.c_str(), startFeatures, dimFeatures, startLabels, dimLabels);
// if we have labels, we need a label Mapping file, it will be a file with one label per line
if (m_labelType != labelNone)
{
ConfigArray arrayLabels;
std::wstring labelPath = configLabels("labelMappingFile");
if (fexists(labelPath))
{
arrayLabels.LoadConfigFile(labelPath);
for (int i=0; i < arrayLabels.size(); ++i)
{
LabelType label = arrayLabels[i];
m_mapIdToLabel[i] = label;
m_mapLabelToId[label] = i;
}
m_labelIdMax = (LabelIdType)arrayLabels.size();
}
else
{
// only do label creation if we have the allow flag, should only be done as a separate command
// to ensure that the label file will exist for verification step in training
bool allowLabelCreation = readerConfig("allowMapCreation","false");
if (allowLabelCreation)
m_labelFileToWrite = labelPath;
else
RuntimeError("label mapping file %ls not found, can be created with a 'createLabelMap' command/action\n", labelPath.c_str());
2014-08-30 03:21:42 +04:00
}
}
// if we know the size of the randomization now, resize, otherwise wait until we know the epochSize in StartMinibatchLoop()
if (Randomize() && m_randomizeRange != randomizeAuto)
m_randomordering.resize(m_randomizeRange, m_randomizeRange);
// if the value they passed in as udim is not big enough, add something on
if (udim < m_labelIdMax)
udim = m_labelIdMax;
m_labelDim = (LabelIdType)udim;
mOneLinePerFile = false;
mOneLinePerFile = readerConfig("onelineperfile", "false");
2014-08-30 03:21:42 +04:00
}
// InitCache - Initialize the caching reader if cache files exist, otherwise the writer
// readerConfig - reader configuration
template<class ElemType>
void UCIFastReader<ElemType>::InitCache(const ConfigParameters& readerConfig)
{
// check for a writer tag first (lets us know we are caching)
if (!readerConfig.Exists("writerType"))
return;
// first try to open the binary cache
bool found = false;
try
{
// TODO: need to go down to all levels, maybe search for sectionType
ConfigArray filesList(',');
vector<std::wstring> names;
if (readerConfig.Exists("wfile"))
{
filesList.push_back(readerConfig("wfile"));
if (fexists(readerConfig("wfile")))
found = true;
}
FindConfigNames(readerConfig, "wfile", names);
for (const auto & name : names)
2014-08-30 03:21:42 +04:00
{
ConfigParameters config = readerConfig(name);
filesList.push_back(config("wfile"));
if (fexists(config("wfile")))
found = true;
}
// if we have a file already, we are going to read the cached files
if (found)
{
ConfigParameters config;
readerConfig.CopyTo(config);
// mmodify the config so the reader types look correct
config["readerType"] = config("writerType");
config["file"] = filesList;
m_cachingReader = new DataReader<ElemType>(config);
}
else
{
m_cachingWriter = new DataWriter<ElemType>(readerConfig);
// now get the section names for map and category types
std::map<std::wstring, SectionType, nocase_compare> sections;
m_cachingWriter->GetSections(sections);
for (const auto & pair : sections)
2014-08-30 03:21:42 +04:00
{
if (pair.second == sectionTypeCategoryLabel)
{
m_labelsCategoryName = pair.first;
}
else if (pair.second == sectionTypeLabelMapping)
{
m_labelsMapName = pair.first;
}
}
}
}
catch (runtime_error err)
{
fprintf(stderr,"Error attemping to create Binary%s\n%s\n",found?"Reader":"Writer",err.what());
delete m_cachingReader;
m_cachingReader = NULL;
delete m_cachingWriter;
m_cachingWriter = NULL;
}
catch (...)
{
// if there is any error, just get rid of the object
fprintf(stderr,"Error attemping to create Binary%s\n",found?"Reader":"Writer");
delete m_cachingReader;
m_cachingReader = NULL;
delete m_cachingWriter;
m_cachingWriter = NULL;
}
}
// destructor - virtual so it gets called properly
template<class ElemType>
UCIFastReader<ElemType>::~UCIFastReader()
{
ReleaseMemory();
delete m_cachingReader;
delete m_cachingWriter;
}
// ReleaseMemory - release the memory footprint of UCIFastReader
// used when the caching reader is taking over
template<class ElemType>
void UCIFastReader<ElemType>::ReleaseMemory()
{
if (m_featuresBuffer!=NULL)
delete[] m_featuresBuffer;
m_featuresBuffer=NULL;
if (m_labelsBuffer!=NULL)
delete[] m_labelsBuffer;
m_labelsBuffer=NULL;
if (m_labelsIdBuffer!=NULL)
delete[] m_labelsIdBuffer;
m_labelsIdBuffer=NULL;
m_featureData.clear();
m_labelIdData.clear();
m_labelData.clear();
}
//SetupEpoch - Setup the proper position in the file, and other variable settings to start a particular epoch
template<class ElemType>
void UCIFastReader<ElemType>::SetupEpoch()
{
// if we are starting fresh (epoch zero and no data read), init everything
// however if we are using cachingWriter, we need to know record count, so do that first
if (m_epoch == 0 && m_totalSamples == 0 && m_cachingWriter != NULL)
{
m_readNextSample = m_epochStartSample = m_mbStartSample = 0;
m_parser.SetFilePosition(0);
}
else // otherwise, position the read to start at the right location
{
// don't know the total number of samples yet, so count them
if (m_totalSamples == 0)
{
if (m_traceLevel > 0)
fprintf(stderr, "starting at epoch %lu counting lines to determine record count\n", (unsigned long)m_epoch);
2014-08-30 03:21:42 +04:00
m_parser.SetParseMode(ParseLineCount);
m_totalSamples = m_parser.Parse(size_t(-1), NULL, NULL);
m_parser.SetParseMode(ParseNormal);
m_parser.SetFilePosition(0);
m_mbStartSample = 0;
UpdateDataVariables(0); // update all the variables since we read to the end...
if (m_traceLevel > 0)
fprintf(stderr, "\n %lu records found\n", (unsigned long)m_totalSamples);
2014-08-30 03:21:42 +04:00
}
// make sure we are in the correct location for mid-dataset epochs
size_t mbStartSample = m_epoch * m_epochSize;
size_t fileRecord = m_totalSamples?mbStartSample % m_totalSamples:0;
fprintf(stderr, "starting epoch %lu at record count %lu, and file position %lu\n", (unsigned long)m_epoch, (unsigned long)mbStartSample, (unsigned long)fileRecord);
2014-08-30 03:21:42 +04:00
size_t currentFileRecord = m_mbStartSample % m_totalSamples;
// reset the next read sample
m_readNextSample = mbStartSample;
if (currentFileRecord == fileRecord)
{
fprintf(stderr, "already there from last epoch\n");
// we have a slight delima here, if we haven't determined the end of the file yet
// and the user told us to find how many records are in the file, we can't distinguish "almost done"
// with a file (a character away) and the middle of the file. So read ahead a record to see if it's there.
bool endReached = m_endReached;
if (!endReached)
{
if (!m_parser.HasMoreData())
{
endReached = true;
UpdateDataVariables(mbStartSample);
assert(m_endReached);
}
}
// move the read pointer to the end since we have everything already in memory.
if (endReached && m_epochStartSample % m_totalSamples == fileRecord
&& m_featureData.size() >= m_epochSize*m_featureCount)
{
m_readNextSample = mbStartSample + m_epochSize;
// write the label file here to make sure we do it somewhere. We know the entire dataset has been read at this point
WriteLabelFile();
}
}
// not the right position, need to get there
else
{
// if we are already past the desired record, start at the beginning again
if (currentFileRecord > fileRecord)
{
m_parser.SetFilePosition(0);
currentFileRecord = 0;
}
fprintf(stderr, "reading from record %lu to %lu to be positioned properly for epoch\n", (unsigned long)currentFileRecord, (unsigned long)fileRecord);
2014-08-30 03:21:42 +04:00
m_parser.SetParseMode(ParseLineCount);
m_parser.Parse(fileRecord-currentFileRecord, NULL, NULL);
m_parser.SetParseMode(ParseNormal);
if (!m_labelFileToWrite.empty())
{
fprintf(stderr, "WARNING: file %ls NOT written to disk, label file will only be written when starting epochs at the beginning of the dataset\n", m_labelFileToWrite.c_str());
2014-08-30 03:21:42 +04:00
m_labelFileToWrite.clear();
RuntimeError("LabelMappingFile not provided in config, must be provided if not starting from epoch Zero (0)");
2014-08-30 03:21:42 +04:00
}
}
m_epochStartSample = m_mbStartSample = mbStartSample;
}
}
// utility function to round an integer up to a multiple of size
size_t RoundUp(size_t value, size_t size)
{
return ((value + size -1)/size)*size;
}
template<class ElemType>
void UCIFastReader<ElemType>::SetNumParallelSequences(const size_t sz)
{
mBlgSize = sz;
if (mOneLinePerFile)
m_mbSize = mBlgSize;
};
2014-08-30 03:21:42 +04:00
//StartMinibatchLoop - Startup a minibatch loop
// mbSize - [in] size of the minibatch (number of Samples, etc.)
// epoch - [in] epoch number for this loop, if > 0 the requestedEpochSamples must be specified (unless epoch zero was completed this run)
// requestedEpochSamples - [in] number of samples to randomize, defaults to requestDataSize which uses the number of samples there are in the dataset
// this value must be a multiple of mbSize, if it is not, it will be rounded up to one.
template<class ElemType>
void UCIFastReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples)
{
if (mOneLinePerFile)
mbSize = mBlgSize; /// each file has only one observation, therefore the number of data to read is the number of files
2014-08-30 03:21:42 +04:00
// if we aren't currently caching, see if we can use a cache
if (!m_cachingReader && !m_cachingWriter)
{
InitCache(m_readerConfig);
if (m_cachingReader)
ReleaseMemory(); // free the memory used by the UCIFastReader
}
// if we are reading from the cache, do so now and return
if (m_cachingReader)
{
m_cachingReader->StartMinibatchLoop(mbSize, epoch, requestedEpochSamples);
return;
}
if (m_featuresBuffer==NULL || mbSize > m_mbSize)
2014-08-30 03:21:42 +04:00
{
// if we are reallocating bigger, release the original
if (m_featuresBuffer != NULL)
delete[] m_featuresBuffer;
2014-08-30 03:21:42 +04:00
m_featuresBuffer = new ElemType[mbSize*m_featureCount];
memset(m_featuresBuffer,0,sizeof(ElemType)*mbSize*m_featureCount);
}
if (m_labelsBuffer == NULL || mbSize > m_mbSize)
2014-08-30 03:21:42 +04:00
{
// if we are reallocating bigger, release the original
if (m_labelsBuffer != NULL)
delete[] m_labelsBuffer;
2014-08-30 03:21:42 +04:00
if (m_labelType == labelCategory)
{
m_labelsBuffer = new ElemType[m_labelDim*mbSize];
memset(m_labelsBuffer,0,sizeof(ElemType)*m_labelDim*mbSize);
m_labelsIdBuffer = new LabelIdType[mbSize];
memset(m_labelsIdBuffer,0,sizeof(LabelIdType)*mbSize);
2014-08-30 03:21:42 +04:00
}
else if (m_labelType != labelNone)
{
m_labelsBuffer = new ElemType[mbSize];
memset(m_labelsBuffer,0,sizeof(ElemType)*mbSize);
m_labelsIdBuffer = NULL;
}
}
m_mbSize = mbSize;
if (requestedEpochSamples == requestDataSize)
{
if (!m_endReached)
{
m_epochSize = requestDataSize;
}
}
else
{
m_epochSize = requestedEpochSamples;
if (!m_partialMinibatch)
m_epochSize = RoundUp(requestedEpochSamples, mbSize);
if (m_epochSize != requestedEpochSamples)
fprintf(stderr, "epochSize rounded up to %d to fit an integral number of minibatches\n", (int)m_epochSize);
2014-08-30 03:21:42 +04:00
}
// set the randomization range for randomizationAuto
// or if it's invalid less than the minibatch size, we need to make it at least minibatch size
if (m_randomizeRange != randomizeNone)
{
if (m_epochSize != requestDataSize && m_randomizeRange == randomizeAuto)
{
m_randomizeRange = m_epochSize;
}
m_randomizeRange = max(m_randomizeRange, m_mbSize);
if (m_randomizeRange != randomizeAuto)
{
if ((m_epochSize != requestDataSize && m_epochSize % m_randomizeRange != 0) || (m_randomizeRange % m_mbSize != 0))
RuntimeError("randomizeRange must be an even multiple of mbSize and an integral factor of epochSize");
2014-08-30 03:21:42 +04:00
m_randomordering.resize(m_randomizeRange, m_randomizeRange);
}
}
// we use epochSize, which might not be set yet, so use a default value for allocations if not yet set
size_t epochSize = m_epochSize == requestDataSize?1000:m_epochSize;
m_epoch = epoch;
m_mbStartSample = epoch*m_epochSize;
// allocate room for the data
m_featureData.reserve(m_featureCount*epochSize);
if (m_labelType == labelCategory)
m_labelIdData.reserve(epochSize);
else if (m_labelType != labelNone)
m_labelData.reserve(epochSize);
SetupEpoch();
}
// function to store the LabelType in an ElemType
// required for string labels, which can't be stored in ElemType arrays
template<class ElemType>
void UCIFastReader<ElemType>::StoreLabel(ElemType& labelStore, const LabelType& labelValue)
{
labelStore = (ElemType)m_mapLabelToId[labelValue];
}
// GetMinibatch - Get the next minibatch (features and labels)
// matrices - [in] a map with named matrix types (i.e. 'features', 'labels') mapped to the corresponing matrix,
// [out] each matrix resized if necessary containing data.
// returns - true if there are more minibatches, false if no more minibatchs remain
template<class ElemType>
bool UCIFastReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices)
{
if (m_cachingReader)
{
return m_cachingReader->GetMinibatch(matrices);
}
// get the features array
if (matrices.find(m_featuresName) == matrices.end())
RuntimeError("Features matrix not found in config file, there should be a section '%ls=[...]' in the configuration file.", m_featuresName.c_str());
2014-08-30 03:21:42 +04:00
Matrix<ElemType>& features = *matrices[m_featuresName];
// get out if they didn't call StartMinibatchLoop() first
if (m_mbSize == 0)
return false;
// check to see if we have changed epochs, if so we are done with this one.
if (m_mbStartSample / m_epochSize != m_epoch)
return false;
bool randomize = Randomize();
bool moreData = EnsureDataAvailable(m_mbStartSample);
// figure which sweep of the randomization we are on
size_t epochSample = m_mbStartSample % m_epochSize; // where the minibatch starts in this epoch
//size_t samplesExtra = m_totalSamples % m_epochSize; // extra samples at the end of an epoch
//size_t epochsDS = (m_totalSamples+m_epochSize-1)/m_epochSize; // how many epochs per dataset
size_t randomizeSet = randomize?RandomizeSweep(m_mbStartSample):0;
const auto & tmap = m_randomordering(randomizeSet);
size_t epochEnd = m_epochSize;
size_t recordStart = m_totalSamples?m_mbStartSample%m_totalSamples:m_mbStartSample;
// actual size is either what requested, or total number of samples read so far
size_t actualmbsize = min(m_totalSamples, m_mbSize); // it may still return less if at end of sweep
// check for an odd sized last minibatch
if (epochSample + actualmbsize > epochEnd)
{
actualmbsize = epochEnd - epochSample;
}
// hit the end of the dataset, we should only get here in "one=pass mode"
if (!moreData)
{
// make sure we take into account hitting the end of the dataset (not wrapping around)
actualmbsize = min(m_totalSamples-recordStart,actualmbsize);
}
if (m_labelType == labelCategory)
{
memset(m_labelsBuffer,0,sizeof(ElemType)*m_labelDim*actualmbsize);
memset(m_labelsIdBuffer,0,sizeof(LabelIdType)*actualmbsize);
2014-08-30 03:21:42 +04:00
}
else if (m_labelType != labelNone)
{
memset(m_labelsBuffer,0,sizeof(ElemType)*1*actualmbsize);
}
if (actualmbsize > 0)
{
// loop through and copy data to matrix
int j = 0; // vector of vectors of feature data
// determine randomization base index
size_t randBase = 0; // (keep compiler happy)
2014-08-30 03:21:42 +04:00
if (randomize)
randBase = epochSample - epochSample%m_randomizeRange;
//loop through all the samples
for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample)
{
// pick the right sample with randomization if desired
size_t jRand = randomize?(randBase + tmap[jSample%m_randomizeRange]):jSample;
jRand %= m_epochSize;
// vector of feature data goes into matrix column
memcpy(&m_featuresBuffer[j*m_featureCount],&m_featureData[jRand*m_featureCount],sizeof(ElemType)*m_featureCount);
if (m_labelType == labelCategory)
{
m_labelsBuffer[j*m_labelDim + m_labelIdData[jRand]] = (ElemType)1;
m_labelsIdBuffer[j] = m_labelIdData[jRand];
}
else if (m_labelType != labelNone)
{
if (m_labelType == labelRegression)
{
m_labelsBuffer[j] = (ElemType)atof(m_labelData[jRand].c_str());
}
else
{
StoreLabel(m_labelsBuffer[j],m_labelData[jRand]);
}
}
}
2015-09-30 09:50:17 +03:00
}
// create the respective MBLayout
// Every sample is returned as a sequence of 1 frame.
m_pMBLayout->Init(actualmbsize, 1, false/*means it is not sequential*/);
2014-08-30 03:21:42 +04:00
// if we are writing out to the caching writer, do it now
if (m_cachingWriter)
{
map<std::wstring, void*, nocase_compare> writeBuffer;
writeBuffer[m_featuresName] = m_featuresBuffer;
if (m_labelType == labelCategory)
{
writeBuffer[m_labelsName] = m_labelsIdBuffer;
if (!m_labelsCategoryName.empty())
writeBuffer[m_labelsCategoryName] = m_labelsBuffer;
}
else if (m_labelType != labelNone)
{
writeBuffer[m_labelsName] = m_labelsBuffer;
}
// write out the data, on a second pass compute statistics as needed
bool moreToWrite = m_cachingWriter->SaveData(m_mbStartSample, writeBuffer, actualmbsize, m_totalSamples, 0);
// done writing
if (!moreToWrite)
{
// write out the mapping table as necessary
if (m_labelType == labelCategory && !m_labelsMapName.empty())
{
m_cachingWriter->SaveMapping(m_labelsMapName, m_mapIdToLabel);
}
WriteLabelFile();
// now close the cache writer
delete m_cachingWriter;
m_cachingWriter = NULL;
}
}
// advance to the next minibatch
m_mbStartSample += actualmbsize;
// if they don't want partial minibatches, skip data transfer and return
if (actualmbsize < m_mbSize && !m_partialMinibatch
|| actualmbsize == 0) // no records found (end of minibatch)
{
return false;
}
// now transfer to the GPU as needed
features.SetValue(m_featureCount, actualmbsize, features.GetDeviceId(), m_featuresBuffer, matrixFlagNormal);
2014-08-30 03:21:42 +04:00
if (m_labelType == labelCategory)
{
auto labelEntry = matrices.find(m_labelsName);
if (labelEntry != matrices.end())
{
Matrix<ElemType>* labels = labelEntry->second;
if (labels != nullptr)
labels->SetValue(m_labelDim, actualmbsize, labels->GetDeviceId(), m_labelsBuffer,matrixFlagNormal);
2014-08-30 03:21:42 +04:00
}
}
else if (m_labelType != labelNone)
{
auto labelEntry = matrices.find(m_labelsName);
if (labelEntry != matrices.end())
{
Matrix<ElemType>* labels = labelEntry->second;
if (labels != nullptr)
labels->SetValue(1, actualmbsize, labels->GetDeviceId(), m_labelsBuffer,matrixFlagNormal);
2014-08-30 03:21:42 +04:00
}
}
// we read some records, so process them
return true;
}
// GetLabelMapping - Gets the label mapping from integer index to label type
// returns - a map from numeric datatype to native label type
template<class ElemType>
const std::map<typename IDataReader<ElemType>::LabelIdType, typename IDataReader<ElemType>::LabelType>& UCIFastReader<ElemType>::GetLabelMapping(const std::wstring& sectionName)
{
if (m_cachingReader)
{
return m_cachingReader->GetLabelMapping(sectionName);
}
return m_mapIdToLabel;
}
// SetLabelMapping - Sets the label mapping from integer index to label
// labelMapping - mapping table from label values to IDs (must be 0-n)
// note: for tasks with labels, the mapping table must be the same between a training run and a testing run
template<class ElemType>
void UCIFastReader<ElemType>::SetLabelMapping(const std::wstring& /*sectionName*/, const std::map<typename IDataReader<ElemType>::LabelIdType, LabelType>& labelMapping)
2014-08-30 03:21:42 +04:00
{
if (m_cachingReader)
{
RuntimeError("Cannot set mapping table when the caching reader is being used");
2014-08-30 03:21:42 +04:00
}
m_mapIdToLabel = labelMapping;
m_mapLabelToId.clear();
for (std::pair<unsigned, LabelType> var : labelMapping)
2014-08-30 03:21:42 +04:00
{
m_mapLabelToId[var.second] = var.first;
}
}
// GetData - Gets metadata from the specified section (into CPU memory)
// sectionName - section name to retrieve data from
// numRecords - number of records to read
// data - pointer to data buffer, if NULL, dataBufferSize will be set to size of required buffer to accomidate request
// dataBufferSize - [in] size of the databuffer in bytes
// [out] size of buffer filled with data
// recordStart - record to start reading from, defaults to zero (start of data)
// returns: true if data remains to be read, false if the end of data was reached
template<class ElemType>
bool UCIFastReader<ElemType>::GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart)
{
if (m_cachingReader)
{
return m_cachingReader->GetData(sectionName, numRecords, data, dataBufferSize, recordStart);
}
RuntimeError("GetData not supported in UCIFastReader");
2014-08-30 03:21:42 +04:00
}
template<class ElemType>
bool UCIFastReader<ElemType>::DataEnd(EndDataType endDataType)
{
if (m_cachingReader)
{
return m_cachingReader->DataEnd(endDataType);
}
bool ret = false;
switch (endDataType)
{
case endDataNull:
assert(false);
break;
case endDataEpoch:
ret = (m_mbStartSample / m_epochSize != m_epoch);
break;
case endDataSet:
ret = EnsureDataAvailable(m_mbStartSample, true);
break;
case endDataSentence: // for fast reader each minibatch is considered a "sentence", so always true
ret = true;
break;
}
return ret;
}
// instantiate all the combinations we expect to be used
template class UCIFastReader<double>;
template class UCIFastReader<float>;
}}}