536 строки
19 KiB
C++
536 строки
19 KiB
C++
//
|
|
// <copyright file="UCIReader.cpp" company="Microsoft">
|
|
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
// </copyright>
|
|
//
|
|
// UCIReader.cpp : Defines the exported functions for the DLL application.
|
|
//
|
|
|
|
#include "stdafx.h"
|
|
#include "File.h"
|
|
#define DATAREADER_EXPORTS // creating the exports here
|
|
#include "DataReader.h"
|
|
#include "UCIReader.h"
|
|
|
|
namespace Microsoft { namespace MSR { namespace CNTK {
|
|
|
|
template<class ElemType, typename LabelType>
|
|
size_t UCIReader<ElemType, LabelType>::RandomizeSweep(size_t epochSample)
|
|
{
|
|
size_t randomRangePerEpoch = (m_epochSize+m_randomizeRange-1)/m_randomizeRange;
|
|
return m_epoch*randomRangePerEpoch + epochSample/m_randomizeRange;
|
|
}
|
|
|
|
// ReadLine - Read a line
|
|
// readSample - sample to read in global sample space
|
|
// returns - true if we successfully read a record, otherwise false
|
|
template<class ElemType, typename LabelType>
|
|
bool UCIReader<ElemType, LabelType>::ReadRecord(size_t readSample)
|
|
{
|
|
bool readRecord = false;
|
|
File& file = *m_file;
|
|
if (!file.IsEOF())
|
|
{
|
|
LabelType label;
|
|
//std::wstring wstr;
|
|
//file.GetLine(wstr);
|
|
if (m_labelFirst && m_labelType != labelNone)
|
|
{
|
|
file >> label;
|
|
}
|
|
|
|
// get the sample index in this epoch (not global sample)
|
|
size_t epochSample = readSample % m_epochSize;
|
|
size_t idxFeature = epochSample*m_featureCount;
|
|
|
|
int cntFeatures=0;
|
|
bool read = true; // set the read amount to something valid
|
|
ElemType* feature = &m_featureData[idxFeature];
|
|
while (cntFeatures < m_featureCount && read)
|
|
{
|
|
ElemType elem;
|
|
|
|
// try and get an element, if it doesn't work (reading a string as an number, etc.)
|
|
// read will return 0, and we exit the loop
|
|
read = file.TryGetText(elem);
|
|
if (read)
|
|
*feature++ = elem;
|
|
++cntFeatures;
|
|
}
|
|
|
|
// get end of line if it exists
|
|
bool eol = file.EndOfLineOrEOF(true);
|
|
|
|
// if label is last pop it off the vector
|
|
if (!m_labelFirst && m_labelType != labelNone)
|
|
{
|
|
file >> label;
|
|
if (!file.EndOfLineOrEOF(true))
|
|
ERROR("end of line/file not found after label");
|
|
}
|
|
|
|
readRecord = true;
|
|
|
|
// add the new values to the arrays
|
|
if (m_labelType == labelCategory)
|
|
{
|
|
// check to see if we have seen this label before
|
|
auto value = m_mapLabelToId.find(label);
|
|
LabelIdType labelId;
|
|
if (value == m_mapLabelToId.end())
|
|
{
|
|
// new label so add it to the mapping tables
|
|
m_mapLabelToId[label] = m_labelIdMax;
|
|
m_mapIdToLabel[m_labelIdMax] = label;
|
|
labelId = m_labelIdMax++;
|
|
|
|
// if our label dimension is lower than the current labelId then increase it
|
|
if (m_labelDim < m_labelIdMax)
|
|
m_labelDim = m_labelIdMax;
|
|
}
|
|
else
|
|
{
|
|
labelId = value->second;
|
|
}
|
|
|
|
// now add the label id to the label data array
|
|
m_labelIdData[epochSample] = labelId;
|
|
}
|
|
else if (m_labelType != labelNone)
|
|
{
|
|
m_labelData[epochSample] = label;
|
|
}
|
|
}
|
|
return readRecord; // we read a record
|
|
}
|
|
|
|
// EnsureDataAvailable - Read enough lines so we can request a minibatch starting as requested
|
|
// mbStartSample - the starting sample we are ensureing are good
|
|
// numberRead - [out] returns the actual number read
|
|
// returns - true if we have more to read, false if we hit the end of the dataset
|
|
template<class ElemType, typename LabelType>
|
|
bool UCIReader<ElemType, LabelType>::EnsureDataAvailable(size_t mbStartSample, size_t& numberRead)
|
|
{
|
|
assert(mbStartSample >= m_epochStartSample);
|
|
// determine how far ahead we need to read
|
|
bool randomize = Randomize();
|
|
// need to read to the end of the next minibatch
|
|
size_t epochSample = mbStartSample;
|
|
epochSample %= m_epochSize;
|
|
|
|
// determine number left to read for this epoch
|
|
size_t numberToRead = m_epochSize - epochSample;
|
|
// we will take either a minibatch or the number left in the epoch
|
|
numberToRead = min(numberToRead, m_mbSize);
|
|
|
|
size_t randomRangePerEpoch = 1;
|
|
if (randomize)
|
|
{
|
|
size_t randomizeSweep = RandomizeSweep(epochSample);
|
|
// if first read or read takes us to another randomization range
|
|
// we need to read at least randomization range records
|
|
if (m_randomizeRange != randomizeAuto && // if we are randomizing and know the range
|
|
randomizeSweep != m_randomordering.CurrentSeed()) // the range has changed since last time
|
|
{
|
|
numberToRead = m_randomizeRange;
|
|
}
|
|
}
|
|
|
|
// check to see if we have the proper records read already
|
|
if (m_readNextSample >= mbStartSample+numberToRead && mbStartSample >= m_epochStartSample)
|
|
return true;
|
|
|
|
// read in the samples
|
|
File& file = *m_file;
|
|
numberRead=0;
|
|
bool readRecords = true;
|
|
while (readRecords && numberRead < numberToRead)
|
|
{
|
|
size_t next = numberRead+1;
|
|
if (!(next% 10000))
|
|
fprintf(stderr,"#");
|
|
else if (!(next% 1000))
|
|
fprintf(stderr,"+");
|
|
else if (!(next % 100))
|
|
fprintf(stderr, ".");
|
|
readRecords = ReadRecord(m_readNextSample);
|
|
if (readRecords)
|
|
{
|
|
numberRead++;
|
|
++m_readNextSample;
|
|
if (!m_endReached)
|
|
++m_totalSamples; // total number of records in the dataset
|
|
}
|
|
}
|
|
|
|
// if we hit the end of the records, we now have the total number of Samples in the dataset
|
|
if (!readRecords)
|
|
{
|
|
UpdateDataVariables();
|
|
}
|
|
return readRecords;
|
|
}
|
|
|
|
// UpdateDataVariables - Update variables that depend on the dataset being completely read
|
|
template<class ElemType, typename LabelType>
|
|
void UCIReader<ElemType, LabelType>::UpdateDataVariables()
|
|
{
|
|
// if we already reached the end before no need to set again.
|
|
if (m_endReached)
|
|
return;
|
|
|
|
// get the size of the dataset
|
|
assert(m_totalSamples*m_featureCount >= m_featureData.size());
|
|
|
|
if (m_epochSize == requestDataSize)
|
|
m_epochSize = m_totalSamples;
|
|
|
|
// make sure randomization range is within the sample bounds
|
|
if (m_randomizeRange > m_epochSize)
|
|
{
|
|
m_randomizeRange = m_epochSize;
|
|
m_randomordering.resize(m_randomizeRange,m_randomizeRange);
|
|
}
|
|
|
|
// update the label dimension if it is not big enough, add something on
|
|
if (m_labelType == labelCategory && m_labelIdMax > m_labelDim)
|
|
m_labelDim = m_labelIdMax; // update the label dimensions if different
|
|
|
|
// we got to the end of the dataset
|
|
m_endReached = true;
|
|
}
|
|
|
|
// Reader Initialize
|
|
// vdim - [out] number of elements in a single Sample of feature values (single precision values)
|
|
// udim - [out] number of columns in the label matrix
|
|
// filepaths - [in] and array of file paths to necessary files, it is variable depending on the reader
|
|
// options - [in] string of options (i.e. "-windowsize:11 -addenergy") data reader specific
|
|
// randomize - number of samples to randomize, defaults to randomizeAuto
|
|
template<class ElemType, typename LabelType>
|
|
void UCIReader<ElemType, LabelType>::Init(size_t& vdim, size_t& udim, const std::vector<std::wstring>& filepaths, const ConfigParameters& config)
|
|
{
|
|
|
|
// initialize all the variables
|
|
m_mbStartSample = m_epoch = m_totalSamples = m_epochStartSample = 0;
|
|
m_labelIdMax = m_labelDim = 0;
|
|
m_partialMinibatch = m_labelFirst = m_endReached = false;
|
|
m_labelType = labelCategory;
|
|
m_featureCount = vdim;
|
|
m_readNextSample = 0;
|
|
|
|
// set the feature count to at least one (we better have one feature...)
|
|
assert (m_featureCount != 0);
|
|
|
|
fprintf(stderr, "reading uci file %ws", filepaths[0].c_str());
|
|
m_file = new File(filepaths[0], fileOptionsRead | fileOptionsText | fileOptionsSequential);
|
|
File& file = *m_file;
|
|
|
|
ConfigParameters readerConfig = config("reader");
|
|
if (readerConfig.Exists("randomize"))
|
|
{
|
|
string randomizeString = readerConfig("randomize");
|
|
if (randomizeString == "None")
|
|
{
|
|
m_randomizeRange = randomizeNone;
|
|
}
|
|
else if (randomizeString == "Auto")
|
|
{
|
|
m_randomizeRange = randomizeAuto;
|
|
}
|
|
else
|
|
{
|
|
m_randomizeRange = readerConfig("randomize");
|
|
}
|
|
}
|
|
else
|
|
{
|
|
m_randomizeRange = randomizeAuto;
|
|
}
|
|
|
|
// determine if we have first or last label
|
|
std::string labelFirst(readerConfig("labelPosition","First"));
|
|
m_labelFirst = labelFirst == "First";
|
|
|
|
// determine if we partial minibatches are desired
|
|
std::string minibatchMode(readerConfig("minibatchMode","Partial"));
|
|
m_partialMinibatch = minibatchMode == "Parital";
|
|
|
|
// determine if we partial minibatches are desired
|
|
std::string labelType(readerConfig("labelType","Category"));
|
|
if (labelType == "Category")
|
|
{
|
|
m_labelType = labelCategory;
|
|
}
|
|
else if (labelType == "Regression")
|
|
{
|
|
m_labelType = labelRegression;
|
|
}
|
|
else if (labelType == "None")
|
|
{
|
|
m_labelType = labelNone;
|
|
}
|
|
|
|
// if we know the size of the randomization now, resize, otherwise wait until we know the epochSize in StartMinibatchLoop()
|
|
if (Randomize() && m_randomizeRange != randomizeAuto)
|
|
m_randomordering.resize(m_randomizeRange, m_randomizeRange);
|
|
|
|
// if the value they passed in as udim is not big enough, add something on
|
|
if (udim < m_labelIdMax)
|
|
udim = m_labelIdMax;
|
|
m_labelDim = (LabelIdType)udim;
|
|
}
|
|
|
|
// destructor - virtual so it gets called properly
|
|
template<class ElemType, typename LabelType>
|
|
UCIReader<ElemType, LabelType>::~UCIReader()
|
|
{
|
|
delete m_file;
|
|
}
|
|
|
|
//SetupEpoch - Setup the proper position in the file, and other variable settings to start a particular epoch
|
|
template<class ElemType, typename LabelType>
|
|
void UCIReader<ElemType, LabelType>::SetupEpoch()
|
|
{
|
|
size_t sweep = 0;
|
|
size_t sweepsPerDS = 1;
|
|
// if we know the total number of records
|
|
if (m_endReached)
|
|
{
|
|
sweepsPerDS = (m_totalSamples+m_epochSize-1)/m_epochSize;
|
|
sweep = m_epoch / sweepsPerDS;
|
|
}
|
|
else
|
|
{ // don't know yet, haven't reached the end
|
|
sweepsPerDS = m_epoch+1;
|
|
}
|
|
|
|
// if we need to start in the middle of the dataset, we better already be there
|
|
if (m_epoch % sweepsPerDS != 0)
|
|
{
|
|
// make sure we are in the correct location already for mid-dataset epochs
|
|
fprintf(stderr, "starting epoch %d midway through file at position %ld\n", m_epoch, m_mbStartSample);
|
|
assert(m_mbStartSample % m_epochSize == 0);
|
|
m_epochStartSample = m_mbStartSample;
|
|
// future, we would need to seek to the proper location
|
|
}
|
|
else
|
|
{
|
|
// starting over in the dataset
|
|
m_readNextSample = m_epochStartSample = m_mbStartSample = m_epoch * m_epochSize;
|
|
if (sweepsPerDS > 1)
|
|
{
|
|
// restarting an epoch at the beginning of the dataset
|
|
fprintf(stderr, "restarting file read, for epoch %d\n", m_epoch);
|
|
m_file->SetPosition(0);
|
|
}
|
|
else if (m_epoch > 0) // if we have read the data once already
|
|
{
|
|
assert(m_totalSamples <= m_epochSize);
|
|
assert(m_featureData.size()/m_featureCount >= m_totalSamples);
|
|
fprintf(stderr, "all data already resident for epoch %d\n", m_epoch);
|
|
// move the read pointer to the end since we have everything already in memory.
|
|
m_readNextSample += m_totalSamples;
|
|
}
|
|
}
|
|
}
|
|
|
|
//StartMinibatchLoop - Startup a minibatch loop
|
|
// mbSize - [in] size of the minibatch (number of Samples, etc.)
|
|
// epoch - [in] epoch number for this loop
|
|
// requestedEpochSamples - [in] number of samples to randomize, defaults to requestDataSize which uses the number of samples there are in the dataset
|
|
template<class ElemType, typename LabelType>
|
|
void UCIReader<ElemType, LabelType>::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples)
|
|
{
|
|
m_mbSize = mbSize;
|
|
if (requestedEpochSamples == requestDataSize)
|
|
{
|
|
if (m_endReached)
|
|
{
|
|
m_epochSize = m_totalSamples;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
m_epochSize = requestedEpochSamples;
|
|
}
|
|
|
|
// set the randomization range for randomizationAuto
|
|
// or if it's invalid less than the minibatch size, we need to make it at least minibatch size
|
|
if (m_randomizeRange == randomizeAuto
|
|
|| (m_randomizeRange != randomizeNone && m_randomizeRange < mbSize))
|
|
{
|
|
if (m_epochSize != requestDataSize)
|
|
{
|
|
m_randomizeRange = m_epochSize;
|
|
m_randomordering.resize(m_randomizeRange, m_randomizeRange);
|
|
}
|
|
else if (m_randomizeRange < mbSize)
|
|
{
|
|
m_randomizeRange = max(m_randomizeRange, m_mbSize);
|
|
m_randomordering.resize(m_randomizeRange, m_randomizeRange);
|
|
}
|
|
}
|
|
|
|
m_epoch = epoch;
|
|
m_mbStartSample = epoch*m_epochSize;
|
|
SetupEpoch();
|
|
|
|
// allocate room for the data
|
|
m_featureData.resize(m_featureCount*m_epochSize);
|
|
if (m_labelType == labelCategory)
|
|
m_labelIdData.resize(m_epochSize);
|
|
else if (m_labelType != labelNone)
|
|
m_labelData.resize(m_epochSize);
|
|
}
|
|
|
|
// GetMinibatch - Get the next minibatch
|
|
// features - [out] returns minibatch in passed in matrix, will resize and replace existing data. Number of columns returned may be less than requested mbSize if end of dataset has been reached
|
|
// labels - [out] returns matrix of label values as normalized integers (0-x) for class labels, and will replace existing data.
|
|
// return - true if we read some records to process, otherwise false;
|
|
template<class ElemType, typename LabelType>
|
|
bool UCIReader<ElemType, LabelType>::GetMinibatch(Matrix<ElemType>& features, Matrix<ElemType>& labels)
|
|
{
|
|
// get out if they didn't call StartMinibatchLoop() first
|
|
if (m_mbSize == 0)
|
|
return false;
|
|
|
|
// check to see if we have changed epochs, if so we are done with this one.
|
|
if (m_mbStartSample / m_epochSize != m_epoch)
|
|
return false;
|
|
|
|
bool randomize = Randomize();
|
|
size_t recordsRead = 0;
|
|
bool moreData = EnsureDataAvailable(m_mbStartSample, recordsRead);
|
|
|
|
// figure which sweep of the randomization we are on
|
|
size_t epochSample = m_mbStartSample % m_epochSize; // where the minibatch starts in this epoch
|
|
size_t samplesExtra = m_totalSamples % m_epochSize; // extra samples at the end of an epoch
|
|
size_t epochsDS = (m_totalSamples+m_epochSize-1)/m_epochSize; // how many epochs per dataset
|
|
size_t randomizeSet = randomize?RandomizeSweep(epochSample):0;
|
|
const auto & tmap = m_randomordering(randomizeSet);
|
|
size_t epochEnd = m_epochSize;
|
|
|
|
// actual size is either what requested, or total number of samples read so far
|
|
size_t actualmbsize = min(m_totalSamples, m_mbSize); // it may still return less if at end of sweep
|
|
|
|
// if we have extra records at the end of the dataset
|
|
// and we are in the epoch where they would occur
|
|
if (samplesExtra && !((m_epoch+1)%epochsDS))
|
|
{
|
|
epochEnd = samplesExtra;
|
|
}
|
|
|
|
// check for an odd sized last minibatch
|
|
if (epochSample + actualmbsize > epochEnd)
|
|
{
|
|
actualmbsize = epochEnd - epochSample;
|
|
}
|
|
|
|
// hit the end of the dataset, so see how many records we REALLY got
|
|
if (!moreData)
|
|
{
|
|
// we started a new epoch and hit the end of the file before we read any records so reset the epoch and keep going
|
|
// we know this is the case when our epoch starts on the same sample as the next read sample and we are at the beginning.
|
|
if (epochSample == 0 && m_epochStartSample == m_readNextSample)
|
|
{
|
|
SetupEpoch();
|
|
moreData = EnsureDataAvailable(m_mbStartSample, recordsRead);
|
|
}
|
|
// if we are out of records return now
|
|
else if (actualmbsize == 0)
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// if they don't want partial minibatches, skip and return
|
|
if (actualmbsize < m_mbSize && !m_partialMinibatch)
|
|
{
|
|
m_mbStartSample += actualmbsize;
|
|
return false;
|
|
}
|
|
|
|
// resize the features array to be big enough
|
|
features.Resize(m_featureCount, actualmbsize);
|
|
|
|
if (m_labelType == labelCategory)
|
|
{
|
|
// make the label array big enough, this should be a sparse array when that is supported
|
|
labels.Resize(m_labelDim, actualmbsize);
|
|
labels.SetValue((ElemType)0);
|
|
}
|
|
else if (m_labelType != labelNone)
|
|
{
|
|
labels.Resize(1, actualmbsize);
|
|
}
|
|
|
|
// loop through and copy data to matrix
|
|
int j = 0; // vector of vectors of feature data
|
|
// determine randomization base index
|
|
size_t randBase;
|
|
if (randomize)
|
|
randBase = epochSample - epochSample%m_randomizeRange;
|
|
|
|
for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample)
|
|
{
|
|
// pick the right sample with randomization if desired
|
|
size_t jRand = randomize?(randBase + tmap[jSample%m_randomizeRange]):jSample;
|
|
jRand %= m_epochSize;
|
|
size_t sampleCount = m_featureData[jRand*m_featureCount];
|
|
|
|
// vector of feature data goes into matrix column
|
|
for (int i = 0;i < m_featureCount; ++i)
|
|
{
|
|
features(i, j) = m_featureData[jRand*m_featureCount+i];
|
|
}
|
|
|
|
if (m_labelType == labelCategory)
|
|
{
|
|
// they all have to be in dimensions
|
|
assert(m_labelIdData[jRand] < m_labelDim);
|
|
labels(m_labelIdData[jRand], j) = (ElemType)1;
|
|
}
|
|
else if (m_labelType != labelNone)
|
|
{
|
|
// how do we support string labels?
|
|
labels(0, j) = m_labelData[jRand];
|
|
}
|
|
}
|
|
|
|
// advance to the next minibatch
|
|
m_mbStartSample += actualmbsize;
|
|
|
|
// we read some records, so process them
|
|
return true;
|
|
}
|
|
|
|
// GetLabelMapping - Gets the label mapping from integer index to label type
|
|
// returns - a map from numeric datatype to native label type
|
|
template<class ElemType, typename LabelType>
|
|
const std::map<unsigned, LabelType>& UCIReader<ElemType, LabelType>::GetLabelMapping( )
|
|
{
|
|
return m_mapIdToLabel;
|
|
}
|
|
|
|
// SetLabelMapping - Sets the label mapping from integer index to label
|
|
// labelMapping - mapping table from label values to IDs (must be 0-n)
|
|
// note: for tasks with labels, the mapping table must be the same between a training run and a testing run
|
|
template<class ElemType, typename LabelType>
|
|
void UCIReader<ElemType, LabelType>::SetLabelMapping(const std::map<unsigned, LabelType>& labelMapping)
|
|
{
|
|
m_mapIdToLabel = labelMapping;
|
|
m_mapLabelToId.clear();
|
|
for each (std::pair<unsigned, LabelType> var in labelMapping)
|
|
{
|
|
m_mapLabelToId[var.second] = var.first;
|
|
}
|
|
}
|
|
|
|
// instantiate all the combinations we expect to be used
|
|
//template class UCIReader<double, std::wstring>;
|
|
//template class UCIReader<float, std::wstring>;
|
|
template class UCIReader<float, int>;
|
|
template class UCIReader<double, int>;
|
|
template class UCIReader<float, float>;
|
|
template class UCIReader<double, double>;
|
|
}}} |