CNTK/MachineLearning/CNTKEval/EvalReader.h

247 строки
9.1 KiB
C++

//
// <copyright file="EvalReader.h" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//
#pragma once
#define DATAREADER_LOCAL
#include "DataReader.h"
namespace Microsoft { namespace MSR { namespace CNTK {
// Evaluation Reader class
// interface to pass to evaluation DLL
template<class ElemType>
class EvalReader : public IDataReader<ElemType>
{
typedef typename IDataReader<ElemType>::LabelType LabelType;
typedef typename IDataReader<ElemType>::LabelIdType LabelIdType;
private:
std::map<std::wstring, std::vector<ElemType>*>* m_inputs; // our input data
std::map<std::wstring, size_t>* m_dimensions; // the number of rows for the input data
size_t m_recordCount; // count of records in this data
size_t m_currentRecord; // next record number to read
size_t m_mbSize;
vector<size_t> m_switchFrame;
size_t m_oldSig;
public:
// Method to setup the data for the reader
void SetData(std::map<std::wstring, std::vector<ElemType>*>* inputs, std::map<std::wstring, size_t>* dimensions)
{
m_inputs = inputs;
m_dimensions = dimensions;
m_currentRecord = 0;
m_recordCount = 0;
for (auto iter = inputs->begin(); iter != inputs->end(); ++iter)
{
// figure out the dimension of the data
const std::wstring& val = iter->first;
size_t count = (*inputs)[val]->size();
size_t rows = (*dimensions)[val];
size_t recordCount = count/rows;
if (m_recordCount != 0)
{
// record count must be the same for all the data
if (recordCount != m_recordCount)
RuntimeError("Record Count of %ls (%lux%lu) does not match the record count of previous entries (%lu).", val.c_str(), rows, recordCount, m_recordCount);
}
else
{
m_recordCount = recordCount;
}
}
}
void SetBoundary (size_t newSig)
{
if (m_switchFrame.size()==0)
{
m_oldSig = newSig;
m_switchFrame.assign(1,0);
} else
{
if (m_oldSig==newSig)
{
m_switchFrame[0] = m_mbSize+8888; // TODO: WTF??
}
else
{
m_switchFrame[0] = 0;
m_oldSig = newSig;
}
}
}
virtual void Init(const ConfigParameters & /*config*/) override { }
virtual void Init(const ScriptableObjects::IConfigRecord & /*config*/) override { }
// Destroy - cleanup and remove this class
// NOTE: this destroys the object, and it can't be used past this point
virtual void Destroy()
{
delete this;
}
// EvalReader Constructor
// config - [in] configuration parameters for the datareader
template<class ConfigRecordType>
EvalReader(const ConfigRecordType& config)
{
m_recordCount = m_currentRecord = 0;
Init(config);
}
// Destructor - free up the matrix values we allocated
virtual ~EvalReader()
{
}
//StartMinibatchLoop - Startup a minibatch loop
// mbSize - [in] size of the minibatch (number of frames, etc.)
// epoch - [in] epoch number for this loop
// requestedEpochSamples - [in] number of samples to randomize, defaults to requestDataSize which uses the number of samples there are in the dataset
virtual void StartMinibatchLoop(size_t mbSize, size_t /*epoch*/, size_t /*requestedEpochSamples=requestDataSize*/)
{
m_mbSize = min(mbSize,m_recordCount);
}
// GetMinibatch - Get the next minibatch (features and labels)
// matrices - [in] a map with named matrix types (i.e. 'features', 'labels') mapped to the corresponing matrix,
// [out] each matrix resized if necessary containing data.
// returns - true if there are more minibatches, false if no more minibatchs remain
virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices)
{
// how many records are we reading this time
size_t recordCount = min(m_mbSize, m_recordCount-m_currentRecord);
// check to see if we are out of records in this current dataset
if (m_currentRecord >= m_recordCount)
return false;
// loop through all the input vectors to copy the data over
for (auto iter = m_inputs->begin(); iter != m_inputs->end(); ++iter)
{
// figure out the dimension of the data
std::wstring val = iter->first;
size_t rows = (*m_dimensions)[val];
//size_t count = rows*recordCount;
// find the output matrix we want to fill
auto iterIn = matrices.find(val);
// allocate the matrix if we don't have one yet
if (iterIn == matrices.end())
{
RuntimeError("No matrix data found for key '%ls', cannot continue", val.c_str());
}
Matrix<ElemType>* matrix = iterIn->second;
// resize to the proper size to hold the data
matrix->Resize(rows, recordCount);
// copy over the data
std::vector<ElemType>* data = iter->second;
//size_t = m_currentRecord*rows;
void* mat = &(*matrix)(0,0);
size_t matSize = matrix->GetNumElements()*sizeof(ElemType);
void* dataPtr = (void*)((ElemType*)data->data() + m_currentRecord*rows);
size_t dataSize = rows*recordCount*sizeof(ElemType);
memcpy_s(mat, matSize, dataPtr, dataSize);
}
// increment our record pointer
m_currentRecord += recordCount;
// return true if we returned any data whatsoever
return true;
}
size_t GetNumParallelSequences() { return 1; }
void SetNumParallelSequences(const size_t ) {}
void SetSentenceSegBatch(std::vector<size_t> &sentenceEnd)
{
sentenceEnd.resize(m_switchFrame.size());
for (size_t i = 0; i < m_switchFrame.size(); i++)
{
sentenceEnd[i] = m_switchFrame[i];
}
}
void CopyMBLayoutTo(MBLayoutPtr pMBLayout)
{
assert(m_switchFrame.size() == 1);
pMBLayout->Init(1, m_mbSize, true/*sequential*/); // TODO: not sure if this is always sequential
if (m_switchFrame[0] < m_mbSize) /* there is a switch frame within the minibatch*/
{
pMBLayout->Set(0, m_switchFrame[0], MinibatchPackingFlags::SequenceStart);
if (m_switchFrame[0] > 0)
pMBLayout->SetWithoutOr(0, m_switchFrame[0] - 1, MinibatchPackingFlags::SequenceEnd); // TODO: can't we use Set()?
}
}
void GetSentenceBoundary(std::vector<size_t> boundaryInfo)
{
m_switchFrame.resize(boundaryInfo.size());
for (size_t i = 0; i < m_switchFrame.size(); i ++)
m_switchFrame[i] = boundaryInfo[i];
}
void SetRandomSeed(int) { NOT_IMPLEMENTED; }
// GetLabelMapping - Gets the label mapping from integer index to label type
// returns - a map from numeric datatype to native label type
virtual const std::map<typename EvalReader<ElemType>::LabelIdType, typename EvalReader<ElemType>::LabelType>& GetLabelMapping(const std::wstring& /*sectionName*/)
{
static std::map<typename EvalReader<ElemType>::LabelIdType, typename EvalReader<ElemType>::LabelType> labelMap;
return labelMap;
}
// SetLabelMapping - Sets the label mapping from integer index to label
// labelMapping - mapping table from label values to IDs (must be 0-n)
// note: for tasks with labels, the mapping table must be the same between a training run and a testing run
virtual void SetLabelMapping(const std::wstring& /*sectionName*/, const std::map<typename EvalReader<ElemType>::LabelIdType, typename EvalReader<ElemType>::LabelType>& /*labelMapping*/) {}
// GetData - Gets metadata from the specified section (into CPU memory)
// sectionName - section name to retrieve data from
// numRecords - number of records to read
// data - pointer to data buffer, if NULL, dataBufferSize will be set to size of required buffer to accomidate request
// dataBufferSize - [in] size of the databuffer in bytes
// [out] size of buffer filled with data
// recordStart - record to start reading from, defaults to zero (start of data)
// returns: true if data remains to be read, false if the end of data was reached
virtual bool GetData(const std::wstring& /*sectionName*/, size_t /*numRecords*/, void* /*data*/, size_t& /*dataBufferSize*/, size_t /*recordStart=0*/)
{
return false;
}
virtual bool DataEnd(EndDataType /*endDataType*/)
{
return m_currentRecord < m_recordCount;
}
virtual bool GetMinibatch4SE(std::vector<shared_ptr<const msra::dbn::latticepair>> & /*latticeinput*/, vector<size_t> & /*uids*/,
vector<size_t> & /*boundaries*/, vector<size_t> &/*extrauttmap*/)
{
return true;
}
virtual bool GetHmmData(msra::asr::simplesenonehmm * /*hmm*/)
{
return true;
}
virtual void SetValidFrameInBatch(vector<size_t> &/*validFrame*/)
{
return;
}
};
}}}