CNTK/MachineLearning/CNTKEval/EvalReader.h

//
// <copyright file="EvalReader.h" company="Microsoft">
//     Copyright (c) Microsoft Corporation.  All rights reserved.
// </copyright>
//
#pragma once

#define DATAREADER_LOCAL
#include "DataReader.h"

namespace Microsoft { namespace MSR { namespace CNTK {

// Evaluation Reader class
// interface to pass to evaluation DLL
template<class ElemType>
class EvalReader : public IDataReader<ElemType>
{
    typedef typename IDataReader<ElemType>::LabelType LabelType;
    typedef typename IDataReader<ElemType>::LabelIdType LabelIdType;
private:
    std::map<std::wstring, std::vector<ElemType>*>* m_inputs; // our input data
    std::map<std::wstring, size_t>* m_dimensions; // the number of rows for the input data
    size_t m_recordCount; // count of records in this data
    size_t m_currentRecord; // next record number to read
    size_t m_mbSize;
    vector<size_t> m_switchFrame;
    size_t m_oldSig;
public:
    // Method to setup the data for the reader
    void SetData(std::map<std::wstring, std::vector<ElemType>*>* inputs, std::map<std::wstring, size_t>* dimensions)
    {
        m_inputs = inputs;
        m_dimensions = dimensions;
        m_currentRecord = 0;
        m_recordCount = 0;
        for (auto iter = inputs->begin(); iter != inputs->end(); ++iter)
        {
            // figure out the dimension of the data
            const std::wstring& val = iter->first;
            size_t count = (*inputs)[val]->size();
            size_t rows = (*dimensions)[val];
            size_t recordCount = count/rows;


            if (m_recordCount != 0)
            {
                // record count must be the same for all the data
                if (recordCount != m_recordCount)
                    RuntimeError("Record Count of %ls (%lux%lu) does not match the record count of previous entries (%lu).", val.c_str(), rows, recordCount, m_recordCount);
            }
            else
            {
                m_recordCount = recordCount;
            }
        }
    }

    void SetBoundary (size_t newSig)
    {
        if (m_switchFrame.size()==0)
        {
            m_oldSig = newSig;
            m_switchFrame.assign(1,0);
        } else
        {
            if (m_oldSig==newSig)
            {
                m_switchFrame[0] = m_mbSize+8888;   // TODO: WTF??
            }
            else
            {
                m_switchFrame[0] = 0;
                m_oldSig = newSig;
            }
        }

    }

    virtual void Init(const ConfigParameters & /*config*/) override { }
    virtual void Init(const ScriptableObjects::IConfigRecord & /*config*/) override { }

    // Destroy - cleanup and remove this class
    // NOTE: this destroys the object, and it can't be used past this point
    virtual void Destroy()
    {
        delete this;
    }

    // EvalReader Constructor
    // config - [in] configuration parameters for the datareader
    template<class ConfigRecordType>
    EvalReader(const ConfigRecordType& config)
    {
        m_recordCount = m_currentRecord = 0;
        Init(config);
    }

    // Destructor - free up the matrix values we allocated
    virtual ~EvalReader()
    {
    }

    //StartMinibatchLoop - Startup a minibatch loop
    // mbSize - [in] size of the minibatch (number of frames, etc.)
    // epoch - [in] epoch number for this loop
    // requestedEpochSamples - [in] number of samples to randomize, defaults to requestDataSize which uses the number of samples there are in the dataset
    virtual void StartMinibatchLoop(size_t mbSize, size_t /*epoch*/, size_t /*requestedEpochSamples=requestDataSize*/)
    {
        m_mbSize = min(mbSize,m_recordCount);
    }

    // GetMinibatch - Get the next minibatch (features and labels)
    // matrices - [in] a map with named matrix types (i.e. 'features', 'labels') mapped to the corresponing matrix,
    //             [out] each matrix resized if necessary containing data.
    // returns - true if there are more minibatches, false if no more minibatchs remain
    virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices)
    {
        // how many records are we reading this time
        size_t recordCount = min(m_mbSize, m_recordCount-m_currentRecord);

        // check to see if we are out of records in this current dataset
        if (m_currentRecord >= m_recordCount)
            return false;

        // loop through all the input vectors to copy the data over
        for (auto iter = m_inputs->begin(); iter != m_inputs->end(); ++iter)
        {
            // figure out the dimension of the data
            std::wstring val = iter->first;
            size_t rows = (*m_dimensions)[val];
            //size_t count = rows*recordCount;

            // find the output matrix we want to fill
            auto iterIn = matrices.find(val);

            // allocate the matrix if we don't have one yet
            if (iterIn == matrices.end())
            {
                RuntimeError("No matrix data found for key '%ls', cannot continue", val.c_str());
            }

            Matrix<ElemType>* matrix = iterIn->second;

            // resize to the proper size to hold the data
            matrix->Resize(rows, recordCount);

            // copy over the data
            std::vector<ElemType>* data = iter->second;
            //size_t  = m_currentRecord*rows;
            void* mat = &(*matrix)(0,0);
            size_t matSize = matrix->GetNumElements()*sizeof(ElemType);
            void* dataPtr = (void*)((ElemType*)data->data() + m_currentRecord*rows);
            size_t dataSize = rows*recordCount*sizeof(ElemType);
            memcpy_s(mat, matSize, dataPtr, dataSize);
        }

        // increment our record pointer
        m_currentRecord += recordCount;

        // return true if we returned any data whatsoever
        return true;
    }

    size_t GetNumParallelSequences() { return 1; }

    void SetNumParallelSequences(const size_t ) {}
    void SetSentenceSegBatch(std::vector<size_t> &sentenceEnd)
    {
        sentenceEnd.resize(m_switchFrame.size());
        for (size_t i = 0; i < m_switchFrame.size(); i++)
        {
            sentenceEnd[i] = m_switchFrame[i];
        }
    }
    void CopyMBLayoutTo(MBLayoutPtr pMBLayout)
    {
        assert(m_switchFrame.size() == 1);
        pMBLayout->Init(1, m_mbSize, true/*sequential*/);   // TODO: not sure if this is always sequential

        if (m_switchFrame[0] < m_mbSize) /* there is a switch frame within the minibatch*/
        {
            pMBLayout->Set(0, m_switchFrame[0], MinibatchPackingFlags::SequenceStart);
            if (m_switchFrame[0] > 0)
                pMBLayout->SetWithoutOr(0, m_switchFrame[0] - 1, MinibatchPackingFlags::SequenceEnd);   // TODO: can't we use Set()?
        }
    }

    void GetSentenceBoundary(std::vector<size_t> boundaryInfo)
    {
        m_switchFrame.resize(boundaryInfo.size());
        for (size_t i = 0; i < m_switchFrame.size(); i ++)
            m_switchFrame[i] = boundaryInfo[i];
    }

    void SetRandomSeed(int) { NOT_IMPLEMENTED;  }

    // GetLabelMapping - Gets the label mapping from integer index to label type
    // returns - a map from numeric datatype to native label type
    virtual const std::map<typename EvalReader<ElemType>::LabelIdType, typename EvalReader<ElemType>::LabelType>& GetLabelMapping(const std::wstring& /*sectionName*/)
    {
        static std::map<typename EvalReader<ElemType>::LabelIdType, typename EvalReader<ElemType>::LabelType> labelMap;
        return labelMap;
    }

    // SetLabelMapping - Sets the label mapping from integer index to label
    // labelMapping - mapping table from label values to IDs (must be 0-n)
    // note: for tasks with labels, the mapping table must be the same between a training run and a testing run
    virtual void SetLabelMapping(const std::wstring& /*sectionName*/, const std::map<typename EvalReader<ElemType>::LabelIdType, typename EvalReader<ElemType>::LabelType>& /*labelMapping*/) {}

    // GetData - Gets metadata from the specified section (into CPU memory)
    // sectionName - section name to retrieve data from
    // numRecords - number of records to read
    // data - pointer to data buffer, if NULL, dataBufferSize will be set to size of required buffer to accomidate request
    // dataBufferSize - [in] size of the databuffer in bytes
    //                  [out] size of buffer filled with data
    // recordStart - record to start reading from, defaults to zero (start of data)
    // returns: true if data remains to be read, false if the end of data was reached
    virtual bool GetData(const std::wstring& /*sectionName*/, size_t /*numRecords*/, void* /*data*/, size_t& /*dataBufferSize*/, size_t /*recordStart=0*/)
    {
        return false;
    }

    virtual bool DataEnd(EndDataType /*endDataType*/)
    {
        return m_currentRecord < m_recordCount;
    }

    virtual bool GetMinibatch4SE(std::vector<shared_ptr<const msra::dbn::latticepair>> & /*latticeinput*/, vector<size_t> & /*uids*/,
        vector<size_t> & /*boundaries*/, vector<size_t> &/*extrauttmap*/)
    {
        return true;
    }

    virtual bool GetHmmData(msra::asr::simplesenonehmm * /*hmm*/)
    {
        return true;
    }

    virtual void SetValidFrameInBatch(vector<size_t> &/*validFrame*/)
    {
        return;
    }

};

}}}