CNTK/DataReader/HTKMLFReader/HTKMLFReader.cpp

//
// <copyright file="HTKMLFReader.cpp" company="Microsoft">
//     Copyright (c) Microsoft Corporation.  All rights reserved.
// </copyright>
//
// HTKMLFReader.cpp : Defines the exported functions for the DLL application.
//

#include "stdafx.h"
#ifdef _WIN32
#include <objbase.h>
#endif
#include "basetypes.h"

#include "htkfeatio.h"                  // for reading HTK features
#include "latticearchive.h"             // for reading HTK phoneme lattices (MMI training)
#include "simplesenonehmm.h"            // for MMI scoring
#include "msra_mgram.h"                 // for unigram scores of ground-truth path in sequence training

#include "rollingwindowsource.h"        // minibatch sources
#include "utterancesourcemulti.h"
#include "utterancesource.h"
#include "utterancesourcemulti.h"
#ifdef _WIN32
#include "readaheadsource.h"
#endif
#include "chunkevalsource.h"
#include "minibatchiterator.h"
#define DATAREADER_EXPORTS  // creating the exports here
#include "DataReader.h"
#include "commandArgUtil.h"
#include "HTKMLFReader.h"
#ifdef LEAKDETECT
#include <vld.h> // for memory leak detection
#endif

#ifdef __unix__
#include <limits.h>
typedef unsigned long DWORD;
typedef unsigned short WORD;
typedef unsigned int UNINT32;
#endif
#pragma warning (disable: 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this

#ifdef _WIN32
int msra::numa::node_override = -1;     // for numahelpers.h
#endif

namespace msra { namespace lm {
/*static*/ const mgram_map::index_t mgram_map::nindex = (mgram_map::index_t) -1; // invalid index
}}

namespace Microsoft { namespace MSR { namespace CNTK {

    // Create a Data Reader
    //DATAREADER_API IDataReader* DataReaderFactory(void)

    template<class ElemType>
        void HTKMLFReader<ElemType>::Init(const ConfigParameters& readerConfig)
        {
            m_cudaAllocator = nullptr;
            m_mbiter = NULL;
            m_frameSource = NULL;
#ifdef _WIN32
            m_readAheadSource = NULL;
#endif
            m_lattices = NULL;

            m_truncated = readerConfig("Truncated", "false");
            m_convertLabelsToTargets = false;

            ConfigArray numberOfuttsPerMinibatchForAllEpochs = readerConfig("nbruttsineachrecurrentiter", "1");
            m_numberOfuttsPerMinibatchForAllEpochs = numberOfuttsPerMinibatchForAllEpochs;

            for (int i = 0; i < m_numberOfuttsPerMinibatchForAllEpochs.size(); i++)
            {
                m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[i];
                if (m_numberOfuttsPerMinibatch < 1)
                {
                    LogicError("nbrUttsInEachRecurrentIter cannot be less than 1.");
                }

                if (!m_truncated && m_numberOfuttsPerMinibatch != 1)
                {
                    LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false.");
                }
            }

            m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[0];

            m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
            m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
            m_processedFrame.assign(m_numberOfuttsPerMinibatch, 0);
            m_toProcess.assign(m_numberOfuttsPerMinibatch,0);
            m_switchFrame.assign(m_numberOfuttsPerMinibatch,0);
            m_noData = false;

            string command(readerConfig("action",L"")); //look up in the config for the master command to determine whether we're writing output (inputs only) or training/evaluating (inputs and outputs)

            if (readerConfig.Exists("legacyMode"))
                RuntimeError("legacy mode has been deprecated\n");

            if (command == "write"){
                m_trainOrTest = false;
                PrepareForWriting(readerConfig);
            }
            else{
                m_trainOrTest = true;
                PrepareForTrainingOrTesting(readerConfig);
            }

        }

    // Load all input and output data.
    // Note that the terms features imply be real-valued quanities and
    // labels imply categorical quantities, irrespective of whether they
    // are inputs or targets for the network
    template<class ElemType>
        void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigParameters& readerConfig)
        {
            vector<wstring> scriptpaths;
            vector<wstring> mlfpaths;
            vector<vector<wstring>>mlfpathsmulti;
            size_t firstfilesonly = SIZE_MAX;   // set to a lower value for testing
            vector<vector<wstring>> infilesmulti;
            vector<wstring> filelist;
            size_t numFiles;
            wstring unigrampath(L"");
            //wstring statelistpath(L"");
            size_t randomize = randomizeAuto;
            size_t iFeat, iLabel;
            iFeat = iLabel = 0;
            vector<wstring> statelistpaths;
            vector<size_t> numContextLeft;
            vector<size_t> numContextRight;

            // for the multi-utterance process
            m_featuresBufferMultiUtt.assign(m_numberOfuttsPerMinibatch,NULL);
            m_featuresBufferAllocatedMultiUtt.assign(m_numberOfuttsPerMinibatch,0);
            m_labelsBufferMultiUtt.assign(m_numberOfuttsPerMinibatch,NULL);
            m_labelsBufferAllocatedMultiUtt.assign(m_numberOfuttsPerMinibatch,0);

            std::vector<std::wstring> featureNames;
            std::vector<std::wstring> labelNames;
            GetDataNamesFromConfig(readerConfig, featureNames, labelNames);
            if (featureNames.size() + labelNames.size() <= 1)
            {
                RuntimeError("network needs at least 1 input and 1 output specified!");
            }

            //load data for all real-valued inputs (features)
            foreach_index(i, featureNames)
            {
                ConfigParameters thisFeature = readerConfig(featureNames[i]);
                m_featDims.push_back(thisFeature("dim"));
                ConfigArray contextWindow = thisFeature("contextWindow", "1");
                if (contextWindow.size() == 1) // symmetric
                {
                    size_t windowFrames = contextWindow[0];
                    if (windowFrames % 2 == 0 )
                        RuntimeError("augmentationextent: neighbor expansion of input features to %d not symmetrical", windowFrames);
                    size_t context = windowFrames / 2;           // extend each side by this
                    numContextLeft.push_back(context);
                    numContextRight.push_back(context);

                }
                else if (contextWindow.size() == 2) // left context, right context
                {
                    numContextLeft.push_back(contextWindow[0]);
                    numContextRight.push_back(contextWindow[1]);
                }
                else
                {
                    RuntimeError("contextFrames must have 1 or 2 values specified, found %d", contextWindow.size());
                }
                // update m_featDims to reflect the total input dimension (featDim x contextWindow), not the native feature dimension
                // that is what the lower level feature readers expect
                m_featDims[i] = m_featDims[i] * (1 + numContextLeft[i] + numContextRight[i]);

                string type = thisFeature("type","Real");
                if (type=="Real"){
                    m_nameToTypeMap[featureNames[i]] = InputOutputTypes::real;
                }
                else{
                    RuntimeError("feature type must be Real");
                }

                m_featureNameToIdMap[featureNames[i]]= iFeat;
                scriptpaths.push_back(thisFeature("scpFile"));
                m_featureNameToDimMap[featureNames[i]] = m_featDims[i];

                m_featuresBufferMultiIO.push_back(nullptr);
                m_featuresBufferAllocatedMultiIO.push_back(0);

                iFeat++;
            }

            foreach_index(i, labelNames)
            {
                ConfigParameters thisLabel = readerConfig(labelNames[i]);
                if (thisLabel.Exists("labelDim"))
                    m_labelDims.push_back(thisLabel("labelDim"));
                else if (thisLabel.Exists("dim"))
                    m_labelDims.push_back(thisLabel("dim"));
                else
                    RuntimeError("labels must specify dim or labelDim");

                string type;
                if (thisLabel.Exists("labelType"))
                    type = thisLabel("labelType"); // let's deprecate this eventually and just use "type"...
                else
                    type = thisLabel("type","Category"); // outputs should default to category

                if (type=="Category")
                    m_nameToTypeMap[labelNames[i]] = InputOutputTypes::category;
                else
                    RuntimeError("label type must be Category");

                statelistpaths.push_back(thisLabel("labelMappingFile",L""));

                m_labelNameToIdMap[labelNames[i]]=iLabel;
                m_labelNameToDimMap[labelNames[i]]=m_labelDims[i];
                mlfpaths.clear();
                mlfpaths.push_back(thisLabel("mlfFile"));
                mlfpathsmulti.push_back(mlfpaths);

                m_labelsBufferMultiIO.push_back(nullptr);
                m_labelsBufferAllocatedMultiIO.push_back(0);

                iLabel++;

                wstring labelToTargetMappingFile(thisLabel("labelToTargetMappingFile",L""));
                if (labelToTargetMappingFile != L"")
                {
                    std::vector<std::vector<ElemType>> labelToTargetMap;
                    m_convertLabelsToTargetsMultiIO.push_back(true);
                    if (thisLabel.Exists("targetDim"))
                    {
                        m_labelNameToDimMap[labelNames[i]]=m_labelDims[i]=thisLabel("targetDim");
                    }
                    else
                        RuntimeError("output must specify targetDim if labelToTargetMappingFile specified!");
                    size_t targetDim = ReadLabelToTargetMappingFile (labelToTargetMappingFile,statelistpaths[i], labelToTargetMap);
                    if (targetDim!=m_labelDims[i])
                        RuntimeError("mismatch between targetDim and dim found in labelToTargetMappingFile");
                    m_labelToTargetMapMultiIO.push_back(labelToTargetMap);
                }
                else
                {
                    m_convertLabelsToTargetsMultiIO.push_back(false);
                    m_labelToTargetMapMultiIO.push_back(std::vector<std::vector<ElemType>>());
                }
            }

            if (iFeat!=scriptpaths.size() || iLabel!=mlfpathsmulti.size())
                throw std::runtime_error(msra::strfun::strprintf ("# of inputs files vs. # of inputs or # of output files vs # of outputs inconsistent\n"));

            if (readerConfig.Exists("randomize"))
            {
                const std::string& randomizeString = readerConfig("randomize");
                if (randomizeString == "None")
                {
                    randomize = randomizeNone;
                }
                else if (randomizeString == "Auto")
                {
                    randomize = randomizeAuto;
                }
                else
                {
                    randomize = readerConfig("randomize");
                }
            }

            m_framemode = readerConfig("frameMode", "true");

            int verbosity = readerConfig("verbosity","2");

            // determine if we partial minibatches are desired
            std::string minibatchMode(readerConfig("minibatchMode","Partial"));
            m_partialMinibatch = !_stricmp(minibatchMode.c_str(),"Partial");

            // get the read method, defaults to "blockRandomize" other option is "rollingWindow"
            std::string readMethod(readerConfig("readMethod","blockRandomize"));

            if (readMethod == "blockRandomize" && randomize == randomizeNone)
            {
                fprintf(stderr, "WARNING: Randomize cannot be set to None when readMethod is set to blockRandomize. Change it Auto");
                randomize = randomizeAuto;
            }

            // see if they want to use readAhead
#ifdef _WIN32
            m_readAhead = readerConfig("readAhead", "false");
#endif

            // read all input files (from multiple inputs)
            // TO DO: check for consistency (same number of files in each script file)
            numFiles=0;
            foreach_index(i,scriptpaths)
            {
                filelist.clear();
                std::wstring scriptpath = scriptpaths[i];
                fprintf(stderr, "reading script file %S ...", scriptpath.c_str());
                size_t n = 0;
                for (msra::files::textreader reader(scriptpath); reader && filelist.size() <= firstfilesonly/*optimization*/; )
                {
                    filelist.push_back (reader.wgetline());
                    n++;
                }

                fprintf (stderr, " %lu entries\n", n);

                if (i==0)
                    numFiles=n;
                else
                    if (n!=numFiles)
                        throw std::runtime_error (msra::strfun::strprintf ("number of files in each scriptfile inconsistent (%d vs. %d)", numFiles,n));

                /*
                   do "..." expansion if SCP uses relative path names
                   "..." in the SCP means full path is the same as the SCP file
                   for example, if scp file is "//aaa/bbb/ccc/ddd.scp"
                   and contains entry like
                   .../file1.feat
                   .../file2.feat
                   etc.
                   the features will be read from
                //aaa/bbb/ccc/file1.feat
                //aaa/bbb/ccc/file2.feat
                etc.
                This works well if you store the scp file with the features but
                do not want different scp files everytime you move or create new features
                */
                wstring scpdircached;
                for (auto & entry : filelist)
                    ExpandDotDotDot(entry, scriptpath, scpdircached);

                infilesmulti.push_back(filelist);
            }

            if (readerConfig.Exists("unigram"))
                unigrampath = (wstring)readerConfig("unigram");

            // load a unigram if needed (this is used for MMI training)
            msra::lm::CSymbolSet unigramsymbols;
            std::unique_ptr<msra::lm::CMGramLM> unigram;
            size_t silencewordid = SIZE_MAX;
            size_t startwordid = SIZE_MAX;
            size_t endwordid = SIZE_MAX;
            if (unigrampath != L"")
            {
                unigram.reset (new msra::lm::CMGramLM());
                unigram->read (unigrampath, unigramsymbols, false/*filterVocabulary--false will build the symbol map*/, 1/*maxM--unigram only*/);
                silencewordid = unigramsymbols["!silence"];     // give this an id (even if not in the LM vocabulary)
                startwordid = unigramsymbols["<s>"];
                endwordid = unigramsymbols["</s>"];
            }

            if (!unigram)
                fprintf (stderr, "trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion\n");

            // currently assumes all mlfs will have same root name (key)
            set<wstring> restrictmlftokeys;     // restrict MLF reader to these files--will make stuff much faster without having to use shortened input files
            if (infilesmulti[0].size() <= 100)
            {
                foreach_index (i, infilesmulti[0])
                {
                    msra::asr::htkfeatreader::parsedpath ppath (infilesmulti[0][i]);
                    const wstring key = regex_replace ((wstring)ppath, wregex (L"\\.[^\\.\\\\/:]*$"), wstring());  // delete extension (or not if none)
                    restrictmlftokeys.insert (key);
                }
            }
            // get labels

            //if (readerConfig.Exists("statelist"))
            //    statelistpath = readerConfig("statelist");

            double htktimetoframe = 100000.0;           // default is 10ms
            //std::vector<msra::asr::htkmlfreader<msra::asr::htkmlfentry,msra::lattices::lattice::htkmlfwordsequence>> labelsmulti;
            std::vector<std::map<std::wstring,std::vector<msra::asr::htkmlfentry>>> labelsmulti;
            //std::vector<std::wstring> pagepath;
            foreach_index(i, mlfpathsmulti)
            {
                const msra::lm::CSymbolSet* wordmap = unigram ? &unigramsymbols : NULL;
                msra::asr::htkmlfreader<msra::asr::htkmlfentry,msra::lattices::lattice::htkmlfwordsequence>
                labels(mlfpathsmulti[i], restrictmlftokeys, statelistpaths[i], wordmap, (map<string,size_t>*) NULL, htktimetoframe);      // label MLF
                // get the temp file name for the page file
                labelsmulti.push_back(labels);
            }

            if (!_stricmp(readMethod.c_str(),"blockRandomize"))
            {
                // construct all the parameters we don't need, but need to be passed to the constructor...
                std::pair<std::vector<wstring>,std::vector<wstring>> latticetocs;
                std::unordered_map<std::string,size_t> modelsymmap;
                m_lattices = new msra::dbn::latticesource(latticetocs, modelsymmap);

                // now get the frame source. This has better randomization and doesn't create temp files
                m_frameSource = new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, m_framemode);
                m_frameSource->setverbosity(verbosity);
                //m_frameSource = new msra::dbn::minibatchutterancesource(infilesmulti[0], labelsmulti[0], m_featDims[0], m_labelDims[0], numContextLeft[0], numContextRight[0], randomize, *m_lattices, m_latticeMap, m_framemode);

            }
            else if (!_stricmp(readMethod.c_str(),"rollingWindow"))
            {
#ifdef _WIN32
                std::wstring pageFilePath;
#else
                std::string pageFilePath;
#endif
                std::vector<std::wstring> pagePaths;
                if (readerConfig.Exists("pageFilePath"))
                {
                    pageFilePath = readerConfig("pageFilePath");

                    // replace any '/' with '\' for compat with default path
                    std::replace(pageFilePath.begin(), pageFilePath.end(), '/','\\');
#ifdef _WIN32
                    // verify path exists
                    DWORD attrib = GetFileAttributes(pageFilePath.c_str());
                    if (attrib==INVALID_FILE_ATTRIBUTES || !(attrib & FILE_ATTRIBUTE_DIRECTORY))
                        throw std::runtime_error ("pageFilePath does not exist");
#endif
#ifdef __unix__
                struct stat statbuf;
                if (stat(pageFilePath.c_str(), &statbuf)==-1)
                {
                    throw std::runtime_error ("pageFilePath does not exist");
                }

#endif
            }
                else  // using default temporary path
                {
#ifdef _WIN32
                    pageFilePath.reserve(MAX_PATH);
                    GetTempPath(MAX_PATH, &pageFilePath[0]);
#endif
#ifdef __unix__
                pageFilePath.reserve(PATH_MAX);
                pageFilePath = "/tmp/temp.CNTK.XXXXXX";
#endif
                }

#ifdef _WIN32
                if (pageFilePath.size()>MAX_PATH-14) // max length of input to GetTempFileName is MAX_PATH-14
                    throw std::runtime_error (msra::strfun::strprintf ("pageFilePath must be less than %d characters", MAX_PATH-14));
#endif
#ifdef __unix__
            if (pageFilePath.size()>PATH_MAX-14) // max length of input to GetTempFileName is PATH_MAX-14
                throw std::runtime_error (msra::strfun::strprintf ("pageFilePath must be less than %d characters", PATH_MAX-14));
#endif
                foreach_index(i, infilesmulti)
                {
#ifdef _WIN32
                    wchar_t tempFile[MAX_PATH];
                    GetTempFileName(pageFilePath.c_str(), L"CNTK", 0, tempFile);
                    pagePaths.push_back(tempFile);
#endif
#ifdef __unix__
                char* tempFile;
                //GetTempFileName(pageFilePath.c_str(), L"CNTK", 0, tempFile);
                tempFile = (char*) pageFilePath.c_str();
                int fid = mkstemp(tempFile);
                unlink (tempFile);
                close (fid);
                pagePaths.push_back(GetWC(tempFile));
#endif
                }

                const bool mayhavenoframe=false;
                int addEnergy = 0;

                //m_frameSourceMultiIO = new msra::dbn::minibatchframesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, randomize, pagepath, mayhavenoframe, addEnergy);
                //m_frameSourceMultiIO->setverbosity(verbosity);
                m_frameSource = new msra::dbn::minibatchframesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, pagePaths, mayhavenoframe, addEnergy);
                m_frameSource->setverbosity(verbosity);
            }
            else
            {
                RuntimeError("readMethod must be rollingWindow or blockRandomize");
            }

        }

    // Load all input and output data.
    // Note that the terms features imply be real-valued quanities and
    // labels imply categorical quantities, irrespective of whether they
    // are inputs or targets for the network
    template<class ElemType>
        void HTKMLFReader<ElemType>::PrepareForWriting(const ConfigParameters& readerConfig)
        {
            vector<wstring> scriptpaths;
            vector<wstring> filelist;
            size_t numFiles;
            size_t firstfilesonly = SIZE_MAX;   // set to a lower value for testing
            size_t evalchunksize = 2048;
            vector<size_t> realDims;
            size_t iFeat = 0;
            vector<size_t> numContextLeft;
            vector<size_t> numContextRight;

            std::vector<std::wstring> featureNames;
            std::vector<std::wstring> labelNames;
            GetDataNamesFromConfig(readerConfig, featureNames, labelNames);

            foreach_index(i, featureNames)
            {
                ConfigParameters thisFeature = readerConfig(featureNames[i]);
                realDims.push_back(thisFeature("dim"));

                ConfigArray contextWindow = thisFeature("contextWindow", "1");
                if (contextWindow.size() == 1) // symmetric
                {
                    size_t windowFrames = contextWindow[0];
                    if (windowFrames % 2 == 0)
                        RuntimeError("augmentationextent: neighbor expansion of input features to %d not symmetrical", windowFrames);
                    size_t context = windowFrames / 2;           // extend each side by this
                    numContextLeft.push_back(context);
                    numContextRight.push_back(context);

                }
                else if (contextWindow.size() == 2) // left context, right context
                {
                    numContextLeft.push_back(contextWindow[0]);
                    numContextRight.push_back(contextWindow[1]);
                }
                else
                {
                    RuntimeError("contextFrames must have 1 or 2 values specified, found %d", contextWindow.size());
                }
                // update m_featDims to reflect the total input dimension (featDim x contextWindow), not the native feature dimension
                // that is what the lower level feature readers expect
                realDims[i] = realDims[i] * (1 + numContextLeft[i] + numContextRight[i]);

                string type = thisFeature("type","Real");
                if (type=="Real"){
                    m_nameToTypeMap[featureNames[i]] = InputOutputTypes::real;
                }
                else{
                    RuntimeError("feature type must be Real");
                }

                m_featureNameToIdMap[featureNames[i]]= iFeat;
                scriptpaths.push_back(thisFeature("scpFile"));
                m_featureNameToDimMap[featureNames[i]] = realDims[i];

                m_featuresBufferMultiIO.push_back(nullptr);
                m_featuresBufferAllocatedMultiIO.push_back(0);
                iFeat++;
            }

            if (labelNames.size()>0)
                RuntimeError("writer mode does not support labels as inputs, only features");

            numFiles=0;
            foreach_index(i,scriptpaths)
            {
                filelist.clear();
                std::wstring scriptpath = scriptpaths[i];
                fprintf(stderr, "reading script file %S ...", scriptpath.c_str());
                size_t n = 0;
                for (msra::files::textreader reader(scriptpath); reader && filelist.size() <= firstfilesonly/*optimization*/; )
                {
                    filelist.push_back (reader.wgetline());
                    n++;
                }

                fprintf (stderr, " %d entries\n", (int)n);

                if (i==0)
                    numFiles=n;
                else
                    if (n!=numFiles)
                        throw std::runtime_error (msra::strfun::strprintf ("HTKMLFReader::InitEvalReader: number of files in each scriptfile inconsistent (%d vs. %d)", numFiles,n));

                m_inputFilesMultiIO.push_back(filelist);
            }

            m_fileEvalSource = new msra::dbn::FileEvalSource(realDims, numContextLeft, numContextRight, evalchunksize);
        }


    // destructor - virtual so it gets called properly
    template<class ElemType>
        HTKMLFReader<ElemType>::~HTKMLFReader()
        {
            delete m_mbiter;
#ifdef _WIN32
            delete m_readAheadSource;
#endif
            delete m_frameSource;
            delete m_lattices;

            if (!m_featuresBufferMultiIO.empty())
            {
                if (m_featuresBufferMultiIO[0] != nullptr)
                {
                    foreach_index(i, m_featuresBufferMultiIO)
                    {
                        m_featuresBufferMultiIO[i] = nullptr;
                    }
                }
            }
            if (!m_labelsBufferMultiIO.empty())
            {
                if (m_labelsBufferMultiIO[0] != nullptr)
                {
                    foreach_index(i, m_labelsBufferMultiIO)
                    {
                        m_labelsBufferMultiIO[i] = nullptr;
                    }
                }
            }
            if (/*m_numberOfuttsPerMinibatch > 1 && */m_truncated)
            {
                for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i ++)
                {
                    if (m_featuresBufferMultiUtt[i] != NULL)
                    {
                        delete[] m_featuresBufferMultiUtt[i];
                        m_featuresBufferMultiUtt[i] = NULL;
                    }
                    if (m_labelsBufferMultiUtt[i] != NULL)
                    {
                        delete[] m_labelsBufferMultiUtt[i];
                        m_labelsBufferMultiUtt[i] = NULL;
                    }

                }
            }

            delete m_cudaAllocator;
        }

    //StartMinibatchLoop - Startup a minibatch loop
    // mbSize - [in] size of the minibatch (number of frames, etc.)
    // epoch - [in] epoch number for this loop
    // requestedEpochSamples - [in] number of samples to randomize, defaults to requestDataSize which uses the number of samples there are in the dataset
    template<class ElemType>
        void HTKMLFReader<ElemType>::StartDistributedMinibatchLoop(size_t mbSize, size_t epoch, size_t subsetNum, size_t numSubsets, size_t requestedEpochSamples /*= requestDataSize*/)
        {
            assert(subsetNum < numSubsets);
            assert(((subsetNum == 0) && (numSubsets == 1)) || this->SupportsDistributedMBRead());

            m_mbSize = mbSize;

            m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[epoch];

            m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
            m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
            m_processedFrame.assign(m_numberOfuttsPerMinibatch, 0);
            m_toProcess.assign(m_numberOfuttsPerMinibatch, 0);
            m_switchFrame.assign(m_numberOfuttsPerMinibatch, 0);

            if (m_trainOrTest)
            {
                // For distributed reading under truncated BPTT of LSTMs, we distribute the utterances per minibatch among all the subsets
                if (m_truncated)
                {
                    if ((numSubsets > 1) && (m_numberOfuttsPerMinibatch < numSubsets))
                    {
                        LogicError("Insufficient value of 'nbruttsineachrecurrentiter'=%d for distributed reading with %d subsets", m_numberOfuttsPerMinibatch, numSubsets);
                    }

                    m_numberOfuttsPerMinibatch = (m_numberOfuttsPerMinibatch / numSubsets) + ((subsetNum < (m_numberOfuttsPerMinibatch % numSubsets)) ? 1 : 0);
                }

                StartMinibatchLoopToTrainOrTest(mbSize, epoch, subsetNum, numSubsets, requestedEpochSamples);
            }
            else
            {
                // No distributed reading of mini-batches for write
                if ((subsetNum != 0) || (numSubsets != 1))
                {
                    LogicError("Distributed reading of mini-batches is only supported for training or testing");
                }

                StartMinibatchLoopToWrite(mbSize,epoch,requestedEpochSamples);
            }
            m_checkDictionaryKeys=true;
        }

    template<class ElemType>
        void HTKMLFReader<ElemType>::StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t subsetNum, size_t numSubsets, size_t requestedEpochSamples)
        {
            size_t datapasses=1;
            //size_t totalFrames = m_frameSource->totalframes();
            size_t totalFrames;
            totalFrames = m_frameSource->totalframes();

            size_t extraFrames = totalFrames%mbSize;
            size_t minibatches = totalFrames/mbSize;

            // if we are allowing partial minibatches, do nothing, and let it go through
            if (!m_partialMinibatch)
            {
                // we don't want any partial frames, so round total frames to be an even multiple of our mbSize
                if (totalFrames > mbSize)
                    totalFrames -= extraFrames;

                if (requestedEpochSamples == requestDataSize)
                {
                    requestedEpochSamples = totalFrames;
                }
                else if (minibatches > 0)   // if we have any full minibatches
                {
                    // since we skip the extraFrames, we need to add them to the total to get the actual number of frames requested
                    size_t sweeps = (requestedEpochSamples-1)/totalFrames; // want the number of sweeps we will skip the extra, so subtract 1 and divide
                    requestedEpochSamples += extraFrames*sweeps;
                }
            }
            else if (requestedEpochSamples == requestDataSize)
            {
                requestedEpochSamples = totalFrames;
            }

            // delete the old one first (in case called more than once)
            delete m_mbiter;
            msra::dbn::minibatchsource* source = m_frameSource;
#ifdef _WIN32
            if (m_readAhead)
            {
                if (m_readAheadSource == NULL)
                {
                    m_readAheadSource = new msra::dbn::minibatchreadaheadsource (*source, requestedEpochSamples);
                }
                else if (m_readAheadSource->epochsize() != requestedEpochSamples)
                {
                    delete m_readAheadSource;
                    m_readAheadSource = new msra::dbn::minibatchreadaheadsource (*source, requestedEpochSamples);
                }
                source = m_readAheadSource;
            }
#endif
            m_mbiter = new msra::dbn::minibatchiterator(*source, epoch, requestedEpochSamples, mbSize, subsetNum, numSubsets, datapasses);
            if (!m_featuresBufferMultiIO.empty())
            {
                if (m_featuresBufferMultiIO[0] != nullptr) // check first feature, if it isn't NULL, safe to assume all are not NULL?
                {
                    foreach_index(i, m_featuresBufferMultiIO)
                    {
                        m_featuresBufferMultiIO[i] = nullptr;
                        m_featuresBufferAllocatedMultiIO[i]=0;
                    }
                }
            }
            if (!m_labelsBufferMultiIO.empty())
            {
                if (m_labelsBufferMultiIO[0] != nullptr)
                {
                    foreach_index(i, m_labelsBufferMultiIO)
                    {
                        m_labelsBufferMultiIO[i] = nullptr;
                        m_labelsBufferAllocatedMultiIO[i]=0;
                    }
                }
            }
            if (m_numberOfuttsPerMinibatch && m_truncated == true)
            {
                m_noData = false;
                m_featuresStartIndexMultiUtt.assign(m_featuresBufferMultiIO.size()*m_numberOfuttsPerMinibatch,0);
                m_labelsStartIndexMultiUtt.assign(m_labelsBufferMultiIO.size()*m_numberOfuttsPerMinibatch,0);
                for (size_t u = 0; u < m_numberOfuttsPerMinibatch; u ++)
                {
                    if (m_featuresBufferMultiUtt[u] != NULL)
                    {
                        delete[] m_featuresBufferMultiUtt[u];
                        m_featuresBufferMultiUtt[u] = NULL;
                        m_featuresBufferAllocatedMultiUtt[u] = 0;
                    }
                    if (m_labelsBufferMultiUtt[u] != NULL)
                    {
                        delete[] m_labelsBufferMultiUtt[u];
                        m_labelsBufferMultiUtt[u] = NULL;
                        m_labelsBufferAllocatedMultiUtt[u] = 0;
                    }
                    ReNewBufferForMultiIO(u);
                }
            }
        }

    template<class ElemType>
        void HTKMLFReader<ElemType>::StartMinibatchLoopToWrite(size_t mbSize, size_t /*epoch*/, size_t /*requestedEpochSamples*/)
        {
            m_fileEvalSource->Reset();
            m_fileEvalSource->SetMinibatchSize(mbSize);
            //m_chunkEvalSourceMultiIO->reset();
            m_inputFileIndex=0;

            if (m_featuresBufferMultiIO[0] != nullptr) // check first feature, if it isn't NULL, safe to assume all are not NULL?
            {
                foreach_index(i, m_featuresBufferMultiIO)
                {
                    m_featuresBufferMultiIO[i] = nullptr;
                    m_featuresBufferAllocatedMultiIO[i]=0;
                }
            }

        }

    // GetMinibatch - Get the next minibatch (features and labels)
    // matrices - [in] a map with named matrix types (i.e. 'features', 'labels') mapped to the corresponing matrix,
    //             [out] each matrix resized if necessary containing data.
    // returns - true if there are more minibatches, false if no more minibatchs remain
    template<class ElemType>
        bool HTKMLFReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices)
        {
            if (m_trainOrTest)
            {
                return GetMinibatchToTrainOrTest(matrices);
            }
            else
            {
                return GetMinibatchToWrite(matrices);
            }
        }

    template<class ElemType>
        bool HTKMLFReader<ElemType>::GetMinibatchToTrainOrTest(std::map<std::wstring, Matrix<ElemType>*>& matrices)
        {
            size_t id;
            size_t dim;
            bool skip = false;

            // on first minibatch, make sure we can supply data for requested nodes
            std::map<std::wstring,size_t>::iterator iter;
            if     (m_checkDictionaryKeys)
            {
                for (auto iter=matrices.begin();iter!=matrices.end();iter++)
                {
                    if (m_nameToTypeMap.find(iter->first)==m_nameToTypeMap.end())
                        throw std::runtime_error(msra::strfun::strprintf("minibatch requested for input node %ws not found in reader - cannot generate input\n",iter->first.c_str()));

                }
                m_checkDictionaryKeys=false;
            }

            do
            {
                if (m_truncated == false)
                {
                    if (!(*m_mbiter))
                        return false;

                    // now, access all features and and labels by iterating over map of "matrices"
                    bool first = true;
                    typename std::map<std::wstring, Matrix<ElemType>*>::iterator iter;
                    for (iter = matrices.begin();iter!=matrices.end(); iter++)
                    {
                        // dereference matrix that corresponds to key (input/output name) and
                        // populate based on whether its a feature or a label
                        Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels

                        if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
                        {

                            id = m_featureNameToIdMap[iter->first];
                            dim = m_featureNameToDimMap[iter->first];
                            const msra::dbn::matrixstripe feat = m_mbiter->frames(id);
                            const size_t actualmbsize = feat.cols();   // it may still return less if at end of sweep TODO: this check probably only needs to happen once
                            if (first)
                            {
                                m_sentenceBegin.Resize((size_t)1, (size_t)feat.cols());
                                m_minibatchPackingFlag.resize(feat.cols());

                                m_sentenceBegin.SetValue((ElemType) SEQUENCE_MIDDLE);
                                m_sentenceBegin.SetValue(0, 0, (ElemType) SEQUENCE_START);
                                m_sentenceBegin.SetValue(0, (size_t)feat.cols()-1, (ElemType) SEQUENCE_END);
                                std::fill(m_minibatchPackingFlag.begin(), m_minibatchPackingFlag.end(), MinibatchPackingFlag::None);
                                m_minibatchPackingFlag[0] = MinibatchPackingFlag::SequenceStart;
                                m_minibatchPackingFlag[(size_t)feat.cols() - 1] = MinibatchPackingFlag::SequenceEnd;
                                first = false;
                            }

                            assert (actualmbsize == m_mbiter->currentmbframes());
                            skip = (!m_partialMinibatch && m_mbiter->requestedframes() != actualmbsize && m_frameSource->totalframes() > actualmbsize);

                            // check to see if we got the number of frames we requested
                            if (!skip)
                            {
                                assert(feat.rows()==dim); // check feature dimension matches what's expected

                                if ((m_featuresBufferMultiIO[id] == nullptr) ||
                                    (m_featuresBufferAllocatedMultiIO[id] < (feat.rows() * feat.cols())) /*buffer size changed. can be partial minibatch*/)
                                {
                                    m_featuresBufferMultiIO[id] = AllocateIntermediateBuffer(data.GetDeviceId(), feat.rows() * feat.cols());
                                    m_featuresBufferAllocatedMultiIO[id] = feat.rows() * feat.cols();
                                }

                                // copy the features over to our array type
                                if (sizeof(ElemType) == sizeof(float))
                                {
                                    for (int j=0; j < feat.cols(); j++) // column major, so iterate columns
                                    {
                                        // copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns
                                        memcpy_s(&m_featuresBufferMultiIO[id].get()[j * feat.rows()], sizeof(ElemType) * feat.rows(), &feat(0, j), sizeof(ElemType) * feat.rows());
                                    }
                                }
                                else
                                {
                                    for (int j=0; j < feat.cols(); j++) // column major, so iterate columns in outside loop
                                    {
                                        for (int i = 0; i < feat.rows(); i++)
                                        {
                                            m_featuresBufferMultiIO[id].get()[j * feat.rows() + i] = feat(i, j);
                                        }
                                    }
                                }
                                data.SetValue(feat.rows(), feat.cols(), m_featuresBufferMultiIO[id].get(), matrixFlagNormal);
                            }
                        }
                        else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
                        {
                            id = m_labelNameToIdMap[iter->first];
                            dim = m_labelNameToDimMap[iter->first];
                            const vector<size_t> & uids = m_mbiter->labels(id);

                            // need skip logic here too in case labels are first in map not features
                            const size_t actualmbsize = uids.size();   // it may still return less if at end of sweep TODO: this check probably only needs to happen once
                            assert (actualmbsize == m_mbiter->currentmbframes());
                            skip = (!m_partialMinibatch && m_mbiter->requestedframes() != actualmbsize && m_frameSource->totalframes() > actualmbsize);

                            if (!skip)
                            {
                                // copy the labels over to array type
                                //data.Resize(udims[id], uids.size());
                                //data.SetValue((ElemType)0);

                                // loop through the columns and set one value to 1
                                // in the future we want to use a sparse matrix here
                                //for (int i = 0; i < uids.size(); i++)
                                //{
                                //    assert(uids[i] <udims[id]);
                                //    data(uids[i], i) = (ElemType)1;
                                //}

                                if ((m_labelsBufferMultiIO[id] == nullptr) ||
                                    (m_labelsBufferAllocatedMultiIO[id] < (dim * uids.size())))
                                {
                                    m_labelsBufferMultiIO[id] = AllocateIntermediateBuffer(data.GetDeviceId(), dim * uids.size());
                                    m_labelsBufferAllocatedMultiIO[id] = dim * uids.size();
                                }
                                memset(m_labelsBufferMultiIO[id].get(), 0, sizeof(ElemType) * dim * uids.size());

                                if (m_convertLabelsToTargetsMultiIO[id])
                                {
                                    size_t labelDim = m_labelToTargetMapMultiIO[id].size();
                                    for (int i = 0; i < uids.size(); i++)
                                    {
                                        assert(uids[i] < labelDim); labelDim;
                                        size_t labelId = uids[i];
                                        for (int j = 0; j < dim; j++)
                                        {
                                            m_labelsBufferMultiIO[id].get()[i * dim + j] = m_labelToTargetMapMultiIO[id][labelId][j];
                                        }
                                    }
                                }
                                else
                                {
                                    // loop through the columns and set one value to 1
                                    // in the future we want to use a sparse matrix here
                                    for (int i = 0; i < uids.size(); i++)
                                    {
                                        assert(uids[i] < dim);
                                        //labels(uids[i], i) = (ElemType)1;
                                        m_labelsBufferMultiIO[id].get()[i * dim + uids[i]] = (ElemType)1;
                                    }
                                }


                                data.SetValue(dim, uids.size(), m_labelsBufferMultiIO[id].get(), matrixFlagNormal);
                            }
                        }
                        else{
                            //default:
                            throw runtime_error(msra::strfun::strprintf("GetMinibatchMultiIO:: unknown InputOutputType for %S\n",(iter->first).c_str()));
                        }

                    }
                    // advance to the next minibatch
                    (*m_mbiter)++;
                }
                else
                {
                    if (m_noData)
                    {
                        bool endEpoch = true;
                        for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i++)
                        {
                            if (m_processedFrame[i] != m_toProcess[i])
                            {
                                endEpoch = false;
                            }
                        }
                        if(endEpoch)
                        {
                            return false;
                        }
                    }
                    size_t numOfFea = m_featuresBufferMultiIO.size();
                    size_t numOfLabel = m_labelsBufferMultiIO.size();

                    m_sentenceBegin.Resize(m_numberOfuttsPerMinibatch, m_mbSize);
                    m_minibatchPackingFlag.resize(m_mbSize);

                    for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i++)
                    {
                        for (size_t j = 0; j < m_mbSize; j++)
                        {
                            m_sentenceBegin.SetValue(i,j,(ElemType) SEQUENCE_MIDDLE);
                        }
                    }
                    std::fill(m_minibatchPackingFlag.begin(), m_minibatchPackingFlag.end(), MinibatchPackingFlag::None);


                    vector<size_t> actualmbsize;
                    actualmbsize.assign(m_numberOfuttsPerMinibatch,0);
                    for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i++)
                    {
                        size_t startFr = m_processedFrame[i];
                        size_t endFr = 0;
                        if ((m_processedFrame[i] + m_mbSize) < m_toProcess[i])
                        {
                            if(m_processedFrame[i] > 0)
                            {
                                m_sentenceEnd[i] = false;
                                m_switchFrame[i] = m_mbSize+1;
                                if (m_processedFrame[i] == 1)
                                {
                                    m_sentenceBegin.SetValue(i, 0, (ElemType)SEQUENCE_END);
                                    m_minibatchPackingFlag[0] = MinibatchPackingFlag::SequenceEnd;
                                }
                            }
                            else
                            {
                                m_switchFrame[i] = 0;
                                m_sentenceEnd[i] = true;
                                m_sentenceBegin.SetValue(i, 0, (ElemType)SEQUENCE_START);
                                m_minibatchPackingFlag[0] = MinibatchPackingFlag::SequenceStart;
                            }
                            actualmbsize[i] = m_mbSize;
                            endFr = startFr + actualmbsize[i];
                            typename std::map<std::wstring, Matrix<ElemType>*>::iterator iter;
                            for (iter = matrices.begin();iter!=matrices.end(); iter++)
                            {
                                // dereference matrix that corresponds to key (input/output name) and
                                // populate based on whether its a feature or a label
                                Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels

                                if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
                                {
                                    id = m_featureNameToIdMap[iter->first];
                                    dim = m_featureNameToDimMap[iter->first];

                                    if ((m_featuresBufferMultiIO[id] == nullptr) ||
                                        (m_featuresBufferAllocatedMultiIO[id] < (dim * m_mbSize * m_numberOfuttsPerMinibatch)) /*buffer size changed. can be partial minibatch*/)
                                    {
                                        m_featuresBufferMultiIO[id] = AllocateIntermediateBuffer(data.GetDeviceId(), dim * m_mbSize * m_numberOfuttsPerMinibatch);
                                        m_featuresBufferAllocatedMultiIO[id] = dim * m_mbSize * m_numberOfuttsPerMinibatch;
                                    }

                                    if (sizeof(ElemType) == sizeof(float))
                                    {
                                        for (size_t j = startFr,k = 0; j < endFr; j++,k++) // column major, so iterate columns
                                        {
                                            // copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns
                                            memcpy_s(&m_featuresBufferMultiIO[id].get()[(k * m_numberOfuttsPerMinibatch + i) * dim], sizeof(ElemType) * dim, &m_featuresBufferMultiUtt[i][j * dim + m_featuresStartIndexMultiUtt[id + i * numOfFea]], sizeof(ElemType) * dim);
                                        }
                                    }
                                    else
                                    {
                                        for (size_t j=startFr,k=0; j < endFr; j++,k++) // column major, so iterate columns in outside loop
                                        {
                                            for (int d = 0; d < dim; d++)
                                            {
                                                m_featuresBufferMultiIO[id].get()[(k * m_numberOfuttsPerMinibatch + i) * dim + d] = m_featuresBufferMultiUtt[i][j * dim + d + m_featuresStartIndexMultiUtt[id + i * numOfFea]];
                                            }
                                        }
                                    }
                                }
                                else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
                                {
                                    id = m_labelNameToIdMap[iter->first];
                                    dim = m_labelNameToDimMap[iter->first];
                                    if ((m_labelsBufferMultiIO[id] == nullptr) ||
                                        (m_labelsBufferAllocatedMultiIO[id] < (dim * m_mbSize * m_numberOfuttsPerMinibatch)))
                                    {
                                        m_labelsBufferMultiIO[id] = AllocateIntermediateBuffer(data.GetDeviceId(), dim * m_mbSize * m_numberOfuttsPerMinibatch);
                                        m_labelsBufferAllocatedMultiIO[id] = dim * m_mbSize * m_numberOfuttsPerMinibatch;
                                    }

                                    for (size_t j = startFr,k=0; j < endFr; j++,k++)
                                    {
                                        for (int d = 0; d < dim; d++)
                                        {
                                            m_labelsBufferMultiIO[id].get()[(k * m_numberOfuttsPerMinibatch + i) * dim + d] = m_labelsBufferMultiUtt[i][j * dim + d + m_labelsStartIndexMultiUtt[id + i * numOfLabel]];
                                        }
                                    }
                                }
                            }
                            m_processedFrame[i] += m_mbSize;
                        }
                        else
                        {
                            actualmbsize[i] = m_toProcess[i] - m_processedFrame[i];
                            endFr = startFr + actualmbsize[i];

                            typename std::map<std::wstring, Matrix<ElemType>*>::iterator iter;
                            for (iter = matrices.begin();iter!=matrices.end(); iter++)
                            {
                                // dereference matrix that corresponds to key (input/output name) and
                                // populate based on whether its a feature or a label
                                Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels

                                if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
                                {
                                    id = m_featureNameToIdMap[iter->first];
                                    dim = m_featureNameToDimMap[iter->first];

                                    if ((m_featuresBufferMultiIO[id] == nullptr) ||
                                        (m_featuresBufferAllocatedMultiIO[id] < (dim * m_mbSize * m_numberOfuttsPerMinibatch)) /*buffer size changed. can be partial minibatch*/)
                                    {
                                        m_featuresBufferMultiIO[id] = AllocateIntermediateBuffer(data.GetDeviceId(), dim * m_mbSize * m_numberOfuttsPerMinibatch);
                                        m_featuresBufferAllocatedMultiIO[id] = dim * m_mbSize * m_numberOfuttsPerMinibatch;
                                    }

                                    if (sizeof(ElemType) == sizeof(float))
                                    {
                                        for (size_t j = startFr,k = 0; j < endFr; j++,k++) // column major, so iterate columns
                                        {
                                            // copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns
                                            memcpy_s(&m_featuresBufferMultiIO[id].get()[(k * m_numberOfuttsPerMinibatch + i) * dim], sizeof(ElemType) * dim, &m_featuresBufferMultiUtt[i][j * dim + m_featuresStartIndexMultiUtt[id + i * numOfFea]], sizeof(ElemType) * dim);
                                        }
                                    }
                                    else
                                    {
                                        for (size_t j=startFr,k=0; j < endFr; j++,k++) // column major, so iterate columns in outside loop
                                        {
                                            for (int d = 0; d < dim; d++)
                                            {
                                                m_featuresBufferMultiIO[id].get()[(k * m_numberOfuttsPerMinibatch + i) * dim + d] = m_featuresBufferMultiUtt[i][j * dim + d + m_featuresStartIndexMultiUtt[id + i * numOfFea]];
                                            }
                                        }
                                    }
                                }
                                else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
                                {
                                    id = m_labelNameToIdMap[iter->first];
                                    dim = m_labelNameToDimMap[iter->first];
                                    if ((m_labelsBufferMultiIO[id] == nullptr) ||
                                        (m_labelsBufferAllocatedMultiIO[id] < (dim * m_mbSize * m_numberOfuttsPerMinibatch)))
                                    {
                                        m_labelsBufferMultiIO[id] = AllocateIntermediateBuffer(data.GetDeviceId(), dim * m_mbSize * m_numberOfuttsPerMinibatch);
                                        m_labelsBufferAllocatedMultiIO[id] = dim * m_mbSize * m_numberOfuttsPerMinibatch;
                                    }
                                    for (size_t j = startFr,k=0; j < endFr; j++,k++)
                                    {
                                        for (int d = 0; d < dim; d++)
                                        {
                                            m_labelsBufferMultiIO[id].get()[(k * m_numberOfuttsPerMinibatch + i) * dim + d] = m_labelsBufferMultiUtt[i][j * dim + d + m_labelsStartIndexMultiUtt[id + i * numOfLabel]];
                                        }
                                    }
                                }
                            }
                            m_processedFrame[i] += (endFr-startFr);
                            m_switchFrame[i] = actualmbsize[i];
                            if (actualmbsize[i] < m_mbSize)
                            {
                                m_sentenceBegin.SetValue(i, actualmbsize[i], (ElemType)SEQUENCE_START);
                                m_minibatchPackingFlag[actualmbsize[i]] |= MinibatchPackingFlag::SequenceStart;
                            }
                            if (actualmbsize[i] == m_mbSize)
                            {
                                m_sentenceBegin.SetValue(i, actualmbsize[i]-1, (ElemType)SEQUENCE_END);
                                m_minibatchPackingFlag[actualmbsize[i]-1] |= MinibatchPackingFlag::SequenceEnd;
                            }
                            startFr = m_switchFrame[i];
                            endFr = m_mbSize;
                            bool reNewSucc = ReNewBufferForMultiIO(i);
                            for (iter = matrices.begin();iter!=matrices.end(); iter++)
                            {
                                // dereference matrix that corresponds to key (input/output name) and
                                // populate based on whether its a feature or a label
                                //Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels

                                if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
                                {
                                    id = m_featureNameToIdMap[iter->first];
                                    dim = m_featureNameToDimMap[iter->first];
                                    if (sizeof(ElemType) == sizeof(float))
                                    {
                                        for (size_t j = startFr,k = 0; j < endFr; j++,k++) // column major, so iterate columns
                                        {
                                            // copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns
                                            memcpy_s(&m_featuresBufferMultiIO[id].get()[(j * m_numberOfuttsPerMinibatch + i) * dim], sizeof(ElemType) * dim, &m_featuresBufferMultiUtt[i][k * dim + m_featuresStartIndexMultiUtt[id + i * numOfFea]], sizeof(ElemType) * dim);
                                        }
                                    }
                                    else
                                    {
                                        for (size_t j=startFr,k=0; j < endFr; j++,k++) // column major, so iterate columns in outside loop
                                        {
                                            for (int d = 0; d < dim; d++)
                                            {
                                                m_featuresBufferMultiIO[id].get()[(j * m_numberOfuttsPerMinibatch + i) * dim + d] = m_featuresBufferMultiUtt[i][k * dim + d + m_featuresStartIndexMultiUtt[id + i * numOfFea]];
                                            }
                                        }
                                    }
                                }
                                else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
                                {
                                    id = m_labelNameToIdMap[iter->first];
                                    dim = m_labelNameToDimMap[iter->first];
                                    for (size_t j = startFr,k=0; j < endFr; j++,k++)
                                    {
                                        for (int d = 0; d < dim; d++)
                                        {
                                            m_labelsBufferMultiIO[id].get()[(j * m_numberOfuttsPerMinibatch + i) * dim + d] = m_labelsBufferMultiUtt[i][k * dim + d + m_labelsStartIndexMultiUtt[id + i * numOfLabel]];
                                        }
                                    }
                                }
                            }

                            if (reNewSucc) m_processedFrame[i] += (endFr-startFr);

                        }
                    }
                    typename std::map<std::wstring, Matrix<ElemType>*>::iterator iter;
                    for (iter = matrices.begin();iter!=matrices.end(); iter++)
                    {
                        // dereference matrix that corresponds to key (input/output name) and
                        // populate based on whether its a feature or a label
                        Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels
                        if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
                        {
                            id = m_featureNameToIdMap[iter->first];
                            dim = m_featureNameToDimMap[iter->first];
                            data.SetValue(dim, m_mbSize*m_numberOfuttsPerMinibatch, m_featuresBufferMultiIO[id].get(), matrixFlagNormal);
                        }
                        else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
                        {
                            id = m_labelNameToIdMap[iter->first];
                            dim = m_labelNameToDimMap[iter->first];
                            data.SetValue(dim, m_mbSize*m_numberOfuttsPerMinibatch, m_labelsBufferMultiIO[id].get(), matrixFlagNormal);
                        }
                    }
                    skip=false;
                }
            }   // keep going if we didn't get the right size minibatch
            while(skip);

            return true;
        }

    template<class ElemType>
        bool HTKMLFReader<ElemType>::GetMinibatchToWrite(std::map<std::wstring, Matrix<ElemType>*>& matrices)
        {
            std::map<std::wstring,size_t>::iterator iter;
            if     (m_checkDictionaryKeys)
            {
                for (auto iter=m_featureNameToIdMap.begin();iter!=m_featureNameToIdMap.end();iter++)
                {
                    if (matrices.find(iter->first)==matrices.end())
                    {
                        fprintf(stderr,"GetMinibatchToWrite: feature node %ls specified in reader not found in the network\n", iter->first.c_str());
                        throw std::runtime_error("GetMinibatchToWrite: feature node specified in reader not found in the network.");
                    }
                }
                /*
                   for (auto iter=matrices.begin();iter!=matrices.end();iter++)
                   {
                   if (m_featureNameToIdMap.find(iter->first)==m_featureNameToIdMap.end())
                   throw std::runtime_error(msra::strfun::strprintf("minibatch requested for input node %ws not found in reader - cannot generate input\n",iter->first.c_str()));
                   }
                   */
                m_checkDictionaryKeys=false;
            }

            if (m_inputFileIndex<m_inputFilesMultiIO[0].size())
            {
                m_fileEvalSource->Reset();

                // load next file (or set of files)
                foreach_index(i, m_inputFilesMultiIO)
                {
                    msra::asr::htkfeatreader reader;

                    const auto path = reader.parse(m_inputFilesMultiIO[i][m_inputFileIndex]);
                    // read file
                    msra::dbn::matrix feat;
                    string featkind;
                    unsigned int sampperiod;
                    msra::util::attempt (5, [&]()
                            {
                            reader.read (path, featkind, sampperiod, feat);   // whole file read as columns of feature vectors
                            });
                    fprintf (stderr, "evaluate: reading %d frames of %S\n", (int)feat.cols(), ((wstring)path).c_str());
                    m_fileEvalSource->AddFile(feat, featkind, sampperiod, i);
                }
                m_inputFileIndex++;

                // turn frames into minibatch (augment neighbors, etc)
                m_fileEvalSource->CreateEvalMinibatch();

                // populate input matrices
                bool first = true;
                typename std::map<std::wstring, Matrix<ElemType>*>::iterator iter;
                for (iter = matrices.begin();iter!=matrices.end(); iter++)
                {
                    // dereference matrix that corresponds to key (input/output name) and
                    // populate based on whether its a feature or a label

                    if (m_nameToTypeMap.find(iter->first)!=m_nameToTypeMap.end() && m_nameToTypeMap[iter->first] == InputOutputTypes::real)
                    {
                        Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels
                        size_t id = m_featureNameToIdMap[iter->first];
                        size_t dim = m_featureNameToDimMap[iter->first];

                        const msra::dbn::matrix feat = m_fileEvalSource->ChunkOfFrames(id);
                        if (first)
                        {
                            m_sentenceBegin.Resize((size_t)1, (size_t)feat.cols());
                            m_minibatchPackingFlag.resize((size_t)feat.cols());
                            m_sentenceBegin.SetValue((ElemType)SEQUENCE_MIDDLE);
                            m_sentenceBegin.SetValue(0, 0, (ElemType)SEQUENCE_START);
                            m_sentenceBegin.SetValue(0, (size_t)feat.cols() - 1, (ElemType)SEQUENCE_END);
                            std::fill(m_minibatchPackingFlag.begin(), m_minibatchPackingFlag.end(), MinibatchPackingFlag::None);
                            m_minibatchPackingFlag[0] = MinibatchPackingFlag::SequenceStart;
                            m_minibatchPackingFlag[(size_t)feat.cols() - 1] = MinibatchPackingFlag::SequenceEnd;
                            first = false;
                        }

                        // copy the features over to our array type
                        assert(feat.rows()==dim); dim; // check feature dimension matches what's expected

                        if ((m_featuresBufferMultiIO[id] == nullptr) ||
                            (m_featuresBufferAllocatedMultiIO[id] < (feat.rows() * feat.cols())) /*buffer size changed. can be partial minibatch*/)
                        {
                            m_featuresBufferMultiIO[id] = AllocateIntermediateBuffer(data.GetDeviceId(), feat.rows() * feat.cols());
                            m_featuresBufferAllocatedMultiIO[id] = feat.rows() * feat.cols();
                        }

                        if (sizeof(ElemType) == sizeof(float))
                        {
                            for (int j=0; j < feat.cols(); j++) // column major, so iterate columns
                            {
                                // copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns
                                memcpy_s(&m_featuresBufferMultiIO[id].get()[j * feat.rows()], sizeof(ElemType) * feat.rows(), &feat(0, j), sizeof(ElemType) * feat.rows());
                            }
                        }
                        else
                        {
                            for (int j=0; j < feat.cols(); j++) // column major, so iterate columns in outside loop
                            {
                                for (int i = 0; i < feat.rows(); i++)
                                {
                                    m_featuresBufferMultiIO[id].get()[j * feat.rows() + i] = feat(i, j);
                                }
                            }
                        }
                        data.SetValue(feat.rows(), feat.cols(), m_featuresBufferMultiIO[id].get(), matrixFlagNormal);
                    }
                }
                return true;
            }
            else
            {
                return false;
            }
        }


    template<class ElemType>
        bool HTKMLFReader<ElemType>::ReNewBufferForMultiIO(size_t i)
        {
            if (m_noData)
            {
                return false;
            }
            size_t numOfFea = m_featuresBufferMultiIO.size();
            size_t numOfLabel = m_labelsBufferMultiIO.size();

            size_t totalFeatNum = 0;
            foreach_index(id, m_featuresBufferAllocatedMultiIO)
            {
                const msra::dbn::matrixstripe featOri = m_mbiter->frames(id);
                size_t fdim = featOri.rows();
                const size_t actualmbsizeOri = featOri.cols();
                m_featuresStartIndexMultiUtt[id+i*numOfFea] = totalFeatNum;
                totalFeatNum = fdim * actualmbsizeOri + m_featuresStartIndexMultiUtt[id+i*numOfFea];
            }
            if (m_featuresBufferMultiUtt[i]==NULL)
            {
                m_featuresBufferMultiUtt[i] = new ElemType[totalFeatNum];
                m_featuresBufferAllocatedMultiUtt[i] = totalFeatNum;
            }
            else if (m_featuresBufferAllocatedMultiUtt[i] < totalFeatNum) //buffer size changed. can be partial minibatch
            {
                delete[] m_featuresBufferMultiUtt[i];
                m_featuresBufferMultiUtt[i] = new ElemType[totalFeatNum];
                m_featuresBufferAllocatedMultiUtt[i] = totalFeatNum;
            }

            size_t totalLabelsNum = 0;
            for (auto it = m_labelNameToIdMap.begin(); it != m_labelNameToIdMap.end(); ++it)
            {
                size_t id = m_labelNameToIdMap[it->first];
                size_t dim  = m_labelNameToDimMap[it->first];

                const vector<size_t> & uids = m_mbiter->labels(id);
                size_t actualmbsizeOri = uids.size();
                m_labelsStartIndexMultiUtt[id+i*numOfLabel] = totalLabelsNum;
                totalLabelsNum = m_labelsStartIndexMultiUtt[id+i*numOfLabel] + dim * actualmbsizeOri;
            }

            if (m_labelsBufferMultiUtt[i]==NULL)
            {
                m_labelsBufferMultiUtt[i] = new ElemType[totalLabelsNum];
                m_labelsBufferAllocatedMultiUtt[i] = totalLabelsNum;
            }
            else if (m_labelsBufferAllocatedMultiUtt[i] < totalLabelsNum)
            {
                delete[] m_labelsBufferMultiUtt[i];
                m_labelsBufferMultiUtt[i] = new ElemType[totalLabelsNum];
                m_labelsBufferAllocatedMultiUtt[i] = totalLabelsNum;
            }

            memset(m_labelsBufferMultiUtt[i],0,sizeof(ElemType)*totalLabelsNum);

            bool first = true;
            foreach_index(id, m_featuresBufferMultiIO)
            {
                const msra::dbn::matrixstripe featOri = m_mbiter->frames(id);
                const size_t actualmbsizeOri = featOri.cols();
                size_t fdim = featOri.rows();
                if (first)
                {
                    m_toProcess[i] = actualmbsizeOri;
                    first = false;
                }
                else
                {
                    if (m_toProcess[i] != actualmbsizeOri)
                    {
                        throw std::runtime_error("The multi-IO features has inconsistent number of frames!");
                    }
                }
                assert (actualmbsizeOri == m_mbiter->currentmbframes());

                if (sizeof(ElemType) == sizeof(float))
                {
                    for (int k = 0; k < actualmbsizeOri; k++) // column major, so iterate columns
                    {
                        // copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns
                        memcpy_s(&m_featuresBufferMultiUtt[i][k*fdim+m_featuresStartIndexMultiUtt[id+i*numOfFea]],sizeof(ElemType)*fdim,&featOri(0,k),sizeof(ElemType)*fdim);
                    }
                }
                else
                {
                    for (int k=0; k < actualmbsizeOri; k++) // column major, so iterate columns in outside loop
                    {
                        for (int d = 0; d < featOri.rows(); d++)
                        {
                            m_featuresBufferMultiUtt[i][k*featOri.rows()+d+m_featuresStartIndexMultiUtt[id+i*numOfFea]] = featOri(d,k);
                        }
                    }
                }
            }

            for (auto it = m_labelNameToIdMap.begin(); it != m_labelNameToIdMap.end(); ++it)
            {
                size_t id = m_labelNameToIdMap[it->first];
                size_t dim  = m_labelNameToDimMap[it->first];

                const vector<size_t> & uids = m_mbiter->labels(id);
                size_t actualmbsizeOri = uids.size();

                if (m_convertLabelsToTargetsMultiIO[id])
                {
                    size_t labelDim = m_labelToTargetMapMultiIO[id].size();
                    for (int k=0; k < actualmbsizeOri; k++)
                    {
                        assert(uids[k] < labelDim); labelDim;
                        size_t labelId = uids[k];
                        for (int j = 0; j < dim; j++)
                        {
                            m_labelsBufferMultiUtt[i][k*dim + j + m_labelsStartIndexMultiUtt[id+i*numOfLabel]] = m_labelToTargetMapMultiIO[id][labelId][j];
                        }
                    }
                }
                else
                {
                    // loop through the columns and set one value to 1
                    // in the future we want to use a sparse matrix here
                    for (int k=0; k < actualmbsizeOri; k++)
                    {
                        assert(uids[k] < dim);
                        //labels(uids[i], i) = (ElemType)1;
                        m_labelsBufferMultiUtt[i][k*dim+uids[k]+m_labelsStartIndexMultiUtt[id+i*numOfLabel]]=(ElemType)1;
                    }
                }
            }
            m_processedFrame[i] = 0;

            (*m_mbiter)++;
            if (!(*m_mbiter))
                m_noData = true;

            return true;
        }


    // GetLabelMapping - Gets the label mapping from integer to type in file
    // mappingTable - a map from numeric datatype to native label type stored as a string
    template<class ElemType>
        const std::map<typename IDataReader<ElemType>::LabelIdType, typename IDataReader<ElemType>::LabelType>& HTKMLFReader<ElemType>::GetLabelMapping(const std::wstring& /*sectionName*/)
        {
            return m_idToLabelMap;
        }

    // SetLabelMapping - Sets the label mapping from integer index to label
    // labelMapping - mapping table from label values to IDs (must be 0-n)
    // note: for tasks with labels, the mapping table must be the same between a training run and a testing run
    template<class ElemType>
        void HTKMLFReader<ElemType>::SetLabelMapping(const std::wstring& /*sectionName*/, const std::map<typename IDataReader<ElemType>::LabelIdType, typename IDataReader<ElemType>::LabelType>& labelMapping)
        {
            m_idToLabelMap = labelMapping;
        }

    template<class ElemType>
        size_t HTKMLFReader<ElemType>::ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector<std::vector<ElemType>>& labelToTargetMap)
        {
            if (labelListFile==L"")
                throw std::runtime_error("HTKMLFReader::ReadLabelToTargetMappingFile(): cannot read labelToTargetMappingFile without a labelMappingFile!");

            vector<std::wstring> labelList;
            size_t count, numLabels;
            count=0;
            // read statelist first
            msra::files::textreader labelReader(labelListFile);
            while(labelReader)
            {
                labelList.push_back(labelReader.wgetline());
                count++;
            }
            numLabels=count;
            count=0;
            msra::files::textreader mapReader(labelToTargetMappingFile);
            size_t targetDim = 0;
            while(mapReader)
            {
                std::wstring line(mapReader.wgetline());
                // find white space as a demarcation
                std::wstring::size_type pos = line.find(L" ");
                std::wstring token = line.substr(0,pos);
                std::wstring targetstring = line.substr(pos+1);

                if (labelList[count]!=token)
                    RuntimeError("HTKMLFReader::ReadLabelToTargetMappingFile(): mismatch between labelMappingFile and labelToTargetMappingFile");

                if (count==0)
                    targetDim = targetstring.length();
                else if (targetDim!=targetstring.length())
                    RuntimeError("HTKMLFReader::ReadLabelToTargetMappingFile(): inconsistent target length among records");

                std::vector<ElemType> targetVector(targetstring.length(),(ElemType)0.0);
                foreach_index(i, targetstring)
                {
                    if (targetstring.compare(i,1,L"1")==0)
                        targetVector[i] = (ElemType)1.0;
                    else if (targetstring.compare(i,1,L"0")!=0)
                        RuntimeError("HTKMLFReader::ReadLabelToTargetMappingFile(): expecting label2target mapping to contain only 1's or 0's");
                }
                labelToTargetMap.push_back(targetVector);
                count++;
            }

            // verify that statelist and label2target mapping file are in same order (to match up with reader) while reading mapping
            if (count!=labelList.size())
                RuntimeError("HTKMLFReader::ReadLabelToTargetMappingFile(): mismatch between lengths of labelMappingFile vs labelToTargetMappingFile");

            return targetDim;
        }

    // GetData - Gets metadata from the specified section (into CPU memory)
    // sectionName - section name to retrieve data from
    // numRecords - number of records to read
    // data - pointer to data buffer, if NULL, dataBufferSize will be set to size of required buffer to accomidate request
    // dataBufferSize - [in] size of the databuffer in bytes
    //                  [out] size of buffer filled with data
    // recordStart - record to start reading from, defaults to zero (start of data)
    // returns: true if data remains to be read, false if the end of data was reached
    template<class ElemType>
        bool HTKMLFReader<ElemType>::GetData(const std::wstring& /*sectionName*/, size_t /*numRecords*/, void* /*data*/, size_t& /*dataBufferSize*/, size_t /*recordStart*/)
        {
            throw std::runtime_error("GetData not supported in HTKMLFReader");
        }


    template<class ElemType>
        bool HTKMLFReader<ElemType>::DataEnd(EndDataType endDataType)
        {
            // each minibatch is considered a "sentence"
            // other datatypes not really supported...
            // assert(endDataType == endDataSentence);
            // for the truncated BPTT, we need to support check wether it's the end of data
            bool ret = false;
            switch (endDataType)
            {
                case endDataNull:
                case endDataEpoch:
                case endDataSet:
                    throw std::logic_error("DataEnd: does not support endDataTypes: endDataNull, endDataEpoch and endDataSet");
                    break;
                case endDataSentence:
                    if (m_truncated)
                        ret = m_sentenceEnd[0];
                    else
                        ret = true; // useless in current condition
                    break;
            }
            return ret;
        }

    template<class ElemType>
        void HTKMLFReader<ElemType>::SetSentenceEndInBatch(vector<size_t> &sentenceEnd)
        {
            sentenceEnd.resize(m_switchFrame.size());
            for (size_t i = 0; i < m_switchFrame.size() ; i++)
            {
                sentenceEnd[i] = m_switchFrame[i];
            }
        }

    template<class ElemType>
        void HTKMLFReader<ElemType>::SetSentenceSegBatch(Matrix<float> &sentenceBegin, vector<MinibatchPackingFlag>& minibatchPackingFlag)
        {
            if (!m_framemode)
            {
                sentenceBegin.SetValue(m_sentenceBegin);
                minibatchPackingFlag = m_minibatchPackingFlag;
            }
        }


    // GetFileConfigNames - determine the names of the features and labels sections in the config file
    // features - [in,out] a vector of feature name strings
    // labels - [in,out] a vector of label name strings
    template<class ElemType>
        void HTKMLFReader<ElemType>::GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector<std::wstring>& features, std::vector<std::wstring>& labels)
        {
            for (auto iter = readerConfig.begin(); iter != readerConfig.end(); ++iter)
            {
                auto pair = *iter;
                ConfigParameters temp = iter->second;
                // see if we have a config parameters that contains a "file" element, it's a sub key, use it
                if (temp.ExistsCurrent("scpFile"))
                {
                    features.push_back(msra::strfun::utf16(iter->first));
                }
                else if (temp.ExistsCurrent("mlfFile"))
                {
                    labels.push_back(msra::strfun::utf16(iter->first));
                }

            }
        }

    template<class ElemType>
        void HTKMLFReader<ElemType>::ExpandDotDotDot(wstring & featPath, const wstring & scpPath, wstring & scpDirCached)
        {
            wstring delim = L"/\\";

            if (scpDirCached.empty())
            {
                scpDirCached = scpPath;
                wstring tail;
                auto pos = scpDirCached.find_last_of(delim);
                if (pos != wstring::npos)
                {
                    tail = scpDirCached.substr(pos + 1);
                    scpDirCached.resize(pos);
                }
                if (tail.empty()) // nothing was split off: no dir given, 'dir' contains the filename
                    scpDirCached.swap(tail);
            }
            size_t pos = featPath.find(L"...");
            if (pos != featPath.npos)
                featPath = featPath.substr(0, pos) + scpDirCached + featPath.substr(pos + 3);
        }

    template class HTKMLFReader<float>;
    template class HTKMLFReader<double>;
}}}