diff --git a/DataReader/HTKMLFReader/HTKMLFReader.cpp b/DataReader/HTKMLFReader/HTKMLFReader.cpp index 4ee055592..ebb659525 100644 --- a/DataReader/HTKMLFReader/HTKMLFReader.cpp +++ b/DataReader/HTKMLFReader/HTKMLFReader.cpp @@ -1,1541 +1,1541 @@ -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// -// HTKMLFReader.cpp : Defines the exported functions for the DLL application. -// - -#include "stdafx.h" -#include -#include "basetypes.h" - -#include "htkfeatio.h" // for reading HTK features -#include "latticearchive.h" // for reading HTK phoneme lattices (MMI training) -#include "simplesenonehmm.h" // for MMI scoring -#include "msra_mgram.h" // for unigram scores of ground-truth path in sequence training - -#include "rollingwindowsource.h" // minibatch sources -#include "utterancesourcemulti.h" -#include "utterancesource.h" -#include "utterancesourcemulti.h" -#include "readaheadsource.h" -#include "chunkevalsource.h" -#include "minibatchiterator.h" -#define DATAREADER_EXPORTS // creating the exports here -#include "DataReader.h" -#include "HTKMLFReader.h" -#ifdef LEAKDETECT -#include // for memory leak detection -#endif - -#pragma warning (disable: 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this - -int msra::numa::node_override = -1; // for numahelpers.h - -namespace Microsoft { namespace MSR { namespace CNTK { - - // Create a Data Reader - //DATAREADER_API IDataReader* DataReaderFactory(void) - - template - void HTKMLFReader::Init(const ConfigParameters& readerConfig) - { - m_mbiter = NULL; - m_frameSource = NULL; - m_readAheadSource = NULL; - m_lattices = NULL; - - m_truncated = readerConfig("Truncated", "false"); - m_convertLabelsToTargets = false; - - m_numberOfuttsPerMinibatch = readerConfig("nbruttsineachrecurrentiter", "1"); - - if (m_numberOfuttsPerMinibatch < 1) - { - LogicError("nbrUttsInEachRecurrentIter cannot be less than 1."); - } - - if (!m_truncated && m_numberOfuttsPerMinibatch != 1) - { - LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false."); - } - - m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch; - m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true); - m_processedFrame.assign(m_numberOfuttsPerMinibatch, 0); - m_toProcess.assign(m_numberOfuttsPerMinibatch,0); - m_switchFrame.assign(m_numberOfuttsPerMinibatch,0); - m_noData = false; - - string command(readerConfig("action",L"")); //look up in the config for the master command to determine whether we're writing output (inputs only) or training/evaluating (inputs and outputs) - - if (readerConfig.Exists("legacyMode")) - RuntimeError("legacy mode has been deprecated\n"); - - if (command == "write"){ - m_trainOrTest = false; - PrepareForWriting(readerConfig); - } - else{ - m_trainOrTest = true; - PrepareForTrainingOrTesting(readerConfig); - } - - } - - // Load all input and output data. - // Note that the terms features imply be real-valued quanities and - // labels imply categorical quantities, irrespective of whether they - // are inputs or targets for the network - template - void HTKMLFReader::PrepareForTrainingOrTesting(const ConfigParameters& readerConfig) - { - vector scriptpaths; - vector mlfpaths; - vector>mlfpathsmulti; - size_t firstfilesonly = SIZE_MAX; // set to a lower value for testing - vector> infilesmulti; - vector filelist; - size_t numFiles; - wstring unigrampath(L""); - //wstring statelistpath(L""); - size_t randomize = randomizeAuto; - size_t iFeat, iLabel; - iFeat = iLabel = 0; - vector statelistpaths; - bool framemode = true; - vector numContextLeft; - vector numContextRight; - - // for the multi-utterance process - m_featuresBufferMultiUtt.assign(m_numberOfuttsPerMinibatch,NULL); - m_featuresBufferAllocatedMultiUtt.assign(m_numberOfuttsPerMinibatch,0); - m_labelsBufferMultiUtt.assign(m_numberOfuttsPerMinibatch,NULL); - m_labelsBufferAllocatedMultiUtt.assign(m_numberOfuttsPerMinibatch,0); - - std::vector featureNames; - std::vector labelNames; - GetDataNamesFromConfig(readerConfig, featureNames, labelNames); - if (featureNames.size() + labelNames.size() <= 1) - { - RuntimeError("network needs at least 1 input and 1 output specified!"); - } - - //load data for all real-valued inputs (features) - foreach_index(i, featureNames) - { - ConfigParameters thisFeature = readerConfig(featureNames[i]); - m_featDims.push_back(thisFeature("dim")); - ConfigArray contextWindow = thisFeature("contextWindow", "1"); - if (contextWindow.size() == 1) // symmetric - { - size_t windowFrames = contextWindow[0]; - if (windowFrames % 2 == 0 ) - RuntimeError("augmentationextent: neighbor expansion of input features to %d not symmetrical", windowFrames); - size_t context = windowFrames / 2; // extend each side by this - numContextLeft.push_back(context); - numContextRight.push_back(context); - - } - else if (contextWindow.size() == 2) // left context, right context - { - numContextLeft.push_back(contextWindow[0]); - numContextRight.push_back(contextWindow[1]); - } - else - { - RuntimeError("contextFrames must have 1 or 2 values specified, found %d", contextWindow.size()); - } - // update m_featDims to reflect the total input dimension (featDim x contextWindow), not the native feature dimension - // that is what the lower level feature readers expect - m_featDims[i] = m_featDims[i] * (1 + numContextLeft[i] + numContextRight[i]); - - string type = thisFeature("type","Real"); - if (type=="Real"){ - m_nameToTypeMap[featureNames[i]] = InputOutputTypes::real; - } - else{ - RuntimeError("feature type must be Real"); - } - - m_featureNameToIdMap[featureNames[i]]= iFeat; - scriptpaths.push_back(thisFeature("scpFile")); - m_featureNameToDimMap[featureNames[i]] = m_featDims[i]; - - m_featuresBufferMultiIO.push_back(NULL); - m_featuresBufferAllocatedMultiIO.push_back(0); - - iFeat++; - } - - foreach_index(i, labelNames) - { - ConfigParameters thisLabel = readerConfig(labelNames[i]); - if (thisLabel.Exists("labelDim")) - m_labelDims.push_back(thisLabel("labelDim")); - else if (thisLabel.Exists("dim")) - m_labelDims.push_back(thisLabel("dim")); - else - RuntimeError("labels must specify dim or labelDim"); - - string type; - if (thisLabel.Exists("labelType")) - type = thisLabel("labelType"); // let's deprecate this eventually and just use "type"... - else - type = thisLabel("type","Category"); // outputs should default to category - - if (type=="Category") - m_nameToTypeMap[labelNames[i]] = InputOutputTypes::category; - else - RuntimeError("label type must be Category"); - - statelistpaths.push_back(thisLabel("labelMappingFile",L"")); - - m_labelNameToIdMap[labelNames[i]]=iLabel; - m_labelNameToDimMap[labelNames[i]]=m_labelDims[i]; - mlfpaths.clear(); - mlfpaths.push_back(thisLabel("mlfFile")); - mlfpathsmulti.push_back(mlfpaths); - - m_labelsBufferMultiIO.push_back(NULL); - m_labelsBufferAllocatedMultiIO.push_back(0); - - iLabel++; - - wstring labelToTargetMappingFile(thisLabel("labelToTargetMappingFile",L"")); - if (labelToTargetMappingFile != L"") - { - std::vector> labelToTargetMap; - m_convertLabelsToTargetsMultiIO.push_back(true); - if (thisLabel.Exists("targetDim")) - { - m_labelNameToDimMap[labelNames[i]]=m_labelDims[i]=thisLabel("targetDim"); - } - else - RuntimeError("output must specify targetDim if labelToTargetMappingFile specified!"); - size_t targetDim = ReadLabelToTargetMappingFile (labelToTargetMappingFile,statelistpaths[i], labelToTargetMap); - if (targetDim!=m_labelDims[i]) - RuntimeError("mismatch between targetDim and dim found in labelToTargetMappingFile"); - m_labelToTargetMapMultiIO.push_back(labelToTargetMap); - } - else - { - m_convertLabelsToTargetsMultiIO.push_back(false); - m_labelToTargetMapMultiIO.push_back(std::vector>()); - } - } - - if (iFeat!=scriptpaths.size() || iLabel!=mlfpathsmulti.size()) - throw std::runtime_error(msra::strfun::strprintf ("# of inputs files vs. # of inputs or # of output files vs # of outputs inconsistent\n")); - - if (readerConfig.Exists("randomize")) - { - const std::string& randomizeString = readerConfig("randomize"); - if (randomizeString == "None") - { - randomize = randomizeNone; - } - else if (randomizeString == "Auto") - { - randomize = randomizeAuto; - } - else - { - randomize = readerConfig("randomize"); - } - } - - if (readerConfig.Exists("frameMode")) - { - const std::string& framemodeString = readerConfig("frameMode"); - if (framemodeString == "false") - { - framemode = false; - } - } - - int verbosity = readerConfig("verbosity","2"); - - // determine if we partial minibatches are desired - std::string minibatchMode(readerConfig("minibatchMode","Partial")); - m_partialMinibatch = !_stricmp(minibatchMode.c_str(),"Partial"); - - // get the read method, defaults to "blockRandomize" other option is "rollingWindow" - std::string readMethod(readerConfig("readMethod","blockRandomize")); - - // see if they want to use readAhead - m_readAhead = readerConfig("readAhead", "false"); - - // read all input files (from multiple inputs) - // TO DO: check for consistency (same number of files in each script file) - numFiles=0; - foreach_index(i,scriptpaths) - { - filelist.clear(); - std::wstring scriptpath = scriptpaths[i]; - fprintf(stderr, "reading script file %S ...", scriptpath.c_str()); - size_t n = 0; - for (msra::files::textreader reader(scriptpath); reader && filelist.size() <= firstfilesonly/*optimization*/; ) - { - filelist.push_back (reader.wgetline()); - n++; - } - - fprintf (stderr, " %lu entries\n", n); - - if (i==0) - numFiles=n; - else - if (n!=numFiles) - throw std::runtime_error (msra::strfun::strprintf ("number of files in each scriptfile inconsistent (%d vs. %d)", numFiles,n)); - - infilesmulti.push_back(filelist); - } - - if (readerConfig.Exists("unigram")) - unigrampath = readerConfig("unigram"); - - // load a unigram if needed (this is used for MMI training) - msra::lm::CSymbolSet unigramsymbols; - std::unique_ptr unigram; - size_t silencewordid = SIZE_MAX; - size_t startwordid = SIZE_MAX; - size_t endwordid = SIZE_MAX; - if (unigrampath != L"") - { - unigram.reset (new msra::lm::CMGramLM()); - unigram->read (unigrampath, unigramsymbols, false/*filterVocabulary--false will build the symbol map*/, 1/*maxM--unigram only*/); - silencewordid = unigramsymbols["!silence"]; // give this an id (even if not in the LM vocabulary) - startwordid = unigramsymbols[""]; - endwordid = unigramsymbols[""]; - } - - if (!unigram) - fprintf (stderr, "trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion\n"); - - // currently assumes all mlfs will have same root name (key) - set restrictmlftokeys; // restrict MLF reader to these files--will make stuff much faster without having to use shortened input files - if (infilesmulti[0].size() <= 100) - { - foreach_index (i, infilesmulti[0]) - { - msra::asr::htkfeatreader::parsedpath ppath (infilesmulti[0][i]); - const wstring key = regex_replace ((wstring)ppath, wregex (L"\\.[^\\.\\\\/:]*$"), wstring()); // delete extension (or not if none) - restrictmlftokeys.insert (key); - } - } - // get labels - - //if (readerConfig.Exists("statelist")) - // statelistpath = readerConfig("statelist"); - - double htktimetoframe = 100000.0; // default is 10ms - //std::vector> labelsmulti; - std::vector>> labelsmulti; - //std::vector pagepath; - foreach_index(i, mlfpathsmulti) - { - msra::asr::htkmlfreader - labels(mlfpathsmulti[i], restrictmlftokeys, statelistpaths[i], unigram ? &unigramsymbols : NULL, (map*) NULL, htktimetoframe); // label MLF - // get the temp file name for the page file - labelsmulti.push_back(labels); - } - - - if (!_stricmp(readMethod.c_str(),"blockRandomize")) - { - // construct all the parameters we don't need, but need to be passed to the constructor... - std::pair,std::vector> latticetocs; - std::unordered_map modelsymmap; - m_lattices = new msra::dbn::latticesource(latticetocs, modelsymmap); - - // now get the frame source. This has better randomization and doesn't create temp files - m_frameSource = new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, framemode); - //m_frameSource = new msra::dbn::minibatchutterancesource(infilesmulti[0], labelsmulti[0], m_featDims[0], m_labelDims[0], numContextLeft[0], numContextRight[0], randomize, *m_lattices, m_latticeMap, framemode); - - } - else if (!_stricmp(readMethod.c_str(),"rollingWindow")) - { - std::wstring pageFilePath; - std::vector pagePaths; - if (readerConfig.Exists("pageFilePath")) - { - pageFilePath = readerConfig("pageFilePath"); - - // replace any '/' with '\' for compat with default path - std::replace(pageFilePath.begin(), pageFilePath.end(), '/','\\'); - - // verify path exists - DWORD attrib = GetFileAttributes(pageFilePath.c_str()); - if (attrib==INVALID_FILE_ATTRIBUTES || !(attrib & FILE_ATTRIBUTE_DIRECTORY)) - throw std::runtime_error ("pageFilePath does not exist"); - } - else // using default temporary path - { - pageFilePath.reserve(MAX_PATH); - GetTempPath(MAX_PATH, &pageFilePath[0]); - } - - if (pageFilePath.size()>MAX_PATH-14) // max length of input to GetTempFileName is MAX_PATH-14 - throw std::runtime_error (msra::strfun::strprintf ("pageFilePath must be less than %d characters", MAX_PATH-14)); - - foreach_index(i, infilesmulti) - { - - wchar_t tempFile[MAX_PATH]; - GetTempFileName(pageFilePath.c_str(), L"CNTK", 0, tempFile); - pagePaths.push_back(tempFile); - - } - - const bool mayhavenoframe=false; - int addEnergy = 0; - - //m_frameSourceMultiIO = new msra::dbn::minibatchframesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, randomize, pagepath, mayhavenoframe, addEnergy); - //m_frameSourceMultiIO->setverbosity(verbosity); - m_frameSource = new msra::dbn::minibatchframesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, pagePaths, mayhavenoframe, addEnergy); - m_frameSource->setverbosity(verbosity); - } - else - { - RuntimeError("readMethod must be rollingWindow or blockRandomize"); - } - - } - - // Load all input and output data. - // Note that the terms features imply be real-valued quanities and - // labels imply categorical quantities, irrespective of whether they - // are inputs or targets for the network - template - void HTKMLFReader::PrepareForWriting(const ConfigParameters& readerConfig) - { - vector scriptpaths; - vector filelist; - size_t numFiles; - size_t firstfilesonly = SIZE_MAX; // set to a lower value for testing - size_t evalchunksize = 2048; - vector realDims; - size_t iFeat = 0; - vector numContextLeft; - vector numContextRight; - - std::vector featureNames; - std::vector labelNames; - GetDataNamesFromConfig(readerConfig, featureNames, labelNames); - - foreach_index(i, featureNames) - { - ConfigParameters thisFeature = readerConfig(featureNames[i]); - realDims.push_back(thisFeature("dim")); - - ConfigArray contextWindow = thisFeature("contextWindow", "1"); - if (contextWindow.size() == 1) // symmetric - { - size_t windowFrames = contextWindow[0]; - if (windowFrames % 2 == 0) - RuntimeError("augmentationextent: neighbor expansion of input features to %d not symmetrical", windowFrames); - size_t context = windowFrames / 2; // extend each side by this - numContextLeft.push_back(context); - numContextRight.push_back(context); - - } - else if (contextWindow.size() == 2) // left context, right context - { - numContextLeft.push_back(contextWindow[0]); - numContextRight.push_back(contextWindow[1]); - } - else - { - RuntimeError("contextFrames must have 1 or 2 values specified, found %d", contextWindow.size()); - } - // update m_featDims to reflect the total input dimension (featDim x contextWindow), not the native feature dimension - // that is what the lower level feature readers expect - realDims[i] = realDims[i] * (1 + numContextLeft[i] + numContextRight[i]); - - string type = thisFeature("type","Real"); - if (type=="Real"){ - m_nameToTypeMap[featureNames[i]] = InputOutputTypes::real; - } - else{ - RuntimeError("feature type must be Real"); - } - - m_featureNameToIdMap[featureNames[i]]= iFeat; - scriptpaths.push_back(thisFeature("scpFile")); - m_featureNameToDimMap[featureNames[i]] = realDims[i]; - - m_featuresBufferMultiIO.push_back(NULL); - m_featuresBufferAllocatedMultiIO.push_back(0); - iFeat++; - } - - if (labelNames.size()>0) - RuntimeError("writer mode does not support labels as inputs, only features"); - - numFiles=0; - foreach_index(i,scriptpaths) - { - filelist.clear(); - std::wstring scriptpath = scriptpaths[i]; - fprintf(stderr, "reading script file %S ...", scriptpath.c_str()); - size_t n = 0; - for (msra::files::textreader reader(scriptpath); reader && filelist.size() <= firstfilesonly/*optimization*/; ) - { - filelist.push_back (reader.wgetline()); - n++; - } - - fprintf (stderr, " %d entries\n", n); - - if (i==0) - numFiles=n; - else - if (n!=numFiles) - throw std::runtime_error (msra::strfun::strprintf ("HTKMLFReader::InitEvalReader: number of files in each scriptfile inconsistent (%d vs. %d)", numFiles,n)); - - m_inputFilesMultiIO.push_back(filelist); - } - - m_fileEvalSource = new msra::dbn::FileEvalSource(realDims, numContextLeft, numContextRight, evalchunksize); - } - - - - // destructor - virtual so it gets called properly - template - HTKMLFReader::~HTKMLFReader() - { - delete m_mbiter; - delete m_readAheadSource; - delete m_frameSource; - delete m_lattices; - - if (!m_featuresBufferMultiIO.empty()) - { - if ( m_featuresBufferMultiIO[0] != NULL) - { - foreach_index(i, m_featuresBufferMultiIO) - { - delete[] m_featuresBufferMultiIO[i]; - m_featuresBufferMultiIO[i] = NULL; - } - } - } - if (!m_labelsBufferMultiIO.empty()) - { - if (m_labelsBufferMultiIO[0] != NULL) - { - foreach_index(i, m_labelsBufferMultiIO) - { - delete[] m_labelsBufferMultiIO[i]; - m_labelsBufferMultiIO[i] = NULL; - } - } - } - if (/*m_numberOfuttsPerMinibatch > 1 && */m_truncated) - { - for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i ++) - { - if (m_featuresBufferMultiUtt[i] != NULL) - { - delete[] m_featuresBufferMultiUtt[i]; - m_featuresBufferMultiUtt[i] = NULL; - } - if (m_labelsBufferMultiUtt[i] != NULL) - { - delete[] m_labelsBufferMultiUtt[i]; - m_labelsBufferMultiUtt[i] = NULL; - } - - } - } - } - - //StartMinibatchLoop - Startup a minibatch loop - // mbSize - [in] size of the minibatch (number of frames, etc.) - // epoch - [in] epoch number for this loop - // requestedEpochSamples - [in] number of samples to randomize, defaults to requestDataSize which uses the number of samples there are in the dataset - template - void HTKMLFReader::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples) - { - m_mbSize = mbSize; - - if (m_trainOrTest) - { - StartMinibatchLoopToTrainOrTest(mbSize,epoch,requestedEpochSamples); - } - else - { - StartMinibatchLoopToWrite(mbSize,epoch,requestedEpochSamples); - } - m_checkDictionaryKeys=true; - } - - template - void HTKMLFReader::StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples) - { - size_t datapasses=1; - //size_t totalFrames = m_frameSource->totalframes(); - size_t totalFrames; - totalFrames = m_frameSource->totalframes(); - - size_t extraFrames = totalFrames%mbSize; - size_t minibatches = totalFrames/mbSize; - - // if we are allowing partial minibatches, do nothing, and let it go through - if (!m_partialMinibatch) - { - // we don't want any partial frames, so round total frames to be an even multiple of our mbSize - if (totalFrames > mbSize) - totalFrames -= extraFrames; - - if (requestedEpochSamples == requestDataSize) - { - requestedEpochSamples = totalFrames; - } - else if (minibatches > 0) // if we have any full minibatches - { - // since we skip the extraFrames, we need to add them to the total to get the actual number of frames requested - size_t sweeps = (requestedEpochSamples-1)/totalFrames; // want the number of sweeps we will skip the extra, so subtract 1 and divide - requestedEpochSamples += extraFrames*sweeps; - } - } - else if (requestedEpochSamples == requestDataSize) - { - requestedEpochSamples = totalFrames; - } - - // delete the old one first (in case called more than once) - delete m_mbiter; - msra::dbn::minibatchsource* source = m_frameSource; - if (m_readAhead) - { - if (m_readAheadSource == NULL) - { - m_readAheadSource = new msra::dbn::minibatchreadaheadsource (*source, requestedEpochSamples); - } - else if (m_readAheadSource->epochsize() != requestedEpochSamples) - { - delete m_readAheadSource; - m_readAheadSource = new msra::dbn::minibatchreadaheadsource (*source, requestedEpochSamples); - } - source = m_readAheadSource; - } - m_mbiter = new msra::dbn::minibatchiterator(*source, epoch, requestedEpochSamples, mbSize, datapasses); - if (!m_featuresBufferMultiIO.empty()) - { - if (m_featuresBufferMultiIO[0]!=NULL) // check first feature, if it isn't NULL, safe to assume all are not NULL? - { - foreach_index(i, m_featuresBufferMultiIO) - { - delete[] m_featuresBufferMultiIO[i]; - m_featuresBufferMultiIO[i]=NULL; - m_featuresBufferAllocatedMultiIO[i]=0; - } - } - } - if (!m_labelsBufferMultiIO.empty()) - { - if (m_labelsBufferMultiIO[0]!=NULL) - { - foreach_index(i, m_labelsBufferMultiIO) - { - delete[] m_labelsBufferMultiIO[i]; - m_labelsBufferMultiIO[i]=NULL; - m_labelsBufferAllocatedMultiIO[i]=0; - } - } - } - if (m_numberOfuttsPerMinibatch && m_truncated == true) - { - m_noData = false; - m_featuresStartIndexMultiUtt.assign(m_featuresBufferMultiIO.size()*m_numberOfuttsPerMinibatch,0); - m_labelsStartIndexMultiUtt.assign(m_labelsBufferMultiIO.size()*m_numberOfuttsPerMinibatch,0); - for (size_t u = 0; u < m_numberOfuttsPerMinibatch; u ++) - { - if (m_featuresBufferMultiUtt[u] != NULL) - { - delete[] m_featuresBufferMultiUtt[u]; - m_featuresBufferMultiUtt[u] = NULL; - m_featuresBufferAllocatedMultiUtt[u] = 0; - } - if (m_labelsBufferMultiUtt[u] != NULL) - { - delete[] m_labelsBufferMultiUtt[u]; - m_labelsBufferMultiUtt[u] = NULL; - m_labelsBufferAllocatedMultiUtt[u] = 0; - } - ReNewBufferForMultiIO(u); - } - } - } - - template - void HTKMLFReader::StartMinibatchLoopToWrite(size_t mbSize, size_t /*epoch*/, size_t /*requestedEpochSamples*/) - { - m_fileEvalSource->Reset(); - m_fileEvalSource->SetMinibatchSize(mbSize); - //m_chunkEvalSourceMultiIO->reset(); - m_inputFileIndex=0; - - if (m_featuresBufferMultiIO[0]!=NULL) // check first feature, if it isn't NULL, safe to assume all are not NULL? - { - foreach_index(i, m_featuresBufferMultiIO) - { - delete[] m_featuresBufferMultiIO[i]; - m_featuresBufferMultiIO[i]=NULL; - m_featuresBufferAllocatedMultiIO[i]=0; - } - } - - } - - // GetMinibatch - Get the next minibatch (features and labels) - // matrices - [in] a map with named matrix types (i.e. 'features', 'labels') mapped to the corresponing matrix, - // [out] each matrix resized if necessary containing data. - // returns - true if there are more minibatches, false if no more minibatchs remain - template - bool HTKMLFReader::GetMinibatch(std::map*>& matrices) - { - if (m_trainOrTest) - { - return GetMinibatchToTrainOrTest(matrices); - } - else - { - return GetMinibatchToWrite(matrices); - } - } - - template - bool HTKMLFReader::GetMinibatchToTrainOrTest(std::map*>& matrices) - { - size_t id; - size_t dim; - bool skip = false; - - // on first minibatch, make sure we can supply data for requested nodes - std::map::iterator iter; - if (m_checkDictionaryKeys) - { - for (auto iter=matrices.begin();iter!=matrices.end();iter++) - { - if (m_nameToTypeMap.find(iter->first)==m_nameToTypeMap.end()) - throw std::runtime_error(msra::strfun::strprintf("minibatch requested for input node %ws not found in reader - cannot generate input\n",iter->first.c_str())); - - } - m_checkDictionaryKeys=false; - } - - do - { - if (m_truncated == false) - { - if (!(*m_mbiter)) - return false; - - // now, access all features and and labels by iterating over map of "matrices" - std::map*>::iterator iter; - for (iter = matrices.begin();iter!=matrices.end(); iter++) - { - // dereference matrix that corresponds to key (input/output name) and - // populate based on whether its a feature or a label - Matrix& data = *matrices[iter->first]; // can be features or labels - - if (m_nameToTypeMap[iter->first] == InputOutputTypes::real) - { - - id = m_featureNameToIdMap[iter->first]; - dim = m_featureNameToDimMap[iter->first]; - const msra::dbn::matrixstripe feat = m_mbiter->frames(id); - const size_t actualmbsize = feat.cols(); // it may still return less if at end of sweep TODO: this check probably only needs to happen once - assert (actualmbsize == m_mbiter->currentmbframes()); - skip = (!m_partialMinibatch && m_mbiter->requestedframes() != actualmbsize && m_frameSource->totalframes() > actualmbsize); - - // check to see if we got the number of frames we requested - if (!skip) - { - // copy the features over to our array type - assert(feat.rows()==dim); // check feature dimension matches what's expected - - if (m_featuresBufferMultiIO[id]==NULL) - { - m_featuresBufferMultiIO[id] = new ElemType[feat.rows()*feat.cols()]; - m_featuresBufferAllocatedMultiIO[id] = feat.rows()*feat.cols(); - } - else if (m_featuresBufferAllocatedMultiIO[id]first] == InputOutputTypes::category) - { - id = m_labelNameToIdMap[iter->first]; - dim = m_labelNameToDimMap[iter->first]; - const vector & uids = m_mbiter->labels(id); - - // need skip logic here too in case labels are first in map not features - const size_t actualmbsize = uids.size(); // it may still return less if at end of sweep TODO: this check probably only needs to happen once - assert (actualmbsize == m_mbiter->currentmbframes()); - skip = (!m_partialMinibatch && m_mbiter->requestedframes() != actualmbsize && m_frameSource->totalframes() > actualmbsize); - - if (!skip) - { - // copy the labels over to array type - //data.Resize(udims[id], uids.size()); - //data.SetValue((ElemType)0); - - // loop through the columns and set one value to 1 - // in the future we want to use a sparse matrix here - //for (int i = 0; i < uids.size(); i++) - //{ - // assert(uids[i] first).c_str())); - } - - } - // advance to the next minibatch - (*m_mbiter)++; - } - else - { - if (m_noData) - { - bool endEpoch = true; - for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i++) - { - if (m_processedFrame[i] != m_toProcess[i]) - { - endEpoch = false; - } - } - if(endEpoch) - { - return false; - } - } - size_t numOfFea = m_featuresBufferMultiIO.size(); - size_t numOfLabel = m_labelsBufferMultiIO.size(); - vector actualmbsize; - actualmbsize.assign(m_numberOfuttsPerMinibatch,0); - for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i++) - { - size_t startFr = m_processedFrame[i]; - size_t endFr = 0; - if ((m_processedFrame[i] + m_mbSize) < m_toProcess[i]) - { - if(m_processedFrame[i] > 0) - { - m_sentenceEnd[i] = false; - m_switchFrame[i] = m_mbSize+1; - } - else - { - m_switchFrame[i] = 0; - m_sentenceEnd[i] = true; - } - actualmbsize[i] = m_mbSize; - endFr = startFr + actualmbsize[i]; - std::map*>::iterator iter; - for (iter = matrices.begin();iter!=matrices.end(); iter++) - { - // dereference matrix that corresponds to key (input/output name) and - // populate based on whether its a feature or a label - //Matrix& data = - *matrices[iter->first]; // can be features or labels - - if (m_nameToTypeMap[iter->first] == InputOutputTypes::real) - { - id = m_featureNameToIdMap[iter->first]; - dim = m_featureNameToDimMap[iter->first]; - - if (m_featuresBufferMultiIO[id]==NULL) - { - m_featuresBufferMultiIO[id] = new ElemType[dim*m_mbSize*m_numberOfuttsPerMinibatch]; - m_featuresBufferAllocatedMultiIO[id] = dim*m_mbSize*m_numberOfuttsPerMinibatch; - } - else if (m_featuresBufferAllocatedMultiIO[id]first] == InputOutputTypes::category) - { - id = m_labelNameToIdMap[iter->first]; - dim = m_labelNameToDimMap[iter->first]; - if (m_labelsBufferMultiIO[id]==NULL) - { - m_labelsBufferMultiIO[id] = new ElemType[dim*m_mbSize*m_numberOfuttsPerMinibatch]; - m_labelsBufferAllocatedMultiIO[id] = dim*m_mbSize*m_numberOfuttsPerMinibatch; - } - else if (m_labelsBufferAllocatedMultiIO[id]*>::iterator iter; - for (iter = matrices.begin();iter!=matrices.end(); iter++) - { - // dereference matrix that corresponds to key (input/output name) and - // populate based on whether its a feature or a label - //Matrix& data = - *matrices[iter->first]; // can be features or labels - - if (m_nameToTypeMap[iter->first] == InputOutputTypes::real) - { - id = m_featureNameToIdMap[iter->first]; - dim = m_featureNameToDimMap[iter->first]; - - if (m_featuresBufferMultiIO[id]==NULL) - { - m_featuresBufferMultiIO[id] = new ElemType[dim*m_mbSize*m_numberOfuttsPerMinibatch]; - m_featuresBufferAllocatedMultiIO[id] = dim*m_mbSize*m_numberOfuttsPerMinibatch; - } - else if (m_featuresBufferAllocatedMultiIO[id]first] == InputOutputTypes::category) - { - id = m_labelNameToIdMap[iter->first]; - dim = m_labelNameToDimMap[iter->first]; - if (m_labelsBufferMultiIO[id]==NULL) - { - m_labelsBufferMultiIO[id] = new ElemType[dim*m_mbSize*m_numberOfuttsPerMinibatch]; - m_labelsBufferAllocatedMultiIO[id] = dim*m_mbSize*m_numberOfuttsPerMinibatch; - } - else if (m_labelsBufferAllocatedMultiIO[id]& data = - *matrices[iter->first]; // can be features or labels - - if (m_nameToTypeMap[iter->first] == InputOutputTypes::real) - { - id = m_featureNameToIdMap[iter->first]; - dim = m_featureNameToDimMap[iter->first]; - if (sizeof(ElemType) == sizeof(float)) - { - for (size_t j = startFr,k = 0; j < endFr; j++,k++) // column major, so iterate columns - { - // copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns - memcpy_s(&m_featuresBufferMultiIO[id][(j*m_numberOfuttsPerMinibatch+i)*dim],sizeof(ElemType)*dim,&m_featuresBufferMultiUtt[i][k*dim+m_featuresStartIndexMultiUtt[id+i*numOfFea]],sizeof(ElemType)*dim); - } - } - else - { - for (size_t j=startFr,k=0; j < endFr; j++,k++) // column major, so iterate columns in outside loop - { - for (int d = 0; d < dim; d++) - { - m_featuresBufferMultiIO[id][(j*m_numberOfuttsPerMinibatch+i)*dim+d] = m_featuresBufferMultiUtt[i][k*dim+d+m_featuresStartIndexMultiUtt[id+i*numOfFea]]; - } - } - } - } - else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category) - { - id = m_labelNameToIdMap[iter->first]; - dim = m_labelNameToDimMap[iter->first]; - for (size_t j = startFr,k=0; j < endFr; j++,k++) - { - for (int d = 0; d < dim; d++) - { - m_labelsBufferMultiIO[id][(j*m_numberOfuttsPerMinibatch+i)*dim + d] = m_labelsBufferMultiUtt[i][k*dim+d+m_labelsStartIndexMultiUtt[id+i*numOfLabel]]; - } - } - } - } - - if (reNewSucc) m_processedFrame[i] += (endFr-startFr); - - } - } - std::map*>::iterator iter; - for (iter = matrices.begin();iter!=matrices.end(); iter++) - { - // dereference matrix that corresponds to key (input/output name) and - // populate based on whether its a feature or a label - Matrix& data = *matrices[iter->first]; // can be features or labels - if (m_nameToTypeMap[iter->first] == InputOutputTypes::real) - { - id = m_featureNameToIdMap[iter->first]; - dim = m_featureNameToDimMap[iter->first]; - data.SetValue(dim, m_mbSize*m_numberOfuttsPerMinibatch, m_featuresBufferMultiIO[id],matrixFlagNormal); - } - else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category) - { - id = m_labelNameToIdMap[iter->first]; - dim = m_labelNameToDimMap[iter->first]; - data.SetValue(dim, m_mbSize*m_numberOfuttsPerMinibatch, m_labelsBufferMultiIO[id],matrixFlagNormal); - } - } - skip=false; - } - } // keep going if we didn't get the right size minibatch - while(skip); - - return true; - } - - template - bool HTKMLFReader::GetMinibatchToWrite(std::map*>& matrices) - { - std::map::iterator iter; - if (m_checkDictionaryKeys) - { - for (auto iter=m_featureNameToIdMap.begin();iter!=m_featureNameToIdMap.end();iter++) - { - if (matrices.find(iter->first)==matrices.end()) - { - fprintf(stderr,"GetMinibatchToWrite: feature node %ws specified in reader not found in the network\n",iter->first.c_str()); - throw std::runtime_error("GetMinibatchToWrite: feature node specified in reader not found in the network."); - } - } - /* - for (auto iter=matrices.begin();iter!=matrices.end();iter++) - { - if (m_featureNameToIdMap.find(iter->first)==m_featureNameToIdMap.end()) - throw std::runtime_error(msra::strfun::strprintf("minibatch requested for input node %ws not found in reader - cannot generate input\n",iter->first.c_str())); - } - */ - m_checkDictionaryKeys=false; - } - - if (m_inputFileIndexReset(); - - // load next file (or set of files) - foreach_index(i, m_inputFilesMultiIO) - { - msra::asr::htkfeatreader reader; - - const auto path = reader.parse(m_inputFilesMultiIO[i][m_inputFileIndex]); - // read file - msra::dbn::matrix feat; - string featkind; - unsigned int sampperiod; - msra::util::attempt (5, [&]() - { - reader.read (path, featkind, sampperiod, feat); // whole file read as columns of feature vectors - }); - fprintf (stderr, "evaluate: reading %d frames of %S\n", feat.cols(), ((wstring)path).c_str()); - m_fileEvalSource->AddFile(feat, featkind, sampperiod, i); - } - m_inputFileIndex++; - - // turn frames into minibatch (augment neighbors, etc) - m_fileEvalSource->CreateEvalMinibatch(); - - // populate input matrices - - std::map*>::iterator iter; - for (iter = matrices.begin();iter!=matrices.end(); iter++) - { - // dereference matrix that corresponds to key (input/output name) and - // populate based on whether its a feature or a label - - if (m_nameToTypeMap.find(iter->first)!=m_nameToTypeMap.end() && m_nameToTypeMap[iter->first] == InputOutputTypes::real) - { - Matrix& data = *matrices[iter->first]; // can be features or labels - size_t id = m_featureNameToIdMap[iter->first]; - size_t dim = m_featureNameToDimMap[iter->first]; - - const msra::dbn::matrix feat = m_fileEvalSource->ChunkOfFrames(id); - - // copy the features over to our array type - assert(feat.rows()==dim); dim; // check feature dimension matches what's expected - - if (m_featuresBufferMultiIO[id]==NULL) - { - m_featuresBufferMultiIO[id] = new ElemType[feat.rows()*feat.cols()]; - m_featuresBufferAllocatedMultiIO[id] = feat.rows()*feat.cols(); - } - else if (m_featuresBufferAllocatedMultiIO[id] - bool HTKMLFReader::ReNewBufferForMultiIO(size_t i) - { - if (m_noData) - { - return false; - } - size_t numOfFea = m_featuresBufferMultiIO.size(); - size_t numOfLabel = m_labelsBufferMultiIO.size(); - - size_t totalFeatNum = 0; - foreach_index(id, m_featuresBufferAllocatedMultiIO) - { - const msra::dbn::matrixstripe featOri = m_mbiter->frames(id); - size_t fdim = featOri.rows(); - const size_t actualmbsizeOri = featOri.cols(); - m_featuresStartIndexMultiUtt[id+i*numOfFea] = totalFeatNum; - totalFeatNum = fdim * actualmbsizeOri + m_featuresStartIndexMultiUtt[id+i*numOfFea]; - } - if (m_featuresBufferMultiUtt[i]==NULL) - { - m_featuresBufferMultiUtt[i] = new ElemType[totalFeatNum]; - m_featuresBufferAllocatedMultiUtt[i] = totalFeatNum; - } - else if (m_featuresBufferAllocatedMultiUtt[i] < totalFeatNum) //buffer size changed. can be partial minibatch - { - delete[] m_featuresBufferMultiUtt[i]; - m_featuresBufferMultiUtt[i] = new ElemType[totalFeatNum]; - m_featuresBufferAllocatedMultiUtt[i] = totalFeatNum; - } - - size_t totalLabelsNum = 0; - for (auto it = m_labelNameToIdMap.begin(); it != m_labelNameToIdMap.end(); ++it) - { - size_t id = m_labelNameToIdMap[it->first]; - size_t dim = m_labelNameToDimMap[it->first]; - - const vector & uids = m_mbiter->labels(id); - size_t actualmbsizeOri = uids.size(); - m_labelsStartIndexMultiUtt[id+i*numOfLabel] = totalLabelsNum; - totalLabelsNum = m_labelsStartIndexMultiUtt[id+i*numOfLabel] + dim * actualmbsizeOri; - } - - if (m_labelsBufferMultiUtt[i]==NULL) - { - m_labelsBufferMultiUtt[i] = new ElemType[totalLabelsNum]; - m_labelsBufferAllocatedMultiUtt[i] = totalLabelsNum; - } - else if (m_labelsBufferAllocatedMultiUtt[i] < totalLabelsNum) - { - delete[] m_labelsBufferMultiUtt[i]; - m_labelsBufferMultiUtt[i] = new ElemType[totalLabelsNum]; - m_labelsBufferAllocatedMultiUtt[i] = totalLabelsNum; - } - - memset(m_labelsBufferMultiUtt[i],0,sizeof(ElemType)*totalLabelsNum); - - bool first = true; - foreach_index(id, m_featuresBufferMultiIO) - { - const msra::dbn::matrixstripe featOri = m_mbiter->frames(id); - const size_t actualmbsizeOri = featOri.cols(); - size_t fdim = featOri.rows(); - if (first) - { - m_toProcess[i] = actualmbsizeOri; - first = false; - } - else - { - if (m_toProcess[i] != actualmbsizeOri) - { - throw std::runtime_error("The multi-IO features has inconsistent number of frames!"); - } - } - assert (actualmbsizeOri == m_mbiter->currentmbframes()); - - if (sizeof(ElemType) == sizeof(float)) - { - for (int k = 0; k < actualmbsizeOri; k++) // column major, so iterate columns - { - // copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns - memcpy_s(&m_featuresBufferMultiUtt[i][k*fdim+m_featuresStartIndexMultiUtt[id+i*numOfFea]],sizeof(ElemType)*fdim,&featOri(0,k),sizeof(ElemType)*fdim); - } - } - else - { - for (int k=0; k < actualmbsizeOri; k++) // column major, so iterate columns in outside loop - { - for (int d = 0; d < featOri.rows(); d++) - { - m_featuresBufferMultiUtt[i][k*featOri.rows()+d+m_featuresStartIndexMultiUtt[id+i*numOfFea]] = featOri(d,k); - } - } - } - } - - for (auto it = m_labelNameToIdMap.begin(); it != m_labelNameToIdMap.end(); ++it) - { - size_t id = m_labelNameToIdMap[it->first]; - size_t dim = m_labelNameToDimMap[it->first]; - - const vector & uids = m_mbiter->labels(id); - size_t actualmbsizeOri = uids.size(); - - if (m_convertLabelsToTargetsMultiIO[id]) - { - size_t labelDim = m_labelToTargetMapMultiIO[id].size(); - for (int k=0; k < actualmbsizeOri; k++) - { - assert(uids[k] < labelDim); labelDim; - size_t labelId = uids[k]; - for (int j = 0; j < dim; j++) - { - m_labelsBufferMultiUtt[i][k*dim + j + m_labelsStartIndexMultiUtt[id+i*numOfLabel]] = m_labelToTargetMapMultiIO[id][labelId][j]; - } - } - } - else - { - // loop through the columns and set one value to 1 - // in the future we want to use a sparse matrix here - for (int k=0; k < actualmbsizeOri; k++) - { - assert(uids[k] < dim); - //labels(uids[i], i) = (ElemType)1; - m_labelsBufferMultiUtt[i][k*dim+uids[k]+m_labelsStartIndexMultiUtt[id+i*numOfLabel]]=(ElemType)1; - } - } - } - m_processedFrame[i] = 0; - - (*m_mbiter)++; - if (!(*m_mbiter)) - m_noData = true; - - return true; - } - - - - - // GetLabelMapping - Gets the label mapping from integer to type in file - // mappingTable - a map from numeric datatype to native label type stored as a string - template - const std::map::LabelIdType, typename IDataReader::LabelType>& HTKMLFReader::GetLabelMapping(const std::wstring& /*sectionName*/) - { - return m_idToLabelMap; - } - - // SetLabelMapping - Sets the label mapping from integer index to label - // labelMapping - mapping table from label values to IDs (must be 0-n) - // note: for tasks with labels, the mapping table must be the same between a training run and a testing run - template - void HTKMLFReader::SetLabelMapping(const std::wstring& /*sectionName*/, const std::map::LabelIdType, typename IDataReader::LabelType>& labelMapping) - { - m_idToLabelMap = labelMapping; - } - - template - size_t HTKMLFReader::ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector>& labelToTargetMap) - { - if (labelListFile==L"") - throw std::runtime_error("HTKMLFReader::ReadLabelToTargetMappingFile(): cannot read labelToTargetMappingFile without a labelMappingFile!"); - - vector labelList; - size_t count, numLabels; - count=0; - // read statelist first - msra::files::textreader labelReader(labelListFile); - while(labelReader) - { - labelList.push_back(labelReader.wgetline()); - count++; - } - numLabels=count; - count=0; - msra::files::textreader mapReader(labelToTargetMappingFile); - size_t targetDim = 0; - while(mapReader) - { - std::wstring line(mapReader.wgetline()); - // find white space as a demarcation - std::wstring::size_type pos = line.find(L" "); - std::wstring token = line.substr(0,pos); - std::wstring targetstring = line.substr(pos+1); - - if (labelList[count]!=token) - RuntimeError("HTKMLFReader::ReadLabelToTargetMappingFile(): mismatch between labelMappingFile and labelToTargetMappingFile"); - - if (count==0) - targetDim = targetstring.length(); - else if (targetDim!=targetstring.length()) - RuntimeError("HTKMLFReader::ReadLabelToTargetMappingFile(): inconsistent target length among records"); - - std::vector targetVector(targetstring.length(),(ElemType)0.0); - foreach_index(i, targetstring) - { - if (targetstring.compare(i,1,L"1")==0) - targetVector[i] = (ElemType)1.0; - else if (targetstring.compare(i,1,L"0")!=0) - RuntimeError("HTKMLFReader::ReadLabelToTargetMappingFile(): expecting label2target mapping to contain only 1's or 0's"); - } - labelToTargetMap.push_back(targetVector); - count++; - } - - // verify that statelist and label2target mapping file are in same order (to match up with reader) while reading mapping - if (count!=labelList.size()) - RuntimeError("HTKMLFReader::ReadLabelToTargetMappingFile(): mismatch between lengths of labelMappingFile vs labelToTargetMappingFile"); - - return targetDim; - } - - // GetData - Gets metadata from the specified section (into CPU memory) - // sectionName - section name to retrieve data from - // numRecords - number of records to read - // data - pointer to data buffer, if NULL, dataBufferSize will be set to size of required buffer to accomidate request - // dataBufferSize - [in] size of the databuffer in bytes - // [out] size of buffer filled with data - // recordStart - record to start reading from, defaults to zero (start of data) - // returns: true if data remains to be read, false if the end of data was reached - template - bool HTKMLFReader::GetData(const std::wstring& /*sectionName*/, size_t /*numRecords*/, void* /*data*/, size_t& /*dataBufferSize*/, size_t /*recordStart*/) - { - throw std::runtime_error("GetData not supported in HTKMLFReader"); - } - - - template - bool HTKMLFReader::DataEnd(EndDataType endDataType) - { - // each minibatch is considered a "sentence" - // other datatypes not really supported... - // assert(endDataType == endDataSentence); - // for the truncated BPTT, we need to support check wether it's the end of data - bool ret = false; - switch (endDataType) - { - case endDataNull: - case endDataEpoch: - case endDataSet: - throw std::logic_error("DataEnd: does not support endDataTypes: endDataNull, endDataEpoch and endDataSet"); - break; - case endDataSentence: - if (m_truncated) - ret = m_sentenceEnd[0]; - else - ret = true; // useless in current condition - break; - } - return ret; - } - - template - void HTKMLFReader::SetSentenceEndInBatch(vector &sentenceEnd) - { - sentenceEnd.resize(m_switchFrame.size()); - for (size_t i = 0; i < m_switchFrame.size() ; i++) - { - sentenceEnd[i] = m_switchFrame[i]; - } - } - - // GetFileConfigNames - determine the names of the features and labels sections in the config file - // features - [in,out] a vector of feature name strings - // labels - [in,out] a vector of label name strings - template - void HTKMLFReader::GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector& features, std::vector& labels) - { - for (auto iter = readerConfig.begin(); iter != readerConfig.end(); ++iter) - { - auto pair = *iter; - ConfigParameters temp = iter->second; - // see if we have a config parameters that contains a "file" element, it's a sub key, use it - if (temp.ExistsCurrent("scpFile")) - { - features.push_back(msra::strfun::utf16(iter->first)); - } - else if (temp.ExistsCurrent("mlfFile")) - { - labels.push_back(msra::strfun::utf16(iter->first)); - } - - } - } - - template class HTKMLFReader; - template class HTKMLFReader; - }}} +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +// HTKMLFReader.cpp : Defines the exported functions for the DLL application. +// + +#include "stdafx.h" +#include +#include "basetypes.h" + +#include "htkfeatio.h" // for reading HTK features +#include "latticearchive.h" // for reading HTK phoneme lattices (MMI training) +#include "simplesenonehmm.h" // for MMI scoring +#include "msra_mgram.h" // for unigram scores of ground-truth path in sequence training + +#include "rollingwindowsource.h" // minibatch sources +#include "utterancesourcemulti.h" +#include "utterancesource.h" +#include "utterancesourcemulti.h" +#include "readaheadsource.h" +#include "chunkevalsource.h" +#include "minibatchiterator.h" +#define DATAREADER_EXPORTS // creating the exports here +#include "DataReader.h" +#include "HTKMLFReader.h" +#ifdef LEAKDETECT +#include // for memory leak detection +#endif + +#pragma warning (disable: 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this + +int msra::numa::node_override = -1; // for numahelpers.h + +namespace Microsoft { namespace MSR { namespace CNTK { + + // Create a Data Reader + //DATAREADER_API IDataReader* DataReaderFactory(void) + + template + void HTKMLFReader::Init(const ConfigParameters& readerConfig) + { + m_mbiter = NULL; + m_frameSource = NULL; + m_readAheadSource = NULL; + m_lattices = NULL; + + m_truncated = readerConfig("Truncated", "false"); + m_convertLabelsToTargets = false; + + m_numberOfuttsPerMinibatch = readerConfig("nbruttsineachrecurrentiter", "1"); + + if (m_numberOfuttsPerMinibatch < 1) + { + LogicError("nbrUttsInEachRecurrentIter cannot be less than 1."); + } + + if (!m_truncated && m_numberOfuttsPerMinibatch != 1) + { + LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false."); + } + + m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch; + m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true); + m_processedFrame.assign(m_numberOfuttsPerMinibatch, 0); + m_toProcess.assign(m_numberOfuttsPerMinibatch,0); + m_switchFrame.assign(m_numberOfuttsPerMinibatch,0); + m_noData = false; + + string command(readerConfig("action",L"")); //look up in the config for the master command to determine whether we're writing output (inputs only) or training/evaluating (inputs and outputs) + + if (readerConfig.Exists("legacyMode")) + RuntimeError("legacy mode has been deprecated\n"); + + if (command == "write"){ + m_trainOrTest = false; + PrepareForWriting(readerConfig); + } + else{ + m_trainOrTest = true; + PrepareForTrainingOrTesting(readerConfig); + } + + } + + // Load all input and output data. + // Note that the terms features imply be real-valued quanities and + // labels imply categorical quantities, irrespective of whether they + // are inputs or targets for the network + template + void HTKMLFReader::PrepareForTrainingOrTesting(const ConfigParameters& readerConfig) + { + vector scriptpaths; + vector mlfpaths; + vector>mlfpathsmulti; + size_t firstfilesonly = SIZE_MAX; // set to a lower value for testing + vector> infilesmulti; + vector filelist; + size_t numFiles; + wstring unigrampath(L""); + //wstring statelistpath(L""); + size_t randomize = randomizeAuto; + size_t iFeat, iLabel; + iFeat = iLabel = 0; + vector statelistpaths; + bool framemode = true; + vector numContextLeft; + vector numContextRight; + + // for the multi-utterance process + m_featuresBufferMultiUtt.assign(m_numberOfuttsPerMinibatch,NULL); + m_featuresBufferAllocatedMultiUtt.assign(m_numberOfuttsPerMinibatch,0); + m_labelsBufferMultiUtt.assign(m_numberOfuttsPerMinibatch,NULL); + m_labelsBufferAllocatedMultiUtt.assign(m_numberOfuttsPerMinibatch,0); + + std::vector featureNames; + std::vector labelNames; + GetDataNamesFromConfig(readerConfig, featureNames, labelNames); + if (featureNames.size() + labelNames.size() <= 1) + { + RuntimeError("network needs at least 1 input and 1 output specified!"); + } + + //load data for all real-valued inputs (features) + foreach_index(i, featureNames) + { + ConfigParameters thisFeature = readerConfig(featureNames[i]); + m_featDims.push_back(thisFeature("dim")); + ConfigArray contextWindow = thisFeature("contextWindow", "1"); + if (contextWindow.size() == 1) // symmetric + { + size_t windowFrames = contextWindow[0]; + if (windowFrames % 2 == 0 ) + RuntimeError("augmentationextent: neighbor expansion of input features to %d not symmetrical", windowFrames); + size_t context = windowFrames / 2; // extend each side by this + numContextLeft.push_back(context); + numContextRight.push_back(context); + + } + else if (contextWindow.size() == 2) // left context, right context + { + numContextLeft.push_back(contextWindow[0]); + numContextRight.push_back(contextWindow[1]); + } + else + { + RuntimeError("contextFrames must have 1 or 2 values specified, found %d", contextWindow.size()); + } + // update m_featDims to reflect the total input dimension (featDim x contextWindow), not the native feature dimension + // that is what the lower level feature readers expect + m_featDims[i] = m_featDims[i] * (1 + numContextLeft[i] + numContextRight[i]); + + string type = thisFeature("type","Real"); + if (type=="Real"){ + m_nameToTypeMap[featureNames[i]] = InputOutputTypes::real; + } + else{ + RuntimeError("feature type must be Real"); + } + + m_featureNameToIdMap[featureNames[i]]= iFeat; + scriptpaths.push_back(thisFeature("scpFile")); + m_featureNameToDimMap[featureNames[i]] = m_featDims[i]; + + m_featuresBufferMultiIO.push_back(NULL); + m_featuresBufferAllocatedMultiIO.push_back(0); + + iFeat++; + } + + foreach_index(i, labelNames) + { + ConfigParameters thisLabel = readerConfig(labelNames[i]); + if (thisLabel.Exists("labelDim")) + m_labelDims.push_back(thisLabel("labelDim")); + else if (thisLabel.Exists("dim")) + m_labelDims.push_back(thisLabel("dim")); + else + RuntimeError("labels must specify dim or labelDim"); + + string type; + if (thisLabel.Exists("labelType")) + type = thisLabel("labelType"); // let's deprecate this eventually and just use "type"... + else + type = thisLabel("type","Category"); // outputs should default to category + + if (type=="Category") + m_nameToTypeMap[labelNames[i]] = InputOutputTypes::category; + else + RuntimeError("label type must be Category"); + + statelistpaths.push_back(thisLabel("labelMappingFile",L"")); + + m_labelNameToIdMap[labelNames[i]]=iLabel; + m_labelNameToDimMap[labelNames[i]]=m_labelDims[i]; + mlfpaths.clear(); + mlfpaths.push_back(thisLabel("mlfFile")); + mlfpathsmulti.push_back(mlfpaths); + + m_labelsBufferMultiIO.push_back(NULL); + m_labelsBufferAllocatedMultiIO.push_back(0); + + iLabel++; + + wstring labelToTargetMappingFile(thisLabel("labelToTargetMappingFile",L"")); + if (labelToTargetMappingFile != L"") + { + std::vector> labelToTargetMap; + m_convertLabelsToTargetsMultiIO.push_back(true); + if (thisLabel.Exists("targetDim")) + { + m_labelNameToDimMap[labelNames[i]]=m_labelDims[i]=thisLabel("targetDim"); + } + else + RuntimeError("output must specify targetDim if labelToTargetMappingFile specified!"); + size_t targetDim = ReadLabelToTargetMappingFile (labelToTargetMappingFile,statelistpaths[i], labelToTargetMap); + if (targetDim!=m_labelDims[i]) + RuntimeError("mismatch between targetDim and dim found in labelToTargetMappingFile"); + m_labelToTargetMapMultiIO.push_back(labelToTargetMap); + } + else + { + m_convertLabelsToTargetsMultiIO.push_back(false); + m_labelToTargetMapMultiIO.push_back(std::vector>()); + } + } + + if (iFeat!=scriptpaths.size() || iLabel!=mlfpathsmulti.size()) + throw std::runtime_error(msra::strfun::strprintf ("# of inputs files vs. # of inputs or # of output files vs # of outputs inconsistent\n")); + + if (readerConfig.Exists("randomize")) + { + const std::string& randomizeString = readerConfig("randomize"); + if (randomizeString == "None") + { + randomize = m_htkRandomizeDisable; // randomizeNone; + } + else if (randomizeString == "Auto") + { + randomize = m_htkRandomizeAuto; // randomizeAuto; + } + else + { + randomize = readerConfig("randomize"); + } + } + + if (readerConfig.Exists("frameMode")) + { + const std::string& framemodeString = readerConfig("frameMode"); + if (framemodeString == "false") + { + framemode = false; + } + } + + int verbosity = readerConfig("verbosity","2"); + + // determine if we partial minibatches are desired + std::string minibatchMode(readerConfig("minibatchMode","Partial")); + m_partialMinibatch = !_stricmp(minibatchMode.c_str(),"Partial"); + + // get the read method, defaults to "blockRandomize" other option is "rollingWindow" + std::string readMethod(readerConfig("readMethod","blockRandomize")); + + // see if they want to use readAhead + m_readAhead = readerConfig("readAhead", "false"); + + // read all input files (from multiple inputs) + // TO DO: check for consistency (same number of files in each script file) + numFiles=0; + foreach_index(i,scriptpaths) + { + filelist.clear(); + std::wstring scriptpath = scriptpaths[i]; + fprintf(stderr, "reading script file %S ...", scriptpath.c_str()); + size_t n = 0; + for (msra::files::textreader reader(scriptpath); reader && filelist.size() <= firstfilesonly/*optimization*/; ) + { + filelist.push_back (reader.wgetline()); + n++; + } + + fprintf (stderr, " %lu entries\n", n); + + if (i==0) + numFiles=n; + else + if (n!=numFiles) + throw std::runtime_error (msra::strfun::strprintf ("number of files in each scriptfile inconsistent (%d vs. %d)", numFiles,n)); + + infilesmulti.push_back(filelist); + } + + if (readerConfig.Exists("unigram")) + unigrampath = readerConfig("unigram"); + + // load a unigram if needed (this is used for MMI training) + msra::lm::CSymbolSet unigramsymbols; + std::unique_ptr unigram; + size_t silencewordid = SIZE_MAX; + size_t startwordid = SIZE_MAX; + size_t endwordid = SIZE_MAX; + if (unigrampath != L"") + { + unigram.reset (new msra::lm::CMGramLM()); + unigram->read (unigrampath, unigramsymbols, false/*filterVocabulary--false will build the symbol map*/, 1/*maxM--unigram only*/); + silencewordid = unigramsymbols["!silence"]; // give this an id (even if not in the LM vocabulary) + startwordid = unigramsymbols[""]; + endwordid = unigramsymbols[""]; + } + + if (!unigram) + fprintf (stderr, "trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion\n"); + + // currently assumes all mlfs will have same root name (key) + set restrictmlftokeys; // restrict MLF reader to these files--will make stuff much faster without having to use shortened input files + if (infilesmulti[0].size() <= 100) + { + foreach_index (i, infilesmulti[0]) + { + msra::asr::htkfeatreader::parsedpath ppath (infilesmulti[0][i]); + const wstring key = regex_replace ((wstring)ppath, wregex (L"\\.[^\\.\\\\/:]*$"), wstring()); // delete extension (or not if none) + restrictmlftokeys.insert (key); + } + } + // get labels + + //if (readerConfig.Exists("statelist")) + // statelistpath = readerConfig("statelist"); + + double htktimetoframe = 100000.0; // default is 10ms + //std::vector> labelsmulti; + std::vector>> labelsmulti; + //std::vector pagepath; + foreach_index(i, mlfpathsmulti) + { + msra::asr::htkmlfreader + labels(mlfpathsmulti[i], restrictmlftokeys, statelistpaths[i], unigram ? &unigramsymbols : NULL, (map*) NULL, htktimetoframe); // label MLF + // get the temp file name for the page file + labelsmulti.push_back(labels); + } + + + if (!_stricmp(readMethod.c_str(),"blockRandomize")) + { + // construct all the parameters we don't need, but need to be passed to the constructor... + std::pair,std::vector> latticetocs; + std::unordered_map modelsymmap; + m_lattices = new msra::dbn::latticesource(latticetocs, modelsymmap); + + // now get the frame source. This has better randomization and doesn't create temp files + m_frameSource = new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, framemode); + //m_frameSource = new msra::dbn::minibatchutterancesource(infilesmulti[0], labelsmulti[0], m_featDims[0], m_labelDims[0], numContextLeft[0], numContextRight[0], randomize, *m_lattices, m_latticeMap, framemode); + + } + else if (!_stricmp(readMethod.c_str(),"rollingWindow")) + { + std::wstring pageFilePath; + std::vector pagePaths; + if (readerConfig.Exists("pageFilePath")) + { + pageFilePath = readerConfig("pageFilePath"); + + // replace any '/' with '\' for compat with default path + std::replace(pageFilePath.begin(), pageFilePath.end(), '/','\\'); + + // verify path exists + DWORD attrib = GetFileAttributes(pageFilePath.c_str()); + if (attrib==INVALID_FILE_ATTRIBUTES || !(attrib & FILE_ATTRIBUTE_DIRECTORY)) + throw std::runtime_error ("pageFilePath does not exist"); + } + else // using default temporary path + { + pageFilePath.reserve(MAX_PATH); + GetTempPath(MAX_PATH, &pageFilePath[0]); + } + + if (pageFilePath.size()>MAX_PATH-14) // max length of input to GetTempFileName is MAX_PATH-14 + throw std::runtime_error (msra::strfun::strprintf ("pageFilePath must be less than %d characters", MAX_PATH-14)); + + foreach_index(i, infilesmulti) + { + + wchar_t tempFile[MAX_PATH]; + GetTempFileName(pageFilePath.c_str(), L"CNTK", 0, tempFile); + pagePaths.push_back(tempFile); + + } + + const bool mayhavenoframe=false; + int addEnergy = 0; + + //m_frameSourceMultiIO = new msra::dbn::minibatchframesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, randomize, pagepath, mayhavenoframe, addEnergy); + //m_frameSourceMultiIO->setverbosity(verbosity); + m_frameSource = new msra::dbn::minibatchframesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, pagePaths, mayhavenoframe, addEnergy); + m_frameSource->setverbosity(verbosity); + } + else + { + RuntimeError("readMethod must be rollingWindow or blockRandomize"); + } + + } + + // Load all input and output data. + // Note that the terms features imply be real-valued quanities and + // labels imply categorical quantities, irrespective of whether they + // are inputs or targets for the network + template + void HTKMLFReader::PrepareForWriting(const ConfigParameters& readerConfig) + { + vector scriptpaths; + vector filelist; + size_t numFiles; + size_t firstfilesonly = SIZE_MAX; // set to a lower value for testing + size_t evalchunksize = 2048; + vector realDims; + size_t iFeat = 0; + vector numContextLeft; + vector numContextRight; + + std::vector featureNames; + std::vector labelNames; + GetDataNamesFromConfig(readerConfig, featureNames, labelNames); + + foreach_index(i, featureNames) + { + ConfigParameters thisFeature = readerConfig(featureNames[i]); + realDims.push_back(thisFeature("dim")); + + ConfigArray contextWindow = thisFeature("contextWindow", "1"); + if (contextWindow.size() == 1) // symmetric + { + size_t windowFrames = contextWindow[0]; + if (windowFrames % 2 == 0) + RuntimeError("augmentationextent: neighbor expansion of input features to %d not symmetrical", windowFrames); + size_t context = windowFrames / 2; // extend each side by this + numContextLeft.push_back(context); + numContextRight.push_back(context); + + } + else if (contextWindow.size() == 2) // left context, right context + { + numContextLeft.push_back(contextWindow[0]); + numContextRight.push_back(contextWindow[1]); + } + else + { + RuntimeError("contextFrames must have 1 or 2 values specified, found %d", contextWindow.size()); + } + // update m_featDims to reflect the total input dimension (featDim x contextWindow), not the native feature dimension + // that is what the lower level feature readers expect + realDims[i] = realDims[i] * (1 + numContextLeft[i] + numContextRight[i]); + + string type = thisFeature("type","Real"); + if (type=="Real"){ + m_nameToTypeMap[featureNames[i]] = InputOutputTypes::real; + } + else{ + RuntimeError("feature type must be Real"); + } + + m_featureNameToIdMap[featureNames[i]]= iFeat; + scriptpaths.push_back(thisFeature("scpFile")); + m_featureNameToDimMap[featureNames[i]] = realDims[i]; + + m_featuresBufferMultiIO.push_back(NULL); + m_featuresBufferAllocatedMultiIO.push_back(0); + iFeat++; + } + + if (labelNames.size()>0) + RuntimeError("writer mode does not support labels as inputs, only features"); + + numFiles=0; + foreach_index(i,scriptpaths) + { + filelist.clear(); + std::wstring scriptpath = scriptpaths[i]; + fprintf(stderr, "reading script file %S ...", scriptpath.c_str()); + size_t n = 0; + for (msra::files::textreader reader(scriptpath); reader && filelist.size() <= firstfilesonly/*optimization*/; ) + { + filelist.push_back (reader.wgetline()); + n++; + } + + fprintf (stderr, " %d entries\n", n); + + if (i==0) + numFiles=n; + else + if (n!=numFiles) + throw std::runtime_error (msra::strfun::strprintf ("HTKMLFReader::InitEvalReader: number of files in each scriptfile inconsistent (%d vs. %d)", numFiles,n)); + + m_inputFilesMultiIO.push_back(filelist); + } + + m_fileEvalSource = new msra::dbn::FileEvalSource(realDims, numContextLeft, numContextRight, evalchunksize); + } + + + + // destructor - virtual so it gets called properly + template + HTKMLFReader::~HTKMLFReader() + { + delete m_mbiter; + delete m_readAheadSource; + delete m_frameSource; + delete m_lattices; + + if (!m_featuresBufferMultiIO.empty()) + { + if ( m_featuresBufferMultiIO[0] != NULL) + { + foreach_index(i, m_featuresBufferMultiIO) + { + delete[] m_featuresBufferMultiIO[i]; + m_featuresBufferMultiIO[i] = NULL; + } + } + } + if (!m_labelsBufferMultiIO.empty()) + { + if (m_labelsBufferMultiIO[0] != NULL) + { + foreach_index(i, m_labelsBufferMultiIO) + { + delete[] m_labelsBufferMultiIO[i]; + m_labelsBufferMultiIO[i] = NULL; + } + } + } + if (/*m_numberOfuttsPerMinibatch > 1 && */m_truncated) + { + for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i ++) + { + if (m_featuresBufferMultiUtt[i] != NULL) + { + delete[] m_featuresBufferMultiUtt[i]; + m_featuresBufferMultiUtt[i] = NULL; + } + if (m_labelsBufferMultiUtt[i] != NULL) + { + delete[] m_labelsBufferMultiUtt[i]; + m_labelsBufferMultiUtt[i] = NULL; + } + + } + } + } + + //StartMinibatchLoop - Startup a minibatch loop + // mbSize - [in] size of the minibatch (number of frames, etc.) + // epoch - [in] epoch number for this loop + // requestedEpochSamples - [in] number of samples to randomize, defaults to requestDataSize which uses the number of samples there are in the dataset + template + void HTKMLFReader::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples) + { + m_mbSize = mbSize; + + if (m_trainOrTest) + { + StartMinibatchLoopToTrainOrTest(mbSize,epoch,requestedEpochSamples); + } + else + { + StartMinibatchLoopToWrite(mbSize,epoch,requestedEpochSamples); + } + m_checkDictionaryKeys=true; + } + + template + void HTKMLFReader::StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples) + { + size_t datapasses=1; + //size_t totalFrames = m_frameSource->totalframes(); + size_t totalFrames; + totalFrames = m_frameSource->totalframes(); + + size_t extraFrames = totalFrames%mbSize; + size_t minibatches = totalFrames/mbSize; + + // if we are allowing partial minibatches, do nothing, and let it go through + if (!m_partialMinibatch) + { + // we don't want any partial frames, so round total frames to be an even multiple of our mbSize + if (totalFrames > mbSize) + totalFrames -= extraFrames; + + if (requestedEpochSamples == requestDataSize) + { + requestedEpochSamples = totalFrames; + } + else if (minibatches > 0) // if we have any full minibatches + { + // since we skip the extraFrames, we need to add them to the total to get the actual number of frames requested + size_t sweeps = (requestedEpochSamples-1)/totalFrames; // want the number of sweeps we will skip the extra, so subtract 1 and divide + requestedEpochSamples += extraFrames*sweeps; + } + } + else if (requestedEpochSamples == requestDataSize) + { + requestedEpochSamples = totalFrames; + } + + // delete the old one first (in case called more than once) + delete m_mbiter; + msra::dbn::minibatchsource* source = m_frameSource; + if (m_readAhead) + { + if (m_readAheadSource == NULL) + { + m_readAheadSource = new msra::dbn::minibatchreadaheadsource (*source, requestedEpochSamples); + } + else if (m_readAheadSource->epochsize() != requestedEpochSamples) + { + delete m_readAheadSource; + m_readAheadSource = new msra::dbn::minibatchreadaheadsource (*source, requestedEpochSamples); + } + source = m_readAheadSource; + } + m_mbiter = new msra::dbn::minibatchiterator(*source, epoch, requestedEpochSamples, mbSize, datapasses); + if (!m_featuresBufferMultiIO.empty()) + { + if (m_featuresBufferMultiIO[0]!=NULL) // check first feature, if it isn't NULL, safe to assume all are not NULL? + { + foreach_index(i, m_featuresBufferMultiIO) + { + delete[] m_featuresBufferMultiIO[i]; + m_featuresBufferMultiIO[i]=NULL; + m_featuresBufferAllocatedMultiIO[i]=0; + } + } + } + if (!m_labelsBufferMultiIO.empty()) + { + if (m_labelsBufferMultiIO[0]!=NULL) + { + foreach_index(i, m_labelsBufferMultiIO) + { + delete[] m_labelsBufferMultiIO[i]; + m_labelsBufferMultiIO[i]=NULL; + m_labelsBufferAllocatedMultiIO[i]=0; + } + } + } + if (m_numberOfuttsPerMinibatch && m_truncated == true) + { + m_noData = false; + m_featuresStartIndexMultiUtt.assign(m_featuresBufferMultiIO.size()*m_numberOfuttsPerMinibatch,0); + m_labelsStartIndexMultiUtt.assign(m_labelsBufferMultiIO.size()*m_numberOfuttsPerMinibatch,0); + for (size_t u = 0; u < m_numberOfuttsPerMinibatch; u ++) + { + if (m_featuresBufferMultiUtt[u] != NULL) + { + delete[] m_featuresBufferMultiUtt[u]; + m_featuresBufferMultiUtt[u] = NULL; + m_featuresBufferAllocatedMultiUtt[u] = 0; + } + if (m_labelsBufferMultiUtt[u] != NULL) + { + delete[] m_labelsBufferMultiUtt[u]; + m_labelsBufferMultiUtt[u] = NULL; + m_labelsBufferAllocatedMultiUtt[u] = 0; + } + ReNewBufferForMultiIO(u); + } + } + } + + template + void HTKMLFReader::StartMinibatchLoopToWrite(size_t mbSize, size_t /*epoch*/, size_t /*requestedEpochSamples*/) + { + m_fileEvalSource->Reset(); + m_fileEvalSource->SetMinibatchSize(mbSize); + //m_chunkEvalSourceMultiIO->reset(); + m_inputFileIndex=0; + + if (m_featuresBufferMultiIO[0]!=NULL) // check first feature, if it isn't NULL, safe to assume all are not NULL? + { + foreach_index(i, m_featuresBufferMultiIO) + { + delete[] m_featuresBufferMultiIO[i]; + m_featuresBufferMultiIO[i]=NULL; + m_featuresBufferAllocatedMultiIO[i]=0; + } + } + + } + + // GetMinibatch - Get the next minibatch (features and labels) + // matrices - [in] a map with named matrix types (i.e. 'features', 'labels') mapped to the corresponing matrix, + // [out] each matrix resized if necessary containing data. + // returns - true if there are more minibatches, false if no more minibatchs remain + template + bool HTKMLFReader::GetMinibatch(std::map*>& matrices) + { + if (m_trainOrTest) + { + return GetMinibatchToTrainOrTest(matrices); + } + else + { + return GetMinibatchToWrite(matrices); + } + } + + template + bool HTKMLFReader::GetMinibatchToTrainOrTest(std::map*>& matrices) + { + size_t id; + size_t dim; + bool skip = false; + + // on first minibatch, make sure we can supply data for requested nodes + std::map::iterator iter; + if (m_checkDictionaryKeys) + { + for (auto iter=matrices.begin();iter!=matrices.end();iter++) + { + if (m_nameToTypeMap.find(iter->first)==m_nameToTypeMap.end()) + throw std::runtime_error(msra::strfun::strprintf("minibatch requested for input node %ws not found in reader - cannot generate input\n",iter->first.c_str())); + + } + m_checkDictionaryKeys=false; + } + + do + { + if (m_truncated == false) + { + if (!(*m_mbiter)) + return false; + + // now, access all features and and labels by iterating over map of "matrices" + std::map*>::iterator iter; + for (iter = matrices.begin();iter!=matrices.end(); iter++) + { + // dereference matrix that corresponds to key (input/output name) and + // populate based on whether its a feature or a label + Matrix& data = *matrices[iter->first]; // can be features or labels + + if (m_nameToTypeMap[iter->first] == InputOutputTypes::real) + { + + id = m_featureNameToIdMap[iter->first]; + dim = m_featureNameToDimMap[iter->first]; + const msra::dbn::matrixstripe feat = m_mbiter->frames(id); + const size_t actualmbsize = feat.cols(); // it may still return less if at end of sweep TODO: this check probably only needs to happen once + assert (actualmbsize == m_mbiter->currentmbframes()); + skip = (!m_partialMinibatch && m_mbiter->requestedframes() != actualmbsize && m_frameSource->totalframes() > actualmbsize); + + // check to see if we got the number of frames we requested + if (!skip) + { + // copy the features over to our array type + assert(feat.rows()==dim); // check feature dimension matches what's expected + + if (m_featuresBufferMultiIO[id]==NULL) + { + m_featuresBufferMultiIO[id] = new ElemType[feat.rows()*feat.cols()]; + m_featuresBufferAllocatedMultiIO[id] = feat.rows()*feat.cols(); + } + else if (m_featuresBufferAllocatedMultiIO[id]first] == InputOutputTypes::category) + { + id = m_labelNameToIdMap[iter->first]; + dim = m_labelNameToDimMap[iter->first]; + const vector & uids = m_mbiter->labels(id); + + // need skip logic here too in case labels are first in map not features + const size_t actualmbsize = uids.size(); // it may still return less if at end of sweep TODO: this check probably only needs to happen once + assert (actualmbsize == m_mbiter->currentmbframes()); + skip = (!m_partialMinibatch && m_mbiter->requestedframes() != actualmbsize && m_frameSource->totalframes() > actualmbsize); + + if (!skip) + { + // copy the labels over to array type + //data.Resize(udims[id], uids.size()); + //data.SetValue((ElemType)0); + + // loop through the columns and set one value to 1 + // in the future we want to use a sparse matrix here + //for (int i = 0; i < uids.size(); i++) + //{ + // assert(uids[i] first).c_str())); + } + + } + // advance to the next minibatch + (*m_mbiter)++; + } + else + { + if (m_noData) + { + bool endEpoch = true; + for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i++) + { + if (m_processedFrame[i] != m_toProcess[i]) + { + endEpoch = false; + } + } + if(endEpoch) + { + return false; + } + } + size_t numOfFea = m_featuresBufferMultiIO.size(); + size_t numOfLabel = m_labelsBufferMultiIO.size(); + vector actualmbsize; + actualmbsize.assign(m_numberOfuttsPerMinibatch,0); + for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i++) + { + size_t startFr = m_processedFrame[i]; + size_t endFr = 0; + if ((m_processedFrame[i] + m_mbSize) < m_toProcess[i]) + { + if(m_processedFrame[i] > 0) + { + m_sentenceEnd[i] = false; + m_switchFrame[i] = m_mbSize+1; + } + else + { + m_switchFrame[i] = 0; + m_sentenceEnd[i] = true; + } + actualmbsize[i] = m_mbSize; + endFr = startFr + actualmbsize[i]; + std::map*>::iterator iter; + for (iter = matrices.begin();iter!=matrices.end(); iter++) + { + // dereference matrix that corresponds to key (input/output name) and + // populate based on whether its a feature or a label + //Matrix& data = + *matrices[iter->first]; // can be features or labels + + if (m_nameToTypeMap[iter->first] == InputOutputTypes::real) + { + id = m_featureNameToIdMap[iter->first]; + dim = m_featureNameToDimMap[iter->first]; + + if (m_featuresBufferMultiIO[id]==NULL) + { + m_featuresBufferMultiIO[id] = new ElemType[dim*m_mbSize*m_numberOfuttsPerMinibatch]; + m_featuresBufferAllocatedMultiIO[id] = dim*m_mbSize*m_numberOfuttsPerMinibatch; + } + else if (m_featuresBufferAllocatedMultiIO[id]first] == InputOutputTypes::category) + { + id = m_labelNameToIdMap[iter->first]; + dim = m_labelNameToDimMap[iter->first]; + if (m_labelsBufferMultiIO[id]==NULL) + { + m_labelsBufferMultiIO[id] = new ElemType[dim*m_mbSize*m_numberOfuttsPerMinibatch]; + m_labelsBufferAllocatedMultiIO[id] = dim*m_mbSize*m_numberOfuttsPerMinibatch; + } + else if (m_labelsBufferAllocatedMultiIO[id]*>::iterator iter; + for (iter = matrices.begin();iter!=matrices.end(); iter++) + { + // dereference matrix that corresponds to key (input/output name) and + // populate based on whether its a feature or a label + //Matrix& data = + *matrices[iter->first]; // can be features or labels + + if (m_nameToTypeMap[iter->first] == InputOutputTypes::real) + { + id = m_featureNameToIdMap[iter->first]; + dim = m_featureNameToDimMap[iter->first]; + + if (m_featuresBufferMultiIO[id]==NULL) + { + m_featuresBufferMultiIO[id] = new ElemType[dim*m_mbSize*m_numberOfuttsPerMinibatch]; + m_featuresBufferAllocatedMultiIO[id] = dim*m_mbSize*m_numberOfuttsPerMinibatch; + } + else if (m_featuresBufferAllocatedMultiIO[id]first] == InputOutputTypes::category) + { + id = m_labelNameToIdMap[iter->first]; + dim = m_labelNameToDimMap[iter->first]; + if (m_labelsBufferMultiIO[id]==NULL) + { + m_labelsBufferMultiIO[id] = new ElemType[dim*m_mbSize*m_numberOfuttsPerMinibatch]; + m_labelsBufferAllocatedMultiIO[id] = dim*m_mbSize*m_numberOfuttsPerMinibatch; + } + else if (m_labelsBufferAllocatedMultiIO[id]& data = + *matrices[iter->first]; // can be features or labels + + if (m_nameToTypeMap[iter->first] == InputOutputTypes::real) + { + id = m_featureNameToIdMap[iter->first]; + dim = m_featureNameToDimMap[iter->first]; + if (sizeof(ElemType) == sizeof(float)) + { + for (size_t j = startFr,k = 0; j < endFr; j++,k++) // column major, so iterate columns + { + // copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns + memcpy_s(&m_featuresBufferMultiIO[id][(j*m_numberOfuttsPerMinibatch+i)*dim],sizeof(ElemType)*dim,&m_featuresBufferMultiUtt[i][k*dim+m_featuresStartIndexMultiUtt[id+i*numOfFea]],sizeof(ElemType)*dim); + } + } + else + { + for (size_t j=startFr,k=0; j < endFr; j++,k++) // column major, so iterate columns in outside loop + { + for (int d = 0; d < dim; d++) + { + m_featuresBufferMultiIO[id][(j*m_numberOfuttsPerMinibatch+i)*dim+d] = m_featuresBufferMultiUtt[i][k*dim+d+m_featuresStartIndexMultiUtt[id+i*numOfFea]]; + } + } + } + } + else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category) + { + id = m_labelNameToIdMap[iter->first]; + dim = m_labelNameToDimMap[iter->first]; + for (size_t j = startFr,k=0; j < endFr; j++,k++) + { + for (int d = 0; d < dim; d++) + { + m_labelsBufferMultiIO[id][(j*m_numberOfuttsPerMinibatch+i)*dim + d] = m_labelsBufferMultiUtt[i][k*dim+d+m_labelsStartIndexMultiUtt[id+i*numOfLabel]]; + } + } + } + } + + if (reNewSucc) m_processedFrame[i] += (endFr-startFr); + + } + } + std::map*>::iterator iter; + for (iter = matrices.begin();iter!=matrices.end(); iter++) + { + // dereference matrix that corresponds to key (input/output name) and + // populate based on whether its a feature or a label + Matrix& data = *matrices[iter->first]; // can be features or labels + if (m_nameToTypeMap[iter->first] == InputOutputTypes::real) + { + id = m_featureNameToIdMap[iter->first]; + dim = m_featureNameToDimMap[iter->first]; + data.SetValue(dim, m_mbSize*m_numberOfuttsPerMinibatch, m_featuresBufferMultiIO[id],matrixFlagNormal); + } + else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category) + { + id = m_labelNameToIdMap[iter->first]; + dim = m_labelNameToDimMap[iter->first]; + data.SetValue(dim, m_mbSize*m_numberOfuttsPerMinibatch, m_labelsBufferMultiIO[id],matrixFlagNormal); + } + } + skip=false; + } + } // keep going if we didn't get the right size minibatch + while(skip); + + return true; + } + + template + bool HTKMLFReader::GetMinibatchToWrite(std::map*>& matrices) + { + std::map::iterator iter; + if (m_checkDictionaryKeys) + { + for (auto iter=m_featureNameToIdMap.begin();iter!=m_featureNameToIdMap.end();iter++) + { + if (matrices.find(iter->first)==matrices.end()) + { + fprintf(stderr,"GetMinibatchToWrite: feature node %ws specified in reader not found in the network\n",iter->first.c_str()); + throw std::runtime_error("GetMinibatchToWrite: feature node specified in reader not found in the network."); + } + } + /* + for (auto iter=matrices.begin();iter!=matrices.end();iter++) + { + if (m_featureNameToIdMap.find(iter->first)==m_featureNameToIdMap.end()) + throw std::runtime_error(msra::strfun::strprintf("minibatch requested for input node %ws not found in reader - cannot generate input\n",iter->first.c_str())); + } + */ + m_checkDictionaryKeys=false; + } + + if (m_inputFileIndexReset(); + + // load next file (or set of files) + foreach_index(i, m_inputFilesMultiIO) + { + msra::asr::htkfeatreader reader; + + const auto path = reader.parse(m_inputFilesMultiIO[i][m_inputFileIndex]); + // read file + msra::dbn::matrix feat; + string featkind; + unsigned int sampperiod; + msra::util::attempt (5, [&]() + { + reader.read (path, featkind, sampperiod, feat); // whole file read as columns of feature vectors + }); + fprintf (stderr, "evaluate: reading %d frames of %S\n", feat.cols(), ((wstring)path).c_str()); + m_fileEvalSource->AddFile(feat, featkind, sampperiod, i); + } + m_inputFileIndex++; + + // turn frames into minibatch (augment neighbors, etc) + m_fileEvalSource->CreateEvalMinibatch(); + + // populate input matrices + + std::map*>::iterator iter; + for (iter = matrices.begin();iter!=matrices.end(); iter++) + { + // dereference matrix that corresponds to key (input/output name) and + // populate based on whether its a feature or a label + + if (m_nameToTypeMap.find(iter->first)!=m_nameToTypeMap.end() && m_nameToTypeMap[iter->first] == InputOutputTypes::real) + { + Matrix& data = *matrices[iter->first]; // can be features or labels + size_t id = m_featureNameToIdMap[iter->first]; + size_t dim = m_featureNameToDimMap[iter->first]; + + const msra::dbn::matrix feat = m_fileEvalSource->ChunkOfFrames(id); + + // copy the features over to our array type + assert(feat.rows()==dim); dim; // check feature dimension matches what's expected + + if (m_featuresBufferMultiIO[id]==NULL) + { + m_featuresBufferMultiIO[id] = new ElemType[feat.rows()*feat.cols()]; + m_featuresBufferAllocatedMultiIO[id] = feat.rows()*feat.cols(); + } + else if (m_featuresBufferAllocatedMultiIO[id] + bool HTKMLFReader::ReNewBufferForMultiIO(size_t i) + { + if (m_noData) + { + return false; + } + size_t numOfFea = m_featuresBufferMultiIO.size(); + size_t numOfLabel = m_labelsBufferMultiIO.size(); + + size_t totalFeatNum = 0; + foreach_index(id, m_featuresBufferAllocatedMultiIO) + { + const msra::dbn::matrixstripe featOri = m_mbiter->frames(id); + size_t fdim = featOri.rows(); + const size_t actualmbsizeOri = featOri.cols(); + m_featuresStartIndexMultiUtt[id+i*numOfFea] = totalFeatNum; + totalFeatNum = fdim * actualmbsizeOri + m_featuresStartIndexMultiUtt[id+i*numOfFea]; + } + if (m_featuresBufferMultiUtt[i]==NULL) + { + m_featuresBufferMultiUtt[i] = new ElemType[totalFeatNum]; + m_featuresBufferAllocatedMultiUtt[i] = totalFeatNum; + } + else if (m_featuresBufferAllocatedMultiUtt[i] < totalFeatNum) //buffer size changed. can be partial minibatch + { + delete[] m_featuresBufferMultiUtt[i]; + m_featuresBufferMultiUtt[i] = new ElemType[totalFeatNum]; + m_featuresBufferAllocatedMultiUtt[i] = totalFeatNum; + } + + size_t totalLabelsNum = 0; + for (auto it = m_labelNameToIdMap.begin(); it != m_labelNameToIdMap.end(); ++it) + { + size_t id = m_labelNameToIdMap[it->first]; + size_t dim = m_labelNameToDimMap[it->first]; + + const vector & uids = m_mbiter->labels(id); + size_t actualmbsizeOri = uids.size(); + m_labelsStartIndexMultiUtt[id+i*numOfLabel] = totalLabelsNum; + totalLabelsNum = m_labelsStartIndexMultiUtt[id+i*numOfLabel] + dim * actualmbsizeOri; + } + + if (m_labelsBufferMultiUtt[i]==NULL) + { + m_labelsBufferMultiUtt[i] = new ElemType[totalLabelsNum]; + m_labelsBufferAllocatedMultiUtt[i] = totalLabelsNum; + } + else if (m_labelsBufferAllocatedMultiUtt[i] < totalLabelsNum) + { + delete[] m_labelsBufferMultiUtt[i]; + m_labelsBufferMultiUtt[i] = new ElemType[totalLabelsNum]; + m_labelsBufferAllocatedMultiUtt[i] = totalLabelsNum; + } + + memset(m_labelsBufferMultiUtt[i],0,sizeof(ElemType)*totalLabelsNum); + + bool first = true; + foreach_index(id, m_featuresBufferMultiIO) + { + const msra::dbn::matrixstripe featOri = m_mbiter->frames(id); + const size_t actualmbsizeOri = featOri.cols(); + size_t fdim = featOri.rows(); + if (first) + { + m_toProcess[i] = actualmbsizeOri; + first = false; + } + else + { + if (m_toProcess[i] != actualmbsizeOri) + { + throw std::runtime_error("The multi-IO features has inconsistent number of frames!"); + } + } + assert (actualmbsizeOri == m_mbiter->currentmbframes()); + + if (sizeof(ElemType) == sizeof(float)) + { + for (int k = 0; k < actualmbsizeOri; k++) // column major, so iterate columns + { + // copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns + memcpy_s(&m_featuresBufferMultiUtt[i][k*fdim+m_featuresStartIndexMultiUtt[id+i*numOfFea]],sizeof(ElemType)*fdim,&featOri(0,k),sizeof(ElemType)*fdim); + } + } + else + { + for (int k=0; k < actualmbsizeOri; k++) // column major, so iterate columns in outside loop + { + for (int d = 0; d < featOri.rows(); d++) + { + m_featuresBufferMultiUtt[i][k*featOri.rows()+d+m_featuresStartIndexMultiUtt[id+i*numOfFea]] = featOri(d,k); + } + } + } + } + + for (auto it = m_labelNameToIdMap.begin(); it != m_labelNameToIdMap.end(); ++it) + { + size_t id = m_labelNameToIdMap[it->first]; + size_t dim = m_labelNameToDimMap[it->first]; + + const vector & uids = m_mbiter->labels(id); + size_t actualmbsizeOri = uids.size(); + + if (m_convertLabelsToTargetsMultiIO[id]) + { + size_t labelDim = m_labelToTargetMapMultiIO[id].size(); + for (int k=0; k < actualmbsizeOri; k++) + { + assert(uids[k] < labelDim); labelDim; + size_t labelId = uids[k]; + for (int j = 0; j < dim; j++) + { + m_labelsBufferMultiUtt[i][k*dim + j + m_labelsStartIndexMultiUtt[id+i*numOfLabel]] = m_labelToTargetMapMultiIO[id][labelId][j]; + } + } + } + else + { + // loop through the columns and set one value to 1 + // in the future we want to use a sparse matrix here + for (int k=0; k < actualmbsizeOri; k++) + { + assert(uids[k] < dim); + //labels(uids[i], i) = (ElemType)1; + m_labelsBufferMultiUtt[i][k*dim+uids[k]+m_labelsStartIndexMultiUtt[id+i*numOfLabel]]=(ElemType)1; + } + } + } + m_processedFrame[i] = 0; + + (*m_mbiter)++; + if (!(*m_mbiter)) + m_noData = true; + + return true; + } + + + + + // GetLabelMapping - Gets the label mapping from integer to type in file + // mappingTable - a map from numeric datatype to native label type stored as a string + template + const std::map::LabelIdType, typename IDataReader::LabelType>& HTKMLFReader::GetLabelMapping(const std::wstring& /*sectionName*/) + { + return m_idToLabelMap; + } + + // SetLabelMapping - Sets the label mapping from integer index to label + // labelMapping - mapping table from label values to IDs (must be 0-n) + // note: for tasks with labels, the mapping table must be the same between a training run and a testing run + template + void HTKMLFReader::SetLabelMapping(const std::wstring& /*sectionName*/, const std::map::LabelIdType, typename IDataReader::LabelType>& labelMapping) + { + m_idToLabelMap = labelMapping; + } + + template + size_t HTKMLFReader::ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector>& labelToTargetMap) + { + if (labelListFile==L"") + throw std::runtime_error("HTKMLFReader::ReadLabelToTargetMappingFile(): cannot read labelToTargetMappingFile without a labelMappingFile!"); + + vector labelList; + size_t count, numLabels; + count=0; + // read statelist first + msra::files::textreader labelReader(labelListFile); + while(labelReader) + { + labelList.push_back(labelReader.wgetline()); + count++; + } + numLabels=count; + count=0; + msra::files::textreader mapReader(labelToTargetMappingFile); + size_t targetDim = 0; + while(mapReader) + { + std::wstring line(mapReader.wgetline()); + // find white space as a demarcation + std::wstring::size_type pos = line.find(L" "); + std::wstring token = line.substr(0,pos); + std::wstring targetstring = line.substr(pos+1); + + if (labelList[count]!=token) + RuntimeError("HTKMLFReader::ReadLabelToTargetMappingFile(): mismatch between labelMappingFile and labelToTargetMappingFile"); + + if (count==0) + targetDim = targetstring.length(); + else if (targetDim!=targetstring.length()) + RuntimeError("HTKMLFReader::ReadLabelToTargetMappingFile(): inconsistent target length among records"); + + std::vector targetVector(targetstring.length(),(ElemType)0.0); + foreach_index(i, targetstring) + { + if (targetstring.compare(i,1,L"1")==0) + targetVector[i] = (ElemType)1.0; + else if (targetstring.compare(i,1,L"0")!=0) + RuntimeError("HTKMLFReader::ReadLabelToTargetMappingFile(): expecting label2target mapping to contain only 1's or 0's"); + } + labelToTargetMap.push_back(targetVector); + count++; + } + + // verify that statelist and label2target mapping file are in same order (to match up with reader) while reading mapping + if (count!=labelList.size()) + RuntimeError("HTKMLFReader::ReadLabelToTargetMappingFile(): mismatch between lengths of labelMappingFile vs labelToTargetMappingFile"); + + return targetDim; + } + + // GetData - Gets metadata from the specified section (into CPU memory) + // sectionName - section name to retrieve data from + // numRecords - number of records to read + // data - pointer to data buffer, if NULL, dataBufferSize will be set to size of required buffer to accomidate request + // dataBufferSize - [in] size of the databuffer in bytes + // [out] size of buffer filled with data + // recordStart - record to start reading from, defaults to zero (start of data) + // returns: true if data remains to be read, false if the end of data was reached + template + bool HTKMLFReader::GetData(const std::wstring& /*sectionName*/, size_t /*numRecords*/, void* /*data*/, size_t& /*dataBufferSize*/, size_t /*recordStart*/) + { + throw std::runtime_error("GetData not supported in HTKMLFReader"); + } + + + template + bool HTKMLFReader::DataEnd(EndDataType endDataType) + { + // each minibatch is considered a "sentence" + // other datatypes not really supported... + // assert(endDataType == endDataSentence); + // for the truncated BPTT, we need to support check wether it's the end of data + bool ret = false; + switch (endDataType) + { + case endDataNull: + case endDataEpoch: + case endDataSet: + throw std::logic_error("DataEnd: does not support endDataTypes: endDataNull, endDataEpoch and endDataSet"); + break; + case endDataSentence: + if (m_truncated) + ret = m_sentenceEnd[0]; + else + ret = true; // useless in current condition + break; + } + return ret; + } + + template + void HTKMLFReader::SetSentenceEndInBatch(vector &sentenceEnd) + { + sentenceEnd.resize(m_switchFrame.size()); + for (size_t i = 0; i < m_switchFrame.size() ; i++) + { + sentenceEnd[i] = m_switchFrame[i]; + } + } + + // GetFileConfigNames - determine the names of the features and labels sections in the config file + // features - [in,out] a vector of feature name strings + // labels - [in,out] a vector of label name strings + template + void HTKMLFReader::GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector& features, std::vector& labels) + { + for (auto iter = readerConfig.begin(); iter != readerConfig.end(); ++iter) + { + auto pair = *iter; + ConfigParameters temp = iter->second; + // see if we have a config parameters that contains a "file" element, it's a sub key, use it + if (temp.ExistsCurrent("scpFile")) + { + features.push_back(msra::strfun::utf16(iter->first)); + } + else if (temp.ExistsCurrent("mlfFile")) + { + labels.push_back(msra::strfun::utf16(iter->first)); + } + + } + } + + template class HTKMLFReader; + template class HTKMLFReader; + }}} diff --git a/DataReader/HTKMLFReader/HTKMLFReader.h b/DataReader/HTKMLFReader/HTKMLFReader.h index 78f8d41c7..3b7692f4b 100644 --- a/DataReader/HTKMLFReader/HTKMLFReader.h +++ b/DataReader/HTKMLFReader/HTKMLFReader.h @@ -1,110 +1,114 @@ -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// -// HTKMLFReader.h - Include file for the MTK and MLF format of features and samples -#pragma once -#include "DataReader.h" - -namespace Microsoft { namespace MSR { namespace CNTK { - -template -class HTKMLFReader : public IDataReader -{ -private: - msra::dbn::minibatchiterator* m_mbiter; - msra::dbn::minibatchsource* m_frameSource; - msra::dbn::minibatchreadaheadsource* m_readAheadSource; - msra::dbn::FileEvalSource* m_fileEvalSource; - msra::dbn::latticesource* m_lattices; - map m_latticeMap; - - vector m_sentenceEnd; - bool m_readAhead; - bool m_truncated; - vector m_processedFrame; - size_t m_numberOfuttsPerMinibatch; - size_t m_actualnumberOfuttsPerMinibatch; - size_t m_mbSize; - vector m_toProcess; - vector m_switchFrame; - bool m_noData; - - bool m_trainOrTest; // if false, in file writing mode - - std::map m_idToLabelMap; - - bool m_partialMinibatch; // allow partial minibatches? - - std::vector m_featuresBufferMultiUtt; - std::vector m_featuresBufferAllocatedMultiUtt; - std::vector m_labelsBufferMultiUtt; - std::vector m_labelsBufferAllocatedMultiUtt; - std::vector m_featuresStartIndexMultiUtt; - std::vector m_labelsStartIndexMultiUtt; - - std::vector m_featuresBufferMultiIO; - std::vector m_featuresBufferAllocatedMultiIO; - std::vector m_labelsBufferMultiIO; - std::vector m_labelsBufferAllocatedMultiIO; - - std::map m_featureNameToIdMap; - std::map m_labelNameToIdMap; - std::map m_nameToTypeMap; - std::map m_featureNameToDimMap; - std::map m_labelNameToDimMap; - // for writing outputs to files (standard single input/output network) - deprecate eventually - bool m_checkDictionaryKeys; - bool m_convertLabelsToTargets; - std::vector m_convertLabelsToTargetsMultiIO; - std::vector> m_inputFilesMultiIO; - - size_t m_inputFileIndex; - std::vector m_featDims; - std::vector m_labelDims; - - std::vector>>m_labelToTargetMapMultiIO; - - void PrepareForTrainingOrTesting(const ConfigParameters& config); - void PrepareForWriting(const ConfigParameters& config); - - bool GetMinibatchToTrainOrTest(std::map*>&matrices); - bool GetMinibatchToWrite(std::map*>&matrices); - - void StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize); - void StartMinibatchLoopToWrite(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize); - - bool ReNewBufferForMultiIO(size_t i); - - size_t NumberSlicesInEachRecurrentIter() { return m_numberOfuttsPerMinibatch ;} - void SetNbrSlicesEachRecurrentIter(const size_t) { }; - - void GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector& features, std::vector& labels); - - - size_t ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector>& labelToTargetMap); - enum InputOutputTypes - { - real, - category, - }; - - - -public: - virtual void Init(const ConfigParameters& config); - virtual void Destroy() {delete this;} - virtual ~HTKMLFReader(); - virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize); - virtual bool GetMinibatch(std::map*>& matrices); - virtual const std::map& GetLabelMapping(const std::wstring& sectionName); - virtual void SetLabelMapping(const std::wstring& sectionName, const std::map& labelMapping); - virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0); - - virtual bool DataEnd(EndDataType endDataType); - void SetSentenceEndInBatch(vector &/*sentenceEnd*/); - void SetSentenceEnd(int /*actualMbSize*/){}; -}; - +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +// HTKMLFReader.h - Include file for the MTK and MLF format of features and samples +#pragma once +#include "DataReader.h" + +namespace Microsoft { namespace MSR { namespace CNTK { + +template +class HTKMLFReader : public IDataReader +{ +private: + + const static size_t m_htkRandomizeAuto = 0; + const static size_t m_htkRandomizeDisable = (size_t)-1; + + msra::dbn::minibatchiterator* m_mbiter; + msra::dbn::minibatchsource* m_frameSource; + msra::dbn::minibatchreadaheadsource* m_readAheadSource; + msra::dbn::FileEvalSource* m_fileEvalSource; + msra::dbn::latticesource* m_lattices; + map m_latticeMap; + + vector m_sentenceEnd; + bool m_readAhead; + bool m_truncated; + vector m_processedFrame; + size_t m_numberOfuttsPerMinibatch; + size_t m_actualnumberOfuttsPerMinibatch; + size_t m_mbSize; + vector m_toProcess; + vector m_switchFrame; + bool m_noData; + + bool m_trainOrTest; // if false, in file writing mode + + std::map m_idToLabelMap; + + bool m_partialMinibatch; // allow partial minibatches? + + std::vector m_featuresBufferMultiUtt; + std::vector m_featuresBufferAllocatedMultiUtt; + std::vector m_labelsBufferMultiUtt; + std::vector m_labelsBufferAllocatedMultiUtt; + std::vector m_featuresStartIndexMultiUtt; + std::vector m_labelsStartIndexMultiUtt; + + std::vector m_featuresBufferMultiIO; + std::vector m_featuresBufferAllocatedMultiIO; + std::vector m_labelsBufferMultiIO; + std::vector m_labelsBufferAllocatedMultiIO; + + std::map m_featureNameToIdMap; + std::map m_labelNameToIdMap; + std::map m_nameToTypeMap; + std::map m_featureNameToDimMap; + std::map m_labelNameToDimMap; + // for writing outputs to files (standard single input/output network) - deprecate eventually + bool m_checkDictionaryKeys; + bool m_convertLabelsToTargets; + std::vector m_convertLabelsToTargetsMultiIO; + std::vector> m_inputFilesMultiIO; + + size_t m_inputFileIndex; + std::vector m_featDims; + std::vector m_labelDims; + + std::vector>>m_labelToTargetMapMultiIO; + + void PrepareForTrainingOrTesting(const ConfigParameters& config); + void PrepareForWriting(const ConfigParameters& config); + + bool GetMinibatchToTrainOrTest(std::map*>&matrices); + bool GetMinibatchToWrite(std::map*>&matrices); + + void StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize); + void StartMinibatchLoopToWrite(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize); + + bool ReNewBufferForMultiIO(size_t i); + + size_t NumberSlicesInEachRecurrentIter() { return m_numberOfuttsPerMinibatch ;} + void SetNbrSlicesEachRecurrentIter(const size_t) { }; + + void GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector& features, std::vector& labels); + + + size_t ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector>& labelToTargetMap); + enum InputOutputTypes + { + real, + category, + }; + + + +public: + virtual void Init(const ConfigParameters& config); + virtual void Destroy() {delete this;} + virtual ~HTKMLFReader(); + virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize); + virtual bool GetMinibatch(std::map*>& matrices); + virtual const std::map& GetLabelMapping(const std::wstring& sectionName); + virtual void SetLabelMapping(const std::wstring& sectionName, const std::map& labelMapping); + virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0); + + virtual bool DataEnd(EndDataType endDataType); + void SetSentenceEndInBatch(vector &/*sentenceEnd*/); + void SetSentenceEnd(int /*actualMbSize*/){}; +}; + }}} \ No newline at end of file diff --git a/DataReader/HTKMLFReader/chunkevalsource.h b/DataReader/HTKMLFReader/chunkevalsource.h index 3ca69ff19..871be63b2 100644 --- a/DataReader/HTKMLFReader/chunkevalsource.h +++ b/DataReader/HTKMLFReader/chunkevalsource.h @@ -339,7 +339,20 @@ namespace msra { namespace dbn { feat[i].resize(vdims[i], framesInBlock); // input features for whole utt (col vectors) // augment the features //msra::dbn::augmentneighbors(framesMulti[i], boundaryFlags, 0, leftcontext[i], rightcontext[i],) - msra::dbn::augmentneighbors (framesMulti[i], boundaryFlags, leftcontext[i], rightcontext[i], 0, framesInBlock, feat[i]); + + size_t leftextent, rightextent; + // page in the needed range of frames + if (leftcontext[i] == 0 && rightcontext[i] == 0) + { + leftextent = rightextent = augmentationextent(framesMulti[i][0].size(), vdims[i]); + } + else + { + leftextent = leftcontext[i]; + rightextent = rightcontext[i]; + } + + msra::dbn::augmentneighbors (framesMulti[i], boundaryFlags, leftextent, rightextent, 0, framesInBlock, feat[i]); } minibatchReady=true; } diff --git a/MachineLearning/cn/cn.vcxproj b/MachineLearning/cn/cn.vcxproj index 7c7bfcc69..4f301d7aa 100644 --- a/MachineLearning/cn/cn.vcxproj +++ b/MachineLearning/cn/cn.vcxproj @@ -112,7 +112,7 @@ Level4 Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + DISPLAY_DEBUG;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) true true true