bug fixes in HTKMLFReader: randomizationNone for rollingwindow and context window setting for write

This commit is contained in:
Mike Seltzer 2015-02-05 19:26:36 -08:00
Родитель 8fbcf8f5b7
Коммит c7085c7e2d
4 изменённых файлов: 1669 добавлений и 1652 удалений

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,110 +1,114 @@
//
// <copyright file="HTKMLFReader.h" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//
// HTKMLFReader.h - Include file for the MTK and MLF format of features and samples
#pragma once
#include "DataReader.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
class HTKMLFReader : public IDataReader<ElemType>
{
private:
msra::dbn::minibatchiterator* m_mbiter;
msra::dbn::minibatchsource* m_frameSource;
msra::dbn::minibatchreadaheadsource* m_readAheadSource;
msra::dbn::FileEvalSource* m_fileEvalSource;
msra::dbn::latticesource* m_lattices;
map<wstring,msra::lattices::lattice::htkmlfwordsequence> m_latticeMap;
vector<bool> m_sentenceEnd;
bool m_readAhead;
bool m_truncated;
vector<size_t> m_processedFrame;
size_t m_numberOfuttsPerMinibatch;
size_t m_actualnumberOfuttsPerMinibatch;
size_t m_mbSize;
vector<size_t> m_toProcess;
vector<size_t> m_switchFrame;
bool m_noData;
bool m_trainOrTest; // if false, in file writing mode
std::map<LabelIdType, LabelType> m_idToLabelMap;
bool m_partialMinibatch; // allow partial minibatches?
std::vector<ElemType*> m_featuresBufferMultiUtt;
std::vector<size_t> m_featuresBufferAllocatedMultiUtt;
std::vector<ElemType*> m_labelsBufferMultiUtt;
std::vector<size_t> m_labelsBufferAllocatedMultiUtt;
std::vector<size_t> m_featuresStartIndexMultiUtt;
std::vector<size_t> m_labelsStartIndexMultiUtt;
std::vector<ElemType*> m_featuresBufferMultiIO;
std::vector<size_t> m_featuresBufferAllocatedMultiIO;
std::vector<ElemType*> m_labelsBufferMultiIO;
std::vector<size_t> m_labelsBufferAllocatedMultiIO;
std::map<std::wstring,size_t> m_featureNameToIdMap;
std::map<std::wstring,size_t> m_labelNameToIdMap;
std::map<std::wstring,size_t> m_nameToTypeMap;
std::map<std::wstring,size_t> m_featureNameToDimMap;
std::map<std::wstring,size_t> m_labelNameToDimMap;
// for writing outputs to files (standard single input/output network) - deprecate eventually
bool m_checkDictionaryKeys;
bool m_convertLabelsToTargets;
std::vector <bool> m_convertLabelsToTargetsMultiIO;
std::vector<std::vector<std::wstring>> m_inputFilesMultiIO;
size_t m_inputFileIndex;
std::vector<size_t> m_featDims;
std::vector<size_t> m_labelDims;
std::vector<std::vector<std::vector<ElemType>>>m_labelToTargetMapMultiIO;
void PrepareForTrainingOrTesting(const ConfigParameters& config);
void PrepareForWriting(const ConfigParameters& config);
bool GetMinibatchToTrainOrTest(std::map<std::wstring, Matrix<ElemType>*>&matrices);
bool GetMinibatchToWrite(std::map<std::wstring, Matrix<ElemType>*>&matrices);
void StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
void StartMinibatchLoopToWrite(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
bool ReNewBufferForMultiIO(size_t i);
size_t NumberSlicesInEachRecurrentIter() { return m_numberOfuttsPerMinibatch ;}
void SetNbrSlicesEachRecurrentIter(const size_t) { };
void GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector<std::wstring>& features, std::vector<std::wstring>& labels);
size_t ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector<std::vector<ElemType>>& labelToTargetMap);
enum InputOutputTypes
{
real,
category,
};
public:
virtual void Init(const ConfigParameters& config);
virtual void Destroy() {delete this;}
virtual ~HTKMLFReader();
virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring& sectionName);
virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<unsigned, LabelType>& labelMapping);
virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0);
virtual bool DataEnd(EndDataType endDataType);
void SetSentenceEndInBatch(vector<size_t> &/*sentenceEnd*/);
void SetSentenceEnd(int /*actualMbSize*/){};
};
//
// <copyright file="HTKMLFReader.h" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//
// HTKMLFReader.h - Include file for the MTK and MLF format of features and samples
#pragma once
#include "DataReader.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
class HTKMLFReader : public IDataReader<ElemType>
{
private:
const static size_t m_htkRandomizeAuto = 0;
const static size_t m_htkRandomizeDisable = (size_t)-1;
msra::dbn::minibatchiterator* m_mbiter;
msra::dbn::minibatchsource* m_frameSource;
msra::dbn::minibatchreadaheadsource* m_readAheadSource;
msra::dbn::FileEvalSource* m_fileEvalSource;
msra::dbn::latticesource* m_lattices;
map<wstring,msra::lattices::lattice::htkmlfwordsequence> m_latticeMap;
vector<bool> m_sentenceEnd;
bool m_readAhead;
bool m_truncated;
vector<size_t> m_processedFrame;
size_t m_numberOfuttsPerMinibatch;
size_t m_actualnumberOfuttsPerMinibatch;
size_t m_mbSize;
vector<size_t> m_toProcess;
vector<size_t> m_switchFrame;
bool m_noData;
bool m_trainOrTest; // if false, in file writing mode
std::map<LabelIdType, LabelType> m_idToLabelMap;
bool m_partialMinibatch; // allow partial minibatches?
std::vector<ElemType*> m_featuresBufferMultiUtt;
std::vector<size_t> m_featuresBufferAllocatedMultiUtt;
std::vector<ElemType*> m_labelsBufferMultiUtt;
std::vector<size_t> m_labelsBufferAllocatedMultiUtt;
std::vector<size_t> m_featuresStartIndexMultiUtt;
std::vector<size_t> m_labelsStartIndexMultiUtt;
std::vector<ElemType*> m_featuresBufferMultiIO;
std::vector<size_t> m_featuresBufferAllocatedMultiIO;
std::vector<ElemType*> m_labelsBufferMultiIO;
std::vector<size_t> m_labelsBufferAllocatedMultiIO;
std::map<std::wstring,size_t> m_featureNameToIdMap;
std::map<std::wstring,size_t> m_labelNameToIdMap;
std::map<std::wstring,size_t> m_nameToTypeMap;
std::map<std::wstring,size_t> m_featureNameToDimMap;
std::map<std::wstring,size_t> m_labelNameToDimMap;
// for writing outputs to files (standard single input/output network) - deprecate eventually
bool m_checkDictionaryKeys;
bool m_convertLabelsToTargets;
std::vector <bool> m_convertLabelsToTargetsMultiIO;
std::vector<std::vector<std::wstring>> m_inputFilesMultiIO;
size_t m_inputFileIndex;
std::vector<size_t> m_featDims;
std::vector<size_t> m_labelDims;
std::vector<std::vector<std::vector<ElemType>>>m_labelToTargetMapMultiIO;
void PrepareForTrainingOrTesting(const ConfigParameters& config);
void PrepareForWriting(const ConfigParameters& config);
bool GetMinibatchToTrainOrTest(std::map<std::wstring, Matrix<ElemType>*>&matrices);
bool GetMinibatchToWrite(std::map<std::wstring, Matrix<ElemType>*>&matrices);
void StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
void StartMinibatchLoopToWrite(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
bool ReNewBufferForMultiIO(size_t i);
size_t NumberSlicesInEachRecurrentIter() { return m_numberOfuttsPerMinibatch ;}
void SetNbrSlicesEachRecurrentIter(const size_t) { };
void GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector<std::wstring>& features, std::vector<std::wstring>& labels);
size_t ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector<std::vector<ElemType>>& labelToTargetMap);
enum InputOutputTypes
{
real,
category,
};
public:
virtual void Init(const ConfigParameters& config);
virtual void Destroy() {delete this;}
virtual ~HTKMLFReader();
virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring& sectionName);
virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<unsigned, LabelType>& labelMapping);
virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0);
virtual bool DataEnd(EndDataType endDataType);
void SetSentenceEndInBatch(vector<size_t> &/*sentenceEnd*/);
void SetSentenceEnd(int /*actualMbSize*/){};
};
}}}

Просмотреть файл

@ -339,7 +339,20 @@ namespace msra { namespace dbn {
feat[i].resize(vdims[i], framesInBlock); // input features for whole utt (col vectors)
// augment the features
//msra::dbn::augmentneighbors(framesMulti[i], boundaryFlags, 0, leftcontext[i], rightcontext[i],)
msra::dbn::augmentneighbors (framesMulti[i], boundaryFlags, leftcontext[i], rightcontext[i], 0, framesInBlock, feat[i]);
size_t leftextent, rightextent;
// page in the needed range of frames
if (leftcontext[i] == 0 && rightcontext[i] == 0)
{
leftextent = rightextent = augmentationextent(framesMulti[i][0].size(), vdims[i]);
}
else
{
leftextent = leftcontext[i];
rightextent = rightcontext[i];
}
msra::dbn::augmentneighbors (framesMulti[i], boundaryFlags, leftextent, rightextent, 0, framesInBlock, feat[i]);
}
minibatchReady=true;
}

Просмотреть файл

@ -112,7 +112,7 @@
</PrecompiledHeader>
<WarningLevel>Level4</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>DISPLAY_DEBUG;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<SDLCheck>true</SDLCheck>
<OpenMPSupport>true</OpenMPSupport>
<TreatWarningAsError>true</TreatWarningAsError>