bug fixes in HTKMLFReader: randomizationNone for rollingwindow and context window setting for write
This commit is contained in:
Родитель
8fbcf8f5b7
Коммит
c7085c7e2d
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,110 +1,114 @@
|
|||
//
|
||||
// <copyright file="HTKMLFReader.h" company="Microsoft">
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// </copyright>
|
||||
//
|
||||
// HTKMLFReader.h - Include file for the MTK and MLF format of features and samples
|
||||
#pragma once
|
||||
#include "DataReader.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template<class ElemType>
|
||||
class HTKMLFReader : public IDataReader<ElemType>
|
||||
{
|
||||
private:
|
||||
msra::dbn::minibatchiterator* m_mbiter;
|
||||
msra::dbn::minibatchsource* m_frameSource;
|
||||
msra::dbn::minibatchreadaheadsource* m_readAheadSource;
|
||||
msra::dbn::FileEvalSource* m_fileEvalSource;
|
||||
msra::dbn::latticesource* m_lattices;
|
||||
map<wstring,msra::lattices::lattice::htkmlfwordsequence> m_latticeMap;
|
||||
|
||||
vector<bool> m_sentenceEnd;
|
||||
bool m_readAhead;
|
||||
bool m_truncated;
|
||||
vector<size_t> m_processedFrame;
|
||||
size_t m_numberOfuttsPerMinibatch;
|
||||
size_t m_actualnumberOfuttsPerMinibatch;
|
||||
size_t m_mbSize;
|
||||
vector<size_t> m_toProcess;
|
||||
vector<size_t> m_switchFrame;
|
||||
bool m_noData;
|
||||
|
||||
bool m_trainOrTest; // if false, in file writing mode
|
||||
|
||||
std::map<LabelIdType, LabelType> m_idToLabelMap;
|
||||
|
||||
bool m_partialMinibatch; // allow partial minibatches?
|
||||
|
||||
std::vector<ElemType*> m_featuresBufferMultiUtt;
|
||||
std::vector<size_t> m_featuresBufferAllocatedMultiUtt;
|
||||
std::vector<ElemType*> m_labelsBufferMultiUtt;
|
||||
std::vector<size_t> m_labelsBufferAllocatedMultiUtt;
|
||||
std::vector<size_t> m_featuresStartIndexMultiUtt;
|
||||
std::vector<size_t> m_labelsStartIndexMultiUtt;
|
||||
|
||||
std::vector<ElemType*> m_featuresBufferMultiIO;
|
||||
std::vector<size_t> m_featuresBufferAllocatedMultiIO;
|
||||
std::vector<ElemType*> m_labelsBufferMultiIO;
|
||||
std::vector<size_t> m_labelsBufferAllocatedMultiIO;
|
||||
|
||||
std::map<std::wstring,size_t> m_featureNameToIdMap;
|
||||
std::map<std::wstring,size_t> m_labelNameToIdMap;
|
||||
std::map<std::wstring,size_t> m_nameToTypeMap;
|
||||
std::map<std::wstring,size_t> m_featureNameToDimMap;
|
||||
std::map<std::wstring,size_t> m_labelNameToDimMap;
|
||||
// for writing outputs to files (standard single input/output network) - deprecate eventually
|
||||
bool m_checkDictionaryKeys;
|
||||
bool m_convertLabelsToTargets;
|
||||
std::vector <bool> m_convertLabelsToTargetsMultiIO;
|
||||
std::vector<std::vector<std::wstring>> m_inputFilesMultiIO;
|
||||
|
||||
size_t m_inputFileIndex;
|
||||
std::vector<size_t> m_featDims;
|
||||
std::vector<size_t> m_labelDims;
|
||||
|
||||
std::vector<std::vector<std::vector<ElemType>>>m_labelToTargetMapMultiIO;
|
||||
|
||||
void PrepareForTrainingOrTesting(const ConfigParameters& config);
|
||||
void PrepareForWriting(const ConfigParameters& config);
|
||||
|
||||
bool GetMinibatchToTrainOrTest(std::map<std::wstring, Matrix<ElemType>*>&matrices);
|
||||
bool GetMinibatchToWrite(std::map<std::wstring, Matrix<ElemType>*>&matrices);
|
||||
|
||||
void StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
|
||||
void StartMinibatchLoopToWrite(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
|
||||
|
||||
bool ReNewBufferForMultiIO(size_t i);
|
||||
|
||||
size_t NumberSlicesInEachRecurrentIter() { return m_numberOfuttsPerMinibatch ;}
|
||||
void SetNbrSlicesEachRecurrentIter(const size_t) { };
|
||||
|
||||
void GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector<std::wstring>& features, std::vector<std::wstring>& labels);
|
||||
|
||||
|
||||
size_t ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector<std::vector<ElemType>>& labelToTargetMap);
|
||||
enum InputOutputTypes
|
||||
{
|
||||
real,
|
||||
category,
|
||||
};
|
||||
|
||||
|
||||
|
||||
public:
|
||||
virtual void Init(const ConfigParameters& config);
|
||||
virtual void Destroy() {delete this;}
|
||||
virtual ~HTKMLFReader();
|
||||
virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
|
||||
virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
|
||||
virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring& sectionName);
|
||||
virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<unsigned, LabelType>& labelMapping);
|
||||
virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0);
|
||||
|
||||
virtual bool DataEnd(EndDataType endDataType);
|
||||
void SetSentenceEndInBatch(vector<size_t> &/*sentenceEnd*/);
|
||||
void SetSentenceEnd(int /*actualMbSize*/){};
|
||||
};
|
||||
|
||||
//
|
||||
// <copyright file="HTKMLFReader.h" company="Microsoft">
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// </copyright>
|
||||
//
|
||||
// HTKMLFReader.h - Include file for the MTK and MLF format of features and samples
|
||||
#pragma once
|
||||
#include "DataReader.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template<class ElemType>
|
||||
class HTKMLFReader : public IDataReader<ElemType>
|
||||
{
|
||||
private:
|
||||
|
||||
const static size_t m_htkRandomizeAuto = 0;
|
||||
const static size_t m_htkRandomizeDisable = (size_t)-1;
|
||||
|
||||
msra::dbn::minibatchiterator* m_mbiter;
|
||||
msra::dbn::minibatchsource* m_frameSource;
|
||||
msra::dbn::minibatchreadaheadsource* m_readAheadSource;
|
||||
msra::dbn::FileEvalSource* m_fileEvalSource;
|
||||
msra::dbn::latticesource* m_lattices;
|
||||
map<wstring,msra::lattices::lattice::htkmlfwordsequence> m_latticeMap;
|
||||
|
||||
vector<bool> m_sentenceEnd;
|
||||
bool m_readAhead;
|
||||
bool m_truncated;
|
||||
vector<size_t> m_processedFrame;
|
||||
size_t m_numberOfuttsPerMinibatch;
|
||||
size_t m_actualnumberOfuttsPerMinibatch;
|
||||
size_t m_mbSize;
|
||||
vector<size_t> m_toProcess;
|
||||
vector<size_t> m_switchFrame;
|
||||
bool m_noData;
|
||||
|
||||
bool m_trainOrTest; // if false, in file writing mode
|
||||
|
||||
std::map<LabelIdType, LabelType> m_idToLabelMap;
|
||||
|
||||
bool m_partialMinibatch; // allow partial minibatches?
|
||||
|
||||
std::vector<ElemType*> m_featuresBufferMultiUtt;
|
||||
std::vector<size_t> m_featuresBufferAllocatedMultiUtt;
|
||||
std::vector<ElemType*> m_labelsBufferMultiUtt;
|
||||
std::vector<size_t> m_labelsBufferAllocatedMultiUtt;
|
||||
std::vector<size_t> m_featuresStartIndexMultiUtt;
|
||||
std::vector<size_t> m_labelsStartIndexMultiUtt;
|
||||
|
||||
std::vector<ElemType*> m_featuresBufferMultiIO;
|
||||
std::vector<size_t> m_featuresBufferAllocatedMultiIO;
|
||||
std::vector<ElemType*> m_labelsBufferMultiIO;
|
||||
std::vector<size_t> m_labelsBufferAllocatedMultiIO;
|
||||
|
||||
std::map<std::wstring,size_t> m_featureNameToIdMap;
|
||||
std::map<std::wstring,size_t> m_labelNameToIdMap;
|
||||
std::map<std::wstring,size_t> m_nameToTypeMap;
|
||||
std::map<std::wstring,size_t> m_featureNameToDimMap;
|
||||
std::map<std::wstring,size_t> m_labelNameToDimMap;
|
||||
// for writing outputs to files (standard single input/output network) - deprecate eventually
|
||||
bool m_checkDictionaryKeys;
|
||||
bool m_convertLabelsToTargets;
|
||||
std::vector <bool> m_convertLabelsToTargetsMultiIO;
|
||||
std::vector<std::vector<std::wstring>> m_inputFilesMultiIO;
|
||||
|
||||
size_t m_inputFileIndex;
|
||||
std::vector<size_t> m_featDims;
|
||||
std::vector<size_t> m_labelDims;
|
||||
|
||||
std::vector<std::vector<std::vector<ElemType>>>m_labelToTargetMapMultiIO;
|
||||
|
||||
void PrepareForTrainingOrTesting(const ConfigParameters& config);
|
||||
void PrepareForWriting(const ConfigParameters& config);
|
||||
|
||||
bool GetMinibatchToTrainOrTest(std::map<std::wstring, Matrix<ElemType>*>&matrices);
|
||||
bool GetMinibatchToWrite(std::map<std::wstring, Matrix<ElemType>*>&matrices);
|
||||
|
||||
void StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
|
||||
void StartMinibatchLoopToWrite(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
|
||||
|
||||
bool ReNewBufferForMultiIO(size_t i);
|
||||
|
||||
size_t NumberSlicesInEachRecurrentIter() { return m_numberOfuttsPerMinibatch ;}
|
||||
void SetNbrSlicesEachRecurrentIter(const size_t) { };
|
||||
|
||||
void GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector<std::wstring>& features, std::vector<std::wstring>& labels);
|
||||
|
||||
|
||||
size_t ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector<std::vector<ElemType>>& labelToTargetMap);
|
||||
enum InputOutputTypes
|
||||
{
|
||||
real,
|
||||
category,
|
||||
};
|
||||
|
||||
|
||||
|
||||
public:
|
||||
virtual void Init(const ConfigParameters& config);
|
||||
virtual void Destroy() {delete this;}
|
||||
virtual ~HTKMLFReader();
|
||||
virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
|
||||
virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
|
||||
virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring& sectionName);
|
||||
virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<unsigned, LabelType>& labelMapping);
|
||||
virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0);
|
||||
|
||||
virtual bool DataEnd(EndDataType endDataType);
|
||||
void SetSentenceEndInBatch(vector<size_t> &/*sentenceEnd*/);
|
||||
void SetSentenceEnd(int /*actualMbSize*/){};
|
||||
};
|
||||
|
||||
}}}
|
|
@ -339,7 +339,20 @@ namespace msra { namespace dbn {
|
|||
feat[i].resize(vdims[i], framesInBlock); // input features for whole utt (col vectors)
|
||||
// augment the features
|
||||
//msra::dbn::augmentneighbors(framesMulti[i], boundaryFlags, 0, leftcontext[i], rightcontext[i],)
|
||||
msra::dbn::augmentneighbors (framesMulti[i], boundaryFlags, leftcontext[i], rightcontext[i], 0, framesInBlock, feat[i]);
|
||||
|
||||
size_t leftextent, rightextent;
|
||||
// page in the needed range of frames
|
||||
if (leftcontext[i] == 0 && rightcontext[i] == 0)
|
||||
{
|
||||
leftextent = rightextent = augmentationextent(framesMulti[i][0].size(), vdims[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
leftextent = leftcontext[i];
|
||||
rightextent = rightcontext[i];
|
||||
}
|
||||
|
||||
msra::dbn::augmentneighbors (framesMulti[i], boundaryFlags, leftextent, rightextent, 0, framesInBlock, feat[i]);
|
||||
}
|
||||
minibatchReady=true;
|
||||
}
|
||||
|
|
|
@ -112,7 +112,7 @@
|
|||
</PrecompiledHeader>
|
||||
<WarningLevel>Level4</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>DISPLAY_DEBUG;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<SDLCheck>true</SDLCheck>
|
||||
<OpenMPSupport>true</OpenMPSupport>
|
||||
<TreatWarningAsError>true</TreatWarningAsError>
|
||||
|
|
Загрузка…
Ссылка в новой задаче