Adding suppport for parallelized sequence training in Kaldi2Reader

This commit is contained in:
chenguoguo 2015-07-24 16:55:21 +00:00
Родитель fc676c579a
Коммит 73c6db513d
7 изменённых файлов: 783 добавлений и 332 удалений

Просмотреть файл

@ -225,20 +225,28 @@ void DataReader<ElemType>::SetRandomSeed(int seed)
}
template<class ElemType>
bool DataReader<ElemType>::GetForkedUtterance(std::wstring& uttID, std::map<std::wstring, Matrix<ElemType>*>& matrices)
bool DataReader<ElemType>::GetMinibatchCopy(
std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
std::map<std::wstring, Matrix<ElemType>*>& matrices,
Matrix<ElemType>& sentenceBegin,
std::vector<MinibatchPackingFlag>& minibatchPackingFlag)
{
bool ans = false;
for (size_t i = 0; i < m_ioNames.size(); i++)
ans = (m_dataReader[m_ioNames[i]]->GetForkedUtterance(uttID, matrices) || ans);
ans = (m_dataReader[m_ioNames[i]]->GetMinibatchCopy(uttInfo, matrices, sentenceBegin, minibatchPackingFlag) || ans);
return ans;
}
template<class ElemType>
bool DataReader<ElemType>::ComputeDerivativeFeatures(const std::wstring& uttID, const Matrix<ElemType>& outputs)
bool DataReader<ElemType>::SetNetOutput(
const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
const Matrix<ElemType>& outputs,
const Matrix<ElemType>& sentenceBegin,
const std::vector<MinibatchPackingFlag>& minibatchPackingFlag)
{
bool ans = false;
for (size_t i = 0; i < m_ioNames.size(); i++)
ans = (m_dataReader[m_ioNames[i]]->ComputeDerivativeFeatures(uttID, outputs) || ans);
ans = (m_dataReader[m_ioNames[i]]->SetNetOutput(uttInfo, outputs, sentenceBegin, minibatchPackingFlag) || ans);
return ans;
}

Просмотреть файл

@ -85,14 +85,27 @@ public:
void SetDoRandomize(bool b){ mDoRandomize = b; }
// Gets utterance before getting the actual minibatch, which will not affect
// getting the minibatches. This can be useful in sequence training.
virtual bool GetForkedUtterance(std::wstring& , std::map<std::wstring, Matrix<ElemType>*>& ) { return false; }
// Gets a copy of the minibatch for the forward computation. This can be
// useful if some of the computation has to happen in the reader.
virtual bool GetMinibatchCopy(
std::vector<std::vector<std::pair<wstring, size_t>>>& /*uttInfo*/,
std::map<std::wstring, Matrix<ElemType>*>& /*matrices*/,
Matrix<ElemType>& /*sentenceBegin*/,
std::vector<MinibatchPackingFlag>& /*minibatchPackingFlag*/)
{
return false;
}
// Computes certain derivatives given outputs from neural networks, which
// will later be fed to the neural network as features. This can be useful
// in sequence training.
virtual bool ComputeDerivativeFeatures(const std::wstring& , const Matrix<ElemType>& ) { return false; }
// Sets the neural network output to the reader. This can be useful if some
// of the computation has to happen in the reader.
virtual bool SetNetOutput(
const std::vector<std::vector<std::pair<wstring, size_t>>>& /*uttInfo*/,
const Matrix<ElemType>& /*outputs*/,
const Matrix<ElemType>& /*sentenceBegin*/,
const std::vector<MinibatchPackingFlag>& /*minibatchPackingFlag*/)
{
return false;
}
};
// GetReader - get a reader type from the DLL
@ -193,14 +206,21 @@ public:
virtual bool DataEnd(EndDataType endDataType);
// Gets utterance before getting the actual minibatch, which will not affect
// getting the minibatches. This can be useful in sequence training.
virtual bool GetForkedUtterance(std::wstring& uttID, std::map<std::wstring, Matrix<ElemType>*>& matrices);
// Gets a copy of the minibatch for the forward computation. This can be
// useful if some of the computation has to happen in the reader.
virtual bool GetMinibatchCopy(
std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
std::map<std::wstring, Matrix<ElemType>*>& matrices,
Matrix<ElemType>& sentenceBegin,
std::vector<MinibatchPackingFlag>& minibatchPackingFlag);
// Computes certain derivatives given outputs from neural networks, which
// will later be fed to the neural network as features. This can be useful
// in sequence training.
virtual bool ComputeDerivativeFeatures(const std::wstring& uttID, const Matrix<ElemType>& outputs);
// Sets the neural network output to the reader. This can be useful if some
// of the computation has to happen in the reader.
virtual bool SetNetOutput(
const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
const Matrix<ElemType>& outputs,
const Matrix<ElemType>& sentenceBegin,
const std::vector<MinibatchPackingFlag>& minibatchPackingFlag);
void SetSentenceSegBatch(Matrix<ElemType> & sentenceBegin, vector<MinibatchPackingFlag>& minibatchPackingFlag);

Просмотреть файл

@ -49,9 +49,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_frameSource = NULL;
m_lattices = NULL;
m_sequenceTrainingIO = NULL;
m_minibatchBuffer.resize(0);
m_minibatchBufferIndex = 0;
m_minibatchBufferLeftovers = 0;
m_noData = false;
m_convertLabelsToTargets = false;
m_doSeqTrain = false;
m_getMinibatchCopy = false;
if (readerConfig.Exists("legacyMode"))
{
@ -60,7 +64,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// If <m_framemode> is false, throw away any utterance that is longer
// than the specified <m_maxUtteranceLength>.
m_maxUtteranceLength = readerConfig("maxUtteranceLength", "1500");
m_maxUtteranceLength = readerConfig("maxUtteranceLength", "10000");
// m_truncated:
// If true, truncate utterances to fit the minibatch size. Otherwise
@ -172,7 +176,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_sequenceTrainingIO = new KaldiSequenceTrainingIO<ElemType>(
denlatRspecifier, aliRspecifier, transModelFilename,
silencePhoneStr, m_seqTrainCriterion, oldAcousticScale,
acousticScale, lmScale, oneSilenceClass);
acousticScale, lmScale,
oneSilenceClass, m_numberOfuttsPerMinibatch);
// Scans the configurations to get "seqTrainDeriv" type input and
// "seqTrainObj" type input. Both are feature nodes, we feed derivatives
@ -293,6 +298,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
m_featureNameToIdMap[featureNames[i]] = iFeat;
assert(iFeat == m_featureIdToNameMap.size());
m_featureIdToNameMap.push_back(featureNames[i]);
scriptpaths.push_back(new msra::asr::FeatureSection(thisFeature("scpFile"), thisFeature("rx"), thisFeature("featureTransform", "")));
m_featureNameToDimMap[featureNames[i]] = m_featDims[i];
@ -334,6 +341,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
statelistpaths.push_back(thisLabel("labelMappingFile",L""));
m_labelNameToIdMap[labelNames[i]] = iLabel;
assert(iLabel == m_labelIdToNameMap.size());
m_labelIdToNameMap.push_back(labelNames[i]);
m_labelNameToDimMap[labelNames[i]] = m_labelDims[i];
mlfpaths.clear();
mlfpaths.push_back(thisLabel("mlfFile"));
@ -599,6 +608,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
m_featureNameToIdMap[featureNames[i]]= iFeat;
assert(iFeat == m_featureIdToNameMap.size());
m_featureIdToNameMap.push_back(featureNames[i]);
scriptpaths.push_back(new msra::asr::FeatureSection(thisFeature("scpFile"), thisFeature("rx"), thisFeature("featureTransform", "")));
m_featureNameToDimMap[featureNames[i]] = realDims[i];
@ -736,6 +747,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void HTKMLFReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples)
{
m_mbSize = mbSize;
m_currentMBSize = mbSize;
if (m_trainOrTest)
{
@ -788,7 +800,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_mbiter = NULL;
}
msra::dbn::minibatchsource* source = m_frameSource;
m_mbiter = new msra::dbn::minibatchiterator(*source, epoch, requestedEpochSamples, mbSize, datapasses);
size_t currentMBSize = (m_framemode == true) ? mbSize : 1;
m_mbiter = new msra::dbn::minibatchiterator(*source, epoch, requestedEpochSamples, currentMBSize, datapasses);
// Clears feature and label buffer.
if (!m_featuresBufferMultiIO.empty())
@ -882,7 +895,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// if startFrame = 5, endFrame = 10, then we copy frames 5, 6, 7, 8, 9.
template<class ElemType>
bool HTKMLFReader<ElemType>::PopulateUtteranceInMinibatch(
std::map<std::wstring, Matrix<ElemType>*>& matrices,
const std::map<std::wstring, Matrix<ElemType>*>& matrices,
size_t uttIndex, size_t startFrame,
size_t endFrame, size_t mbSize, size_t mbOffset)
{
@ -897,15 +910,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
return false;
}
if (m_doSeqTrain && m_numberOfuttsPerMinibatch > 1)
{
LogicError("nbrUttsInEachRecurrentIter has to be 1 in sequence training.\n");
}
size_t numOfFea = m_featuresBufferMultiIO.size();
size_t numOfLabel = m_labelsBufferMultiIO.size();
typename std::map<std::wstring, Matrix<ElemType>*>::iterator iter;
for (iter = matrices.begin(); iter != matrices.end(); iter++)
for (auto iter = matrices.begin(); iter != matrices.end(); iter++)
{
if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
{ // Features.
@ -972,65 +980,41 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
}
}
else if (m_doSeqTrain)
{
// TODO(GUOGUO): if we are going to allow "m_truncate" for
// sequence training, we will have to modify the
// following -- the following always assume we
// start filling the minibatch from index 0.
// If we do sequence training we have to populate the derivative
// features as well as the objective features. But unlike the
// features and labels, we put them in to <matrices> directly.
// We assume we only process one utterance at a time in the
// current implementation.
assert(uttIndex == 0);
if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainDeriv)
{
wstring uttID = m_uttInfo[uttIndex][0].first;
Matrix<ElemType>& data = *matrices[iter->first];
if (m_sequenceTrainingIO->HasDerivatives(uttID))
m_sequenceTrainingIO->GetDerivatives(startFrame, endFrame, mbSize, uttID, data);
else
{
data.Resize(data.GetNumRows(), mbSize);
data.SetValue(0);
}
}
else if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainObj)
{
wstring uttID = m_uttInfo[uttIndex][0].first;
Matrix<ElemType>& data = *matrices[iter->first];
if (m_sequenceTrainingIO->HasDerivatives(uttID))
m_sequenceTrainingIO->GetObjectives(startFrame, endFrame, uttID, data);
else
data.SetValue(0);
}
}
}
return success;
}
template<class ElemType>
bool HTKMLFReader<ElemType>::GetMinibatchToTrainOrTest(std::map<std::wstring, Matrix<ElemType>*>& matrices)
bool HTKMLFReader<ElemType>::GetOneMinibatchToTrainOrTestDataBuffer(
const std::map<std::wstring, Matrix<ElemType>*>& matrices)
{
bool skip = false;
// On first minibatch, check if we have input for given names.
if (m_checkDictionaryKeys)
{
std::map<std::wstring,size_t>::iterator iter;
for (auto iter = matrices.begin(); iter != matrices.end(); iter++)
{
if (m_nameToTypeMap.find(iter->first) == m_nameToTypeMap.end())
{
throw std::runtime_error(msra::strfun::strprintf("minibatch requested for input node %S not found in reader - cannot generate input\n", iter->first.c_str()));
throw std::runtime_error(msra::strfun::strprintf(
"minibatch requested for input node %S not found in"
"reader - cannot generate input\n", iter->first.c_str()));
}
}
m_checkDictionaryKeys=false;
}
size_t currentMBSize = m_mbSize;
// If we are doing sequence training, we need to keep the utterance
// information.
if (m_doSeqTrain)
{
m_minibatchUttInfo.assign(m_numberOfuttsPerMinibatch,
std::vector<std::pair<wstring, size_t>>(0));
}
m_currentMBSize = m_mbSize;
do
{
// Checks if we have finished all the utterances.
@ -1050,28 +1034,28 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
}
// If <m_truncated> is true, <currentMBSize> is <m_mbSize>
// If <m_truncated> is false, <currentMBSize> equals to the longest
// If <m_truncated> is true, <m_currentMBSize> is <m_mbSize>
// If <m_truncated> is false, <m_currentMBSize> equals to the longest
// utterance in the minibatch.
if (!m_truncated)
{
currentMBSize = 0;
m_currentMBSize = 0;
for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i++)
{
if (m_currentBufferFrames[i] > currentMBSize)
if (m_currentBufferFrames[i] > m_currentMBSize)
{
currentMBSize = m_currentBufferFrames[i];
m_currentMBSize = m_currentBufferFrames[i];
}
}
}
// We initialize the sentence boundary information before we process
// the utterances.
m_sentenceBegin.Resize(m_numberOfuttsPerMinibatch, currentMBSize);
m_minibatchPackingFlag.resize(currentMBSize);
m_sentenceBegin.Resize(m_numberOfuttsPerMinibatch, m_currentMBSize);
m_minibatchPackingFlag.resize(m_currentMBSize);
for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i++)
{
for (size_t j = 0; j < currentMBSize; j++)
for (size_t j = 0; j < m_currentMBSize; j++)
{
m_sentenceBegin.SetValue(i, j, (ElemType) SENTENCE_MIDDLE);
}
@ -1085,7 +1069,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t startFrame = m_processedFrame[i];
size_t endFrame = 0;
if ((startFrame + currentMBSize) < m_toProcess[i])
if ((startFrame + m_currentMBSize) < m_toProcess[i])
{
// There is only 1 case:
// 1. <m_framemode> is false, and <m_truncated> is true.
@ -1099,11 +1083,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_minibatchPackingFlag[0] |= MinibatchPackingFlag::UtteranceStart;
}
endFrame = startFrame + currentMBSize;
bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, currentMBSize);
m_processedFrame[i] += currentMBSize;
endFrame = startFrame + m_currentMBSize;
bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, m_currentMBSize);
if (m_doSeqTrain && populateSucc) { m_minibatchUttInfo[i].push_back(m_uttInfo[i][0]); }
m_processedFrame[i] += m_currentMBSize;
}
else if ((startFrame + currentMBSize) == m_toProcess[i])
else if ((startFrame + m_currentMBSize) == m_toProcess[i])
{
// There are 3 cases:
// 1. <m_framemode> is false, and <m_truncated> is true,
@ -1132,9 +1117,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Now puts the utterance into the minibatch, and loads the
// next one.
endFrame = startFrame + currentMBSize;
bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, currentMBSize);
m_processedFrame[i] += currentMBSize;
endFrame = startFrame + m_currentMBSize;
bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, m_currentMBSize);
if (m_doSeqTrain && populateSucc) { m_minibatchUttInfo[i].push_back(m_uttInfo[i][0]); }
m_processedFrame[i] += m_currentMBSize;
bool reNewSucc = ReNewBufferForMultiIO(i);
}
else
@ -1151,7 +1137,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Checks if we have reached the end of the minibatch.
if (startFrame == m_toProcess[i])
{
for (size_t k = 0; k < currentMBSize; k++)
for (size_t k = 0; k < m_currentMBSize; k++)
{
m_sentenceBegin.SetValue(i, k, (ElemType) NO_LABELS);
m_minibatchPackingFlag[k] |= MinibatchPackingFlag::NoLabel;
@ -1159,7 +1145,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Populates <NO_LABELS> with real features, the
// following implementation is not efficient...
assert(m_toProcess[i] > 0);
PopulateUtteranceInMinibatch(matrices, i, 0, 1, currentMBSize, k);
PopulateUtteranceInMinibatch(matrices, i, 0, 1, m_currentMBSize, k);
}
continue;
}
@ -1194,13 +1180,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
endFrame = m_toProcess[i];
size_t currentMBFilled = endFrame - startFrame;
bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, currentMBSize);
bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, m_currentMBSize);
if (m_doSeqTrain && populateSucc) { m_minibatchUttInfo[i].push_back(m_uttInfo[i][0]); }
m_processedFrame[i] += currentMBFilled;
bool reNewSucc = ReNewBufferForMultiIO(i);
// Third, if the next utterance can fit into the current
// minibatch, we also pack the next utterance.
while (reNewSucc && (currentMBFilled + m_toProcess[i] <= currentMBSize))
while (reNewSucc && (currentMBFilled + m_toProcess[i] <= m_currentMBSize))
{
// Sets the utterance boundary.
assert(currentMBFilled + m_toProcess[i] <= m_sentenceBegin.GetNumCols());
@ -1208,7 +1195,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_minibatchPackingFlag[currentMBFilled] |= MinibatchPackingFlag::UtteranceStart;
m_sentenceBegin.SetValue(i, currentMBFilled + m_toProcess[i] - 1, (ElemType)SENTENCE_END);
m_minibatchPackingFlag[currentMBFilled + m_toProcess[i] - 1] |= MinibatchPackingFlag::UtteranceEnd;
populateSucc = PopulateUtteranceInMinibatch(matrices, i, 0, m_toProcess[i], currentMBSize, currentMBFilled);
populateSucc = PopulateUtteranceInMinibatch(matrices, i, 0, m_toProcess[i], m_currentMBSize, currentMBFilled);
if (m_doSeqTrain && populateSucc) { m_minibatchUttInfo[i].push_back(m_uttInfo[i][0]); }
assert(m_processedFrame[i] == 0);
m_processedFrame[i] = m_toProcess[i];
currentMBFilled += m_toProcess[i];
@ -1219,9 +1207,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// minibatch is not full.
if (reNewSucc && !m_framemode && m_truncated)
{
populateSucc = PopulateUtteranceInMinibatch(matrices, i, 0, currentMBSize - currentMBFilled, currentMBSize, currentMBFilled);
m_processedFrame[i] += currentMBSize - currentMBFilled;
if (currentMBFilled < currentMBSize)
populateSucc = PopulateUtteranceInMinibatch(matrices, i, 0, m_currentMBSize - currentMBFilled, m_currentMBSize, currentMBFilled);
if (m_doSeqTrain && populateSucc) { m_minibatchUttInfo[i].push_back(m_uttInfo[i][0]); }
m_processedFrame[i] += m_currentMBSize - currentMBFilled;
if (currentMBFilled < m_currentMBSize)
{
m_sentenceBegin.SetValue(i, currentMBFilled, (ElemType)SENTENCE_BEGIN);
m_minibatchPackingFlag[currentMBFilled] |= MinibatchPackingFlag::UtteranceStart;
@ -1229,7 +1218,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
else
{
for (size_t k = currentMBFilled; k < currentMBSize; k++)
for (size_t k = currentMBFilled; k < m_currentMBSize; k++)
{
m_sentenceBegin.SetValue(i, k, (ElemType) NO_LABELS);
m_minibatchPackingFlag[k] |= MinibatchPackingFlag::NoLabel;
@ -1237,29 +1226,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Populates <NO_LABELS> with real features, the
// following implementation is not efficient...
assert(m_toProcess[i] > 0);
PopulateUtteranceInMinibatch(matrices, i, 0, 1, currentMBSize, k);
PopulateUtteranceInMinibatch(matrices, i, 0, 1, m_currentMBSize, k);
}
}
}
}
typename std::map<std::wstring, Matrix<ElemType>*>::iterator iter;
for (iter = matrices.begin(); iter != matrices.end(); iter++)
{
Matrix<ElemType>& data = *matrices[iter->first];
if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
{
size_t id = m_featureNameToIdMap[iter->first];
size_t dim = m_featureNameToDimMap[iter->first];
data.SetValue(dim, currentMBSize * m_numberOfuttsPerMinibatch, m_featuresBufferMultiIO[id] , matrixFlagNormal);
}
else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
{
size_t id = m_labelNameToIdMap[iter->first];
size_t dim = m_labelNameToDimMap[iter->first];
data.SetValue(dim, currentMBSize * m_numberOfuttsPerMinibatch, m_labelsBufferMultiIO[id], matrixFlagNormal);
}
}
skip=false;
}
while(skip);
@ -1267,6 +1239,209 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return true;
}
template<class ElemType>
bool HTKMLFReader<ElemType>::ShouldCopyMinibatchFromBuffer()
{
if (m_doSeqTrain)
{
// If <m_getMinibatchCopy> is false, then we should copy data from
// buffer for back-propagation.
if (m_getMinibatchCopy == false && m_minibatchBuffer.size() > 0)
{
m_minibatchBufferIndex = 0;
m_minibatchBufferLeftovers = m_minibatchBuffer.size() - 1; // Will pop one more.
return true;
}
// If <m_getMinibatchCopy> is true, we first have to re-compute
// the likelihood for the frames that are already in the buffer.
if (m_getMinibatchCopy == true && m_minibatchBufferLeftovers > 0)
{
if (m_minibatchBufferLeftovers == m_minibatchBuffer.size())
{
m_minibatchBufferIndex = 0;
}
else
{
m_minibatchBufferIndex += 1;
}
m_minibatchBufferLeftovers -= 1;
return true;
}
}
return false;
}
template<class ElemType>
void HTKMLFReader<ElemType>::CopyMinibatchToBuffer()
{
MinibatchBufferUnit currentMinibatch;
// Stores variables realted to the current minibatch.
currentMinibatch.sentenceBegin.SetValue(m_sentenceBegin);
currentMinibatch.minibatchPackingFlag = m_minibatchPackingFlag;
currentMinibatch.currentMBSize = m_currentMBSize;
currentMinibatch.minibatchUttInfo = m_minibatchUttInfo;
size_t size = m_currentMBSize * m_numberOfuttsPerMinibatch;
// Copies features.
currentMinibatch.features.resize(0);
for (size_t i = 0; i < m_featuresBufferMultiIO.size(); ++i)
{
std::vector<ElemType> tmpFeatures(m_featuresBufferMultiIO[i],
m_featuresBufferMultiIO[i] + size * m_featureNameToDimMap[m_featureIdToNameMap[i]]);
currentMinibatch.features.push_back(tmpFeatures);
}
// Copies labels.
currentMinibatch.labels.resize(0);
for (size_t i = 0; i < m_labelsBufferMultiIO.size(); ++i)
{
std::vector<ElemType> tmpLabels(m_labelsBufferMultiIO[i],
m_labelsBufferMultiIO[i] + size * m_labelNameToDimMap[m_labelIdToNameMap[i]]);
currentMinibatch.labels.push_back(tmpLabels);
}
m_minibatchBuffer.push_back(currentMinibatch);
}
template<class ElemType>
void HTKMLFReader<ElemType>::CopyMinibatchFromBufferToMatrix(
size_t index,
std::map<std::wstring, Matrix<ElemType>*>& matrices)
{
assert(m_minibatchBuffer.size() > index);
// Restores the variables related to the minibatch.
m_sentenceBegin.SetValue(m_minibatchBuffer[index].sentenceBegin);
m_minibatchPackingFlag = m_minibatchBuffer[index].minibatchPackingFlag;
m_currentMBSize = m_minibatchBuffer[index].currentMBSize;
m_minibatchUttInfo = m_minibatchBuffer[index].minibatchUttInfo;
// Copies data to the matrix.
for (auto iter = matrices.begin(); iter != matrices.end(); iter++)
{
Matrix<ElemType>& data = *matrices[iter->first];
if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
{
size_t id = m_featureNameToIdMap[iter->first];
size_t dim = m_featureNameToDimMap[iter->first];
assert(id < m_minibatchBuffer[index].features.size());
data.SetValue(dim,
m_minibatchBuffer[index].features[id].size() / dim,
m_minibatchBuffer[index].features[id].data(),
matrixFlagNormal);
}
else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
{
size_t id = m_labelNameToIdMap[iter->first];
size_t dim = m_labelNameToDimMap[iter->first];
assert(id < m_minibatchBuffer[index].labels.size());
data.SetValue(dim,
m_minibatchBuffer[index].labels[id].size() / dim,
m_minibatchBuffer[index].labels[id].data(),
matrixFlagNormal);
}
else if (m_doSeqTrain && !m_getMinibatchCopy)
{
if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainDeriv)
{
m_sequenceTrainingIO->GetDerivative(
m_minibatchUttInfo, m_sentenceBegin,
m_minibatchPackingFlag, matrices[iter->first]);
}
else if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainObj)
{
m_sequenceTrainingIO->GetObjective(m_minibatchUttInfo,
matrices[iter->first]);
}
}
}
// If we are not in the minibatch copy mode, then we can remove the
// minibatch from buffer.
if (m_getMinibatchCopy == false)
{
assert(index == 0);
m_minibatchBuffer.pop_front();
}
}
template<class ElemType>
void HTKMLFReader<ElemType>::CopyMinibatchToMatrix(
size_t size,
const vector<ElemType*>& featureBuffer,
const vector<ElemType*>& labelBuffer,
std::map<std::wstring, Matrix<ElemType>*>& matrices) const
{
for (auto iter = matrices.begin(); iter != matrices.end(); iter++)
{
Matrix<ElemType>& data = *matrices[iter->first];
if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
{
size_t id = m_featureNameToIdMap[iter->first];
size_t dim = m_featureNameToDimMap[iter->first];
assert(id < featureBuffer.size());
data.SetValue(dim, size, featureBuffer[id] , matrixFlagNormal);
}
else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
{
size_t id = m_labelNameToIdMap[iter->first];
size_t dim = m_labelNameToDimMap[iter->first];
assert(id < labelBuffer.size());
data.SetValue(dim, size, labelBuffer[id], matrixFlagNormal);
}
else if (m_doSeqTrain)
{
if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainDeriv)
{
data.Resize(data.GetNumRows(), m_currentMBSize);
data.SetValue(0);
}
else if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainObj)
{
data.SetValue(0);
}
}
}
}
template<class ElemType>
bool HTKMLFReader<ElemType>::GetMinibatchToTrainOrTest(
std::map<std::wstring, Matrix<ElemType>*>& matrices)
{
// We either copy a new minibatch from buffer or read one from minibatch
// iterator.
bool success = false;
if (ShouldCopyMinibatchFromBuffer())
{
CopyMinibatchFromBufferToMatrix(m_minibatchBufferIndex, matrices);
return true;
}
else
{
success = GetOneMinibatchToTrainOrTestDataBuffer(matrices);
if (success)
{
CopyMinibatchToMatrix(
m_currentMBSize * m_numberOfuttsPerMinibatch,
m_featuresBufferMultiIO, m_labelsBufferMultiIO, matrices);
}
// Checks if we need to move the current minibatch to buffer.
if (success && m_getMinibatchCopy)
{
CopyMinibatchToBuffer();
}
return success;
}
return false;
}
template<class ElemType>
bool HTKMLFReader<ElemType>::GetMinibatchToWrite(std::map<std::wstring, Matrix<ElemType>*>& matrices)
{
@ -1567,82 +1742,60 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (!(*m_mbiter))
m_noData = true;
return true;
return true;
}
// Gets a copy of the utterance that corresponds to the current minibatches,
// which will be used to do a neural network forward computation.
template<class ElemType>
bool HTKMLFReader<ElemType>::GetForkedUtterance(std::wstring& uttID,
std::map<std::wstring, Matrix<ElemType>*>& matrices)
bool HTKMLFReader<ElemType>::GetMinibatchCopy(
std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
std::map<std::wstring, Matrix<ElemType>*>& matrices,
Matrix<ElemType>& sentenceBegin,
std::vector<MinibatchPackingFlag>& minibatchPackingFlag)
{
if (!m_doSeqTrain)
// We need to get a "copy" of the minibatch to do the forward
// computation for sequence training.
if (m_doSeqTrain)
{
assert(m_framemode == false);
if (m_sequenceTrainingIO->NeedLikelihoodToComputeDerivative())
{
m_getMinibatchCopy = true;
if (GetMinibatchToTrainOrTest(matrices))
{
sentenceBegin.SetValue(m_sentenceBegin);
minibatchPackingFlag = m_minibatchPackingFlag;
uttInfo = m_minibatchUttInfo;
m_getMinibatchCopy = false;
return true;
}
m_getMinibatchCopy = false;
}
return false;
}
assert(m_framemode == false);
// For the moment we only support single utterance.
if (m_numberOfuttsPerMinibatch != 1)
{
RuntimeError("The current sequence training implementation does not support multiple utterances.\n");
}
// Under our current assumption, we only have one utterance at a time.
uttID = m_uttInfo[0][0].first;
if (!m_sequenceTrainingIO->HasDerivatives(uttID))
{
size_t startFrame = 0;
size_t endFrame = m_uttInfo[0][0].second;
size_t currentMBSize = endFrame - startFrame;
bool populateSucc = PopulateUtteranceInMinibatch(
matrices, 0, startFrame, endFrame, currentMBSize);
if (!populateSucc)
{
return false;
}
// Sets sentence boundary.
m_sentenceBegin.Resize(1, currentMBSize);
m_minibatchPackingFlag.resize(currentMBSize);
for (size_t i = 0; i < currentMBSize; i++)
{
m_sentenceBegin.SetValue(0, i, (ElemType) SENTENCE_MIDDLE);
}
std::fill(m_minibatchPackingFlag.begin(), m_minibatchPackingFlag.end(), MinibatchPackingFlag::None);
m_sentenceBegin.SetValue(0, 0, (ElemType)SENTENCE_BEGIN);
m_sentenceBegin.SetValue(0, m_sentenceBegin.GetNumCols() - 1, (ElemType) SENTENCE_END);
m_minibatchPackingFlag[0] = MinibatchPackingFlag::UtteranceStart;
m_minibatchPackingFlag[m_sentenceBegin.GetNumCols() - 1] = MinibatchPackingFlag::UtteranceEnd;
typename std::map<std::wstring, Matrix<ElemType>*>::iterator iter;
for (iter = matrices.begin(); iter != matrices.end(); iter++)
{
Matrix<ElemType>& data = *matrices[iter->first];
if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
{
size_t id = m_featureNameToIdMap[iter->first];
size_t dim = m_featureNameToDimMap[iter->first];
data.SetValue(dim, currentMBSize * m_numberOfuttsPerMinibatch, m_featuresBufferMultiIO[id] , matrixFlagNormal);
}
else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
{
size_t id = m_labelNameToIdMap[iter->first];
size_t dim = m_labelNameToDimMap[iter->first];
data.SetValue(dim, currentMBSize * m_numberOfuttsPerMinibatch, m_labelsBufferMultiIO[id], matrixFlagNormal);
}
}
return true;
}
return false;
}
template<class ElemType>
bool HTKMLFReader<ElemType>::ComputeDerivativeFeatures(const std::wstring& uttID,
const Matrix<ElemType>& outputs)
bool HTKMLFReader<ElemType>::SetNetOutput(
const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
const Matrix<ElemType>& outputs,
const Matrix<ElemType>& sentenceBegin,
const std::vector<MinibatchPackingFlag>& minibatchPackingFlag)
{
return m_sequenceTrainingIO->ComputeDerivatives(uttID, outputs);
// Set the likelihoods for the utterance with which we can comput the
// derivatives. Note that the minibatch may only contain partial output
// for the utterance, <m_sequenceTrainingIO> takes care of "pasting"
// them together.
if (m_doSeqTrain)
{
assert(m_framemode == false);
return m_sequenceTrainingIO->SetLikelihood(uttInfo, outputs,
sentenceBegin,
minibatchPackingFlag);
}
return false;
}

Просмотреть файл

@ -24,14 +24,25 @@ private:
msra::dbn::latticesource* m_lattices;
map<wstring,msra::lattices::lattice::htkmlfwordsequence> m_latticeMap;
// Sequence training related. Note that for now we only support single
// utterance in sequence training. But the utterance information holders
// are designed as if they support multiple utterances -- in case we will
// extend this soon.
// Sequence training realted members.
struct MinibatchBufferUnit
{
std::vector<std::vector<ElemType>> features;
std::vector<std::vector<ElemType>> labels;
Matrix<ElemType> sentenceBegin;
vector<MinibatchPackingFlag> minibatchPackingFlag;
std::vector<std::vector<std::pair<wstring, size_t>>> minibatchUttInfo;
size_t currentMBSize;
};
bool m_doSeqTrain;
bool m_getMinibatchCopy;
size_t m_minibatchBufferIndex;
size_t m_minibatchBufferLeftovers;
wstring m_seqTrainCriterion;
KaldiSequenceTrainingIO<ElemType>* m_sequenceTrainingIO;
std::deque<MinibatchBufferUnit> m_minibatchBuffer;
std::vector<std::vector<std::pair<wstring, size_t>>> m_uttInfo;
std::vector<std::vector<std::pair<wstring, size_t>>> m_minibatchUttInfo;
vector<bool> m_sentenceEnd;
bool m_readAhead;
@ -42,6 +53,7 @@ private:
size_t m_numberOfuttsPerMinibatch;
size_t m_actualnumberOfuttsPerMinibatch;
size_t m_mbSize;
size_t m_currentMBSize;
vector<size_t> m_currentBufferFrames;
vector<size_t> m_toProcess;
vector<size_t> m_switchFrame;
@ -72,6 +84,8 @@ private:
std::map<std::wstring,size_t> m_nameToTypeMap;
std::map<std::wstring,size_t> m_featureNameToDimMap;
std::map<std::wstring,size_t> m_labelNameToDimMap;
std::vector<std::wstring> m_featureIdToNameMap;
std::vector<std::wstring> m_labelIdToNameMap;
// for writing outputs to files (standard single input/output network) - deprecate eventually
bool m_checkDictionaryKeys;
bool m_convertLabelsToTargets;
@ -89,10 +103,22 @@ private:
void PrepareForSequenceTraining(const ConfigParameters& config);
bool GetMinibatchToTrainOrTest(std::map<std::wstring, Matrix<ElemType>*>& matrices);
bool GetOneMinibatchToTrainOrTestDataBuffer(const std::map<std::wstring, Matrix<ElemType>*>& matrices);
bool GetMinibatchToWrite(std::map<std::wstring, Matrix<ElemType>*>& matrices);
bool PopulateUtteranceInMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices, size_t uttIndex, size_t startFrame, size_t endFrame, size_t mbSize, size_t mbOffset = 0);
bool PopulateUtteranceInMinibatch(const std::map<std::wstring, Matrix<ElemType>*>& matrices, size_t uttIndex, size_t startFrame, size_t endFrame, size_t mbSize, size_t mbOffset = 0);
//-void GetCurrentUtteranceInfo(size_t uttIndex, size_t startFrame, size_t endFrame, wstring& uttID, size_t& startFrameInUtt, size_t& endFrameInUtt);
// If we have to read the current minibatch from buffer, return true,
// otherwise return false.
bool ShouldCopyMinibatchFromBuffer();
// Copys the current minibatch to buffer.
void CopyMinibatchToBuffer();
// Copys one minibatch from buffer to matrix.
void CopyMinibatchFromBufferToMatrix(size_t index, std::map<std::wstring, Matrix<ElemType>*>& matrices);
// Copys one minibatch from <m_featuresBufferMultiIO> to matrix.
void CopyMinibatchToMatrix(size_t size, const std::vector<ElemType*>& featureBuffer, const std::vector<ElemType*>& labelBuffer, std::map<std::wstring, Matrix<ElemType>*>& matrices) const;
void StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
void StartMinibatchLoopToWrite(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
@ -157,9 +183,16 @@ public:
virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<LabelIdType, LabelType>& labelMapping);
virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0);
virtual bool GetForkedUtterance(std::wstring& uttID, std::map<std::wstring, Matrix<ElemType>*>& matrices);
virtual bool ComputeDerivativeFeatures(const std::wstring& uttID, const Matrix<ElemType>& outputs);
virtual bool GetMinibatchCopy(
std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
std::map<std::wstring, Matrix<ElemType>*>& matrices,
Matrix<ElemType>& sentenceBegin,
vector<MinibatchPackingFlag>& sentenceExistsBeginOrNoLabels);
virtual bool SetNetOutput(
const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
const Matrix<ElemType>& outputs,
const Matrix<ElemType>& sentenceBegin,
const vector<MinibatchPackingFlag>& sentenceExistsBeginOrNoLabels);
virtual bool DataEnd(EndDataType endDataType);
void SetSentenceEndInBatch(vector<size_t> &/*sentenceEnd*/);

Просмотреть файл

@ -11,7 +11,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
const wstring& transModelFilename, const wstring& silencePhoneStr,
const wstring& trainCriterion,
ElemType oldAcousticScale, ElemType acousticScale,
ElemType lmScale, bool oneSilenceClass)
ElemType lmScale, bool oneSilenceClass, size_t numberOfuttsPerMinibatch)
{
using namespace msra::asr;
assert(denlatRspecifier != L"");
@ -26,8 +26,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_lmScale = lmScale;
m_trainCriterion = trainCriterion;
m_oneSilenceClass = oneSilenceClass;
m_objective = 0;
m_posteriors.clear();
m_numUttsPerMinibatch = numberOfuttsPerMinibatch;
m_needLikelihood = true;
m_currentObj = 0;
m_minibatchIndex = 1;
m_lastCompleteMinibatch.assign(m_numUttsPerMinibatch, 0);
if (!kaldi::SplitStringToIntegers(toStr(silencePhoneStr),
":", false, &m_silencePhones))
{
@ -35,13 +38,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
if (m_trainCriterion != L"mpfe" && m_trainCriterion != L"smbr")
{
LogicError("Supported sequence training criterion are: mpfe, smbr.\n");
LogicError("Supported sequence training criterion: mpfe, smbr.\n");
}
m_derivRead = false;
m_objRead = false;
m_currentUttHasDeriv = false;
m_currentUttID = L"";
m_currentUttLength = 0;
}
// Destructor.
@ -61,50 +59,43 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
template<class ElemType>
bool KaldiSequenceTrainingIO<ElemType>::HasDerivatives(const wstring& uttID)
bool KaldiSequenceTrainingIO<ElemType>::ComputeDerivative(
const wstring& uttID)
{
if (uttID == m_currentUttID && m_currentUttHasDeriv)
{
return true;
}
else
{
return false;
}
}
template<class ElemType>
bool KaldiSequenceTrainingIO<ElemType>::ComputeDerivatives(
const wstring& uttID, const Matrix<ElemType>& logLikelihoodIn)
{
// Checks if we need to move data to CPU.
Matrix<ElemType> logLikelihood(logLikelihoodIn);
if (logLikelihood.GetDeviceId() >= 0)
logLikelihood.TransferFromDeviceToDevice(logLikelihood.GetDeviceId(), CPUDEVICE, true, false, false);
assert(m_uttPool.find(uttID) != m_uttPool.end());
assert(m_uttPool[uttID].hasDerivative == false);
Matrix<ElemType>& logLikelihood = m_uttPool[uttID].logLikelihood;
std::string uttIDStr = msra::asr::toStr(uttID);
// Sanity check.
if (m_transModel.NumPdfs() != logLikelihood.GetNumRows())
{
RuntimeError("Number of labels in logLikelihood does not match that in the Kaldi model for utterance %S: %d v.s. %d\n", uttID.c_str(), logLikelihood.GetNumRows(), m_transModel.NumPdfs());
RuntimeError("Number of labels in logLikelihood does not match that"
" in the Kaldi model for utterance %S: %d v.s. %d\n",
uttID.c_str(), logLikelihood.GetNumRows(),
m_transModel.NumPdfs());
}
// Reads alignment.
if (!m_aliReader->HasKey(uttIDStr))
{
RuntimeError("Alignment not found for utterance %s\n", uttIDStr.c_str());
RuntimeError("Alignment not found for utterance %s\n",
uttIDStr.c_str());
}
const std::vector<int32> ali = m_aliReader->Value(uttIDStr);
if (ali.size() != logLikelihood.GetNumCols())
{
RuntimeError("Number of frames in logLikelihood does not match that in the alignment for utterance %S: %d v.s. %d\n", uttID.c_str(), logLikelihood.GetNumCols(), ali.size());
RuntimeError("Number of frames in logLikelihood does not match that"
" in the alignment for utterance %S: %d v.s. %d\n",
uttID.c_str(), logLikelihood.GetNumCols(), ali.size());
}
// Reads denominator lattice.
if (!m_denlatReader->HasKey(uttIDStr))
{
RuntimeError("Denominator lattice not found for utterance %S\n", uttID.c_str());
RuntimeError("Denominator lattice not found for utterance %S\n",
uttID.c_str());
}
kaldi::CompactLattice clat = m_denlatReader->Value(uttIDStr);
fst::CreateSuperFinal(&clat); /* One final state with weight One() */
@ -115,7 +106,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// acoustic scale to 0.
if (m_oldAcousticScale != 1.0)
{
fst::ScaleLattice(fst::AcousticLatticeScale(m_oldAcousticScale), &lat);
fst::ScaleLattice(fst::AcousticLatticeScale(m_oldAcousticScale),
&lat);
}
// Topsort lattice.
@ -133,7 +125,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
kaldi::int32 maxTime = kaldi::LatticeStateTimes(lat, &stateTimes);
if (maxTime != logLikelihood.GetNumCols())
{
RuntimeError("Number of frames in the logLikelihood does not match that in the denominator lattice for utterance %S\n", uttID.c_str(), logLikelihood.GetNumRows(), maxTime);
RuntimeError("Number of frames in the logLikelihood does not match"
" that in the denominator lattice for utterance %S\n",
uttID.c_str(), logLikelihood.GetNumRows(), maxTime);
}
// Does lattice acoustic rescoring with the new posteriors from the
@ -143,7 +137,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Second pass acoustic and language model scale.
if (m_acousticScale != 1.0 || m_lmScale != 1.0)
{
fst::ScaleLattice(fst::LatticeScale(m_lmScale, m_acousticScale), &lat);
fst::ScaleLattice(fst::LatticeScale(m_lmScale, m_acousticScale),
&lat);
}
// Forward-backward on the lattice.
@ -152,39 +147,39 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (m_trainCriterion == L"smbr")
{
thisObj = kaldi::LatticeForwardBackwardMpeVariants(
m_transModel, m_silencePhones, lat, ali, "smbr", m_oneSilenceClass, &post);
m_transModel, m_silencePhones, lat,
ali, "smbr", m_oneSilenceClass, &post);
}
else if (m_trainCriterion == L"mpfe")
{
thisObj = kaldi::LatticeForwardBackwardMpeVariants(
m_transModel, m_silencePhones, lat, ali, "mpfe", m_oneSilenceClass, &post);
m_transModel, m_silencePhones, lat,
ali, "mpfe", m_oneSilenceClass, &post);
}
kaldi::ConvertPosteriorToPdfs(m_transModel, post, &m_posteriors);
kaldi::ConvertPosteriorToPdfs(m_transModel,
post, &(m_uttPool[uttID].posterior));
// Uses "expected error rate" instead of "expected accuracy".
m_objective = logLikelihood.GetNumCols() - thisObj;
m_uttPool[uttID].objective = logLikelihood.GetNumCols() - thisObj;
assert(m_posteriors.size() == logLikelihood.GetNumCols());
assert(m_uttPool[uttID].posterior.size() == logLikelihood.GetNumCols());
m_derivRead = false;
m_objRead = false;
m_currentUttHasDeriv = true;
m_currentUttID = uttID;
m_currentUttLength = logLikelihood.GetNumCols();
return true;
}
template<class ElemType>
void KaldiSequenceTrainingIO<ElemType>::LatticeAcousticRescore(
const std::vector<kaldi::int32>& stateTimes,
const Matrix<ElemType>& logLikelihood, kaldi::Lattice* lat)
const Matrix<ElemType>& logLikelihood, kaldi::Lattice* lat) const
{
std::vector<std::vector<kaldi::int32>> timeStateMap(logLikelihood.GetNumCols());
std::vector<std::vector<kaldi::int32>> timeStateMap(
logLikelihood.GetNumCols());
size_t num_states = lat->NumStates();
for (size_t s = 0; s < num_states; s++)
{
assert(stateTimes[s] >= 0 && stateTimes[s] <= logLikelihood.GetNumCols());
assert(stateTimes[s] >= 0
&& stateTimes[s] <= logLikelihood.GetNumCols());
if (stateTimes[s] < logLikelihood.GetNumCols())
{
timeStateMap[stateTimes[s]].push_back(s);
@ -196,14 +191,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
for (size_t i = 0; i < timeStateMap[t].size(); ++i)
{
kaldi::int32 state = timeStateMap[t][i];
for (fst::MutableArcIterator<kaldi::Lattice> aiter(lat, state); !aiter.Done(); aiter.Next())
for (fst::MutableArcIterator<kaldi::Lattice> aiter(lat, state);
!aiter.Done(); aiter.Next())
{
kaldi::LatticeArc arc = aiter.Value();
kaldi::int32 trans_id = arc.ilabel;
if (trans_id != 0)
{
kaldi::int32 pdf_id = m_transModel.TransitionIdToPdf(trans_id);
arc.weight.SetValue2(-logLikelihood(pdf_id, t) + arc.weight.Value2());
kaldi::int32 pdf_id =
m_transModel.TransitionIdToPdf(trans_id);
arc.weight.SetValue2(-logLikelihood(pdf_id, t)
+ arc.weight.Value2());
aiter.SetValue(arc);
}
}
@ -219,97 +217,285 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
template<class ElemType>
void KaldiSequenceTrainingIO<ElemType>::GetDerivatives(size_t startFrame,
size_t endFrame,
size_t mbSize,
const std::wstring& uttID,
Matrix<ElemType>& derivativesIn)
void KaldiSequenceTrainingIO<ElemType>::ProcessUttInfo(
const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
const Matrix<ElemType>& sentenceBegin,
const std::vector<MinibatchPackingFlag>& minibatchPackingFlag,
std::vector<std::vector<std::pair<wstring, std::pair<size_t, size_t>>>>* uttInfoInMinibatch) const
{
Matrix<ElemType> derivatives(CPUDEVICE);
// Does some sanity check first.
if (uttID != m_currentUttID)
assert(uttInfoInMinibatch != NULL);
assert(uttInfo.size() == m_numUttsPerMinibatch);
assert(sentenceBegin.GetNumRows() == m_numUttsPerMinibatch);
assert(minibatchPackingFlag.size() == sentenceBegin.GetNumCols());
uttInfoInMinibatch->clear();
uttInfoInMinibatch->resize(uttInfo.size());
for (size_t i = 0; i < uttInfo.size(); ++i)
{
RuntimeError("Requested utterance does not matched the utterance that we have computed derivatives for: %S v.s. %S\n", uttID.c_str(), m_currentUttID.c_str());
}
if (!m_currentUttHasDeriv)
{
RuntimeError("Derivatives have not been computed, you have to call KaldiSequenceTrainingIO::ComputeDerivative() before using it.\n");
}
assert(startFrame >= 0);
assert(endFrame <= m_currentUttLength);
derivatives.Resize(m_transModel.NumPdfs(), mbSize);
derivatives.SetValue(0);
for (size_t t = startFrame; t < endFrame; ++t)
{
for (size_t i = 0; i < m_posteriors[t].size(); ++i)
size_t startFrameIndexInMinibatch = 0;
size_t numFrames = 0;
for (size_t j = 0; j < sentenceBegin.GetNumCols(); ++j)
{
size_t pdf_id = m_posteriors[t][i].first;
assert(pdf_id < m_transModel.NumPdfs());
derivatives(pdf_id, t - startFrame) -= m_posteriors[t][i].second; /* Flip the sign */
}
}
// Checks if we need to move data to GPU.
if (derivativesIn.GetDeviceId() >= 0)
derivatives.TransferFromDeviceToDevice(CPUDEVICE, derivativesIn.GetDeviceId(), true, false, false);
derivativesIn.SetValue(derivatives);
// We've used up all the derivatives, reset it.
if (endFrame >= m_currentUttLength)
{
m_derivRead = true;
if (m_objRead)
{
m_currentUttID = L"";
m_currentUttHasDeriv = false;
m_currentUttLength = 0;
if (((size_t)sentenceBegin(i, j) & NO_LABELS) == NO_LABELS)
{
continue;
}
numFrames += 1;
if ((((size_t)sentenceBegin(i, j) & SENTENCE_END) == SENTENCE_END)
|| j == sentenceBegin.GetNumCols() - 1)
{
size_t uttIndex = (*uttInfoInMinibatch)[i].size();
wstring uttID = uttInfo[i][uttIndex].first;
(*uttInfoInMinibatch)[i].push_back(
make_pair(uttID, make_pair(startFrameIndexInMinibatch, numFrames)));
startFrameIndexInMinibatch = j + 1;
numFrames = 0;
}
}
assert(uttInfo[i].size() == (*uttInfoInMinibatch)[i].size());
}
}
// Suppose we have a, b, c 3 streams, the <logLikelihoodIn> is the in the
// following format:
// 1: a11 b11 c11 a12 b12 c12...
// 2: a21 b21 c21 a22 b22 c22...
// 3: a31 b31 c31 a32 b32 c32...
template<class ElemType>
void KaldiSequenceTrainingIO<ElemType>::GetObjectives(size_t startFrame,
size_t endFrame,
const std::wstring& uttID,
Matrix<ElemType>& objectivesIn)
bool KaldiSequenceTrainingIO<ElemType>::SetLikelihood(
const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
const Matrix<ElemType>& logLikelihoodIn,
const Matrix<ElemType>& sentenceBegin,
const std::vector<MinibatchPackingFlag>& minibatchPackingFlag)
{
Matrix<ElemType> objectives(CPUDEVICE);
assert(m_needLikelihood == true);
std::vector<std::vector<
std::pair<wstring, std::pair<size_t, size_t>>>> uttInfoInMinibatch;
ProcessUttInfo(uttInfo, sentenceBegin,
minibatchPackingFlag, &uttInfoInMinibatch);
// Does some sanity check first.
if (uttID != m_currentUttID)
// Checks if we need to move data to CPU.
Matrix<ElemType> logLikelihood(logLikelihoodIn);
if (logLikelihood.GetDeviceId() >= 0)
{
RuntimeError("Requested utterance does not matched the utterance that we have computed objectives for: %S v.s. %S\n", uttID.c_str(), m_currentUttID.c_str());
logLikelihood.TransferFromDeviceToDevice(
logLikelihood.GetDeviceId(), CPUDEVICE, true, false, false);
}
if (!m_currentUttHasDeriv)
bool minibatchComplete = true;
size_t currentMBSize = minibatchPackingFlag.size();
for (size_t i = 0; i < uttInfo.size(); ++i)
{
RuntimeError("Objectives have not been computed, you have to call KaldiSequenceTrainingIO::ComputeDerivative() before using it.\n");
}
assert(startFrame >= 0);
assert(endFrame <= m_currentUttLength);
objectives.Resize(1, 1);
objectives.SetValue(m_objective * static_cast<ElemType>(endFrame - startFrame) / static_cast<ElemType>(m_currentUttLength));
// Checks if we need to move data to GPU.
if (objectivesIn.GetDeviceId() >= 0)
objectives.TransferFromDeviceToDevice(CPUDEVICE, objectivesIn.GetDeviceId(), true, false, false);
objectivesIn.SetValue(objectives);
// We've used up all the objectives, reset it.
if (endFrame >= m_currentUttLength)
{
m_objRead = true;
if (m_derivRead)
assert(uttInfo[i].size() == uttInfoInMinibatch[i].size());
for (size_t j = 0; j < uttInfo[i].size(); ++j)
{
m_currentUttID = L"";
m_currentUttHasDeriv = false;
m_currentUttLength = 0;
wstring uttID = uttInfo[i][j].first;
if (m_uttPool.find(uttID) == m_uttPool.end())
{
UtteranceDerivativeUnit tmpUttUnit;
tmpUttUnit.hasDerivative = false;
tmpUttUnit.uttLength = uttInfo[i][j].second;
tmpUttUnit.progress = 0;
tmpUttUnit.streamID = i;
tmpUttUnit.logLikelihood.Resize(m_transModel.NumPdfs(),
tmpUttUnit.uttLength);
m_uttPool[uttID] = tmpUttUnit;
}
// Sets the likelihood and computes derivatives.
assert(m_uttPool.find(uttID) != m_uttPool.end());
if (m_uttPool[uttID].hasDerivative == false)
{
assert(uttID == uttInfoInMinibatch[i][j].first);
size_t startFrame = uttInfoInMinibatch[i][j].second.first;
size_t numFrames = uttInfoInMinibatch[i][j].second.second;
assert(m_uttPool[uttID].progress + numFrames
<= m_uttPool[uttID].uttLength);
// Sets the likelihood.
for (size_t k = 0; k < numFrames; ++k)
{
m_uttPool[uttID].logLikelihood.SetColumn(
logLikelihood.ColumnSlice(
(startFrame + k) * m_numUttsPerMinibatch + i, 1),
m_uttPool[uttID].progress + k);
}
m_uttPool[uttID].progress += numFrames;
if (m_uttPool[uttID].progress == m_uttPool[uttID].uttLength)
{
ComputeDerivative(uttID);
m_uttPool[uttID].hasDerivative = true;
m_uttPool[uttID].progress = 0;
if (startFrame + numFrames == currentMBSize)
{
m_lastCompleteMinibatch[m_uttPool[uttID].streamID]
= m_minibatchIndex;
}
else
{
m_lastCompleteMinibatch[m_uttPool[uttID].streamID]
= m_minibatchIndex - 1;
}
}
}
}
}
// Checks if we are ready to provide derivatives.
m_minCompleteMinibatchIndex = *std::min_element(
m_lastCompleteMinibatch.begin(), m_lastCompleteMinibatch.end());
m_needLikelihood = (m_minCompleteMinibatchIndex >= 1) ? false : true;
m_minibatchIndex += 1;
}
// Suppose we have a, b, c 3 streams, the <derivativesOut> should be in the
// following format:
// 1: a11 b11 c11 a12 b12 c12...
// 2: a21 b21 c21 a22 b22 c22...
// 3: a31 b31 c31 a32 b32 c32...
template<class ElemType>
bool KaldiSequenceTrainingIO<ElemType>::GetDerivative(
const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
const Matrix<ElemType>& sentenceBegin,
const std::vector<MinibatchPackingFlag>& minibatchPackingFlag,
Matrix<ElemType>* derivativesOut)
{
assert(derivativesOut != NULL);
std::vector<std::vector<
std::pair<wstring, std::pair<size_t, size_t>>>> uttInfoInMinibatch;
ProcessUttInfo(uttInfo, sentenceBegin,
minibatchPackingFlag, &uttInfoInMinibatch);
Matrix<ElemType> derivatives(CPUDEVICE);
derivatives.Resize(m_transModel.NumPdfs(),
sentenceBegin.GetNumCols() * sentenceBegin.GetNumRows());
derivatives.SetValue(0);
m_currentObj = 0;
for (size_t i = 0; i < uttInfo.size(); ++i)
{
assert(uttInfo[i].size() == uttInfoInMinibatch[i].size());
for (size_t j = 0; j < uttInfo[i].size(); ++j)
{
wstring uttID = uttInfo[i][j].first;
// Checks if we have derivatives.
if (m_uttPool.find(uttID) == m_uttPool.end()
|| (m_uttPool.find(uttID) != m_uttPool.end()
&& m_uttPool[uttID].hasDerivative == false))
{
RuntimeError("Derivatives are not ready for utterance:"
" %S\n", uttID.c_str());
}
// Assign the derivatives.
assert(uttID == uttInfoInMinibatch[i][j].first);
size_t startFrame = uttInfoInMinibatch[i][j].second.first;
size_t startFrameInUtt = m_uttPool[uttID].progress;
size_t numFrames = uttInfoInMinibatch[i][j].second.second;
for (size_t k = 0; k < numFrames; ++k)
{
size_t posStart = startFrameInUtt + k;
for (size_t l = 0;
l < m_uttPool[uttID].posterior[posStart].size(); ++l)
{
size_t pdf_id =
m_uttPool[uttID].posterior[posStart][l].first;
assert(pdf_id < m_transModel.NumPdfs());
derivatives(pdf_id,
(startFrame + k) * m_numUttsPerMinibatch + i) -=
m_uttPool[uttID].posterior[posStart][l].second;
}
}
m_currentObj += m_uttPool[uttID].objective
* numFrames / m_uttPool[uttID].uttLength;
m_uttPool[uttID].progress += numFrames;
assert(m_uttPool[uttID].progress <= m_uttPool[uttID].uttLength);
if (m_uttPool[uttID].progress == m_uttPool[uttID].uttLength)
{
m_uttPool.erase(uttID);
}
}
}
// Checks if we need to move data to GPU.
if (derivativesOut->GetDeviceId() >= 0)
{
derivatives.TransferFromDeviceToDevice(
CPUDEVICE, derivativesOut->GetDeviceId(), true, false, false);
}
derivativesOut->SetValue(derivatives);
// Keeps the utterance information so we can check next time when we
// gives the objectives.
m_currentUttInfo = uttInfo;
// Checks if we need to read more loglikelihoods.
m_needLikelihood = false;
m_minCompleteMinibatchIndex -= 1;
if (m_minCompleteMinibatchIndex <= 0)
{
m_needLikelihood = true;
m_minibatchIndex = 1;
m_lastCompleteMinibatch.assign(m_numUttsPerMinibatch, 0);
// Un-do the logLikelihood for partial utterances.
for (auto iter = m_uttPool.begin(); iter != m_uttPool.end(); ++iter)
{
if (iter->second.hasDerivative == false)
{
iter->second.progress = 0;
}
}
}
return true;
}
template<class ElemType>
bool KaldiSequenceTrainingIO<ElemType>::GetObjective(
const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
Matrix<ElemType>* objectivesIn)
{
assert(objectivesIn != NULL);
// Checks utterance information.
bool match = true;
if (uttInfo.size() == m_currentUttInfo.size())
{
for (size_t i = 0; i < uttInfo.size(); ++i)
{
if (uttInfo[i].size() != m_currentUttInfo[i].size())
{
match = false;
break;
}
for (size_t j = 0; j < uttInfo[i].size(); ++j)
{
if (uttInfo[i][j].first != m_currentUttInfo[i][j].first ||
uttInfo[i][j].second != m_currentUttInfo[i][j].second)
{
match = false;
break;
}
}
}
}
else
{
match = false;
}
if (!match)
{
RuntimeError("Current objective does not correspond to the"
" minibatch utterance information, perhaps you did not"
" run GetObjective() right after GetDerivatives()?");
}
// Sets the objectives...
objectivesIn->Resize(1, 1);
objectivesIn->SetValue(m_currentObj);
return true;
}
template class KaldiSequenceTrainingIO<float>;

Просмотреть файл

@ -2,6 +2,7 @@
#include "kaldi.h"
#include "Matrix.h"
#include "basetypes.h"
namespace Microsoft { namespace MSR { namespace CNTK {
@ -12,50 +13,93 @@ class KaldiSequenceTrainingIO
{
private:
bool m_oneSilenceClass;
bool m_currentUttHasDeriv;
bool m_derivRead;
bool m_objRead;
bool m_needLikelihood;
size_t m_numUttsPerMinibatch;
wstring m_trainCriterion;
wstring m_currentUttID;
ElemType m_oldAcousticScale;
ElemType m_acousticScale;
ElemType m_lmScale;
ElemType m_objective;
std::vector<kaldi::int32> m_silencePhones;
size_t m_currentUttLength;
kaldi::TransitionModel m_transModel;
kaldi::Posterior m_posteriors;
kaldi::RandomAccessCompactLatticeReader* m_denlatReader; /*denominator lattices*/
kaldi::RandomAccessInt32VectorReader* m_aliReader; /*alignment*/
kaldi::RandomAccessCompactLatticeReader* m_denlatReader;
kaldi::RandomAccessInt32VectorReader* m_aliReader;
struct UtteranceDerivativeUnit
{
bool hasDerivative;
size_t uttLength;
size_t progress;
size_t streamID;
Matrix<ElemType> logLikelihood;
kaldi::Posterior posterior;
ElemType objective;
UtteranceDerivativeUnit() : logLikelihood(CPUDEVICE)
{
hasDerivative = false;
uttLength = 0;
progress = 0;
streamID = 0;
}
};
ElemType m_currentObj;
int m_minCompleteMinibatchIndex;
size_t m_minibatchIndex;
std::vector<size_t> m_lastCompleteMinibatch;
std::vector<std::vector<std::pair<wstring, size_t>>> m_currentUttInfo;
unordered_map<wstring, UtteranceDerivativeUnit> m_uttPool;
// Rescores the lattice with the lastest posteriors from the neural network.
void LatticeAcousticRescore(const std::vector<kaldi::int32>& stateTimes,
const Matrix<ElemType>& outputs, kaldi::Lattice* lat);
void LatticeAcousticRescore(
const std::vector<kaldi::int32>& stateTimes,
const Matrix<ElemType>& outputs, kaldi::Lattice* lat) const;
// <uttInfoInMinibatch> is a vector of vector of the following:
// uttID startFrameIndexInMinibatch numFrames
void ProcessUttInfo(
const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
const Matrix<ElemType>& sentenceBegin,
const std::vector<MinibatchPackingFlag>& minibatchPackingFlag,
std::vector<std::vector<std::pair<
wstring, std::pair<size_t, size_t>>>>* uttInfoInMinibatch) const;
bool ComputeDerivative(const wstring& uttID);
public:
// Constructor.
KaldiSequenceTrainingIO(const wstring& denlatRspecifier, const wstring& aliRspecifier,
const wstring& transModelFilename, const wstring& silencePhoneStr,
KaldiSequenceTrainingIO(const wstring& denlatRspecifier,
const wstring& aliRspecifier,
const wstring& transModelFilename,
const wstring& silencePhoneStr,
const wstring& trainCriterion,
ElemType oldAcousticScale,
ElemType acousticScale,
ElemType lmScale,
bool oneSilenceClass);
bool oneSilenceClass,
size_t numberOfuttsPerMinibatch);
// Destructor.
~KaldiSequenceTrainingIO();
bool HasDerivatives(const wstring& uttID);
bool NeedLikelihoodToComputeDerivative() const { return m_needLikelihood; }
bool ComputeDerivatives(const wstring& uttID, const Matrix<ElemType>& outputs);
bool SetLikelihood(
const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
const Matrix<ElemType>& outputs,
const Matrix<ElemType>& sentenceBegin,
const std::vector<MinibatchPackingFlag>& minibatchPackingFlag);
// Gets the computed derivatives for given utterance.
void GetDerivatives(size_t startFrame, size_t endFrame, size_t mbSize,
const std::wstring& uttID, Matrix<ElemType>& derivatives);
bool GetDerivative(
const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
const Matrix<ElemType>& sentenceBegin,
const std::vector<MinibatchPackingFlag>& minibatchPackingFlag,
Matrix<ElemType>* derivativesOut);
// Gets the computed objectives for given utterance.
void GetObjectives(size_t startFrame, size_t endFrame,
const std::wstring& uttID, Matrix<ElemType>& derivatives);
bool GetObjective(
const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
Matrix<ElemType>* objectivesIn);
};
}}}

Просмотреть файл

@ -1631,13 +1631,17 @@ protected:
// Tries to read an utterance and run forward computation on the
// whole utterance.
assert(trainSetDataReader != NULL);
std::wstring uttID;
if (trainSetDataReader->GetForkedUtterance(uttID, *inputMatrices))
std::vector<std::vector<std::pair<wstring, size_t>>> uttInfo;
Matrix<ElemType> sentenceBoundary;
std::vector<MinibatchPackingFlag> minibatchPackingFlag;
while (trainSetDataReader->GetMinibatchCopy(uttInfo, *inputMatrices,
sentenceBoundary,
minibatchPackingFlag))
{
UpdateEvalTimeStamps(FeatureNodes);
std::vector<ComputationNodePtr>* outputNodes = net.OutputNodes();
if (outputNodes->size() < 1)
std::vector<ComputationNodePtr>* outputNodes = net.OutputNodes();
if (outputNodes->size() < 1)
{
throw std::logic_error("no output node was found.");
}
@ -1645,8 +1649,11 @@ protected:
net.SetActualMiniBatchSize(actualMBSize);
net.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter());
trainSetDataReader->SetSentenceSegBatch(net.SentenceBoundary(), net.MinibatchPackingFlags());
net.Evaluate((*outputNodes)[0]); // Only evaluate the first output
trainSetDataReader->ComputeDerivativeFeatures(uttID, (*outputNodes)[0]->FunctionValues());
net.Evaluate((*outputNodes)[0]); // Only evaluate the first output
trainSetDataReader->SetNetOutput(uttInfo,
(*outputNodes)[0]->FunctionValues(),
sentenceBoundary,
minibatchPackingFlag);
}
}