diff --git a/Common/DataReader.cpp b/Common/DataReader.cpp index 2739f7ea6..f034cd1a5 100644 --- a/Common/DataReader.cpp +++ b/Common/DataReader.cpp @@ -225,20 +225,28 @@ void DataReader::SetRandomSeed(int seed) } template -bool DataReader::GetForkedUtterance(std::wstring& uttID, std::map*>& matrices) +bool DataReader::GetMinibatchCopy( + std::vector>>& uttInfo, + std::map*>& matrices, + Matrix& sentenceBegin, + std::vector& minibatchPackingFlag) { bool ans = false; for (size_t i = 0; i < m_ioNames.size(); i++) - ans = (m_dataReader[m_ioNames[i]]->GetForkedUtterance(uttID, matrices) || ans); + ans = (m_dataReader[m_ioNames[i]]->GetMinibatchCopy(uttInfo, matrices, sentenceBegin, minibatchPackingFlag) || ans); return ans; } template -bool DataReader::ComputeDerivativeFeatures(const std::wstring& uttID, const Matrix& outputs) +bool DataReader::SetNetOutput( + const std::vector>>& uttInfo, + const Matrix& outputs, + const Matrix& sentenceBegin, + const std::vector& minibatchPackingFlag) { bool ans = false; for (size_t i = 0; i < m_ioNames.size(); i++) - ans = (m_dataReader[m_ioNames[i]]->ComputeDerivativeFeatures(uttID, outputs) || ans); + ans = (m_dataReader[m_ioNames[i]]->SetNetOutput(uttInfo, outputs, sentenceBegin, minibatchPackingFlag) || ans); return ans; } diff --git a/Common/Include/DataReader.h b/Common/Include/DataReader.h index e269844b2..2ed8483dd 100644 --- a/Common/Include/DataReader.h +++ b/Common/Include/DataReader.h @@ -85,14 +85,27 @@ public: void SetDoRandomize(bool b){ mDoRandomize = b; } - // Gets utterance before getting the actual minibatch, which will not affect - // getting the minibatches. This can be useful in sequence training. - virtual bool GetForkedUtterance(std::wstring& , std::map*>& ) { return false; } + // Gets a copy of the minibatch for the forward computation. This can be + // useful if some of the computation has to happen in the reader. + virtual bool GetMinibatchCopy( + std::vector>>& /*uttInfo*/, + std::map*>& /*matrices*/, + Matrix& /*sentenceBegin*/, + std::vector& /*minibatchPackingFlag*/) + { + return false; + } - // Computes certain derivatives given outputs from neural networks, which - // will later be fed to the neural network as features. This can be useful - // in sequence training. - virtual bool ComputeDerivativeFeatures(const std::wstring& , const Matrix& ) { return false; } + // Sets the neural network output to the reader. This can be useful if some + // of the computation has to happen in the reader. + virtual bool SetNetOutput( + const std::vector>>& /*uttInfo*/, + const Matrix& /*outputs*/, + const Matrix& /*sentenceBegin*/, + const std::vector& /*minibatchPackingFlag*/) + { + return false; + } }; // GetReader - get a reader type from the DLL @@ -193,14 +206,21 @@ public: virtual bool DataEnd(EndDataType endDataType); - // Gets utterance before getting the actual minibatch, which will not affect - // getting the minibatches. This can be useful in sequence training. - virtual bool GetForkedUtterance(std::wstring& uttID, std::map*>& matrices); + // Gets a copy of the minibatch for the forward computation. This can be + // useful if some of the computation has to happen in the reader. + virtual bool GetMinibatchCopy( + std::vector>>& uttInfo, + std::map*>& matrices, + Matrix& sentenceBegin, + std::vector& minibatchPackingFlag); - // Computes certain derivatives given outputs from neural networks, which - // will later be fed to the neural network as features. This can be useful - // in sequence training. - virtual bool ComputeDerivativeFeatures(const std::wstring& uttID, const Matrix& outputs); + // Sets the neural network output to the reader. This can be useful if some + // of the computation has to happen in the reader. + virtual bool SetNetOutput( + const std::vector>>& uttInfo, + const Matrix& outputs, + const Matrix& sentenceBegin, + const std::vector& minibatchPackingFlag); void SetSentenceSegBatch(Matrix & sentenceBegin, vector& minibatchPackingFlag); diff --git a/DataReader/Kaldi2Reader/HTKMLFReader.cpp b/DataReader/Kaldi2Reader/HTKMLFReader.cpp index aab749d5f..be8d7dc67 100644 --- a/DataReader/Kaldi2Reader/HTKMLFReader.cpp +++ b/DataReader/Kaldi2Reader/HTKMLFReader.cpp @@ -49,9 +49,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_frameSource = NULL; m_lattices = NULL; m_sequenceTrainingIO = NULL; + m_minibatchBuffer.resize(0); + m_minibatchBufferIndex = 0; + m_minibatchBufferLeftovers = 0; m_noData = false; m_convertLabelsToTargets = false; m_doSeqTrain = false; + m_getMinibatchCopy = false; if (readerConfig.Exists("legacyMode")) { @@ -60,7 +64,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // If is false, throw away any utterance that is longer // than the specified . - m_maxUtteranceLength = readerConfig("maxUtteranceLength", "1500"); + m_maxUtteranceLength = readerConfig("maxUtteranceLength", "10000"); // m_truncated: // If true, truncate utterances to fit the minibatch size. Otherwise @@ -172,7 +176,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_sequenceTrainingIO = new KaldiSequenceTrainingIO( denlatRspecifier, aliRspecifier, transModelFilename, silencePhoneStr, m_seqTrainCriterion, oldAcousticScale, - acousticScale, lmScale, oneSilenceClass); + acousticScale, lmScale, + oneSilenceClass, m_numberOfuttsPerMinibatch); // Scans the configurations to get "seqTrainDeriv" type input and // "seqTrainObj" type input. Both are feature nodes, we feed derivatives @@ -293,6 +298,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { } m_featureNameToIdMap[featureNames[i]] = iFeat; + assert(iFeat == m_featureIdToNameMap.size()); + m_featureIdToNameMap.push_back(featureNames[i]); scriptpaths.push_back(new msra::asr::FeatureSection(thisFeature("scpFile"), thisFeature("rx"), thisFeature("featureTransform", ""))); m_featureNameToDimMap[featureNames[i]] = m_featDims[i]; @@ -334,6 +341,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { statelistpaths.push_back(thisLabel("labelMappingFile",L"")); m_labelNameToIdMap[labelNames[i]] = iLabel; + assert(iLabel == m_labelIdToNameMap.size()); + m_labelIdToNameMap.push_back(labelNames[i]); m_labelNameToDimMap[labelNames[i]] = m_labelDims[i]; mlfpaths.clear(); mlfpaths.push_back(thisLabel("mlfFile")); @@ -599,6 +608,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { } m_featureNameToIdMap[featureNames[i]]= iFeat; + assert(iFeat == m_featureIdToNameMap.size()); + m_featureIdToNameMap.push_back(featureNames[i]); scriptpaths.push_back(new msra::asr::FeatureSection(thisFeature("scpFile"), thisFeature("rx"), thisFeature("featureTransform", ""))); m_featureNameToDimMap[featureNames[i]] = realDims[i]; @@ -736,6 +747,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { void HTKMLFReader::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples) { m_mbSize = mbSize; + m_currentMBSize = mbSize; if (m_trainOrTest) { @@ -788,7 +800,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_mbiter = NULL; } msra::dbn::minibatchsource* source = m_frameSource; - m_mbiter = new msra::dbn::minibatchiterator(*source, epoch, requestedEpochSamples, mbSize, datapasses); + size_t currentMBSize = (m_framemode == true) ? mbSize : 1; + m_mbiter = new msra::dbn::minibatchiterator(*source, epoch, requestedEpochSamples, currentMBSize, datapasses); // Clears feature and label buffer. if (!m_featuresBufferMultiIO.empty()) @@ -882,7 +895,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // if startFrame = 5, endFrame = 10, then we copy frames 5, 6, 7, 8, 9. template bool HTKMLFReader::PopulateUtteranceInMinibatch( - std::map*>& matrices, + const std::map*>& matrices, size_t uttIndex, size_t startFrame, size_t endFrame, size_t mbSize, size_t mbOffset) { @@ -897,15 +910,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { { return false; } - if (m_doSeqTrain && m_numberOfuttsPerMinibatch > 1) - { - LogicError("nbrUttsInEachRecurrentIter has to be 1 in sequence training.\n"); - } size_t numOfFea = m_featuresBufferMultiIO.size(); size_t numOfLabel = m_labelsBufferMultiIO.size(); - typename std::map*>::iterator iter; - for (iter = matrices.begin(); iter != matrices.end(); iter++) + for (auto iter = matrices.begin(); iter != matrices.end(); iter++) { if (m_nameToTypeMap[iter->first] == InputOutputTypes::real) { // Features. @@ -972,65 +980,41 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } } - else if (m_doSeqTrain) - { - // TODO(GUOGUO): if we are going to allow "m_truncate" for - // sequence training, we will have to modify the - // following -- the following always assume we - // start filling the minibatch from index 0. - // If we do sequence training we have to populate the derivative - // features as well as the objective features. But unlike the - // features and labels, we put them in to directly. - // We assume we only process one utterance at a time in the - // current implementation. - assert(uttIndex == 0); - if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainDeriv) - { - wstring uttID = m_uttInfo[uttIndex][0].first; - Matrix& data = *matrices[iter->first]; - if (m_sequenceTrainingIO->HasDerivatives(uttID)) - m_sequenceTrainingIO->GetDerivatives(startFrame, endFrame, mbSize, uttID, data); - else - { - data.Resize(data.GetNumRows(), mbSize); - data.SetValue(0); - } - } - else if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainObj) - { - wstring uttID = m_uttInfo[uttIndex][0].first; - Matrix& data = *matrices[iter->first]; - if (m_sequenceTrainingIO->HasDerivatives(uttID)) - m_sequenceTrainingIO->GetObjectives(startFrame, endFrame, uttID, data); - else - data.SetValue(0); - } - } } return success; } template - bool HTKMLFReader::GetMinibatchToTrainOrTest(std::map*>& matrices) + bool HTKMLFReader::GetOneMinibatchToTrainOrTestDataBuffer( + const std::map*>& matrices) { bool skip = false; // On first minibatch, check if we have input for given names. if (m_checkDictionaryKeys) { - std::map::iterator iter; for (auto iter = matrices.begin(); iter != matrices.end(); iter++) { if (m_nameToTypeMap.find(iter->first) == m_nameToTypeMap.end()) { - throw std::runtime_error(msra::strfun::strprintf("minibatch requested for input node %S not found in reader - cannot generate input\n", iter->first.c_str())); + throw std::runtime_error(msra::strfun::strprintf( + "minibatch requested for input node %S not found in" + "reader - cannot generate input\n", iter->first.c_str())); } } m_checkDictionaryKeys=false; } - size_t currentMBSize = m_mbSize; + // If we are doing sequence training, we need to keep the utterance + // information. + if (m_doSeqTrain) + { + m_minibatchUttInfo.assign(m_numberOfuttsPerMinibatch, + std::vector>(0)); + } + + m_currentMBSize = m_mbSize; do { // Checks if we have finished all the utterances. @@ -1050,28 +1034,28 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } - // If is true, is - // If is false, equals to the longest + // If is true, is + // If is false, equals to the longest // utterance in the minibatch. if (!m_truncated) { - currentMBSize = 0; + m_currentMBSize = 0; for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i++) { - if (m_currentBufferFrames[i] > currentMBSize) + if (m_currentBufferFrames[i] > m_currentMBSize) { - currentMBSize = m_currentBufferFrames[i]; + m_currentMBSize = m_currentBufferFrames[i]; } } } // We initialize the sentence boundary information before we process // the utterances. - m_sentenceBegin.Resize(m_numberOfuttsPerMinibatch, currentMBSize); - m_minibatchPackingFlag.resize(currentMBSize); + m_sentenceBegin.Resize(m_numberOfuttsPerMinibatch, m_currentMBSize); + m_minibatchPackingFlag.resize(m_currentMBSize); for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i++) { - for (size_t j = 0; j < currentMBSize; j++) + for (size_t j = 0; j < m_currentMBSize; j++) { m_sentenceBegin.SetValue(i, j, (ElemType) SENTENCE_MIDDLE); } @@ -1085,7 +1069,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { size_t startFrame = m_processedFrame[i]; size_t endFrame = 0; - if ((startFrame + currentMBSize) < m_toProcess[i]) + if ((startFrame + m_currentMBSize) < m_toProcess[i]) { // There is only 1 case: // 1. is false, and is true. @@ -1099,11 +1083,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_minibatchPackingFlag[0] |= MinibatchPackingFlag::UtteranceStart; } - endFrame = startFrame + currentMBSize; - bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, currentMBSize); - m_processedFrame[i] += currentMBSize; + endFrame = startFrame + m_currentMBSize; + bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, m_currentMBSize); + if (m_doSeqTrain && populateSucc) { m_minibatchUttInfo[i].push_back(m_uttInfo[i][0]); } + m_processedFrame[i] += m_currentMBSize; } - else if ((startFrame + currentMBSize) == m_toProcess[i]) + else if ((startFrame + m_currentMBSize) == m_toProcess[i]) { // There are 3 cases: // 1. is false, and is true, @@ -1132,9 +1117,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Now puts the utterance into the minibatch, and loads the // next one. - endFrame = startFrame + currentMBSize; - bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, currentMBSize); - m_processedFrame[i] += currentMBSize; + endFrame = startFrame + m_currentMBSize; + bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, m_currentMBSize); + if (m_doSeqTrain && populateSucc) { m_minibatchUttInfo[i].push_back(m_uttInfo[i][0]); } + m_processedFrame[i] += m_currentMBSize; bool reNewSucc = ReNewBufferForMultiIO(i); } else @@ -1151,7 +1137,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Checks if we have reached the end of the minibatch. if (startFrame == m_toProcess[i]) { - for (size_t k = 0; k < currentMBSize; k++) + for (size_t k = 0; k < m_currentMBSize; k++) { m_sentenceBegin.SetValue(i, k, (ElemType) NO_LABELS); m_minibatchPackingFlag[k] |= MinibatchPackingFlag::NoLabel; @@ -1159,7 +1145,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Populates with real features, the // following implementation is not efficient... assert(m_toProcess[i] > 0); - PopulateUtteranceInMinibatch(matrices, i, 0, 1, currentMBSize, k); + PopulateUtteranceInMinibatch(matrices, i, 0, 1, m_currentMBSize, k); } continue; } @@ -1194,13 +1180,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { } endFrame = m_toProcess[i]; size_t currentMBFilled = endFrame - startFrame; - bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, currentMBSize); + bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, m_currentMBSize); + if (m_doSeqTrain && populateSucc) { m_minibatchUttInfo[i].push_back(m_uttInfo[i][0]); } m_processedFrame[i] += currentMBFilled; bool reNewSucc = ReNewBufferForMultiIO(i); // Third, if the next utterance can fit into the current // minibatch, we also pack the next utterance. - while (reNewSucc && (currentMBFilled + m_toProcess[i] <= currentMBSize)) + while (reNewSucc && (currentMBFilled + m_toProcess[i] <= m_currentMBSize)) { // Sets the utterance boundary. assert(currentMBFilled + m_toProcess[i] <= m_sentenceBegin.GetNumCols()); @@ -1208,7 +1195,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_minibatchPackingFlag[currentMBFilled] |= MinibatchPackingFlag::UtteranceStart; m_sentenceBegin.SetValue(i, currentMBFilled + m_toProcess[i] - 1, (ElemType)SENTENCE_END); m_minibatchPackingFlag[currentMBFilled + m_toProcess[i] - 1] |= MinibatchPackingFlag::UtteranceEnd; - populateSucc = PopulateUtteranceInMinibatch(matrices, i, 0, m_toProcess[i], currentMBSize, currentMBFilled); + populateSucc = PopulateUtteranceInMinibatch(matrices, i, 0, m_toProcess[i], m_currentMBSize, currentMBFilled); + if (m_doSeqTrain && populateSucc) { m_minibatchUttInfo[i].push_back(m_uttInfo[i][0]); } assert(m_processedFrame[i] == 0); m_processedFrame[i] = m_toProcess[i]; currentMBFilled += m_toProcess[i]; @@ -1219,9 +1207,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { // minibatch is not full. if (reNewSucc && !m_framemode && m_truncated) { - populateSucc = PopulateUtteranceInMinibatch(matrices, i, 0, currentMBSize - currentMBFilled, currentMBSize, currentMBFilled); - m_processedFrame[i] += currentMBSize - currentMBFilled; - if (currentMBFilled < currentMBSize) + populateSucc = PopulateUtteranceInMinibatch(matrices, i, 0, m_currentMBSize - currentMBFilled, m_currentMBSize, currentMBFilled); + if (m_doSeqTrain && populateSucc) { m_minibatchUttInfo[i].push_back(m_uttInfo[i][0]); } + m_processedFrame[i] += m_currentMBSize - currentMBFilled; + if (currentMBFilled < m_currentMBSize) { m_sentenceBegin.SetValue(i, currentMBFilled, (ElemType)SENTENCE_BEGIN); m_minibatchPackingFlag[currentMBFilled] |= MinibatchPackingFlag::UtteranceStart; @@ -1229,7 +1218,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } else { - for (size_t k = currentMBFilled; k < currentMBSize; k++) + for (size_t k = currentMBFilled; k < m_currentMBSize; k++) { m_sentenceBegin.SetValue(i, k, (ElemType) NO_LABELS); m_minibatchPackingFlag[k] |= MinibatchPackingFlag::NoLabel; @@ -1237,29 +1226,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Populates with real features, the // following implementation is not efficient... assert(m_toProcess[i] > 0); - PopulateUtteranceInMinibatch(matrices, i, 0, 1, currentMBSize, k); + PopulateUtteranceInMinibatch(matrices, i, 0, 1, m_currentMBSize, k); } } } } - typename std::map*>::iterator iter; - for (iter = matrices.begin(); iter != matrices.end(); iter++) - { - Matrix& data = *matrices[iter->first]; - if (m_nameToTypeMap[iter->first] == InputOutputTypes::real) - { - size_t id = m_featureNameToIdMap[iter->first]; - size_t dim = m_featureNameToDimMap[iter->first]; - data.SetValue(dim, currentMBSize * m_numberOfuttsPerMinibatch, m_featuresBufferMultiIO[id] , matrixFlagNormal); - } - else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category) - { - size_t id = m_labelNameToIdMap[iter->first]; - size_t dim = m_labelNameToDimMap[iter->first]; - data.SetValue(dim, currentMBSize * m_numberOfuttsPerMinibatch, m_labelsBufferMultiIO[id], matrixFlagNormal); - } - } skip=false; } while(skip); @@ -1267,6 +1239,209 @@ namespace Microsoft { namespace MSR { namespace CNTK { return true; } + template + bool HTKMLFReader::ShouldCopyMinibatchFromBuffer() + { + if (m_doSeqTrain) + { + // If is false, then we should copy data from + // buffer for back-propagation. + if (m_getMinibatchCopy == false && m_minibatchBuffer.size() > 0) + { + m_minibatchBufferIndex = 0; + m_minibatchBufferLeftovers = m_minibatchBuffer.size() - 1; // Will pop one more. + return true; + } + + // If is true, we first have to re-compute + // the likelihood for the frames that are already in the buffer. + if (m_getMinibatchCopy == true && m_minibatchBufferLeftovers > 0) + { + if (m_minibatchBufferLeftovers == m_minibatchBuffer.size()) + { + m_minibatchBufferIndex = 0; + } + else + { + m_minibatchBufferIndex += 1; + } + m_minibatchBufferLeftovers -= 1; + return true; + } + } + + return false; + } + + template + void HTKMLFReader::CopyMinibatchToBuffer() + { + MinibatchBufferUnit currentMinibatch; + + // Stores variables realted to the current minibatch. + currentMinibatch.sentenceBegin.SetValue(m_sentenceBegin); + currentMinibatch.minibatchPackingFlag = m_minibatchPackingFlag; + currentMinibatch.currentMBSize = m_currentMBSize; + currentMinibatch.minibatchUttInfo = m_minibatchUttInfo; + + size_t size = m_currentMBSize * m_numberOfuttsPerMinibatch; + + // Copies features. + currentMinibatch.features.resize(0); + for (size_t i = 0; i < m_featuresBufferMultiIO.size(); ++i) + { + std::vector tmpFeatures(m_featuresBufferMultiIO[i], + m_featuresBufferMultiIO[i] + size * m_featureNameToDimMap[m_featureIdToNameMap[i]]); + currentMinibatch.features.push_back(tmpFeatures); + } + + // Copies labels. + currentMinibatch.labels.resize(0); + for (size_t i = 0; i < m_labelsBufferMultiIO.size(); ++i) + { + std::vector tmpLabels(m_labelsBufferMultiIO[i], + m_labelsBufferMultiIO[i] + size * m_labelNameToDimMap[m_labelIdToNameMap[i]]); + currentMinibatch.labels.push_back(tmpLabels); + } + + m_minibatchBuffer.push_back(currentMinibatch); + } + + template + void HTKMLFReader::CopyMinibatchFromBufferToMatrix( + size_t index, + std::map*>& matrices) + { + assert(m_minibatchBuffer.size() > index); + + // Restores the variables related to the minibatch. + m_sentenceBegin.SetValue(m_minibatchBuffer[index].sentenceBegin); + m_minibatchPackingFlag = m_minibatchBuffer[index].minibatchPackingFlag; + m_currentMBSize = m_minibatchBuffer[index].currentMBSize; + m_minibatchUttInfo = m_minibatchBuffer[index].minibatchUttInfo; + + // Copies data to the matrix. + for (auto iter = matrices.begin(); iter != matrices.end(); iter++) + { + Matrix& data = *matrices[iter->first]; + if (m_nameToTypeMap[iter->first] == InputOutputTypes::real) + { + size_t id = m_featureNameToIdMap[iter->first]; + size_t dim = m_featureNameToDimMap[iter->first]; + assert(id < m_minibatchBuffer[index].features.size()); + data.SetValue(dim, + m_minibatchBuffer[index].features[id].size() / dim, + m_minibatchBuffer[index].features[id].data(), + matrixFlagNormal); + } + else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category) + { + size_t id = m_labelNameToIdMap[iter->first]; + size_t dim = m_labelNameToDimMap[iter->first]; + assert(id < m_minibatchBuffer[index].labels.size()); + data.SetValue(dim, + m_minibatchBuffer[index].labels[id].size() / dim, + m_minibatchBuffer[index].labels[id].data(), + matrixFlagNormal); + } + else if (m_doSeqTrain && !m_getMinibatchCopy) + { + if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainDeriv) + { + m_sequenceTrainingIO->GetDerivative( + m_minibatchUttInfo, m_sentenceBegin, + m_minibatchPackingFlag, matrices[iter->first]); + } + else if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainObj) + { + m_sequenceTrainingIO->GetObjective(m_minibatchUttInfo, + matrices[iter->first]); + } + } + } + + // If we are not in the minibatch copy mode, then we can remove the + // minibatch from buffer. + if (m_getMinibatchCopy == false) + { + assert(index == 0); + m_minibatchBuffer.pop_front(); + } + } + + template + void HTKMLFReader::CopyMinibatchToMatrix( + size_t size, + const vector& featureBuffer, + const vector& labelBuffer, + std::map*>& matrices) const + { + for (auto iter = matrices.begin(); iter != matrices.end(); iter++) + { + Matrix& data = *matrices[iter->first]; + if (m_nameToTypeMap[iter->first] == InputOutputTypes::real) + { + size_t id = m_featureNameToIdMap[iter->first]; + size_t dim = m_featureNameToDimMap[iter->first]; + assert(id < featureBuffer.size()); + data.SetValue(dim, size, featureBuffer[id] , matrixFlagNormal); + } + else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category) + { + size_t id = m_labelNameToIdMap[iter->first]; + size_t dim = m_labelNameToDimMap[iter->first]; + assert(id < labelBuffer.size()); + data.SetValue(dim, size, labelBuffer[id], matrixFlagNormal); + } + else if (m_doSeqTrain) + { + if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainDeriv) + { + data.Resize(data.GetNumRows(), m_currentMBSize); + data.SetValue(0); + } + else if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainObj) + { + data.SetValue(0); + } + } + } + } + + template + bool HTKMLFReader::GetMinibatchToTrainOrTest( + std::map*>& matrices) + { + // We either copy a new minibatch from buffer or read one from minibatch + // iterator. + bool success = false; + if (ShouldCopyMinibatchFromBuffer()) + { + CopyMinibatchFromBufferToMatrix(m_minibatchBufferIndex, matrices); + return true; + } + else + { + success = GetOneMinibatchToTrainOrTestDataBuffer(matrices); + if (success) + { + CopyMinibatchToMatrix( + m_currentMBSize * m_numberOfuttsPerMinibatch, + m_featuresBufferMultiIO, m_labelsBufferMultiIO, matrices); + } + + // Checks if we need to move the current minibatch to buffer. + if (success && m_getMinibatchCopy) + { + CopyMinibatchToBuffer(); + } + + return success; + } + + return false; + } + template bool HTKMLFReader::GetMinibatchToWrite(std::map*>& matrices) { @@ -1567,82 +1742,60 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (!(*m_mbiter)) m_noData = true; - return true; + return true; } // Gets a copy of the utterance that corresponds to the current minibatches, // which will be used to do a neural network forward computation. template - bool HTKMLFReader::GetForkedUtterance(std::wstring& uttID, - std::map*>& matrices) + bool HTKMLFReader::GetMinibatchCopy( + std::vector>>& uttInfo, + std::map*>& matrices, + Matrix& sentenceBegin, + std::vector& minibatchPackingFlag) { - if (!m_doSeqTrain) + // We need to get a "copy" of the minibatch to do the forward + // computation for sequence training. + if (m_doSeqTrain) { + assert(m_framemode == false); + if (m_sequenceTrainingIO->NeedLikelihoodToComputeDerivative()) + { + m_getMinibatchCopy = true; + if (GetMinibatchToTrainOrTest(matrices)) + { + sentenceBegin.SetValue(m_sentenceBegin); + minibatchPackingFlag = m_minibatchPackingFlag; + uttInfo = m_minibatchUttInfo; + m_getMinibatchCopy = false; + return true; + } + m_getMinibatchCopy = false; + } return false; } - assert(m_framemode == false); - - // For the moment we only support single utterance. - if (m_numberOfuttsPerMinibatch != 1) - { - RuntimeError("The current sequence training implementation does not support multiple utterances.\n"); - } - - // Under our current assumption, we only have one utterance at a time. - uttID = m_uttInfo[0][0].first; - if (!m_sequenceTrainingIO->HasDerivatives(uttID)) - { - size_t startFrame = 0; - size_t endFrame = m_uttInfo[0][0].second; - size_t currentMBSize = endFrame - startFrame; - bool populateSucc = PopulateUtteranceInMinibatch( - matrices, 0, startFrame, endFrame, currentMBSize); - if (!populateSucc) - { - return false; - } - - // Sets sentence boundary. - m_sentenceBegin.Resize(1, currentMBSize); - m_minibatchPackingFlag.resize(currentMBSize); - for (size_t i = 0; i < currentMBSize; i++) - { - m_sentenceBegin.SetValue(0, i, (ElemType) SENTENCE_MIDDLE); - } - std::fill(m_minibatchPackingFlag.begin(), m_minibatchPackingFlag.end(), MinibatchPackingFlag::None); - m_sentenceBegin.SetValue(0, 0, (ElemType)SENTENCE_BEGIN); - m_sentenceBegin.SetValue(0, m_sentenceBegin.GetNumCols() - 1, (ElemType) SENTENCE_END); - m_minibatchPackingFlag[0] = MinibatchPackingFlag::UtteranceStart; - m_minibatchPackingFlag[m_sentenceBegin.GetNumCols() - 1] = MinibatchPackingFlag::UtteranceEnd; - - typename std::map*>::iterator iter; - for (iter = matrices.begin(); iter != matrices.end(); iter++) - { - Matrix& data = *matrices[iter->first]; - if (m_nameToTypeMap[iter->first] == InputOutputTypes::real) - { - size_t id = m_featureNameToIdMap[iter->first]; - size_t dim = m_featureNameToDimMap[iter->first]; - data.SetValue(dim, currentMBSize * m_numberOfuttsPerMinibatch, m_featuresBufferMultiIO[id] , matrixFlagNormal); - } - else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category) - { - size_t id = m_labelNameToIdMap[iter->first]; - size_t dim = m_labelNameToDimMap[iter->first]; - data.SetValue(dim, currentMBSize * m_numberOfuttsPerMinibatch, m_labelsBufferMultiIO[id], matrixFlagNormal); - } - } - return true; - } - return false; } template - bool HTKMLFReader::ComputeDerivativeFeatures(const std::wstring& uttID, - const Matrix& outputs) + bool HTKMLFReader::SetNetOutput( + const std::vector>>& uttInfo, + const Matrix& outputs, + const Matrix& sentenceBegin, + const std::vector& minibatchPackingFlag) { - return m_sequenceTrainingIO->ComputeDerivatives(uttID, outputs); + // Set the likelihoods for the utterance with which we can comput the + // derivatives. Note that the minibatch may only contain partial output + // for the utterance, takes care of "pasting" + // them together. + if (m_doSeqTrain) + { + assert(m_framemode == false); + return m_sequenceTrainingIO->SetLikelihood(uttInfo, outputs, + sentenceBegin, + minibatchPackingFlag); + } + return false; } diff --git a/DataReader/Kaldi2Reader/HTKMLFReader.h b/DataReader/Kaldi2Reader/HTKMLFReader.h index 9298d555e..8ee1c5777 100644 --- a/DataReader/Kaldi2Reader/HTKMLFReader.h +++ b/DataReader/Kaldi2Reader/HTKMLFReader.h @@ -24,14 +24,25 @@ private: msra::dbn::latticesource* m_lattices; map m_latticeMap; - // Sequence training related. Note that for now we only support single - // utterance in sequence training. But the utterance information holders - // are designed as if they support multiple utterances -- in case we will - // extend this soon. + // Sequence training realted members. + struct MinibatchBufferUnit + { + std::vector> features; + std::vector> labels; + Matrix sentenceBegin; + vector minibatchPackingFlag; + std::vector>> minibatchUttInfo; + size_t currentMBSize; + }; bool m_doSeqTrain; + bool m_getMinibatchCopy; + size_t m_minibatchBufferIndex; + size_t m_minibatchBufferLeftovers; wstring m_seqTrainCriterion; KaldiSequenceTrainingIO* m_sequenceTrainingIO; + std::deque m_minibatchBuffer; std::vector>> m_uttInfo; + std::vector>> m_minibatchUttInfo; vector m_sentenceEnd; bool m_readAhead; @@ -42,6 +53,7 @@ private: size_t m_numberOfuttsPerMinibatch; size_t m_actualnumberOfuttsPerMinibatch; size_t m_mbSize; + size_t m_currentMBSize; vector m_currentBufferFrames; vector m_toProcess; vector m_switchFrame; @@ -72,6 +84,8 @@ private: std::map m_nameToTypeMap; std::map m_featureNameToDimMap; std::map m_labelNameToDimMap; + std::vector m_featureIdToNameMap; + std::vector m_labelIdToNameMap; // for writing outputs to files (standard single input/output network) - deprecate eventually bool m_checkDictionaryKeys; bool m_convertLabelsToTargets; @@ -89,10 +103,22 @@ private: void PrepareForSequenceTraining(const ConfigParameters& config); bool GetMinibatchToTrainOrTest(std::map*>& matrices); + bool GetOneMinibatchToTrainOrTestDataBuffer(const std::map*>& matrices); bool GetMinibatchToWrite(std::map*>& matrices); - bool PopulateUtteranceInMinibatch(std::map*>& matrices, size_t uttIndex, size_t startFrame, size_t endFrame, size_t mbSize, size_t mbOffset = 0); + bool PopulateUtteranceInMinibatch(const std::map*>& matrices, size_t uttIndex, size_t startFrame, size_t endFrame, size_t mbSize, size_t mbOffset = 0); - //-void GetCurrentUtteranceInfo(size_t uttIndex, size_t startFrame, size_t endFrame, wstring& uttID, size_t& startFrameInUtt, size_t& endFrameInUtt); + // If we have to read the current minibatch from buffer, return true, + // otherwise return false. + bool ShouldCopyMinibatchFromBuffer(); + + // Copys the current minibatch to buffer. + void CopyMinibatchToBuffer(); + + // Copys one minibatch from buffer to matrix. + void CopyMinibatchFromBufferToMatrix(size_t index, std::map*>& matrices); + + // Copys one minibatch from to matrix. + void CopyMinibatchToMatrix(size_t size, const std::vector& featureBuffer, const std::vector& labelBuffer, std::map*>& matrices) const; void StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize); void StartMinibatchLoopToWrite(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize); @@ -157,9 +183,16 @@ public: virtual void SetLabelMapping(const std::wstring& sectionName, const std::map& labelMapping); virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0); - virtual bool GetForkedUtterance(std::wstring& uttID, std::map*>& matrices); - virtual bool ComputeDerivativeFeatures(const std::wstring& uttID, const Matrix& outputs); - + virtual bool GetMinibatchCopy( + std::vector>>& uttInfo, + std::map*>& matrices, + Matrix& sentenceBegin, + vector& sentenceExistsBeginOrNoLabels); + virtual bool SetNetOutput( + const std::vector>>& uttInfo, + const Matrix& outputs, + const Matrix& sentenceBegin, + const vector& sentenceExistsBeginOrNoLabels); virtual bool DataEnd(EndDataType endDataType); void SetSentenceEndInBatch(vector &/*sentenceEnd*/); diff --git a/DataReader/Kaldi2Reader/KaldiSequenceTrainingIO.cpp b/DataReader/Kaldi2Reader/KaldiSequenceTrainingIO.cpp index ea656ac30..9abf9aa5c 100644 --- a/DataReader/Kaldi2Reader/KaldiSequenceTrainingIO.cpp +++ b/DataReader/Kaldi2Reader/KaldiSequenceTrainingIO.cpp @@ -11,7 +11,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { const wstring& transModelFilename, const wstring& silencePhoneStr, const wstring& trainCriterion, ElemType oldAcousticScale, ElemType acousticScale, - ElemType lmScale, bool oneSilenceClass) + ElemType lmScale, bool oneSilenceClass, size_t numberOfuttsPerMinibatch) { using namespace msra::asr; assert(denlatRspecifier != L""); @@ -26,8 +26,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_lmScale = lmScale; m_trainCriterion = trainCriterion; m_oneSilenceClass = oneSilenceClass; - m_objective = 0; - m_posteriors.clear(); + m_numUttsPerMinibatch = numberOfuttsPerMinibatch; + m_needLikelihood = true; + m_currentObj = 0; + m_minibatchIndex = 1; + m_lastCompleteMinibatch.assign(m_numUttsPerMinibatch, 0); if (!kaldi::SplitStringToIntegers(toStr(silencePhoneStr), ":", false, &m_silencePhones)) { @@ -35,13 +38,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { } if (m_trainCriterion != L"mpfe" && m_trainCriterion != L"smbr") { - LogicError("Supported sequence training criterion are: mpfe, smbr.\n"); + LogicError("Supported sequence training criterion: mpfe, smbr.\n"); } - m_derivRead = false; - m_objRead = false; - m_currentUttHasDeriv = false; - m_currentUttID = L""; - m_currentUttLength = 0; } // Destructor. @@ -61,50 +59,43 @@ namespace Microsoft { namespace MSR { namespace CNTK { } template - bool KaldiSequenceTrainingIO::HasDerivatives(const wstring& uttID) + bool KaldiSequenceTrainingIO::ComputeDerivative( + const wstring& uttID) { - if (uttID == m_currentUttID && m_currentUttHasDeriv) - { - return true; - } - else - { - return false; - } - } - - template - bool KaldiSequenceTrainingIO::ComputeDerivatives( - const wstring& uttID, const Matrix& logLikelihoodIn) - { - // Checks if we need to move data to CPU. - Matrix logLikelihood(logLikelihoodIn); - if (logLikelihood.GetDeviceId() >= 0) - logLikelihood.TransferFromDeviceToDevice(logLikelihood.GetDeviceId(), CPUDEVICE, true, false, false); + assert(m_uttPool.find(uttID) != m_uttPool.end()); + assert(m_uttPool[uttID].hasDerivative == false); + Matrix& logLikelihood = m_uttPool[uttID].logLikelihood; std::string uttIDStr = msra::asr::toStr(uttID); // Sanity check. if (m_transModel.NumPdfs() != logLikelihood.GetNumRows()) { - RuntimeError("Number of labels in logLikelihood does not match that in the Kaldi model for utterance %S: %d v.s. %d\n", uttID.c_str(), logLikelihood.GetNumRows(), m_transModel.NumPdfs()); + RuntimeError("Number of labels in logLikelihood does not match that" + " in the Kaldi model for utterance %S: %d v.s. %d\n", + uttID.c_str(), logLikelihood.GetNumRows(), + m_transModel.NumPdfs()); } // Reads alignment. if (!m_aliReader->HasKey(uttIDStr)) { - RuntimeError("Alignment not found for utterance %s\n", uttIDStr.c_str()); + RuntimeError("Alignment not found for utterance %s\n", + uttIDStr.c_str()); } const std::vector ali = m_aliReader->Value(uttIDStr); if (ali.size() != logLikelihood.GetNumCols()) { - RuntimeError("Number of frames in logLikelihood does not match that in the alignment for utterance %S: %d v.s. %d\n", uttID.c_str(), logLikelihood.GetNumCols(), ali.size()); + RuntimeError("Number of frames in logLikelihood does not match that" + " in the alignment for utterance %S: %d v.s. %d\n", + uttID.c_str(), logLikelihood.GetNumCols(), ali.size()); } // Reads denominator lattice. if (!m_denlatReader->HasKey(uttIDStr)) { - RuntimeError("Denominator lattice not found for utterance %S\n", uttID.c_str()); + RuntimeError("Denominator lattice not found for utterance %S\n", + uttID.c_str()); } kaldi::CompactLattice clat = m_denlatReader->Value(uttIDStr); fst::CreateSuperFinal(&clat); /* One final state with weight One() */ @@ -115,7 +106,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // acoustic scale to 0. if (m_oldAcousticScale != 1.0) { - fst::ScaleLattice(fst::AcousticLatticeScale(m_oldAcousticScale), &lat); + fst::ScaleLattice(fst::AcousticLatticeScale(m_oldAcousticScale), + &lat); } // Topsort lattice. @@ -133,7 +125,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { kaldi::int32 maxTime = kaldi::LatticeStateTimes(lat, &stateTimes); if (maxTime != logLikelihood.GetNumCols()) { - RuntimeError("Number of frames in the logLikelihood does not match that in the denominator lattice for utterance %S\n", uttID.c_str(), logLikelihood.GetNumRows(), maxTime); + RuntimeError("Number of frames in the logLikelihood does not match" + " that in the denominator lattice for utterance %S\n", + uttID.c_str(), logLikelihood.GetNumRows(), maxTime); } // Does lattice acoustic rescoring with the new posteriors from the @@ -143,7 +137,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Second pass acoustic and language model scale. if (m_acousticScale != 1.0 || m_lmScale != 1.0) { - fst::ScaleLattice(fst::LatticeScale(m_lmScale, m_acousticScale), &lat); + fst::ScaleLattice(fst::LatticeScale(m_lmScale, m_acousticScale), + &lat); } // Forward-backward on the lattice. @@ -152,39 +147,39 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (m_trainCriterion == L"smbr") { thisObj = kaldi::LatticeForwardBackwardMpeVariants( - m_transModel, m_silencePhones, lat, ali, "smbr", m_oneSilenceClass, &post); + m_transModel, m_silencePhones, lat, + ali, "smbr", m_oneSilenceClass, &post); } else if (m_trainCriterion == L"mpfe") { thisObj = kaldi::LatticeForwardBackwardMpeVariants( - m_transModel, m_silencePhones, lat, ali, "mpfe", m_oneSilenceClass, &post); + m_transModel, m_silencePhones, lat, + ali, "mpfe", m_oneSilenceClass, &post); } - kaldi::ConvertPosteriorToPdfs(m_transModel, post, &m_posteriors); + kaldi::ConvertPosteriorToPdfs(m_transModel, + post, &(m_uttPool[uttID].posterior)); // Uses "expected error rate" instead of "expected accuracy". - m_objective = logLikelihood.GetNumCols() - thisObj; + m_uttPool[uttID].objective = logLikelihood.GetNumCols() - thisObj; - assert(m_posteriors.size() == logLikelihood.GetNumCols()); + assert(m_uttPool[uttID].posterior.size() == logLikelihood.GetNumCols()); - m_derivRead = false; - m_objRead = false; - m_currentUttHasDeriv = true; - m_currentUttID = uttID; - m_currentUttLength = logLikelihood.GetNumCols(); return true; } template void KaldiSequenceTrainingIO::LatticeAcousticRescore( const std::vector& stateTimes, - const Matrix& logLikelihood, kaldi::Lattice* lat) + const Matrix& logLikelihood, kaldi::Lattice* lat) const { - std::vector> timeStateMap(logLikelihood.GetNumCols()); + std::vector> timeStateMap( + logLikelihood.GetNumCols()); size_t num_states = lat->NumStates(); for (size_t s = 0; s < num_states; s++) { - assert(stateTimes[s] >= 0 && stateTimes[s] <= logLikelihood.GetNumCols()); + assert(stateTimes[s] >= 0 + && stateTimes[s] <= logLikelihood.GetNumCols()); if (stateTimes[s] < logLikelihood.GetNumCols()) { timeStateMap[stateTimes[s]].push_back(s); @@ -196,14 +191,17 @@ namespace Microsoft { namespace MSR { namespace CNTK { for (size_t i = 0; i < timeStateMap[t].size(); ++i) { kaldi::int32 state = timeStateMap[t][i]; - for (fst::MutableArcIterator aiter(lat, state); !aiter.Done(); aiter.Next()) + for (fst::MutableArcIterator aiter(lat, state); + !aiter.Done(); aiter.Next()) { kaldi::LatticeArc arc = aiter.Value(); kaldi::int32 trans_id = arc.ilabel; if (trans_id != 0) { - kaldi::int32 pdf_id = m_transModel.TransitionIdToPdf(trans_id); - arc.weight.SetValue2(-logLikelihood(pdf_id, t) + arc.weight.Value2()); + kaldi::int32 pdf_id = + m_transModel.TransitionIdToPdf(trans_id); + arc.weight.SetValue2(-logLikelihood(pdf_id, t) + + arc.weight.Value2()); aiter.SetValue(arc); } } @@ -219,97 +217,285 @@ namespace Microsoft { namespace MSR { namespace CNTK { } template - void KaldiSequenceTrainingIO::GetDerivatives(size_t startFrame, - size_t endFrame, - size_t mbSize, - const std::wstring& uttID, - Matrix& derivativesIn) + void KaldiSequenceTrainingIO::ProcessUttInfo( + const std::vector>>& uttInfo, + const Matrix& sentenceBegin, + const std::vector& minibatchPackingFlag, + std::vector>>>* uttInfoInMinibatch) const { - Matrix derivatives(CPUDEVICE); - - // Does some sanity check first. - if (uttID != m_currentUttID) + assert(uttInfoInMinibatch != NULL); + assert(uttInfo.size() == m_numUttsPerMinibatch); + assert(sentenceBegin.GetNumRows() == m_numUttsPerMinibatch); + assert(minibatchPackingFlag.size() == sentenceBegin.GetNumCols()); + uttInfoInMinibatch->clear(); + uttInfoInMinibatch->resize(uttInfo.size()); + for (size_t i = 0; i < uttInfo.size(); ++i) { - RuntimeError("Requested utterance does not matched the utterance that we have computed derivatives for: %S v.s. %S\n", uttID.c_str(), m_currentUttID.c_str()); - } - if (!m_currentUttHasDeriv) - { - RuntimeError("Derivatives have not been computed, you have to call KaldiSequenceTrainingIO::ComputeDerivative() before using it.\n"); - } - assert(startFrame >= 0); - assert(endFrame <= m_currentUttLength); - - derivatives.Resize(m_transModel.NumPdfs(), mbSize); - derivatives.SetValue(0); - for (size_t t = startFrame; t < endFrame; ++t) - { - for (size_t i = 0; i < m_posteriors[t].size(); ++i) + size_t startFrameIndexInMinibatch = 0; + size_t numFrames = 0; + for (size_t j = 0; j < sentenceBegin.GetNumCols(); ++j) { - size_t pdf_id = m_posteriors[t][i].first; - assert(pdf_id < m_transModel.NumPdfs()); - derivatives(pdf_id, t - startFrame) -= m_posteriors[t][i].second; /* Flip the sign */ - } - } - - // Checks if we need to move data to GPU. - if (derivativesIn.GetDeviceId() >= 0) - derivatives.TransferFromDeviceToDevice(CPUDEVICE, derivativesIn.GetDeviceId(), true, false, false); - - derivativesIn.SetValue(derivatives); - - // We've used up all the derivatives, reset it. - if (endFrame >= m_currentUttLength) - { - m_derivRead = true; - if (m_objRead) - { - m_currentUttID = L""; - m_currentUttHasDeriv = false; - m_currentUttLength = 0; + if (((size_t)sentenceBegin(i, j) & NO_LABELS) == NO_LABELS) + { + continue; + } + numFrames += 1; + if ((((size_t)sentenceBegin(i, j) & SENTENCE_END) == SENTENCE_END) + || j == sentenceBegin.GetNumCols() - 1) + { + size_t uttIndex = (*uttInfoInMinibatch)[i].size(); + wstring uttID = uttInfo[i][uttIndex].first; + (*uttInfoInMinibatch)[i].push_back( + make_pair(uttID, make_pair(startFrameIndexInMinibatch, numFrames))); + startFrameIndexInMinibatch = j + 1; + numFrames = 0; + } } + assert(uttInfo[i].size() == (*uttInfoInMinibatch)[i].size()); } } + // Suppose we have a, b, c 3 streams, the is the in the + // following format: + // 1: a11 b11 c11 a12 b12 c12... + // 2: a21 b21 c21 a22 b22 c22... + // 3: a31 b31 c31 a32 b32 c32... template - void KaldiSequenceTrainingIO::GetObjectives(size_t startFrame, - size_t endFrame, - const std::wstring& uttID, - Matrix& objectivesIn) + bool KaldiSequenceTrainingIO::SetLikelihood( + const std::vector>>& uttInfo, + const Matrix& logLikelihoodIn, + const Matrix& sentenceBegin, + const std::vector& minibatchPackingFlag) { - Matrix objectives(CPUDEVICE); + assert(m_needLikelihood == true); + std::vector>>> uttInfoInMinibatch; + ProcessUttInfo(uttInfo, sentenceBegin, + minibatchPackingFlag, &uttInfoInMinibatch); - // Does some sanity check first. - if (uttID != m_currentUttID) + // Checks if we need to move data to CPU. + Matrix logLikelihood(logLikelihoodIn); + if (logLikelihood.GetDeviceId() >= 0) { - RuntimeError("Requested utterance does not matched the utterance that we have computed objectives for: %S v.s. %S\n", uttID.c_str(), m_currentUttID.c_str()); + logLikelihood.TransferFromDeviceToDevice( + logLikelihood.GetDeviceId(), CPUDEVICE, true, false, false); } - if (!m_currentUttHasDeriv) + + bool minibatchComplete = true; + size_t currentMBSize = minibatchPackingFlag.size(); + for (size_t i = 0; i < uttInfo.size(); ++i) { - RuntimeError("Objectives have not been computed, you have to call KaldiSequenceTrainingIO::ComputeDerivative() before using it.\n"); - } - assert(startFrame >= 0); - assert(endFrame <= m_currentUttLength); - - objectives.Resize(1, 1); - objectives.SetValue(m_objective * static_cast(endFrame - startFrame) / static_cast(m_currentUttLength)); - - // Checks if we need to move data to GPU. - if (objectivesIn.GetDeviceId() >= 0) - objectives.TransferFromDeviceToDevice(CPUDEVICE, objectivesIn.GetDeviceId(), true, false, false); - - objectivesIn.SetValue(objectives); - - // We've used up all the objectives, reset it. - if (endFrame >= m_currentUttLength) - { - m_objRead = true; - if (m_derivRead) + assert(uttInfo[i].size() == uttInfoInMinibatch[i].size()); + for (size_t j = 0; j < uttInfo[i].size(); ++j) { - m_currentUttID = L""; - m_currentUttHasDeriv = false; - m_currentUttLength = 0; + wstring uttID = uttInfo[i][j].first; + if (m_uttPool.find(uttID) == m_uttPool.end()) + { + UtteranceDerivativeUnit tmpUttUnit; + tmpUttUnit.hasDerivative = false; + tmpUttUnit.uttLength = uttInfo[i][j].second; + tmpUttUnit.progress = 0; + tmpUttUnit.streamID = i; + tmpUttUnit.logLikelihood.Resize(m_transModel.NumPdfs(), + tmpUttUnit.uttLength); + m_uttPool[uttID] = tmpUttUnit; + } + + // Sets the likelihood and computes derivatives. + assert(m_uttPool.find(uttID) != m_uttPool.end()); + if (m_uttPool[uttID].hasDerivative == false) + { + assert(uttID == uttInfoInMinibatch[i][j].first); + size_t startFrame = uttInfoInMinibatch[i][j].second.first; + size_t numFrames = uttInfoInMinibatch[i][j].second.second; + assert(m_uttPool[uttID].progress + numFrames + <= m_uttPool[uttID].uttLength); + + // Sets the likelihood. + for (size_t k = 0; k < numFrames; ++k) + { + m_uttPool[uttID].logLikelihood.SetColumn( + logLikelihood.ColumnSlice( + (startFrame + k) * m_numUttsPerMinibatch + i, 1), + m_uttPool[uttID].progress + k); + } + + m_uttPool[uttID].progress += numFrames; + if (m_uttPool[uttID].progress == m_uttPool[uttID].uttLength) + { + ComputeDerivative(uttID); + m_uttPool[uttID].hasDerivative = true; + m_uttPool[uttID].progress = 0; + if (startFrame + numFrames == currentMBSize) + { + m_lastCompleteMinibatch[m_uttPool[uttID].streamID] + = m_minibatchIndex; + } + else + { + m_lastCompleteMinibatch[m_uttPool[uttID].streamID] + = m_minibatchIndex - 1; + } + } + } } } + + // Checks if we are ready to provide derivatives. + m_minCompleteMinibatchIndex = *std::min_element( + m_lastCompleteMinibatch.begin(), m_lastCompleteMinibatch.end()); + m_needLikelihood = (m_minCompleteMinibatchIndex >= 1) ? false : true; + m_minibatchIndex += 1; + } + + // Suppose we have a, b, c 3 streams, the should be in the + // following format: + // 1: a11 b11 c11 a12 b12 c12... + // 2: a21 b21 c21 a22 b22 c22... + // 3: a31 b31 c31 a32 b32 c32... + template + bool KaldiSequenceTrainingIO::GetDerivative( + const std::vector>>& uttInfo, + const Matrix& sentenceBegin, + const std::vector& minibatchPackingFlag, + Matrix* derivativesOut) + { + assert(derivativesOut != NULL); + std::vector>>> uttInfoInMinibatch; + ProcessUttInfo(uttInfo, sentenceBegin, + minibatchPackingFlag, &uttInfoInMinibatch); + + Matrix derivatives(CPUDEVICE); + derivatives.Resize(m_transModel.NumPdfs(), + sentenceBegin.GetNumCols() * sentenceBegin.GetNumRows()); + derivatives.SetValue(0); + + m_currentObj = 0; + for (size_t i = 0; i < uttInfo.size(); ++i) + { + assert(uttInfo[i].size() == uttInfoInMinibatch[i].size()); + for (size_t j = 0; j < uttInfo[i].size(); ++j) + { + wstring uttID = uttInfo[i][j].first; + + // Checks if we have derivatives. + if (m_uttPool.find(uttID) == m_uttPool.end() + || (m_uttPool.find(uttID) != m_uttPool.end() + && m_uttPool[uttID].hasDerivative == false)) + { + RuntimeError("Derivatives are not ready for utterance:" + " %S\n", uttID.c_str()); + } + + // Assign the derivatives. + assert(uttID == uttInfoInMinibatch[i][j].first); + size_t startFrame = uttInfoInMinibatch[i][j].second.first; + size_t startFrameInUtt = m_uttPool[uttID].progress; + size_t numFrames = uttInfoInMinibatch[i][j].second.second; + for (size_t k = 0; k < numFrames; ++k) + { + size_t posStart = startFrameInUtt + k; + for (size_t l = 0; + l < m_uttPool[uttID].posterior[posStart].size(); ++l) + { + size_t pdf_id = + m_uttPool[uttID].posterior[posStart][l].first; + assert(pdf_id < m_transModel.NumPdfs()); + derivatives(pdf_id, + (startFrame + k) * m_numUttsPerMinibatch + i) -= + m_uttPool[uttID].posterior[posStart][l].second; + } + } + m_currentObj += m_uttPool[uttID].objective + * numFrames / m_uttPool[uttID].uttLength; + m_uttPool[uttID].progress += numFrames; + assert(m_uttPool[uttID].progress <= m_uttPool[uttID].uttLength); + if (m_uttPool[uttID].progress == m_uttPool[uttID].uttLength) + { + m_uttPool.erase(uttID); + } + } + } + + // Checks if we need to move data to GPU. + if (derivativesOut->GetDeviceId() >= 0) + { + derivatives.TransferFromDeviceToDevice( + CPUDEVICE, derivativesOut->GetDeviceId(), true, false, false); + } + derivativesOut->SetValue(derivatives); + + // Keeps the utterance information so we can check next time when we + // gives the objectives. + m_currentUttInfo = uttInfo; + + // Checks if we need to read more loglikelihoods. + m_needLikelihood = false; + m_minCompleteMinibatchIndex -= 1; + if (m_minCompleteMinibatchIndex <= 0) + { + m_needLikelihood = true; + m_minibatchIndex = 1; + m_lastCompleteMinibatch.assign(m_numUttsPerMinibatch, 0); + + // Un-do the logLikelihood for partial utterances. + for (auto iter = m_uttPool.begin(); iter != m_uttPool.end(); ++iter) + { + if (iter->second.hasDerivative == false) + { + iter->second.progress = 0; + } + } + } + return true; + } + + template + bool KaldiSequenceTrainingIO::GetObjective( + const std::vector>>& uttInfo, + Matrix* objectivesIn) + { + assert(objectivesIn != NULL); + + // Checks utterance information. + bool match = true; + if (uttInfo.size() == m_currentUttInfo.size()) + { + for (size_t i = 0; i < uttInfo.size(); ++i) + { + if (uttInfo[i].size() != m_currentUttInfo[i].size()) + { + match = false; + break; + } + for (size_t j = 0; j < uttInfo[i].size(); ++j) + { + if (uttInfo[i][j].first != m_currentUttInfo[i][j].first || + uttInfo[i][j].second != m_currentUttInfo[i][j].second) + { + match = false; + break; + } + } + } + } + else + { + match = false; + } + if (!match) + { + RuntimeError("Current objective does not correspond to the" + " minibatch utterance information, perhaps you did not" + " run GetObjective() right after GetDerivatives()?"); + } + + // Sets the objectives... + objectivesIn->Resize(1, 1); + objectivesIn->SetValue(m_currentObj); + + return true; } template class KaldiSequenceTrainingIO; diff --git a/DataReader/Kaldi2Reader/KaldiSequenceTrainingIO.h b/DataReader/Kaldi2Reader/KaldiSequenceTrainingIO.h index dc05e4dba..8be57f2b8 100644 --- a/DataReader/Kaldi2Reader/KaldiSequenceTrainingIO.h +++ b/DataReader/Kaldi2Reader/KaldiSequenceTrainingIO.h @@ -2,6 +2,7 @@ #include "kaldi.h" #include "Matrix.h" +#include "basetypes.h" namespace Microsoft { namespace MSR { namespace CNTK { @@ -12,50 +13,93 @@ class KaldiSequenceTrainingIO { private: bool m_oneSilenceClass; - bool m_currentUttHasDeriv; - bool m_derivRead; - bool m_objRead; + bool m_needLikelihood; + size_t m_numUttsPerMinibatch; wstring m_trainCriterion; - wstring m_currentUttID; ElemType m_oldAcousticScale; ElemType m_acousticScale; ElemType m_lmScale; - ElemType m_objective; std::vector m_silencePhones; - size_t m_currentUttLength; kaldi::TransitionModel m_transModel; - kaldi::Posterior m_posteriors; - kaldi::RandomAccessCompactLatticeReader* m_denlatReader; /*denominator lattices*/ - kaldi::RandomAccessInt32VectorReader* m_aliReader; /*alignment*/ + kaldi::RandomAccessCompactLatticeReader* m_denlatReader; + kaldi::RandomAccessInt32VectorReader* m_aliReader; + + struct UtteranceDerivativeUnit + { + bool hasDerivative; + size_t uttLength; + size_t progress; + size_t streamID; + Matrix logLikelihood; + kaldi::Posterior posterior; + ElemType objective; + + UtteranceDerivativeUnit() : logLikelihood(CPUDEVICE) + { + hasDerivative = false; + uttLength = 0; + progress = 0; + streamID = 0; + } + }; + ElemType m_currentObj; + int m_minCompleteMinibatchIndex; + size_t m_minibatchIndex; + std::vector m_lastCompleteMinibatch; + std::vector>> m_currentUttInfo; + unordered_map m_uttPool; // Rescores the lattice with the lastest posteriors from the neural network. - void LatticeAcousticRescore(const std::vector& stateTimes, - const Matrix& outputs, kaldi::Lattice* lat); + void LatticeAcousticRescore( + const std::vector& stateTimes, + const Matrix& outputs, kaldi::Lattice* lat) const; + + // is a vector of vector of the following: + // uttID startFrameIndexInMinibatch numFrames + void ProcessUttInfo( + const std::vector>>& uttInfo, + const Matrix& sentenceBegin, + const std::vector& minibatchPackingFlag, + std::vector>>>* uttInfoInMinibatch) const; + + bool ComputeDerivative(const wstring& uttID); public: // Constructor. - KaldiSequenceTrainingIO(const wstring& denlatRspecifier, const wstring& aliRspecifier, - const wstring& transModelFilename, const wstring& silencePhoneStr, + KaldiSequenceTrainingIO(const wstring& denlatRspecifier, + const wstring& aliRspecifier, + const wstring& transModelFilename, + const wstring& silencePhoneStr, const wstring& trainCriterion, ElemType oldAcousticScale, ElemType acousticScale, ElemType lmScale, - bool oneSilenceClass); + bool oneSilenceClass, + size_t numberOfuttsPerMinibatch); // Destructor. ~KaldiSequenceTrainingIO(); - bool HasDerivatives(const wstring& uttID); + bool NeedLikelihoodToComputeDerivative() const { return m_needLikelihood; } - bool ComputeDerivatives(const wstring& uttID, const Matrix& outputs); + bool SetLikelihood( + const std::vector>>& uttInfo, + const Matrix& outputs, + const Matrix& sentenceBegin, + const std::vector& minibatchPackingFlag); // Gets the computed derivatives for given utterance. - void GetDerivatives(size_t startFrame, size_t endFrame, size_t mbSize, - const std::wstring& uttID, Matrix& derivatives); + bool GetDerivative( + const std::vector>>& uttInfo, + const Matrix& sentenceBegin, + const std::vector& minibatchPackingFlag, + Matrix* derivativesOut); // Gets the computed objectives for given utterance. - void GetObjectives(size_t startFrame, size_t endFrame, - const std::wstring& uttID, Matrix& derivatives); + bool GetObjective( + const std::vector>>& uttInfo, + Matrix* objectivesIn); }; }}} diff --git a/MachineLearning/CNTK/SGD.h b/MachineLearning/CNTK/SGD.h index 63ac925aa..c51daa26a 100644 --- a/MachineLearning/CNTK/SGD.h +++ b/MachineLearning/CNTK/SGD.h @@ -1631,13 +1631,17 @@ protected: // Tries to read an utterance and run forward computation on the // whole utterance. assert(trainSetDataReader != NULL); - std::wstring uttID; - if (trainSetDataReader->GetForkedUtterance(uttID, *inputMatrices)) + std::vector>> uttInfo; + Matrix sentenceBoundary; + std::vector minibatchPackingFlag; + while (trainSetDataReader->GetMinibatchCopy(uttInfo, *inputMatrices, + sentenceBoundary, + minibatchPackingFlag)) { UpdateEvalTimeStamps(FeatureNodes); - std::vector* outputNodes = net.OutputNodes(); - if (outputNodes->size() < 1) + std::vector* outputNodes = net.OutputNodes(); + if (outputNodes->size() < 1) { throw std::logic_error("no output node was found."); } @@ -1645,8 +1649,11 @@ protected: net.SetActualMiniBatchSize(actualMBSize); net.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter()); trainSetDataReader->SetSentenceSegBatch(net.SentenceBoundary(), net.MinibatchPackingFlags()); - net.Evaluate((*outputNodes)[0]); // Only evaluate the first output - trainSetDataReader->ComputeDerivativeFeatures(uttID, (*outputNodes)[0]->FunctionValues()); + net.Evaluate((*outputNodes)[0]); // Only evaluate the first output + trainSetDataReader->SetNetOutput(uttInfo, + (*outputNodes)[0]->FunctionValues(), + sentenceBoundary, + minibatchPackingFlag); } }