Adding suppport for parallelized sequence training in Kaldi2Reader

2015-07-24 16:55:21 +00:00 · 2015-07-24 16:55:21 +00:00 · 73c6db513d
--- a/Common/DataReader.cpp
+++ b/Common/DataReader.cpp
@ -225,20 +225,28 @@ void DataReader<ElemType>::SetRandomSeed(int seed)
 }

 template<class ElemType>
-bool DataReader<ElemType>::GetForkedUtterance(std::wstring& uttID, std::map<std::wstring, Matrix<ElemType>*>& matrices)
+bool DataReader<ElemType>::GetMinibatchCopy(
+    std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
+    std::map<std::wstring, Matrix<ElemType>*>& matrices,
+    Matrix<ElemType>& sentenceBegin,
+    std::vector<MinibatchPackingFlag>& minibatchPackingFlag)
 {
    bool ans = false;
    for (size_t i = 0; i < m_ioNames.size(); i++)
-        ans = (m_dataReader[m_ioNames[i]]->GetForkedUtterance(uttID, matrices) || ans);
+        ans = (m_dataReader[m_ioNames[i]]->GetMinibatchCopy(uttInfo, matrices, sentenceBegin, minibatchPackingFlag) || ans);
    return ans;
 }

 template<class ElemType>
-bool DataReader<ElemType>::ComputeDerivativeFeatures(const std::wstring& uttID, const Matrix<ElemType>& outputs)
+bool DataReader<ElemType>::SetNetOutput(
+    const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
+    const Matrix<ElemType>& outputs,
+    const Matrix<ElemType>& sentenceBegin,
+    const std::vector<MinibatchPackingFlag>& minibatchPackingFlag)
 {
    bool ans = false;
    for (size_t i = 0; i < m_ioNames.size(); i++)
-        ans = (m_dataReader[m_ioNames[i]]->ComputeDerivativeFeatures(uttID, outputs) || ans);
+        ans = (m_dataReader[m_ioNames[i]]->SetNetOutput(uttInfo, outputs, sentenceBegin, minibatchPackingFlag) || ans);
    return ans;
 }

--- a/Common/Include/DataReader.h
+++ b/Common/Include/DataReader.h
@ -85,14 +85,27 @@ public:

    void SetDoRandomize(bool b){ mDoRandomize = b; }

-    // Gets utterance before getting the actual minibatch, which will not affect
-    // getting the minibatches. This can be useful in sequence training.
-    virtual bool GetForkedUtterance(std::wstring& , std::map<std::wstring, Matrix<ElemType>*>& ) { return false; }
+    // Gets a copy of the minibatch for the forward computation. This can be
+    // useful if some of the computation has to happen in the reader.
+    virtual bool GetMinibatchCopy(
+        std::vector<std::vector<std::pair<wstring, size_t>>>& /*uttInfo*/,
+        std::map<std::wstring, Matrix<ElemType>*>& /*matrices*/,
+        Matrix<ElemType>& /*sentenceBegin*/,
+        std::vector<MinibatchPackingFlag>& /*minibatchPackingFlag*/)
+    {
+        return false;
+    }

-    // Computes certain derivatives given outputs from neural networks, which
-    // will later be fed to the neural network as features. This can be useful
-    // in sequence training.
-    virtual bool ComputeDerivativeFeatures(const std::wstring& , const Matrix<ElemType>& ) { return false; }
+    // Sets the neural network output to the reader. This can be useful if some
+    // of the computation has to happen in the reader.
+    virtual bool SetNetOutput(
+        const std::vector<std::vector<std::pair<wstring, size_t>>>& /*uttInfo*/,
+        const Matrix<ElemType>& /*outputs*/,
+        const Matrix<ElemType>& /*sentenceBegin*/,
+        const std::vector<MinibatchPackingFlag>& /*minibatchPackingFlag*/)
+    {
+        return false;
+    }
 };

 // GetReader - get a reader type from the DLL
@ -193,14 +206,21 @@ public:

    virtual bool DataEnd(EndDataType endDataType);

-    // Gets utterance before getting the actual minibatch, which will not affect
-    // getting the minibatches. This can be useful in sequence training.
-    virtual bool GetForkedUtterance(std::wstring& uttID, std::map<std::wstring, Matrix<ElemType>*>& matrices);
+    // Gets a copy of the minibatch for the forward computation. This can be
+    // useful if some of the computation has to happen in the reader.
+    virtual bool GetMinibatchCopy(
+        std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
+        std::map<std::wstring, Matrix<ElemType>*>& matrices,
+        Matrix<ElemType>& sentenceBegin,
+        std::vector<MinibatchPackingFlag>& minibatchPackingFlag);

-    // Computes certain derivatives given outputs from neural networks, which
-    // will later be fed to the neural network as features. This can be useful
-    // in sequence training.
-    virtual bool ComputeDerivativeFeatures(const std::wstring& uttID, const Matrix<ElemType>& outputs);
+    // Sets the neural network output to the reader. This can be useful if some
+    // of the computation has to happen in the reader.
+    virtual bool SetNetOutput(
+        const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
+        const Matrix<ElemType>& outputs,
+        const Matrix<ElemType>& sentenceBegin,
+        const std::vector<MinibatchPackingFlag>& minibatchPackingFlag);

    void SetSentenceSegBatch(Matrix<ElemType> & sentenceBegin, vector<MinibatchPackingFlag>& minibatchPackingFlag);

--- a/DataReader/Kaldi2Reader/HTKMLFReader.cpp
+++ b/DataReader/Kaldi2Reader/HTKMLFReader.cpp
@ -49,9 +49,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_frameSource = NULL;
        m_lattices = NULL;
        m_sequenceTrainingIO = NULL;
+        m_minibatchBuffer.resize(0);
+        m_minibatchBufferIndex = 0;
+        m_minibatchBufferLeftovers = 0;
        m_noData = false;
        m_convertLabelsToTargets = false;
        m_doSeqTrain = false;
+        m_getMinibatchCopy = false;

        if (readerConfig.Exists("legacyMode"))
        {
@ -60,7 +64,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        // If <m_framemode> is false, throw away any utterance that is longer
        // than the specified <m_maxUtteranceLength>.
-        m_maxUtteranceLength = readerConfig("maxUtteranceLength", "1500");
+        m_maxUtteranceLength = readerConfig("maxUtteranceLength", "10000");

        // m_truncated:
        //     If true, truncate utterances to fit the minibatch size. Otherwise
@ -172,7 +176,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_sequenceTrainingIO = new KaldiSequenceTrainingIO<ElemType>(
            denlatRspecifier, aliRspecifier, transModelFilename,
            silencePhoneStr, m_seqTrainCriterion, oldAcousticScale,
-            acousticScale, lmScale, oneSilenceClass);
+            acousticScale, lmScale,
+            oneSilenceClass, m_numberOfuttsPerMinibatch);

        // Scans the configurations to get "seqTrainDeriv" type input and
        // "seqTrainObj" type input. Both are feature nodes, we feed derivatives
@ -293,6 +298,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }

            m_featureNameToIdMap[featureNames[i]] = iFeat;
+            assert(iFeat == m_featureIdToNameMap.size());
+            m_featureIdToNameMap.push_back(featureNames[i]);
            scriptpaths.push_back(new msra::asr::FeatureSection(thisFeature("scpFile"), thisFeature("rx"), thisFeature("featureTransform", "")));
            m_featureNameToDimMap[featureNames[i]] = m_featDims[i];

@ -334,6 +341,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            statelistpaths.push_back(thisLabel("labelMappingFile",L""));

            m_labelNameToIdMap[labelNames[i]] = iLabel;
+            assert(iLabel == m_labelIdToNameMap.size());
+            m_labelIdToNameMap.push_back(labelNames[i]);
            m_labelNameToDimMap[labelNames[i]] = m_labelDims[i];
            mlfpaths.clear();
            mlfpaths.push_back(thisLabel("mlfFile"));
@ -599,6 +608,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }

            m_featureNameToIdMap[featureNames[i]]= iFeat;
+            assert(iFeat == m_featureIdToNameMap.size());
+            m_featureIdToNameMap.push_back(featureNames[i]);
            scriptpaths.push_back(new msra::asr::FeatureSection(thisFeature("scpFile"), thisFeature("rx"), thisFeature("featureTransform", "")));
            m_featureNameToDimMap[featureNames[i]] = realDims[i];

@ -736,6 +747,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    void HTKMLFReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples)
    {
        m_mbSize = mbSize;
+        m_currentMBSize = mbSize;

        if (m_trainOrTest)
        {
@ -788,7 +800,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_mbiter = NULL;
        }
        msra::dbn::minibatchsource* source = m_frameSource;
-        m_mbiter = new msra::dbn::minibatchiterator(*source, epoch, requestedEpochSamples, mbSize, datapasses);
+        size_t currentMBSize = (m_framemode == true) ? mbSize : 1;
+        m_mbiter = new msra::dbn::minibatchiterator(*source, epoch, requestedEpochSamples, currentMBSize, datapasses);

        // Clears feature and label buffer.
        if (!m_featuresBufferMultiIO.empty())
@ -882,7 +895,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // if startFrame = 5, endFrame = 10, then we copy frames 5, 6, 7, 8, 9.
    template<class ElemType>
    bool HTKMLFReader<ElemType>::PopulateUtteranceInMinibatch(
-        std::map<std::wstring, Matrix<ElemType>*>& matrices,
+        const std::map<std::wstring, Matrix<ElemType>*>& matrices,
        size_t uttIndex, size_t startFrame,
        size_t endFrame, size_t mbSize, size_t mbOffset)
    {
@ -897,15 +910,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            return false;
        }
-        if (m_doSeqTrain && m_numberOfuttsPerMinibatch > 1)
-        {
-            LogicError("nbrUttsInEachRecurrentIter has to be 1 in sequence training.\n");
-        }

        size_t numOfFea = m_featuresBufferMultiIO.size();
        size_t numOfLabel = m_labelsBufferMultiIO.size();
-        typename std::map<std::wstring, Matrix<ElemType>*>::iterator iter;
-        for (iter = matrices.begin(); iter != matrices.end(); iter++)
+        for (auto iter = matrices.begin(); iter != matrices.end(); iter++)
        {
            if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
            {   // Features.
@ -972,65 +980,41 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    }
                }
            }
-            else if (m_doSeqTrain)
-            {
-                // TODO(GUOGUO): if we are going to allow "m_truncate" for
-                //               sequence training, we will have to modify the
-                //               following -- the following always assume we
-                //               start filling the minibatch from index 0.
-                // If we do sequence training we have to populate the derivative
-                // features as well as the objective features. But unlike the
-                // features and labels, we put them in to <matrices> directly.
-                // We assume we only process one utterance at a time in the
-                // current implementation.
-                assert(uttIndex == 0);
-                if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainDeriv)
-                {
-                    wstring uttID = m_uttInfo[uttIndex][0].first; 
-                    Matrix<ElemType>& data = *matrices[iter->first];
-                    if (m_sequenceTrainingIO->HasDerivatives(uttID))
-                        m_sequenceTrainingIO->GetDerivatives(startFrame, endFrame, mbSize, uttID, data);
-                    else
-                    {
-                        data.Resize(data.GetNumRows(), mbSize);
-                        data.SetValue(0);
-                    }
-                }
-                else if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainObj)
-                {
-                    wstring uttID = m_uttInfo[uttIndex][0].first; 
-                    Matrix<ElemType>& data = *matrices[iter->first];
-                    if (m_sequenceTrainingIO->HasDerivatives(uttID))
-                        m_sequenceTrainingIO->GetObjectives(startFrame, endFrame, uttID, data);
-                    else
-                        data.SetValue(0);
-                }
-            }
        }
        return success;
    }

    template<class ElemType>
-    bool HTKMLFReader<ElemType>::GetMinibatchToTrainOrTest(std::map<std::wstring, Matrix<ElemType>*>& matrices)
+    bool HTKMLFReader<ElemType>::GetOneMinibatchToTrainOrTestDataBuffer(
+        const std::map<std::wstring, Matrix<ElemType>*>& matrices)
    {
        bool skip = false;

        // On first minibatch, check if we have input for given names.
        if (m_checkDictionaryKeys)
        {
-            std::map<std::wstring,size_t>::iterator iter;
            for (auto iter = matrices.begin(); iter != matrices.end(); iter++)
            {
                if (m_nameToTypeMap.find(iter->first) == m_nameToTypeMap.end())
                {
-                    throw std::runtime_error(msra::strfun::strprintf("minibatch requested for input node %S not found in reader - cannot generate input\n", iter->first.c_str()));
+                    throw std::runtime_error(msra::strfun::strprintf(
+                          "minibatch requested for input node %S not found in"
+                          "reader - cannot generate input\n", iter->first.c_str()));
                }

            }
            m_checkDictionaryKeys=false;
        }

-        size_t currentMBSize = m_mbSize;
+        // If we are doing sequence training, we need to keep the utterance
+        // information.
+        if (m_doSeqTrain)
+        {
+            m_minibatchUttInfo.assign(m_numberOfuttsPerMinibatch,
+                std::vector<std::pair<wstring, size_t>>(0));
+        }
+
+        m_currentMBSize = m_mbSize;
        do 
        {
            // Checks if we have finished all the utterances.
@ -1050,28 +1034,28 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                }
            }

-            // If <m_truncated> is true, <currentMBSize> is <m_mbSize>
-            // If <m_truncated> is false, <currentMBSize> equals to the longest
+            // If <m_truncated> is true, <m_currentMBSize> is <m_mbSize>
+            // If <m_truncated> is false, <m_currentMBSize> equals to the longest
            // utterance in the minibatch.
            if (!m_truncated)
            {
-                currentMBSize = 0;
+                m_currentMBSize = 0;
                for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i++)
                {
-                    if (m_currentBufferFrames[i] > currentMBSize)
+                    if (m_currentBufferFrames[i] > m_currentMBSize)
                    {
-                        currentMBSize = m_currentBufferFrames[i];
+                        m_currentMBSize = m_currentBufferFrames[i];
                    }
                }
            }

            // We initialize the sentence boundary information before we process
            // the utterances.
-            m_sentenceBegin.Resize(m_numberOfuttsPerMinibatch, currentMBSize);
-            m_minibatchPackingFlag.resize(currentMBSize);
+            m_sentenceBegin.Resize(m_numberOfuttsPerMinibatch, m_currentMBSize);
+            m_minibatchPackingFlag.resize(m_currentMBSize);
            for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i++)
            {
-                for (size_t j = 0; j < currentMBSize; j++)
+                for (size_t j = 0; j < m_currentMBSize; j++)
                {
                    m_sentenceBegin.SetValue(i, j, (ElemType) SENTENCE_MIDDLE);
                }
@ -1085,7 +1069,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                size_t startFrame = m_processedFrame[i];
                size_t endFrame = 0;

-                if ((startFrame + currentMBSize) < m_toProcess[i])
+                if ((startFrame + m_currentMBSize) < m_toProcess[i])
                {
                    // There is only 1 case:
                    //     1. <m_framemode> is false, and <m_truncated> is true.
@ -1099,11 +1083,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        m_minibatchPackingFlag[0] |= MinibatchPackingFlag::UtteranceStart;
                    }

-                    endFrame = startFrame + currentMBSize;
-                    bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, currentMBSize);
-                    m_processedFrame[i] += currentMBSize;
+                    endFrame = startFrame + m_currentMBSize;
+                    bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, m_currentMBSize);
+                    if (m_doSeqTrain && populateSucc) { m_minibatchUttInfo[i].push_back(m_uttInfo[i][0]); }
+                    m_processedFrame[i] += m_currentMBSize;
                }
-                else if ((startFrame + currentMBSize) == m_toProcess[i])
+                else if ((startFrame + m_currentMBSize) == m_toProcess[i])
                {
                    // There are 3 cases:
                    //     1. <m_framemode> is false, and <m_truncated> is true,
@ -1132,9 +1117,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                    // Now puts the utterance into the minibatch, and loads the
                    // next one.
-                    endFrame = startFrame + currentMBSize;
-                    bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, currentMBSize);
-                    m_processedFrame[i] += currentMBSize;
+                    endFrame = startFrame + m_currentMBSize;
+                    bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, m_currentMBSize);
+                    if (m_doSeqTrain && populateSucc) { m_minibatchUttInfo[i].push_back(m_uttInfo[i][0]); }
+                    m_processedFrame[i] += m_currentMBSize;
                    bool reNewSucc = ReNewBufferForMultiIO(i);
                }
                else
@ -1151,7 +1137,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    // Checks if we have reached the end of the minibatch.
                    if (startFrame == m_toProcess[i])
                    {
-                        for (size_t k = 0; k < currentMBSize; k++)
+                        for (size_t k = 0; k < m_currentMBSize; k++)
                        {
                            m_sentenceBegin.SetValue(i, k, (ElemType) NO_LABELS);
                            m_minibatchPackingFlag[k] |= MinibatchPackingFlag::NoLabel;
@ -1159,7 +1145,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                            // Populates <NO_LABELS> with real features, the
                            // following implementation is not efficient...
                            assert(m_toProcess[i] > 0);
-                            PopulateUtteranceInMinibatch(matrices, i, 0, 1, currentMBSize, k);
+                            PopulateUtteranceInMinibatch(matrices, i, 0, 1, m_currentMBSize, k);
                        }
                        continue;
                    }
@ -1194,13 +1180,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    }
                    endFrame = m_toProcess[i];
                    size_t currentMBFilled = endFrame - startFrame;
-                    bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, currentMBSize);
+                    bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, m_currentMBSize);
+                    if (m_doSeqTrain && populateSucc) { m_minibatchUttInfo[i].push_back(m_uttInfo[i][0]); }
                    m_processedFrame[i] += currentMBFilled;
                    bool reNewSucc = ReNewBufferForMultiIO(i);

                    // Third, if the next utterance can fit into the current
                    // minibatch, we also pack the next utterance.
-                    while (reNewSucc && (currentMBFilled + m_toProcess[i] <= currentMBSize))
+                    while (reNewSucc && (currentMBFilled + m_toProcess[i] <= m_currentMBSize))
                    {
                        // Sets the utterance boundary.
                        assert(currentMBFilled + m_toProcess[i] <= m_sentenceBegin.GetNumCols());
@ -1208,7 +1195,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        m_minibatchPackingFlag[currentMBFilled] |= MinibatchPackingFlag::UtteranceStart;
                        m_sentenceBegin.SetValue(i, currentMBFilled + m_toProcess[i] - 1, (ElemType)SENTENCE_END);
                        m_minibatchPackingFlag[currentMBFilled + m_toProcess[i] - 1] |= MinibatchPackingFlag::UtteranceEnd;
-                        populateSucc = PopulateUtteranceInMinibatch(matrices, i, 0, m_toProcess[i], currentMBSize, currentMBFilled);
+                        populateSucc = PopulateUtteranceInMinibatch(matrices, i, 0, m_toProcess[i], m_currentMBSize, currentMBFilled);
+                        if (m_doSeqTrain && populateSucc) { m_minibatchUttInfo[i].push_back(m_uttInfo[i][0]); }
                        assert(m_processedFrame[i] == 0);
                        m_processedFrame[i] = m_toProcess[i];
                        currentMBFilled += m_toProcess[i];
@ -1219,9 +1207,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    // minibatch is not full.
                    if (reNewSucc && !m_framemode && m_truncated)
                    {
-                        populateSucc = PopulateUtteranceInMinibatch(matrices, i, 0, currentMBSize - currentMBFilled, currentMBSize, currentMBFilled);
-                        m_processedFrame[i] += currentMBSize - currentMBFilled;
-                        if (currentMBFilled < currentMBSize)
+                        populateSucc = PopulateUtteranceInMinibatch(matrices, i, 0, m_currentMBSize - currentMBFilled, m_currentMBSize, currentMBFilled);
+                        if (m_doSeqTrain && populateSucc) { m_minibatchUttInfo[i].push_back(m_uttInfo[i][0]); }
+                        m_processedFrame[i] += m_currentMBSize - currentMBFilled;
+                        if (currentMBFilled < m_currentMBSize)
                        {
                            m_sentenceBegin.SetValue(i, currentMBFilled, (ElemType)SENTENCE_BEGIN);
                            m_minibatchPackingFlag[currentMBFilled] |= MinibatchPackingFlag::UtteranceStart;
@ -1229,7 +1218,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    }
                    else
                    {
-                        for (size_t k = currentMBFilled; k < currentMBSize; k++)
+                        for (size_t k = currentMBFilled; k < m_currentMBSize; k++)
                        {
                            m_sentenceBegin.SetValue(i, k, (ElemType) NO_LABELS);
                            m_minibatchPackingFlag[k] |= MinibatchPackingFlag::NoLabel;
@ -1237,29 +1226,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                            // Populates <NO_LABELS> with real features, the
                            // following implementation is not efficient...
                            assert(m_toProcess[i] > 0);
-                            PopulateUtteranceInMinibatch(matrices, i, 0, 1, currentMBSize, k);
+                            PopulateUtteranceInMinibatch(matrices, i, 0, 1, m_currentMBSize, k);
                        }
                    }
                }
            }

-            typename std::map<std::wstring, Matrix<ElemType>*>::iterator iter;
-            for (iter = matrices.begin(); iter != matrices.end(); iter++)
-            {
-                Matrix<ElemType>& data = *matrices[iter->first];
-                if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
-                {
-                    size_t id = m_featureNameToIdMap[iter->first];
-                    size_t dim = m_featureNameToDimMap[iter->first];
-                    data.SetValue(dim, currentMBSize * m_numberOfuttsPerMinibatch, m_featuresBufferMultiIO[id] , matrixFlagNormal);
-                }
-                else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
-                {
-                    size_t id = m_labelNameToIdMap[iter->first];
-                    size_t dim = m_labelNameToDimMap[iter->first];
-                    data.SetValue(dim, currentMBSize * m_numberOfuttsPerMinibatch, m_labelsBufferMultiIO[id], matrixFlagNormal);
-                }
-            }
            skip=false;
        }
        while(skip);
@ -1267,6 +1239,209 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return true;
    }

+    template<class ElemType>
+    bool HTKMLFReader<ElemType>::ShouldCopyMinibatchFromBuffer()
+    {
+        if (m_doSeqTrain)
+        {
+            // If <m_getMinibatchCopy> is false, then we should copy data from
+            // buffer for back-propagation.
+            if (m_getMinibatchCopy == false && m_minibatchBuffer.size() > 0)
+            {
+                m_minibatchBufferIndex = 0;
+                m_minibatchBufferLeftovers = m_minibatchBuffer.size() - 1;  // Will pop one more.
+                return true;
+            }
+
+            // If <m_getMinibatchCopy> is true, we first have to re-compute
+            // the likelihood for the frames that are already in the buffer.
+            if (m_getMinibatchCopy == true && m_minibatchBufferLeftovers > 0)
+            {
+                if (m_minibatchBufferLeftovers == m_minibatchBuffer.size())
+                {
+                    m_minibatchBufferIndex = 0;
+                }
+                else
+                {
+                    m_minibatchBufferIndex += 1;
+                }
+                m_minibatchBufferLeftovers -= 1;
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    template<class ElemType>
+    void HTKMLFReader<ElemType>::CopyMinibatchToBuffer()
+    {
+        MinibatchBufferUnit currentMinibatch;
+
+        // Stores variables realted to the current minibatch.
+        currentMinibatch.sentenceBegin.SetValue(m_sentenceBegin);
+        currentMinibatch.minibatchPackingFlag = m_minibatchPackingFlag;
+        currentMinibatch.currentMBSize = m_currentMBSize;
+        currentMinibatch.minibatchUttInfo = m_minibatchUttInfo;
+
+        size_t size = m_currentMBSize * m_numberOfuttsPerMinibatch;
+
+        // Copies features.
+        currentMinibatch.features.resize(0);
+        for (size_t i = 0; i < m_featuresBufferMultiIO.size(); ++i)
+        {
+            std::vector<ElemType> tmpFeatures(m_featuresBufferMultiIO[i],
+                m_featuresBufferMultiIO[i] + size * m_featureNameToDimMap[m_featureIdToNameMap[i]]);
+            currentMinibatch.features.push_back(tmpFeatures);
+        }
+
+        // Copies labels.
+        currentMinibatch.labels.resize(0);
+        for (size_t i = 0; i < m_labelsBufferMultiIO.size(); ++i)
+        {
+            std::vector<ElemType> tmpLabels(m_labelsBufferMultiIO[i],
+                m_labelsBufferMultiIO[i] + size * m_labelNameToDimMap[m_labelIdToNameMap[i]]);
+            currentMinibatch.labels.push_back(tmpLabels);
+        }
+
+        m_minibatchBuffer.push_back(currentMinibatch);
+    }
+
+    template<class ElemType>
+    void HTKMLFReader<ElemType>::CopyMinibatchFromBufferToMatrix(
+        size_t index,
+        std::map<std::wstring, Matrix<ElemType>*>& matrices)
+    {
+        assert(m_minibatchBuffer.size() > index);
+
+        // Restores the variables related to the minibatch.
+        m_sentenceBegin.SetValue(m_minibatchBuffer[index].sentenceBegin);
+        m_minibatchPackingFlag = m_minibatchBuffer[index].minibatchPackingFlag;
+        m_currentMBSize = m_minibatchBuffer[index].currentMBSize;
+        m_minibatchUttInfo = m_minibatchBuffer[index].minibatchUttInfo;
+
+        // Copies data to the matrix.
+        for (auto iter = matrices.begin(); iter != matrices.end(); iter++)
+        {
+            Matrix<ElemType>& data = *matrices[iter->first];
+            if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
+            {
+                size_t id = m_featureNameToIdMap[iter->first];
+                size_t dim = m_featureNameToDimMap[iter->first];
+                assert(id < m_minibatchBuffer[index].features.size());
+                data.SetValue(dim, 
+                    m_minibatchBuffer[index].features[id].size() / dim,
+                    m_minibatchBuffer[index].features[id].data(),
+                    matrixFlagNormal);
+            }
+            else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
+            {
+                size_t id = m_labelNameToIdMap[iter->first];
+                size_t dim = m_labelNameToDimMap[iter->first];
+                assert(id < m_minibatchBuffer[index].labels.size());
+                data.SetValue(dim,
+                    m_minibatchBuffer[index].labels[id].size() / dim,
+                    m_minibatchBuffer[index].labels[id].data(),
+                    matrixFlagNormal);
+            }
+            else if (m_doSeqTrain && !m_getMinibatchCopy)
+            {
+                if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainDeriv)
+                {
+                    m_sequenceTrainingIO->GetDerivative(
+                        m_minibatchUttInfo, m_sentenceBegin,
+                        m_minibatchPackingFlag, matrices[iter->first]);
+                }
+                else if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainObj)
+                {
+                     m_sequenceTrainingIO->GetObjective(m_minibatchUttInfo,
+                                                        matrices[iter->first]);
+                }
+            }
+        }
+
+        // If we are not in the minibatch copy mode, then we can remove the
+        // minibatch from buffer.
+        if (m_getMinibatchCopy == false)
+        {
+            assert(index == 0);
+            m_minibatchBuffer.pop_front();
+        }
+    }
+
+    template<class ElemType>
+    void HTKMLFReader<ElemType>::CopyMinibatchToMatrix(
+        size_t size,
+        const vector<ElemType*>& featureBuffer,
+        const vector<ElemType*>& labelBuffer,
+        std::map<std::wstring, Matrix<ElemType>*>& matrices) const
+    {
+        for (auto iter = matrices.begin(); iter != matrices.end(); iter++)
+        {
+            Matrix<ElemType>& data = *matrices[iter->first];
+            if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
+            {
+                size_t id = m_featureNameToIdMap[iter->first];
+                size_t dim = m_featureNameToDimMap[iter->first];
+                assert(id < featureBuffer.size());
+                data.SetValue(dim, size, featureBuffer[id] , matrixFlagNormal);
+            }
+            else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
+            {
+                size_t id = m_labelNameToIdMap[iter->first];
+                size_t dim = m_labelNameToDimMap[iter->first];
+                assert(id < labelBuffer.size());
+                data.SetValue(dim, size, labelBuffer[id], matrixFlagNormal);
+            }
+            else if (m_doSeqTrain)
+            {
+                if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainDeriv)
+                {
+                    data.Resize(data.GetNumRows(), m_currentMBSize);
+                    data.SetValue(0);
+                }
+                else if (m_nameToTypeMap[iter->first] == InputOutputTypes::seqTrainObj)
+                {
+                    data.SetValue(0);
+                }
+            }
+        }
+    }
+
+    template<class ElemType>
+    bool HTKMLFReader<ElemType>::GetMinibatchToTrainOrTest(
+        std::map<std::wstring, Matrix<ElemType>*>& matrices)
+    {
+        // We either copy a new minibatch from buffer or read one from minibatch
+        // iterator.
+        bool success = false;
+        if (ShouldCopyMinibatchFromBuffer())
+        {
+            CopyMinibatchFromBufferToMatrix(m_minibatchBufferIndex, matrices);
+            return true;
+        }
+        else
+        {
+            success = GetOneMinibatchToTrainOrTestDataBuffer(matrices);
+            if (success)
+            {
+                CopyMinibatchToMatrix(
+                    m_currentMBSize * m_numberOfuttsPerMinibatch,
+                    m_featuresBufferMultiIO, m_labelsBufferMultiIO, matrices);
+            }
+
+            // Checks if we need to move the current minibatch to buffer.
+            if (success && m_getMinibatchCopy)
+            {
+                CopyMinibatchToBuffer();
+            }
+
+            return success;
+        }
+
+        return false;
+    }
+
    template<class ElemType>
    bool HTKMLFReader<ElemType>::GetMinibatchToWrite(std::map<std::wstring, Matrix<ElemType>*>& matrices)
    {
@ -1567,82 +1742,60 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (!(*m_mbiter))
            m_noData = true;

-        return true;    
+        return true; 
    }

    // Gets a copy of the utterance that corresponds to the current minibatches,
    // which will be used to do a neural network forward computation.
    template<class ElemType>
-    bool HTKMLFReader<ElemType>::GetForkedUtterance(std::wstring& uttID,
-                                                    std::map<std::wstring, Matrix<ElemType>*>& matrices)
+    bool HTKMLFReader<ElemType>::GetMinibatchCopy(
+        std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
+        std::map<std::wstring, Matrix<ElemType>*>& matrices,
+        Matrix<ElemType>& sentenceBegin,
+        std::vector<MinibatchPackingFlag>& minibatchPackingFlag)
    {
-        if (!m_doSeqTrain)
+        // We need to get a "copy" of the minibatch to do the forward
+        // computation for sequence training.
+        if (m_doSeqTrain)
        {
+            assert(m_framemode == false);
+            if (m_sequenceTrainingIO->NeedLikelihoodToComputeDerivative())
+            {
+                m_getMinibatchCopy = true;
+                if (GetMinibatchToTrainOrTest(matrices))
+                {
+                    sentenceBegin.SetValue(m_sentenceBegin);
+                    minibatchPackingFlag = m_minibatchPackingFlag;
+                    uttInfo = m_minibatchUttInfo;
+                    m_getMinibatchCopy = false;
+                    return true;
+                }
+                m_getMinibatchCopy = false;
+            }
            return false;
        }
-        assert(m_framemode == false);
-
-        // For the moment we only support single utterance.
-        if (m_numberOfuttsPerMinibatch != 1)
-        {
-            RuntimeError("The current sequence training implementation does not support multiple utterances.\n");
-        }
-
-        // Under our current assumption, we only have one utterance at a time.
-        uttID = m_uttInfo[0][0].first;
-        if (!m_sequenceTrainingIO->HasDerivatives(uttID))
-        {
-            size_t startFrame = 0;
-            size_t endFrame = m_uttInfo[0][0].second;
-            size_t currentMBSize = endFrame - startFrame;
-            bool populateSucc = PopulateUtteranceInMinibatch(
-                matrices, 0, startFrame, endFrame, currentMBSize);
-            if (!populateSucc)
-            {
-                return false;
-            }
-
-            // Sets sentence boundary.
-            m_sentenceBegin.Resize(1, currentMBSize);
-            m_minibatchPackingFlag.resize(currentMBSize);
-            for (size_t i = 0; i < currentMBSize; i++)
-            {
-                m_sentenceBegin.SetValue(0, i, (ElemType) SENTENCE_MIDDLE);
-            }
-            std::fill(m_minibatchPackingFlag.begin(), m_minibatchPackingFlag.end(), MinibatchPackingFlag::None);
-            m_sentenceBegin.SetValue(0, 0, (ElemType)SENTENCE_BEGIN);
-            m_sentenceBegin.SetValue(0, m_sentenceBegin.GetNumCols() - 1, (ElemType) SENTENCE_END);
-            m_minibatchPackingFlag[0] = MinibatchPackingFlag::UtteranceStart;
-            m_minibatchPackingFlag[m_sentenceBegin.GetNumCols() - 1] = MinibatchPackingFlag::UtteranceEnd;
-
-            typename std::map<std::wstring, Matrix<ElemType>*>::iterator iter;
-            for (iter = matrices.begin(); iter != matrices.end(); iter++)
-            {
-                Matrix<ElemType>& data = *matrices[iter->first];
-                if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
-                {
-                    size_t id = m_featureNameToIdMap[iter->first];
-                    size_t dim = m_featureNameToDimMap[iter->first];
-                    data.SetValue(dim, currentMBSize * m_numberOfuttsPerMinibatch, m_featuresBufferMultiIO[id] , matrixFlagNormal);
-                }
-                else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
-                {
-                    size_t id = m_labelNameToIdMap[iter->first];
-                    size_t dim = m_labelNameToDimMap[iter->first];
-                    data.SetValue(dim, currentMBSize * m_numberOfuttsPerMinibatch, m_labelsBufferMultiIO[id], matrixFlagNormal);
-                }
-            }
-            return true;
-        }
-
        return false;
    }

    template<class ElemType>
-    bool HTKMLFReader<ElemType>::ComputeDerivativeFeatures(const std::wstring& uttID,
-                                                           const Matrix<ElemType>& outputs)
+    bool HTKMLFReader<ElemType>::SetNetOutput(
+        const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
+        const Matrix<ElemType>& outputs,
+        const Matrix<ElemType>& sentenceBegin,
+        const std::vector<MinibatchPackingFlag>& minibatchPackingFlag)
    {
-        return m_sequenceTrainingIO->ComputeDerivatives(uttID, outputs);
+        // Set the likelihoods for the utterance with which we can comput the
+        // derivatives. Note that the minibatch may only contain partial output
+        // for the utterance, <m_sequenceTrainingIO> takes care of "pasting"
+        // them together.
+        if (m_doSeqTrain)
+        {
+            assert(m_framemode == false);
+            return m_sequenceTrainingIO->SetLikelihood(uttInfo, outputs,
+                                                       sentenceBegin,
+                                                       minibatchPackingFlag);
+        }
+        return false;
    }


--- a/DataReader/Kaldi2Reader/HTKMLFReader.h
+++ b/DataReader/Kaldi2Reader/HTKMLFReader.h
@ -24,14 +24,25 @@ private:
    msra::dbn::latticesource* m_lattices;
    map<wstring,msra::lattices::lattice::htkmlfwordsequence> m_latticeMap;

-    // Sequence training related. Note that for now we only support single
-    // utterance in sequence training. But the utterance information holders
-    // are designed as if they support multiple utterances -- in case we will
-    // extend this soon.
+    // Sequence training realted members.
+    struct MinibatchBufferUnit
+    {
+        std::vector<std::vector<ElemType>> features;
+        std::vector<std::vector<ElemType>> labels;
+        Matrix<ElemType> sentenceBegin;
+        vector<MinibatchPackingFlag> minibatchPackingFlag;
+        std::vector<std::vector<std::pair<wstring, size_t>>> minibatchUttInfo;
+        size_t currentMBSize;
+    }; 
    bool m_doSeqTrain;
+    bool m_getMinibatchCopy;
+    size_t m_minibatchBufferIndex;
+    size_t m_minibatchBufferLeftovers;
    wstring m_seqTrainCriterion;
    KaldiSequenceTrainingIO<ElemType>* m_sequenceTrainingIO;
+    std::deque<MinibatchBufferUnit> m_minibatchBuffer;
    std::vector<std::vector<std::pair<wstring, size_t>>> m_uttInfo;
+    std::vector<std::vector<std::pair<wstring, size_t>>> m_minibatchUttInfo;
    
    vector<bool> m_sentenceEnd;
    bool m_readAhead;
@ -42,6 +53,7 @@ private:
    size_t m_numberOfuttsPerMinibatch;
    size_t m_actualnumberOfuttsPerMinibatch;
    size_t m_mbSize;
+    size_t m_currentMBSize;
    vector<size_t> m_currentBufferFrames;
    vector<size_t> m_toProcess;
    vector<size_t> m_switchFrame;
@ -72,6 +84,8 @@ private:
    std::map<std::wstring,size_t> m_nameToTypeMap;
    std::map<std::wstring,size_t> m_featureNameToDimMap;
    std::map<std::wstring,size_t> m_labelNameToDimMap;
+    std::vector<std::wstring> m_featureIdToNameMap;
+    std::vector<std::wstring> m_labelIdToNameMap;
    // for writing outputs to files (standard single input/output network) - deprecate eventually
    bool m_checkDictionaryKeys;
    bool m_convertLabelsToTargets;
@ -89,10 +103,22 @@ private:
    void PrepareForSequenceTraining(const ConfigParameters& config);
    
    bool GetMinibatchToTrainOrTest(std::map<std::wstring, Matrix<ElemType>*>& matrices);
+    bool GetOneMinibatchToTrainOrTestDataBuffer(const std::map<std::wstring, Matrix<ElemType>*>& matrices);
    bool GetMinibatchToWrite(std::map<std::wstring, Matrix<ElemType>*>& matrices);
-    bool PopulateUtteranceInMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices, size_t uttIndex, size_t startFrame, size_t endFrame, size_t mbSize, size_t mbOffset = 0);
+    bool PopulateUtteranceInMinibatch(const std::map<std::wstring, Matrix<ElemType>*>& matrices, size_t uttIndex, size_t startFrame, size_t endFrame, size_t mbSize, size_t mbOffset = 0);

-    //-void GetCurrentUtteranceInfo(size_t uttIndex, size_t startFrame, size_t endFrame, wstring& uttID, size_t& startFrameInUtt, size_t& endFrameInUtt);
+    // If we have to read the current minibatch from buffer, return true,
+    // otherwise return false.
+    bool ShouldCopyMinibatchFromBuffer();
+
+    // Copys the current minibatch to buffer.
+    void CopyMinibatchToBuffer();
+
+    // Copys one minibatch from buffer to matrix.
+    void CopyMinibatchFromBufferToMatrix(size_t index, std::map<std::wstring, Matrix<ElemType>*>& matrices);
+
+    // Copys one minibatch from <m_featuresBufferMultiIO> to matrix. 
+    void CopyMinibatchToMatrix(size_t size, const std::vector<ElemType*>& featureBuffer, const std::vector<ElemType*>& labelBuffer, std::map<std::wstring, Matrix<ElemType>*>& matrices) const;

    void StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
    void StartMinibatchLoopToWrite(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
@ -157,9 +183,16 @@ public:
    virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<LabelIdType, LabelType>& labelMapping);
    virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0);

-    virtual bool GetForkedUtterance(std::wstring& uttID, std::map<std::wstring, Matrix<ElemType>*>& matrices);
-    virtual bool ComputeDerivativeFeatures(const std::wstring& uttID, const Matrix<ElemType>& outputs);
-    
+    virtual bool GetMinibatchCopy(
+        std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
+        std::map<std::wstring, Matrix<ElemType>*>& matrices,
+        Matrix<ElemType>& sentenceBegin,
+        vector<MinibatchPackingFlag>& sentenceExistsBeginOrNoLabels);
+    virtual bool SetNetOutput(
+        const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
+        const Matrix<ElemType>& outputs,
+        const Matrix<ElemType>& sentenceBegin,
+        const vector<MinibatchPackingFlag>& sentenceExistsBeginOrNoLabels);

    virtual bool DataEnd(EndDataType endDataType);
    void SetSentenceEndInBatch(vector<size_t> &/*sentenceEnd*/);
--- a/DataReader/Kaldi2Reader/KaldiSequenceTrainingIO.cpp
+++ b/DataReader/Kaldi2Reader/KaldiSequenceTrainingIO.cpp
@ -11,7 +11,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        const wstring& transModelFilename, const wstring& silencePhoneStr,
        const wstring& trainCriterion,
        ElemType oldAcousticScale, ElemType acousticScale,
-        ElemType lmScale, bool oneSilenceClass)
+        ElemType lmScale, bool oneSilenceClass, size_t numberOfuttsPerMinibatch)
    {
        using namespace msra::asr;
        assert(denlatRspecifier != L"");
@ -26,8 +26,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_lmScale = lmScale;
        m_trainCriterion = trainCriterion;
        m_oneSilenceClass = oneSilenceClass;
-        m_objective = 0;
-        m_posteriors.clear();
+        m_numUttsPerMinibatch = numberOfuttsPerMinibatch;
+        m_needLikelihood = true;
+        m_currentObj = 0;
+        m_minibatchIndex = 1;
+        m_lastCompleteMinibatch.assign(m_numUttsPerMinibatch, 0);
        if (!kaldi::SplitStringToIntegers(toStr(silencePhoneStr),
                                          ":", false, &m_silencePhones))
        {
@ -35,13 +38,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
        if (m_trainCriterion != L"mpfe" && m_trainCriterion != L"smbr")
        {
-            LogicError("Supported sequence training criterion are: mpfe, smbr.\n");
+            LogicError("Supported sequence training criterion: mpfe, smbr.\n");
        }
-        m_derivRead = false;
-        m_objRead = false;
-        m_currentUttHasDeriv = false;
-        m_currentUttID = L"";
-        m_currentUttLength = 0;
    }

    // Destructor.
@ -61,50 +59,43 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    bool KaldiSequenceTrainingIO<ElemType>::HasDerivatives(const wstring& uttID)
+    bool KaldiSequenceTrainingIO<ElemType>::ComputeDerivative(
+        const wstring& uttID)
    {
-        if (uttID == m_currentUttID && m_currentUttHasDeriv)
-        {
-            return true;
-        }
-        else
-        {
-            return false;
-        }
-    }
-
-    template<class ElemType>
-    bool KaldiSequenceTrainingIO<ElemType>::ComputeDerivatives(
-        const wstring& uttID, const Matrix<ElemType>& logLikelihoodIn)
-    {
-        // Checks if we need to move data to CPU.
-        Matrix<ElemType> logLikelihood(logLikelihoodIn);
-        if (logLikelihood.GetDeviceId() >= 0)
-            logLikelihood.TransferFromDeviceToDevice(logLikelihood.GetDeviceId(), CPUDEVICE, true, false, false);
+        assert(m_uttPool.find(uttID) != m_uttPool.end());
+        assert(m_uttPool[uttID].hasDerivative == false);
+        Matrix<ElemType>& logLikelihood = m_uttPool[uttID].logLikelihood;

        std::string uttIDStr = msra::asr::toStr(uttID);

        // Sanity check.
        if (m_transModel.NumPdfs() != logLikelihood.GetNumRows())
        {
-            RuntimeError("Number of labels in logLikelihood does not match that in the Kaldi model for utterance %S: %d v.s. %d\n", uttID.c_str(), logLikelihood.GetNumRows(), m_transModel.NumPdfs());
+            RuntimeError("Number of labels in logLikelihood does not match that"
+                         " in the Kaldi model for utterance %S: %d v.s. %d\n",
+                         uttID.c_str(), logLikelihood.GetNumRows(),
+                         m_transModel.NumPdfs());
        }

        // Reads alignment.
        if (!m_aliReader->HasKey(uttIDStr))
        {
-            RuntimeError("Alignment not found for utterance %s\n", uttIDStr.c_str());
+            RuntimeError("Alignment not found for utterance %s\n",
+                         uttIDStr.c_str());
        }
        const std::vector<int32> ali = m_aliReader->Value(uttIDStr);
        if (ali.size() != logLikelihood.GetNumCols())
        {
-            RuntimeError("Number of frames in logLikelihood does not match that in the alignment for utterance %S: %d v.s. %d\n", uttID.c_str(), logLikelihood.GetNumCols(), ali.size());
+            RuntimeError("Number of frames in logLikelihood does not match that"
+                         " in the alignment for utterance %S: %d v.s. %d\n",
+                         uttID.c_str(), logLikelihood.GetNumCols(), ali.size());
        }

        // Reads denominator lattice.
        if (!m_denlatReader->HasKey(uttIDStr))
        {
-            RuntimeError("Denominator lattice not found for utterance %S\n", uttID.c_str());
+            RuntimeError("Denominator lattice not found for utterance %S\n",
+                         uttID.c_str());
        }
        kaldi::CompactLattice clat = m_denlatReader->Value(uttIDStr);
        fst::CreateSuperFinal(&clat);  /* One final state with weight One() */
@ -115,7 +106,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // acoustic scale to 0.
        if (m_oldAcousticScale != 1.0)
        {
-            fst::ScaleLattice(fst::AcousticLatticeScale(m_oldAcousticScale), &lat);
+            fst::ScaleLattice(fst::AcousticLatticeScale(m_oldAcousticScale),
+                              &lat);
        }

        // Topsort lattice.
@ -133,7 +125,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        kaldi::int32 maxTime = kaldi::LatticeStateTimes(lat, &stateTimes);
        if (maxTime != logLikelihood.GetNumCols())
        {
-            RuntimeError("Number of frames in the logLikelihood does not match that in the denominator lattice for utterance %S\n", uttID.c_str(), logLikelihood.GetNumRows(), maxTime);
+            RuntimeError("Number of frames in the logLikelihood does not match"
+                         " that in the denominator lattice for utterance %S\n",
+                         uttID.c_str(), logLikelihood.GetNumRows(), maxTime);
        }

        // Does lattice acoustic rescoring with the new posteriors from the
@ -143,7 +137,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // Second pass acoustic and language model scale.
        if (m_acousticScale != 1.0 || m_lmScale != 1.0)
        {
-            fst::ScaleLattice(fst::LatticeScale(m_lmScale, m_acousticScale), &lat);
+            fst::ScaleLattice(fst::LatticeScale(m_lmScale, m_acousticScale),
+                              &lat);
        }

        // Forward-backward on the lattice.
@ -152,39 +147,39 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (m_trainCriterion == L"smbr")
        {
            thisObj = kaldi::LatticeForwardBackwardMpeVariants(
-                m_transModel, m_silencePhones, lat, ali, "smbr", m_oneSilenceClass, &post);
+                m_transModel, m_silencePhones, lat,
+                ali, "smbr", m_oneSilenceClass, &post);
        }
        else if (m_trainCriterion == L"mpfe")
        {
            thisObj = kaldi::LatticeForwardBackwardMpeVariants(
-                m_transModel, m_silencePhones, lat, ali, "mpfe", m_oneSilenceClass, &post);
+                m_transModel, m_silencePhones, lat,
+                ali, "mpfe", m_oneSilenceClass, &post);
        }

-        kaldi::ConvertPosteriorToPdfs(m_transModel, post, &m_posteriors);
+        kaldi::ConvertPosteriorToPdfs(m_transModel,
+                                      post, &(m_uttPool[uttID].posterior));

        // Uses "expected error rate" instead of "expected accuracy".
-        m_objective = logLikelihood.GetNumCols() - thisObj;
+        m_uttPool[uttID].objective = logLikelihood.GetNumCols() - thisObj;

-        assert(m_posteriors.size() == logLikelihood.GetNumCols());
+        assert(m_uttPool[uttID].posterior.size() == logLikelihood.GetNumCols());

-        m_derivRead = false;
-        m_objRead = false;
-        m_currentUttHasDeriv = true;
-        m_currentUttID = uttID;
-        m_currentUttLength = logLikelihood.GetNumCols();
        return true;
    }

    template<class ElemType>
    void KaldiSequenceTrainingIO<ElemType>::LatticeAcousticRescore(
        const std::vector<kaldi::int32>& stateTimes,
-        const Matrix<ElemType>& logLikelihood, kaldi::Lattice* lat)
+        const Matrix<ElemType>& logLikelihood, kaldi::Lattice* lat) const
    {
-        std::vector<std::vector<kaldi::int32>> timeStateMap(logLikelihood.GetNumCols());
+        std::vector<std::vector<kaldi::int32>> timeStateMap(
+            logLikelihood.GetNumCols());
        size_t num_states = lat->NumStates();
        for (size_t s = 0; s < num_states; s++)
        {
-            assert(stateTimes[s] >= 0 && stateTimes[s] <= logLikelihood.GetNumCols());
+            assert(stateTimes[s] >= 0
+                   && stateTimes[s] <= logLikelihood.GetNumCols());
            if (stateTimes[s] < logLikelihood.GetNumCols())
            {
                timeStateMap[stateTimes[s]].push_back(s);
@ -196,14 +191,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            for (size_t i = 0; i < timeStateMap[t].size(); ++i)
            {
                kaldi::int32 state = timeStateMap[t][i];
-                for (fst::MutableArcIterator<kaldi::Lattice> aiter(lat, state); !aiter.Done(); aiter.Next())
+                for (fst::MutableArcIterator<kaldi::Lattice> aiter(lat, state);
+                     !aiter.Done(); aiter.Next())
                {
                    kaldi::LatticeArc arc = aiter.Value();
                    kaldi::int32 trans_id = arc.ilabel;
                    if (trans_id != 0)
                    {
-                        kaldi::int32 pdf_id = m_transModel.TransitionIdToPdf(trans_id);
-                        arc.weight.SetValue2(-logLikelihood(pdf_id, t) + arc.weight.Value2());
+                        kaldi::int32 pdf_id =
+                            m_transModel.TransitionIdToPdf(trans_id);
+                        arc.weight.SetValue2(-logLikelihood(pdf_id, t)
+                                             + arc.weight.Value2());
                        aiter.SetValue(arc);
                    }
                }
@ -219,97 +217,285 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    void KaldiSequenceTrainingIO<ElemType>::GetDerivatives(size_t startFrame,
-                                                           size_t endFrame,
-                                                           size_t mbSize,
-                                                           const std::wstring& uttID,
-                                                           Matrix<ElemType>& derivativesIn)
+    void KaldiSequenceTrainingIO<ElemType>::ProcessUttInfo(
+        const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
+        const Matrix<ElemType>& sentenceBegin,
+        const std::vector<MinibatchPackingFlag>& minibatchPackingFlag,
+        std::vector<std::vector<std::pair<wstring, std::pair<size_t, size_t>>>>* uttInfoInMinibatch) const
    {
-        Matrix<ElemType> derivatives(CPUDEVICE);
-
-        // Does some sanity check first.
-        if (uttID != m_currentUttID)
+        assert(uttInfoInMinibatch != NULL);
+        assert(uttInfo.size() == m_numUttsPerMinibatch);
+        assert(sentenceBegin.GetNumRows() == m_numUttsPerMinibatch);
+        assert(minibatchPackingFlag.size() == sentenceBegin.GetNumCols());
+        uttInfoInMinibatch->clear();
+        uttInfoInMinibatch->resize(uttInfo.size());
+        for (size_t i = 0; i < uttInfo.size(); ++i)
        {
-            RuntimeError("Requested utterance does not matched the utterance that we have computed derivatives for: %S v.s. %S\n", uttID.c_str(), m_currentUttID.c_str());
-        }
-        if (!m_currentUttHasDeriv)
-        {
-            RuntimeError("Derivatives have not been computed, you have to call KaldiSequenceTrainingIO::ComputeDerivative() before using it.\n");
-        }
-        assert(startFrame >= 0);
-        assert(endFrame <= m_currentUttLength);
-
-        derivatives.Resize(m_transModel.NumPdfs(), mbSize);
-        derivatives.SetValue(0);
-        for (size_t t = startFrame; t < endFrame; ++t)
-        {
-            for (size_t i = 0; i < m_posteriors[t].size(); ++i)
+            size_t startFrameIndexInMinibatch = 0;
+            size_t numFrames = 0;
+            for (size_t j = 0; j < sentenceBegin.GetNumCols(); ++j)
            {
-                size_t pdf_id = m_posteriors[t][i].first;
-                assert(pdf_id < m_transModel.NumPdfs());
-                derivatives(pdf_id, t - startFrame) -= m_posteriors[t][i].second; /* Flip the sign */
-            }
-        }
-
-        // Checks if we need to move data to GPU.
-        if (derivativesIn.GetDeviceId() >= 0)
-            derivatives.TransferFromDeviceToDevice(CPUDEVICE, derivativesIn.GetDeviceId(), true, false, false);
-
-        derivativesIn.SetValue(derivatives);
-
-        // We've used up all the derivatives, reset it.
-        if (endFrame >= m_currentUttLength)
-        {
-            m_derivRead = true;
-            if (m_objRead)
-            {
-                m_currentUttID = L"";
-                m_currentUttHasDeriv = false;
-                m_currentUttLength = 0;
+                if (((size_t)sentenceBegin(i, j) & NO_LABELS) == NO_LABELS)
+                {
+                    continue;
+                }
+                numFrames += 1;
+                if ((((size_t)sentenceBegin(i, j) & SENTENCE_END) == SENTENCE_END)
+                         || j == sentenceBegin.GetNumCols() - 1)
+                {
+                    size_t uttIndex = (*uttInfoInMinibatch)[i].size();
+                    wstring uttID = uttInfo[i][uttIndex].first;
+                    (*uttInfoInMinibatch)[i].push_back(
+                        make_pair(uttID, make_pair(startFrameIndexInMinibatch, numFrames)));
+                    startFrameIndexInMinibatch = j + 1;
+                    numFrames = 0;
+                }
            }
+            assert(uttInfo[i].size() == (*uttInfoInMinibatch)[i].size());
        }
    }

+    // Suppose we have a, b, c 3 streams, the <logLikelihoodIn> is the in the
+    // following format:
+    // 1: a11 b11 c11 a12 b12 c12...
+    // 2: a21 b21 c21 a22 b22 c22...
+    // 3: a31 b31 c31 a32 b32 c32...
    template<class ElemType>
-    void KaldiSequenceTrainingIO<ElemType>::GetObjectives(size_t startFrame,
-                                                          size_t endFrame,
-                                                          const std::wstring& uttID,
-                                                          Matrix<ElemType>& objectivesIn)
+    bool KaldiSequenceTrainingIO<ElemType>::SetLikelihood(
+        const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
+        const Matrix<ElemType>& logLikelihoodIn,
+        const Matrix<ElemType>& sentenceBegin,
+        const std::vector<MinibatchPackingFlag>& minibatchPackingFlag)
    {
-        Matrix<ElemType> objectives(CPUDEVICE);
+        assert(m_needLikelihood == true);
+        std::vector<std::vector<
+            std::pair<wstring, std::pair<size_t, size_t>>>> uttInfoInMinibatch;
+        ProcessUttInfo(uttInfo, sentenceBegin,
+                       minibatchPackingFlag, &uttInfoInMinibatch);

-        // Does some sanity check first.
-        if (uttID != m_currentUttID)
+        // Checks if we need to move data to CPU.
+        Matrix<ElemType> logLikelihood(logLikelihoodIn);
+        if (logLikelihood.GetDeviceId() >= 0)
        {
-            RuntimeError("Requested utterance does not matched the utterance that we have computed objectives for: %S v.s. %S\n", uttID.c_str(), m_currentUttID.c_str());
+            logLikelihood.TransferFromDeviceToDevice(
+                logLikelihood.GetDeviceId(), CPUDEVICE, true, false, false);
        }
-        if (!m_currentUttHasDeriv)
+
+        bool minibatchComplete = true;
+        size_t currentMBSize = minibatchPackingFlag.size();
+        for (size_t i = 0; i < uttInfo.size(); ++i)
        {
-            RuntimeError("Objectives have not been computed, you have to call KaldiSequenceTrainingIO::ComputeDerivative() before using it.\n");
-        }
-        assert(startFrame >= 0);
-        assert(endFrame <= m_currentUttLength);
-
-        objectives.Resize(1, 1);
-        objectives.SetValue(m_objective * static_cast<ElemType>(endFrame - startFrame) / static_cast<ElemType>(m_currentUttLength));
-
-        // Checks if we need to move data to GPU.
-        if (objectivesIn.GetDeviceId() >= 0)
-            objectives.TransferFromDeviceToDevice(CPUDEVICE, objectivesIn.GetDeviceId(), true, false, false);
-
-        objectivesIn.SetValue(objectives);
-
-        // We've used up all the objectives, reset it.
-        if (endFrame >= m_currentUttLength)
-        {
-            m_objRead = true;
-            if (m_derivRead)
+            assert(uttInfo[i].size() == uttInfoInMinibatch[i].size());
+            for (size_t j = 0; j < uttInfo[i].size(); ++j)
            {
-                m_currentUttID = L"";
-                m_currentUttHasDeriv = false;
-                m_currentUttLength = 0;
+                wstring uttID = uttInfo[i][j].first;
+                if (m_uttPool.find(uttID) == m_uttPool.end())
+                {
+                    UtteranceDerivativeUnit tmpUttUnit;
+                    tmpUttUnit.hasDerivative = false;
+                    tmpUttUnit.uttLength = uttInfo[i][j].second;
+                    tmpUttUnit.progress = 0;
+                    tmpUttUnit.streamID = i;
+                    tmpUttUnit.logLikelihood.Resize(m_transModel.NumPdfs(),
+                                                    tmpUttUnit.uttLength);
+                    m_uttPool[uttID] = tmpUttUnit;
+                }
+
+                // Sets the likelihood and computes derivatives.
+                assert(m_uttPool.find(uttID) != m_uttPool.end());
+                if (m_uttPool[uttID].hasDerivative == false)
+                {
+                    assert(uttID == uttInfoInMinibatch[i][j].first);
+                    size_t startFrame = uttInfoInMinibatch[i][j].second.first;
+                    size_t numFrames = uttInfoInMinibatch[i][j].second.second;
+                    assert(m_uttPool[uttID].progress + numFrames
+                           <= m_uttPool[uttID].uttLength);
+
+                    // Sets the likelihood.
+                    for (size_t k = 0; k < numFrames; ++k)
+                    {
+                        m_uttPool[uttID].logLikelihood.SetColumn(
+                            logLikelihood.ColumnSlice(
+                            (startFrame + k) * m_numUttsPerMinibatch + i, 1),
+                            m_uttPool[uttID].progress + k);
+                    }
+
+                    m_uttPool[uttID].progress += numFrames;
+                    if (m_uttPool[uttID].progress == m_uttPool[uttID].uttLength)
+                    {
+                        ComputeDerivative(uttID);
+                        m_uttPool[uttID].hasDerivative = true;
+                        m_uttPool[uttID].progress = 0;
+                        if (startFrame + numFrames == currentMBSize)
+                        {
+                            m_lastCompleteMinibatch[m_uttPool[uttID].streamID]
+                                = m_minibatchIndex;
+                        }
+                        else
+                        {
+                            m_lastCompleteMinibatch[m_uttPool[uttID].streamID]
+                                = m_minibatchIndex - 1;
+                        }
+                    }
+                }
            }
        }
+
+        // Checks if we are ready to provide derivatives.
+        m_minCompleteMinibatchIndex = *std::min_element(
+            m_lastCompleteMinibatch.begin(), m_lastCompleteMinibatch.end());
+        m_needLikelihood = (m_minCompleteMinibatchIndex >= 1) ? false : true;
+        m_minibatchIndex += 1;
+    }
+
+    // Suppose we have a, b, c 3 streams, the <derivativesOut> should be in the
+    // following format:
+    // 1: a11 b11 c11 a12 b12 c12...
+    // 2: a21 b21 c21 a22 b22 c22...
+    // 3: a31 b31 c31 a32 b32 c32...
+    template<class ElemType>
+    bool KaldiSequenceTrainingIO<ElemType>::GetDerivative(
+        const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
+        const Matrix<ElemType>& sentenceBegin,
+        const std::vector<MinibatchPackingFlag>& minibatchPackingFlag,
+        Matrix<ElemType>* derivativesOut)
+    {
+        assert(derivativesOut != NULL);
+        std::vector<std::vector<
+            std::pair<wstring, std::pair<size_t, size_t>>>> uttInfoInMinibatch;
+        ProcessUttInfo(uttInfo, sentenceBegin,
+                       minibatchPackingFlag, &uttInfoInMinibatch);
+
+        Matrix<ElemType> derivatives(CPUDEVICE);
+        derivatives.Resize(m_transModel.NumPdfs(),
+            sentenceBegin.GetNumCols() * sentenceBegin.GetNumRows());
+        derivatives.SetValue(0);
+
+        m_currentObj = 0;
+        for (size_t i = 0; i < uttInfo.size(); ++i)
+        {
+            assert(uttInfo[i].size() == uttInfoInMinibatch[i].size());
+            for (size_t j = 0; j < uttInfo[i].size(); ++j)
+            {
+                wstring uttID = uttInfo[i][j].first;
+
+                // Checks if we have derivatives.
+                if (m_uttPool.find(uttID) == m_uttPool.end()
+                    || (m_uttPool.find(uttID) != m_uttPool.end()
+                        && m_uttPool[uttID].hasDerivative == false))
+                {
+                    RuntimeError("Derivatives are not ready for utterance:"
+                                 " %S\n", uttID.c_str());
+                }
+
+                // Assign the derivatives.
+                assert(uttID == uttInfoInMinibatch[i][j].first);
+                size_t startFrame = uttInfoInMinibatch[i][j].second.first;
+                size_t startFrameInUtt = m_uttPool[uttID].progress;
+                size_t numFrames = uttInfoInMinibatch[i][j].second.second;
+                for (size_t k = 0; k < numFrames; ++k)
+                {
+                    size_t posStart = startFrameInUtt + k;
+                    for (size_t l = 0;
+                        l < m_uttPool[uttID].posterior[posStart].size(); ++l)
+                    {
+                        size_t pdf_id =
+                            m_uttPool[uttID].posterior[posStart][l].first;
+                        assert(pdf_id < m_transModel.NumPdfs());
+                        derivatives(pdf_id,
+                            (startFrame + k) * m_numUttsPerMinibatch + i) -=
+                            m_uttPool[uttID].posterior[posStart][l].second;
+                    }
+                }
+                m_currentObj += m_uttPool[uttID].objective
+                    * numFrames / m_uttPool[uttID].uttLength;
+                m_uttPool[uttID].progress += numFrames;
+                assert(m_uttPool[uttID].progress <= m_uttPool[uttID].uttLength);
+                if (m_uttPool[uttID].progress == m_uttPool[uttID].uttLength)
+                {
+                    m_uttPool.erase(uttID);
+                }
+            }
+        }
+
+        // Checks if we need to move data to GPU.
+        if (derivativesOut->GetDeviceId() >= 0)
+        {
+            derivatives.TransferFromDeviceToDevice(
+                CPUDEVICE, derivativesOut->GetDeviceId(), true, false, false);
+        }
+        derivativesOut->SetValue(derivatives);
+
+        // Keeps the utterance information so we can check next time when we
+        // gives the objectives.
+        m_currentUttInfo = uttInfo;
+
+        // Checks if we need to read more loglikelihoods.
+        m_needLikelihood = false;
+        m_minCompleteMinibatchIndex -= 1;
+        if (m_minCompleteMinibatchIndex <= 0)
+        {
+            m_needLikelihood = true;
+            m_minibatchIndex = 1;
+            m_lastCompleteMinibatch.assign(m_numUttsPerMinibatch, 0);
+
+            // Un-do the logLikelihood for partial utterances.
+            for (auto iter = m_uttPool.begin(); iter != m_uttPool.end(); ++iter)
+            {
+                if (iter->second.hasDerivative == false)
+                {
+                    iter->second.progress = 0;
+                }
+            }
+        }
+        return true;
+    }
+
+    template<class ElemType>
+    bool KaldiSequenceTrainingIO<ElemType>::GetObjective(
+        const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
+        Matrix<ElemType>* objectivesIn)
+    {
+        assert(objectivesIn != NULL);
+
+        // Checks utterance information.
+        bool match = true;
+        if (uttInfo.size() == m_currentUttInfo.size())
+        {
+            for (size_t i = 0; i < uttInfo.size(); ++i)
+            {
+                if (uttInfo[i].size() != m_currentUttInfo[i].size())
+                {
+                    match = false;
+                    break;
+                }
+                for (size_t j = 0; j < uttInfo[i].size(); ++j)
+                {
+                    if (uttInfo[i][j].first != m_currentUttInfo[i][j].first ||
+                        uttInfo[i][j].second != m_currentUttInfo[i][j].second)
+                    {
+                        match = false;
+                        break;
+                    }
+                }
+            }
+        }
+        else
+        {
+            match = false;
+        }
+        if (!match)
+        {
+            RuntimeError("Current objective does not correspond to the"
+                         " minibatch utterance information, perhaps you did not"
+                         " run GetObjective() right after GetDerivatives()?");
+        }
+
+        // Sets the objectives...
+        objectivesIn->Resize(1, 1);
+        objectivesIn->SetValue(m_currentObj);
+
+        return true;
    }

    template class KaldiSequenceTrainingIO<float>;
--- a/DataReader/Kaldi2Reader/KaldiSequenceTrainingIO.h
+++ b/DataReader/Kaldi2Reader/KaldiSequenceTrainingIO.h
@ -2,6 +2,7 @@

 #include "kaldi.h"
 #include "Matrix.h"
+#include "basetypes.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -12,50 +13,93 @@ class KaldiSequenceTrainingIO
 {
 private:
    bool m_oneSilenceClass;
-    bool m_currentUttHasDeriv;
-    bool m_derivRead;
-    bool m_objRead;
+    bool m_needLikelihood;
+    size_t m_numUttsPerMinibatch;
    wstring m_trainCriterion;
-    wstring m_currentUttID;
    ElemType m_oldAcousticScale;
    ElemType m_acousticScale;
    ElemType m_lmScale;
-    ElemType m_objective;
    std::vector<kaldi::int32> m_silencePhones;
-    size_t m_currentUttLength;
    kaldi::TransitionModel m_transModel;
-    kaldi::Posterior m_posteriors;
-    kaldi::RandomAccessCompactLatticeReader* m_denlatReader;  /*denominator lattices*/
-    kaldi::RandomAccessInt32VectorReader* m_aliReader;        /*alignment*/
+    kaldi::RandomAccessCompactLatticeReader* m_denlatReader;
+    kaldi::RandomAccessInt32VectorReader* m_aliReader;
+
+    struct UtteranceDerivativeUnit
+    {
+        bool hasDerivative;
+        size_t uttLength;
+        size_t progress;
+        size_t streamID;
+        Matrix<ElemType> logLikelihood;
+        kaldi::Posterior posterior;
+        ElemType objective;
+
+        UtteranceDerivativeUnit() : logLikelihood(CPUDEVICE)
+        {
+            hasDerivative = false;
+            uttLength = 0;
+            progress = 0;
+            streamID = 0;
+        }
+    };
+    ElemType m_currentObj;
+    int m_minCompleteMinibatchIndex;
+    size_t m_minibatchIndex;
+    std::vector<size_t> m_lastCompleteMinibatch;
+    std::vector<std::vector<std::pair<wstring, size_t>>> m_currentUttInfo;
+    unordered_map<wstring, UtteranceDerivativeUnit> m_uttPool;

    // Rescores the lattice with the lastest posteriors from the neural network.
-    void LatticeAcousticRescore(const std::vector<kaldi::int32>& stateTimes,
-                                const Matrix<ElemType>& outputs, kaldi::Lattice* lat);
+    void LatticeAcousticRescore(
+        const std::vector<kaldi::int32>& stateTimes,
+        const Matrix<ElemType>& outputs, kaldi::Lattice* lat) const;
+
+    // <uttInfoInMinibatch> is a vector of vector of the following:
+    //     uttID startFrameIndexInMinibatch numFrames
+    void ProcessUttInfo(
+        const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
+        const Matrix<ElemType>& sentenceBegin,
+        const std::vector<MinibatchPackingFlag>& minibatchPackingFlag,
+        std::vector<std::vector<std::pair<
+            wstring, std::pair<size_t, size_t>>>>* uttInfoInMinibatch) const;
+
+    bool ComputeDerivative(const wstring& uttID);

 public:
    // Constructor.
-    KaldiSequenceTrainingIO(const wstring& denlatRspecifier, const wstring& aliRspecifier,
-                            const wstring& transModelFilename, const wstring& silencePhoneStr,
+    KaldiSequenceTrainingIO(const wstring& denlatRspecifier,
+                            const wstring& aliRspecifier,
+                            const wstring& transModelFilename,
+                            const wstring& silencePhoneStr,
                            const wstring& trainCriterion,
                            ElemType oldAcousticScale,
                            ElemType acousticScale,
                            ElemType lmScale,
-                            bool oneSilenceClass);
+                            bool oneSilenceClass,
+                            size_t numberOfuttsPerMinibatch);

    // Destructor.
    ~KaldiSequenceTrainingIO();

-    bool HasDerivatives(const wstring& uttID);
+    bool NeedLikelihoodToComputeDerivative() const { return m_needLikelihood; }

-    bool ComputeDerivatives(const wstring& uttID, const Matrix<ElemType>& outputs);
+    bool SetLikelihood(
+        const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
+        const Matrix<ElemType>& outputs,
+        const Matrix<ElemType>& sentenceBegin,
+        const std::vector<MinibatchPackingFlag>& minibatchPackingFlag);

    // Gets the computed derivatives for given utterance.
-    void GetDerivatives(size_t startFrame, size_t endFrame, size_t mbSize,
-                        const std::wstring& uttID, Matrix<ElemType>& derivatives);
+    bool GetDerivative(
+        const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
+        const Matrix<ElemType>& sentenceBegin,
+        const std::vector<MinibatchPackingFlag>& minibatchPackingFlag,
+        Matrix<ElemType>* derivativesOut);

    // Gets the computed objectives for given utterance.
-    void GetObjectives(size_t startFrame, size_t endFrame,
-                       const std::wstring& uttID, Matrix<ElemType>& derivatives);
+    bool GetObjective(
+        const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
+        Matrix<ElemType>* objectivesIn);
 };

 }}}
--- a/MachineLearning/CNTK/SGD.h
+++ b/MachineLearning/CNTK/SGD.h
@ -1631,13 +1631,17 @@ protected:
        // Tries to read an utterance and run forward computation on the
        // whole utterance.
        assert(trainSetDataReader != NULL);
-        std::wstring uttID;
-                    if (trainSetDataReader->GetForkedUtterance(uttID, *inputMatrices))
+        std::vector<std::vector<std::pair<wstring, size_t>>> uttInfo;
+        Matrix<ElemType> sentenceBoundary;
+        std::vector<MinibatchPackingFlag> minibatchPackingFlag;
+        while (trainSetDataReader->GetMinibatchCopy(uttInfo, *inputMatrices,
+                                                    sentenceBoundary,
+                                                    minibatchPackingFlag))
        {
            UpdateEvalTimeStamps(FeatureNodes);

-                        std::vector<ComputationNodePtr>* outputNodes = net.OutputNodes();
-                        if (outputNodes->size() < 1)
+            std::vector<ComputationNodePtr>* outputNodes = net.OutputNodes();
+            if (outputNodes->size() < 1)
            {
                throw std::logic_error("no output node was found.");
            }
@ -1645,8 +1649,11 @@ protected:
            net.SetActualMiniBatchSize(actualMBSize);
            net.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter());
            trainSetDataReader->SetSentenceSegBatch(net.SentenceBoundary(), net.MinibatchPackingFlags());
-                        net.Evaluate((*outputNodes)[0]);   // Only evaluate the first output
-                        trainSetDataReader->ComputeDerivativeFeatures(uttID, (*outputNodes)[0]->FunctionValues());
+            net.Evaluate((*outputNodes)[0]);   // Only evaluate the first output
+            trainSetDataReader->SetNetOutput(uttInfo,
+                                             (*outputNodes)[0]->FunctionValues(),
+                                             sentenceBoundary,
+                                             minibatchPackingFlag);
        }
    }