Merge branch 'master' of https://git01.codeplex.com/cntk into linux-gcc

2015-07-20 16:57:26 -07:00 · 2015-07-20 16:57:26 -07:00 · 9a90db4f91
--- a/DataReader/HTKMLFReader/utterancesourcemulti.h
+++ b/DataReader/HTKMLFReader/utterancesourcemulti.h
@ -301,39 +301,95 @@ public:
        std::vector<size_t> numclasses;                  // number of output classes as found in the label file (diagnostics)
        _totalframes = 0;
        wstring key;
-        std::vector<size_t>framesaccum;
        size_t numutts=0;
+        
+        std::vector<bool>uttisvalid; // boolean flag to check that utterance is valid. valid means number of 
+                                     //frames is consistent across all feature and label streams
+        std::vector<size_t>uttduration; // track utterance durations to determine utterance validity
+
        std::vector<size_t> classidsbegin;
        if (!lattices.empty())
        {
            LogicError("lattices not supported in utterancereadermulti");
        }

+        allchunks = std::vector<std::vector<utterancechunkdata>>(infiles.size(), std::vector<utterancechunkdata>());
+        featdim = std::vector<size_t>(infiles.size(), 0);
+        sampperiod = std::vector<unsigned int>(infiles.size(), 0);
+        featkind = std::vector<string>(infiles.size(), "");
+        
+        numclasses = std::vector<size_t>(labels.size(), 0);
+        counts = std::vector<std::vector<size_t>>(labels.size(), std::vector<size_t>());
+        
        foreach_index (i, labels)
        {
-            //classids.push_back(biggrowablevector<CLASSIDTYPE>());
            classids.push_back(unique_ptr<biggrowablevector<CLASSIDTYPE>>(new biggrowablevector<CLASSIDTYPE>()));
-            numclasses.push_back(0);
-            counts.push_back(std::vector<size_t>());
            //std::pair<std::vector<wstring>,std::vector<wstring>> latticetocs;
            //std::unordered_map<std::string,size_t> modelsymmap;
            //lattices.push_back(shared_ptr<latticesource>(new latticesource(latticetocs, modelsymmap)));
-    
        }
-        foreach_index(i, infiles){
-            allchunks.push_back(std::vector<utterancechunkdata>());
-            featdim.push_back(0); // initialize
-            sampperiod.push_back(0);
-            featkind.push_back("");
+
+        
+        // first check consistency across feature streams
+        // We'll go through the SCP files for each stream to make sure the duration is consistent
+        // If not, we'll plan to ignore the utterance, and inform the user
+                // m indexes the feature stream
+                // i indexes the files within a stream, i.e. in the SCP file)
+        foreach_index(m, infiles){
+            if (m == 0){
+                numutts = infiles[m].size();
+                uttisvalid = std::vector<bool>(numutts, true);
+                uttduration = std::vector<size_t>(numutts, 0);
+            }
+            else if (infiles[m].size()!=numutts)
+                throw std::runtime_error("minibatchutterancesourcemulti: all feature files must have same number of utterances");
+
+            foreach_index(i, infiles[m]){
+                utterancedesc utterance(msra::asr::htkfeatreader::parsedpath(infiles[m][i]), 0);  //mseltzer - is this foolproof for multiio? is classids always non-empty? 
+                const size_t uttframes = utterance.numframes(); // will throw if frame bounds not given --required to be given in this mode
+                // we need at least 2 frames for boundary markers to work
+                if (uttframes < 2)
+                    throw std::runtime_error("minibatchutterancesource: utterances < 2 frames not supported");
+                if (uttframes > frameref::maxframesperutterance)
+                {
+                            fprintf(stderr, "minibatchutterancesource: skipping %d-th file (%d frames) because it exceeds max. frames (%d) for frameref bit field: %S\n", i, uttframes, frameref::maxframesperutterance, key.c_str());
+                    uttduration[i] = 0;
+                    uttisvalid[i] = false;
+                }
+                else{
+                    if (m == 0){
+                        uttduration[i] = uttframes;
+                        uttisvalid[i] = true;
+                    }
+                    else if (uttduration[i] != uttframes){
+                                fprintf(stderr, "minibatchutterancesource: skipping %d-th file due to inconsistency in duration in different feature streams (%d vs %d frames)\n", i, uttduration[i], uttframes);
+                        uttduration[i] = 0;
+                        uttisvalid[i] = false;
+                    }
+                }
+            }
        }
+        size_t invalidutts=0;
+        foreach_index(i, uttisvalid){
+            if (!uttisvalid[i])
+                invalidutts++;
+        }
+        if (invalidutts > uttisvalid.size() / 2)
+                    throw std::runtime_error("minibatchutterancesource: too many files with inconsistent durations, assuming broken configuration\n");
+        else if (invalidutts>0)
+                    fprintf(stderr, "Found inconsistent durations across feature streams in %d out of %d files\n", invalidutts, uttisvalid.size());
+
+
+        // now process the features and labels
+        size_t utterancesetsize = 0;
        foreach_index (m, infiles)
        {
            utteranceset.clear();
-            if (m==0)
-                numutts = infiles[m].size();
-            else
-                if (infiles[m].size()!=numutts)
-                    throw std::runtime_error("minibatchutterancesourcemulti: all feature files must have same number of utterances");
+                    //if (m==0)
+                    //    numutts = infiles[m].size();
+                    //else
+                    //    if (infiles[m].size()!=numutts)
+                    //        throw std::runtime_error("minibatchutterancesourcemulti: all feature files must have same number of utterances\n");
            if (m==0)
                classidsbegin.clear();

@ -344,16 +400,19 @@ public:
                if (m == 0 && !labels.empty())
                    classidsbegin.push_back(classids[0]->size());
                    
+                if (uttisvalid[i]){
                utterancedesc utterance (msra::asr::htkfeatreader::parsedpath (infiles[m][i]), labels.empty() ? 0 : classidsbegin[i] );  //mseltzer - is this foolproof for multiio? is classids always non-empty? 
                const size_t uttframes = utterance.numframes(); // will throw if frame bounds not given --required to be given in this mode
-                // we need at least 2 frames for boundary markers to work
-                if (uttframes < 2)
-                    throw std::runtime_error ("minibatchutterancesource: utterances < 2 frames not supported");
-                if (uttframes > frameref::maxframesperutterance)
-                {
-                    fprintf (stderr, "minibatchutterancesource: skipping %d-th file (%d frames) because it exceeds max. frames (%d) for frameref bit field: %S", i, uttframes, frameref::maxframesperutterance, key.c_str());
-                    continue;
-                }
+                    assert(uttframes == uttduration[i]); // ensure nothing funky happened
+                    // already performed these checks above
+                    // we need at least 2 frames for boundary markers to work
+                    //if (uttframes < 2)
+                    //    throw std::runtime_error ("minibatchutterancesource: utterances < 2 frames not supported");
+                    //if (uttframes > frameref::maxframesperutterance)
+                    //{
+                    //    fprintf (stderr, "minibatchutterancesource: skipping %d-th file (%d frames) because it exceeds max. frames (%d) for frameref bit field: %S", i, uttframes, frameref::maxframesperutterance, key.c_str());
+                    //    continue;
+                    //}

                // check whether we have the ref transcript
                //auto labelsiter = labels[0].end();
@ -375,9 +434,11 @@ public:
                        if (nolat++ < 5)
                            fprintf (stderr, " [no lattice for %S]", key.c_str());
                    // skip if either one is missing
-                    if (lacksmlf || lackslat)
+                        if (lacksmlf || lackslat){
+                            uttisvalid[i] = false;
                        continue;   // skip this utterance at all
                }
+                    }
                // push the label sequence into classids[], since we already looked it up
                // TODO: we can store labels more efficiently now since we don't do frame-wise random access anymore.
    
@ -389,7 +450,6 @@ public:
                    //if (!labels.empty() && labelsiter != labels[0].end())
                    {
                        // first verify that all the label files have the proper duration
-                        bool durationmatch = true;
                        foreach_index(j, labels)
                        {
                            const auto & labseq = labels[j].find(key)->second;
@ -399,14 +459,15 @@ public:
                            {
                                fprintf(stderr, " [duration mismatch (%d in label vs. %d in feat file), skipping %S]", labframes, uttframes, key.c_str());
                                nomlf++;
-                                durationmatch = false;
-                                break; // continue;   // skip this utterance at all
+                                    uttisvalid[i] = false;
+                                    //continue;   // skip this utterance at all
+                                    break;
                            }
                        }
-                        if (durationmatch){
+                            if (uttisvalid[i])
+                            {
                            utteranceset.push_back(std::move(utterance));
                            _totalframes += uttframes;
-                            framesaccum.push_back(uttframes); //track number of frames in each utterance - first feature is the reference
                            // then parse each mlf if the durations are consistent
                            foreach_index(j, labels)
                            {
@ -437,14 +498,22 @@ public:
                        }
                    }
                    else{
-                        assert(classids.empty());
+                            assert(classids.empty() && labels.empty());
+                            utteranceset.push_back(std::move(utterance));
+                            _totalframes += uttframes;
                    }
                }
-                else
-                {
-                    assert(uttframes==framesaccum[i]); //ensure that number of frames is consistent in each input feature "stream"
+                            else
+                    {
+                        utteranceset.push_back(std::move(utterance));
+                    }
                }
            }
+            if (m == 0) 
+                utterancesetsize = utteranceset.size();
+            else 
+                assert(utteranceset.size() == utterancesetsize);
+            
            fprintf (stderr, "feature set %d: %d frames in %d out of %d utterances\n", m, _totalframes, utteranceset.size(),infiles[m].size());

            if (!labels.empty()){
--- a/DataReader/HTKMLFReader_linux/utterancesourcemulti.h
+++ b/DataReader/HTKMLFReader_linux/utterancesourcemulti.h
@ -323,39 +323,92 @@ public:
        std::vector<size_t> numclasses;                  // number of output classes as found in the label file (diagnostics)
        _totalframes = 0;
        wstring key;
-        std::vector<size_t>framesaccum;
        size_t numutts=0;
+        
+        std::vector<bool>uttisvalid; // boolean flag to check that utterance is valid. valid means number of 
+                                     //frames is consistent across all feature and label streams
+        std::vector<size_t>uttduration; // track utterance durations to determine utterance validity
+
        std::vector<size_t> classidsbegin;
        if (!lattices.empty())
        {
            LogicError("lattices not supported in utterancereadermulti");
        }

+        allchunks = std::vector<std::vector<utterancechunkdata>>(infiles.size(), std::vector<utterancechunkdata>());
+        featdim = std::vector<size_t>(infiles.size(), 0);
+        sampperiod = std::vector<unsigned int>(infiles.size(), 0);
+        featkind = std::vector<string>(infiles.size(), "");
+        numclasses = std::vector<size_t>(labels.size(), 0);
+        counts = std::vector<std::vector<size_t>>(labels.size(), std::vector<size_t>());
        foreach_index (i, labels)
        {
-            //classids.push_back(biggrowablevector<CLASSIDTYPE>());
            classids.push_back(unique_ptr<biggrowablevector<CLASSIDTYPE>>(new biggrowablevector<CLASSIDTYPE>()));
-            numclasses.push_back(0);
-            counts.push_back(std::vector<size_t>());
            //std::pair<std::vector<wstring>,std::vector<wstring>> latticetocs;
            //std::unordered_map<std::string,size_t> modelsymmap;
            //lattices.push_back(shared_ptr<latticesource>(new latticesource(latticetocs, modelsymmap)));
    
        }
-        foreach_index(i, infiles){
-            allchunks.push_back(std::vector<utterancechunkdata>());
-            featdim.push_back(0); // initialize
-            sampperiod.push_back(0);
-            featkind.push_back("");
+
+
+        // first check consistency across feature streams
+        // We'll go through the SCP files for each stream to make sure the duration is consistent
+        // If not, we'll plan to ignore the utterance, and inform the user
+        foreach_index(m, infiles){
+            if (m == 0){
+                numutts = infiles[m].size();
+                uttisvalid = std::vector<bool>(numutts, true);
+                uttduration = std::vector<size_t>(numutts, 0);
+            }
+            else if (infiles[m].size()!=numutts)
+                throw std::runtime_error("minibatchutterancesourcemulti: all feature files must have same number of utterances");
+
+            foreach_index(i, infiles[m]){
+                utterancedesc utterance(msra::asr::htkfeatreader::parsedpath(infiles[m][i]), 0);  //mseltzer - is this foolproof for multiio? is classids always non-empty? 
+                const size_t uttframes = utterance.numframes(); // will throw if frame bounds not given --required to be given in this mode
+                // we need at least 2 frames for boundary markers to work
+                if (uttframes < 2)
+                    throw std::runtime_error("minibatchutterancesource: utterances < 2 frames not supported");
+                if (uttframes > frameref::maxframesperutterance)
+                {
+                            fprintf(stderr, "minibatchutterancesource: skipping %d-th file (%d frames) because it exceeds max. frames (%d) for frameref bit field: %S\n", i, uttframes, frameref::maxframesperutterance, key.c_str());
+                    uttduration[i] = 0;
+                    uttisvalid[i] = false;
+                }
+                else{
+                    if (m == 0){
+                        uttduration[i] = uttframes;
+                        uttisvalid[i] = true;
+                    }
+                    else if (uttduration[i] != uttframes){
+                                fprintf(stderr, "minibatchutterancesource: skipping %d-th file due to inconsistency in duration in different feature streams (%d vs %d frames)\n", i, uttduration[i], uttframes);
+                        uttduration[i] = 0;
+                        uttisvalid[i] = false;
+                    }
+                }
+            }
        }
+        size_t invalidutts=0;
+        foreach_index(i, uttisvalid){
+            if (!uttisvalid[i])
+                invalidutts++;
+        }
+        if (invalidutts > uttisvalid.size() / 2)
+                    throw std::runtime_error("minibatchutterancesource: too many files with inconsistent durations, assuming broken configuration\n");
+        else if (invalidutts>0)
+                    fprintf(stderr, "Found inconsistent durations across feature streams in %d out of %d files\n", invalidutts, uttisvalid.size());
+
+
+        // now process the features and labels
+        size_t utterancesetsize = 0;
        foreach_index (m, infiles)
        {
            utteranceset.clear();
-            if (m==0)
-                numutts = infiles[m].size();
-            else
-                if (infiles[m].size()!=numutts)
-                    throw std::runtime_error("minibatchutterancesourcemulti: all feature files must have same number of utterances");
+                    //if (m==0)
+                    //    numutts = infiles[m].size();
+                    //else
+                    //    if (infiles[m].size()!=numutts)
+                    //        throw std::runtime_error("minibatchutterancesourcemulti: all feature files must have same number of utterances\n");
            if (m==0)
                classidsbegin.clear();

@ -366,16 +419,19 @@ public:
                if (m == 0 && !labels.empty())
                    classidsbegin.push_back(classids[0]->size());
                    
+                if (uttisvalid[i]){
                utterancedesc utterance (msra::asr::htkfeatreader::parsedpath (infiles[m][i]), labels.empty() ? 0 : classidsbegin[i] );  //mseltzer - is this foolproof for multiio? is classids always non-empty? 
                const size_t uttframes = utterance.numframes(); // will throw if frame bounds not given --required to be given in this mode
-                // we need at least 2 frames for boundary markers to work
-                if (uttframes < 2)
-                    throw std::runtime_error ("minibatchutterancesource: utterances < 2 frames not supported");
-                if (uttframes > frameref::maxframesperutterance)
-                {
-                    fprintf (stderr, "minibatchutterancesource: skipping %d-th file (%zu frames) because it exceeds max. frames (%zu) for frameref bit field: %S", i, uttframes, frameref::maxframesperutterance, key.c_str());
-                    continue;
-                }
+                    assert(uttframes == uttduration[i]); // ensure nothing funky happened
+                    // already performed these checks above
+                    // we need at least 2 frames for boundary markers to work
+                    //if (uttframes < 2)
+                    //    throw std::runtime_error ("minibatchutterancesource: utterances < 2 frames not supported");
+                    //if (uttframes > frameref::maxframesperutterance)
+                    //{
+                    //    fprintf (stderr, "minibatchutterancesource: skipping %d-th file (%d frames) because it exceeds max. frames (%d) for frameref bit field: %S", i, uttframes, frameref::maxframesperutterance, key.c_str());
+                    //    continue;
+                    //}

                // check whether we have the ref transcript
                //auto labelsiter = labels[0].end();
@ -397,9 +453,11 @@ public:
                        if (nolat++ < 5)
                            fprintf (stderr, " [no lattice for %S]", key.c_str());
                    // skip if either one is missing
-                    if (lacksmlf || lackslat)
-                        continue;   // skip this utterance at all
-                }
+                        if (lacksmlf || lackslat){
+                            uttisvalid[i] = false;
+                            continue;   // skip this utterance at all
+                        }
+                    }
                // push the label sequence into classids[], since we already looked it up
                // TODO: we can store labels more efficiently now since we don't do frame-wise random access anymore.
    
@ -411,7 +469,6 @@ public:
                    //if (!labels.empty() && labelsiter != labels[0].end())
                    {
                        // first verify that all the label files have the proper duration
-                        bool durationmatch = true;
                        foreach_index (j, labels)
                        {
                            const auto & labseq = labels[j].find(key)->second;
@ -421,14 +478,14 @@ public:
                            {
                                fprintf (stderr, " [duration mismatch (%zu in label vs. %zu in feat file), skipping %S]", labframes, uttframes, key.c_str());
                                nomlf++;
-                                durationmatch = false;
+                                    uttisvalid[i] = false;
                                break; // continue;   // skip this utterance at all
                            }
                        }
-                        if (durationmatch){
+                            if (uttisvalid[i])
+                            {
                            utteranceset.push_back(std::move(utterance));
                            _totalframes += uttframes;
-                            framesaccum.push_back(uttframes); //track number of frames in each utterance - first feature is the reference
                            // then parse each mlf if the durations are consistent
                            foreach_index(j, labels)
                            {
@ -461,14 +518,22 @@ public:
                        }
                    }
                    else{
-                        assert(classids.empty());
+                            assert(classids.empty() && labels.empty());
+                            utteranceset.push_back(std::move(utterance));
+                            _totalframes += uttframes;
                    }
                }
-                else
+                            else
                {
-                    assert(uttframes==framesaccum[i]); //ensure that number of frames is consistent in each input feature "stream"
+                        utteranceset.push_back(std::move(utterance));
+                    }
                }
            }
+            if (m == 0) 
+                utterancesetsize = utteranceset.size();
+            else 
+                assert(utteranceset.size() == utterancesetsize);
+            
            fprintf (stderr, "feature set %d: %zu frames in %zu out of %zu utterances\n", m, _totalframes, utteranceset.size(),infiles[m].size());

            if (!labels.empty()){
--- a/DataReader/Kaldi2Reader/utterancesourcemulti.h
+++ b/DataReader/Kaldi2Reader/utterancesourcemulti.h
@ -316,43 +316,98 @@ public:
        std::vector<size_t> numclasses;                  // number of output classes as found in the label file (diagnostics)
        _totalframes = 0;
        wstring key;
-        std::vector<size_t>framesaccum;
        size_t numutts=0;
+        
+        std::vector<bool>uttisvalid; // boolean flag to check that utterance is valid. valid means number of 
+                                     //frames is consistent across all feature and label streams
+        std::vector<size_t>uttduration; // track utterance durations to determine utterance validity
+
        std::vector<size_t> classidsbegin;
        if (!lattices.empty())
        {
            LogicError("lattices not supported in utterancereadermulti");
        }

+        allchunks = std::vector<std::vector<utterancechunkdata>>(infiles.size(), std::vector<utterancechunkdata>());
+        featdim = std::vector<size_t>(infiles.size(), 0);
+        sampperiod = std::vector<unsigned int>(infiles.size(), 0);
+        featkind = std::vector<string>(infiles.size(), "");
+        
+        numclasses = std::vector<size_t>(labels.size(), 0);
+        counts = std::vector<std::vector<size_t>>(labels.size(), std::vector<size_t>());
        foreach_index (i, labels)
        {
-            //classids.push_back(biggrowablevector<CLASSIDTYPE>());
            classids.push_back(unique_ptr<biggrowablevector<CLASSIDTYPE>>(new biggrowablevector<CLASSIDTYPE>()));
-            numclasses.push_back(0);
-            counts.push_back(std::vector<size_t>());
            //std::pair<std::vector<wstring>,std::vector<wstring>> latticetocs;
            //std::unordered_map<std::string,size_t> modelsymmap;
            //lattices.push_back(shared_ptr<latticesource>(new latticesource(latticetocs, modelsymmap)));
    
        }
-        foreach_index(i, infiles){
-            allchunks.push_back(std::vector<utterancechunkdata>());
-            featdim.push_back(0); // initialize
-            sampperiod.push_back(0);
-            featkind.push_back("");
+
+
+        // first check consistency across feature streams
+        // We'll go through the SCP files for each stream to make sure the duration is consistent
+        // If not, we'll plan to ignore the utterance, and inform the user
+        // m indexes the feature stream
+        // i indexes the files within a stream, i.e. in the SCP file)
+        foreach_index(m, infiles){
+            if (m == 0){
+                numutts = infiles[m].size();
+                uttisvalid = std::vector<bool>(numutts, true);
+                uttduration = std::vector<size_t>(numutts, 0);
+            }
+            else if (infiles[m].size()!=numutts)
+                throw std::runtime_error("minibatchutterancesourcemulti: all feature files must have same number of utterances");
+
+            foreach_index(i, infiles[m]){
+	      utterancedesc utterance(msra::asr::htkfeatreader::parsedpath(infiles[m][i],featuresections[m]), 0);  //mseltzer - is this foolproof for multiio? is classids always non-empty? 
+                const size_t uttframes = utterance.numframes(); // will throw if frame bounds not given --required to be given in this mode
+                // we need at least 2 frames for boundary markers to work
+                if (uttframes < 2)
+                    throw std::runtime_error("minibatchutterancesource: utterances < 2 frames not supported");
+                if (uttframes > frameref::maxframesperutterance)
+                {
+                    fprintf(stderr, "minibatchutterancesource: skipping %d-th file (%d frames) because it exceeds max. frames (%d) for frameref bit field: %S", i, uttframes, frameref::maxframesperutterance, key.c_str());
+                    uttduration[i] = 0;
+                    uttisvalid[i] = false;
+                }
+                else{
+                    if (m == 0){
+                        uttduration[i] = uttframes;
+                        uttisvalid[i] = true;
+                    }
+                    else if (uttduration[i] != uttframes){
+                        fprintf(stderr, "minibatchutterancesource: skipping %d-th file due to inconsistency in duration in different feature streams (%d vs %d frames)", i, uttduration[i], uttframes);
+                        uttduration[i] = 0;
+                        uttisvalid[i] = false;
+                    }
+                }
+            }
        }
+        size_t invalidutts=0;
+        foreach_index(i, uttisvalid){
+            if (!uttisvalid[i])
+                invalidutts++;
+        }
+        if (invalidutts > uttisvalid.size() / 2)
+            throw std::runtime_error("minibatchutterancesource: too many files not found in with inconsistent durations, assuming broken configuration\n");
+        else if (invalidutts>0)
+            fprintf(stderr, "Found inconsistent durations across feature streams in %d out of %d files.", invalidutts, uttisvalid.size());
+
+
+        // now process the features and labels
+        size_t utterancesetsize = 0;
        foreach_index (m, infiles)
        {
            utteranceset.clear();
-            if (m==0)
-                numutts = infiles[m].size();
-            else
-                if (infiles[m].size()!=numutts)
-                    throw std::runtime_error("minibatchutterancesourcemulti: all feature files must have same number of utterances");
+            //if (m==0)
+            //    numutts = infiles[m].size();
+            //else
+            //    if (infiles[m].size()!=numutts)
+            //        throw std::runtime_error("minibatchutterancesourcemulti: all feature files must have same number of utterances\n");
            if (m==0)
                classidsbegin.clear();
            
-            size_t uttRealNum = 0;
            foreach_index (i, infiles[m])
            {
                if (i % (infiles[m].size() / 100 + 1) == 0) { fprintf (stderr, "."); fflush (stderr); }
@ -360,16 +415,20 @@ public:
                if (m == 0 && !labels.empty())
                    classidsbegin.push_back(classids[0]->size());
                    
+                if (uttisvalid[i]){
                utterancedesc utterance (msra::asr::htkfeatreader::parsedpath (infiles[m][i], featuresections[m]), labels.empty() ? 0 : classidsbegin[i] );  //mseltzer - is this foolproof for multiio? is classids always non-empty? 
                const size_t uttframes = utterance.numframes(); // will throw if frame bounds not given --required to be given in this mode
-                // we need at least 2 frames for boundary markers to work
-                if (uttframes < 2)
-                    throw std::runtime_error ("minibatchutterancesource: utterances < 2 frames not supported");
-                if (uttframes > frameref::maxframesperutterance)
-                {
-                    fprintf (stderr, "minibatchutterancesource: skipping %d-th file (%zu frames) because it exceeds max. frames (%zu) for frameref bit field: %S", i, uttframes, frameref::maxframesperutterance, key.c_str());
-                    continue;
-                }
+                    assert(uttframes == uttduration[i]); // ensure nothing funky happened
+
+                    // already performed these checks above
+                    // we need at least 2 frames for boundary markers to work
+                    //if (uttframes < 2)
+                    //    throw std::runtime_error ("minibatchutterancesource: utterances < 2 frames not supported");
+                    //if (uttframes > frameref::maxframesperutterance)
+                    //{
+                    //    fprintf (stderr, "minibatchutterancesource: skipping %d-th file (%d frames) because it exceeds max. frames (%d) for frameref bit field: %S", i, uttframes, frameref::maxframesperutterance, key.c_str());
+                    //    continue;
+                    //}

                // check whether we have the ref transcript
                //auto labelsiter = labels[0].end();
@ -391,8 +450,10 @@ public:
                        if (nolat++ < 5)
                            fprintf (stderr, " [no lattice for %S]", key.c_str());
                    // skip if either one is missing
-                    if (lacksmlf || lackslat)
+                        if (lacksmlf || lackslat){
+                            uttisvalid[i] = false;
                        continue;   // skip this utterance at all
+                        }
                }
                // push the label sequence into classids[], since we already looked it up
                // TODO: we can store labels more efficiently now since we don't do frame-wise random access anymore.
@ -405,7 +466,6 @@ public:
                    //if (!labels.empty() && labelsiter != labels[0].end())
                    {
                         // first verify that all the label files have the proper duration
-                        bool durationmatch = true;
                        foreach_index (j, labels)
                        {
                            const auto & labseq = labels[j].find(key)->second;
@ -415,14 +475,14 @@ public:
                            {
                                fprintf (stderr, " [duration mismatch (%zu in label vs. %zu in feat file), skipping %S]", labframes, uttframes, key.c_str());
                                nomlf++;
-                                durationmatch = false;
+                                    uttisvalid[i] = false;
                                break; // continue;   // skip this utterance at all
                            }
                        }
-                        if (durationmatch){
+                            if (uttisvalid[i])
+                            {
                            utteranceset.push_back(std::move(utterance));
                            _totalframes += uttframes;
-                            framesaccum.push_back(uttframes); //track number of frames in each utterance - first feature is the reference
                            // then parse each mlf if the durations are consistent
                            foreach_index(j, labels)
                            {
@ -455,16 +515,21 @@ public:
                        }
                    }
                    else{
-                        assert(classids.empty());
+                            assert(classids.empty() && labels.empty());
+                            utteranceset.push_back(std::move(utterance));
+                            _totalframes += uttframes;
                    }
                }
-                else
+                            else
                {
-                    assert(uttframes==framesaccum[uttRealNum]); //ensure that number of frames is consistent in each input feature "stream"
-                    uttRealNum++;
- 
+                        utteranceset.push_back(std::move(utterance));
+                    }
                }
            }
+            if (m == 0) 
+                utterancesetsize = utteranceset.size();
+            else 
+                assert(utteranceset.size() == utterancesetsize);
            fprintf (stderr, "feature set %d: %zu frames in %zu out of %zu utterances\n", m, _totalframes, utteranceset.size(),infiles[m].size());

            if (!labels.empty()){
--- a/MachineLearning/CNTK/CNTK.vcxproj
+++ b/MachineLearning/CNTK/CNTK.vcxproj
@ -171,9 +171,7 @@
    <ClInclude Include="ComputationNetworkHelper.h" />
    <ClInclude Include="ComputationNode.h" />
    <ClInclude Include="ConvolutionalNodes.h" />
-    <ClInclude Include="DecoderNode.h" />
-    <ClInclude Include="MinibatchFetcher.h" />
-    <ClInclude Include="MinibatchPrefetcher.h" />
+    <ClInclude Include="DecoderNode.h" />
    <ClInclude Include="EvaluationCriterionNodes.h" />
    <ClInclude Include="IComputationNetBuilder.h" />
    <ClInclude Include="IExecutionEngine.h" />
@ -221,4 +219,4 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets" />
-</Project>
+</Project>
--- a/MachineLearning/CNTK/CNTK.vcxproj.filters
+++ b/MachineLearning/CNTK/CNTK.vcxproj.filters
@ -74,12 +74,6 @@
    <ClInclude Include="IExecutionEngine.h">
      <Filter>Execution Engine</Filter>
    </ClInclude>
-    <ClInclude Include="MinibatchFetcher.h">
-      <Filter>Network</Filter>
-    </ClInclude>
-    <ClInclude Include="MinibatchPrefetcher.h">
-      <Filter>Network</Filter>
-    </ClInclude>
    <ClInclude Include="ModelEditLanguage.h">
      <Filter>Model Editing</Filter>
    </ClInclude>
--- a/MachineLearning/CNTK/MinibatchFetcher.h
+++ b/MachineLearning/CNTK/MinibatchFetcher.h
@ -1,51 +0,0 @@
-//
-// <copyright file="MinibatchFetcher.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-#pragma once
-
-#include "ComputationNetwork.h"
-#include "DataReader.h"
-#include "TimerUtility.h"
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-// This base class represent the old, sequential way of fetching a single minibatch of input data.
-// Essentially, it simply calls GetMinibatch on the reader.
-template<class ElemType>
-class MinibatchFetcher
-{
-public:
-    MinibatchFetcher(IDataReader<ElemType>* trainSetDataReader,
-                     std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
-                     Matrix<ElemType>* sentenceBegin,
-                     vector<MinibatchPackingFlag>* sentenceExistsBeginOrNoLabels) 
-                     :
-        m_reader(trainSetDataReader),
-        m_inputMatrices(inputMatrices),
-        m_sentenceBegin(sentenceBegin),
-        m_sentenceExistsBeginOrNoLabels(sentenceExistsBeginOrNoLabels)
-    {
-        assert((m_sentenceBegin != nullptr) && (m_sentenceExistsBeginOrNoLabels != nullptr));
-    }
-
-    // This virtual dtor is necessary to allow invocation of derived dtors, which have some required synchronization points
-    virtual ~MinibatchFetcher() {}
-
-    virtual bool GetMinibatch()
-    {
-        bool retVal = m_reader->GetMinibatch(*m_inputMatrices);
-        m_reader->SetSentenceSegBatch(*m_sentenceBegin, *m_sentenceExistsBeginOrNoLabels);
-
-        return retVal;
-    }
-
-protected:
-    IDataReader<ElemType>* m_reader;
-    std::map<std::wstring, Matrix<ElemType>*>* m_inputMatrices;
-    Matrix<ElemType>* m_sentenceBegin;
-    vector<MinibatchPackingFlag>* m_sentenceExistsBeginOrNoLabels;
-};
-
-}}}
--- a/MachineLearning/CNTK/MinibatchPrefetcher.h
+++ b/MachineLearning/CNTK/MinibatchPrefetcher.h
@ -1,220 +0,0 @@
-//
-// <copyright file="MinibatchFetcher.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-#pragma once
-
-#include "ComputationNetwork.h"
-#include "DataReader.h"
-#include "MinibatchFetcher.h"
-
-#include <thread>
-#include <mutex>
-#include <condition_variable>
-#include <atomic>
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-// This derived class is an implementation of a prefetcher for minibatches. It contains a simple producer-consumer synchronization
-// between reader and compute. It creates a separate thread for the reader and it allows a single compute to execute concurrently
-// with a single read-ahead of a minibatch. This ensures that compute always has input data to work on, and is not blocked on
-// reads off the disk, nor transfers of memory from host to device in the GPU case.
-template<class ElemType>
-class MinibatchPrefetcher : public MinibatchFetcher<ElemType>
-{
-public:
-    using MinibatchFetcher<ElemType>::m_sentenceBegin;
-    using MinibatchFetcher<ElemType>::m_sentenceExistsBeginOrNoLabels;
-
-    MinibatchPrefetcher(IDataReader<ElemType>* trainSetDataReader,
-                        std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
-                        Matrix<ElemType>* sentenceBegin,
-                        vector<MinibatchPackingFlag>* sentenceExistsBeginOrNoLabels) :
-        MinibatchFetcher<ElemType>(trainSetDataReader, inputMatrices, sentenceBegin, sentenceExistsBeginOrNoLabels),
-        m_prefetchSentenceBegin(nullptr),
-        m_prefetchSentenceExistsBeginOrNoLabels(nullptr),
-        m_isEpochReadingDone(false),
-        m_minibatchReady(false),
-        m_isTerminating(false)
-    {
-        m_deviceId = this->m_inputMatrices->begin()->second->GetDeviceId();
-
-        for (auto iter = this->m_inputMatrices->begin(); iter != this->m_inputMatrices->end(); iter++)
-        {
-            assert(m_deviceId == iter->second->GetDeviceId());
-            m_prefetchInput[iter->first] = new Matrix<ElemType>(iter->second->GetNumRows(),
-                                                                iter->second->GetNumCols(),
-                                                                iter->second->GetDeviceId(),
-                                                                iter->second->GetMatrixType(),
-                                                                iter->second->GetFormat());
-        }
-
-        if (sentenceBegin != nullptr)
-        {
-            m_prefetchSentenceBegin = new Matrix<ElemType>(sentenceBegin->GetNumRows(),
-                                                           sentenceBegin->GetNumCols(),
-                                                           sentenceBegin->GetDeviceId(),
-                                                           sentenceBegin->GetMatrixType(),
-                                                           sentenceBegin->GetFormat());
-        }
-
-        if (sentenceExistsBeginOrNoLabels != nullptr)
-        {
-            m_prefetchSentenceExistsBeginOrNoLabels = new vector<MinibatchPackingFlag>();
-        }
-
-        // Launch a worker thread
-        m_prefetchThread = std::thread([this]() { this->PrefetchWorker(); });
-    }
-
-    virtual ~MinibatchPrefetcher()
-    {
-        // Send a signal to the worker thread that we are in shutdown mode
-        m_isTerminating = true;
-
-        // Make sure that worker thread is unblocked because we are about to wait to join with it. If
-        // worker thread is in the middle of reading, let it finish so that we can safely grab the lock.
-        if (!m_isEpochReadingDone)
-        {
-            fprintf(stderr, "Exiting minibatch loop before reading all the data, waiting to sync with the prefetch thread...\n");
-            m_cv.notify_one();
-        }
-
-        m_prefetchThread.join();
-
-        // Clean up prefetch matrix inputs
-        for (auto iter = m_prefetchInput.begin(); iter != m_prefetchInput.end(); iter++)
-        {
-            delete iter->second;
-        }
-
-        delete m_prefetchSentenceBegin;
-        delete m_prefetchSentenceExistsBeginOrNoLabels;
-    }
-
-    virtual bool GetMinibatch()
-    {
-        bool hasMoreEpochReading = false;
-
-        // Wait until minibatch is ready to be consumed
-        {
-            std::unique_lock<std::mutex> mutexLock(m_mutex);
-            m_cv.wait(mutexLock, [this] { return this->m_minibatchReady == true; });
-
-            // This function now owns the lock
-
-            // m_isTerminating is set on this same thread, but only in destructor
-            assert(!m_isTerminating);
-
-            if (!m_isEpochReadingDone)
-            {
-                // Record an event after all computation for the previous minibatch has been scheduled
-                // ensuring that this event can safely be observed after all compute has finished.
-                Matrix<ElemType>::RecordComputeSyncPoint(m_deviceId);
-
-                // Swap the input matrices to make use of data that has already been read.
-                // This should be as simple as "m_prefetchInput.swap(m_inputMatrices)", but unfortunately
-                // underlying Matrix<ElemType> pointers are cached, so we need to dig deeper to do a swap.
-                for (auto iter = this->m_inputMatrices->begin(); iter != this->m_inputMatrices->end(); iter++)
-                {
-                    assert(m_deviceId == iter->second->GetDeviceId());
-                    std::swap(*(iter->second), *m_prefetchInput[iter->first]);
-                }
-
-                if (m_sentenceBegin != nullptr)
-                {
-                    assert(m_sentenceBegin->GetDeviceId() == m_prefetchSentenceBegin->GetDeviceId());
-                    std::swap(*m_sentenceBegin, *m_prefetchSentenceBegin);
-                }
-
-                if (m_sentenceExistsBeginOrNoLabels != nullptr)
-                {
-                    std::swap(*m_sentenceExistsBeginOrNoLabels, *m_prefetchSentenceExistsBeginOrNoLabels);
-                }
-
-                hasMoreEpochReading = true;
-            }
-
-            // Announce to worker thread to fetch another batch.
-            m_minibatchReady = false;
-        }
-        m_cv.notify_one();
-
-        return hasMoreEpochReading;
-    }
-
-private:
-
-    void PrefetchWorker()
-    {
-        Matrix<ElemType>::EnableConcurrentRead(m_deviceId);
-
-        while (!m_isEpochReadingDone)
-        {
-            // Wait until prefetch is requested
-            std::unique_lock<std::mutex> mutexLock(m_mutex);
-            m_cv.wait(mutexLock, [this] { return (!this->m_minibatchReady || this->m_isTerminating); });
-
-            // We now own the lock
-
-            // If the main thread has an early exit due to break or exception, it
-            // will initiate a shutdown and it will wait for this thread to complete.
-            // Thus, we need to check for that condition before proceeding.
-            m_isEpochReadingDone = m_isTerminating ? true : PrefetchOneMiniBatch();
-
-            // Signal to main thread that minibatch is ready to be consumed
-            m_minibatchReady = true;
-
-            // Manual unlocking is done before notifying, to avoid waking up
-            // the waiting thread only to block again (see notify_one for details)
-            mutexLock.unlock();
-            m_cv.notify_one();
-        }
-    }
-
-    bool PrefetchOneMiniBatch()
-    {
-        // This function must be called while holding a lock
-
-        // Schedule a wait event on the read stream that ensures that nothing can be further
-        // scheduled on that stream until dependent compute event has been observed.
-        // Please note that first two calls will be special cases:
-        //
-        // 1) First mini-batch is fetched before RecordComputeSyncPoint() is ever called
-        // 2) Second mini-batch is fetched depending on RecordComputeSyncPoint() reported before
-        //    scheduling any actual work on the compute thread
-        // 
-        // Dependency chain looks like this (F = fetch, C = compute):
-        //
-        // F1 -> C1 -> F3 (fetch #3 depends on compute #1 completing, which depended on fetch #1 completing)
-        // F2 -> C2 -> F4
-        // F3 -> C3 -> F5
-        // 
-        // It is fetch #3 that *must* observe the event that happened between computes #1 and #2
-        // before proceeding to read into the buffer that was used by compute #1.
-        Matrix<ElemType>::SyncComputeBeforeRead(m_deviceId);
-
-        // Get the next minibatch and wait for it to be available on the device
-        bool isDone = !this->m_reader->GetMinibatch(m_prefetchInput);
-        this->m_reader->SetSentenceSegBatch(*m_prefetchSentenceBegin, *m_prefetchSentenceExistsBeginOrNoLabels);
-
-        Matrix<ElemType>::SyncPendingRead(m_deviceId);
-
-        return isDone;
-    }
-
-    // @TODO: We need to add support for a larger number of prefetch buffers, larger than 1
-    std::map<std::wstring, Matrix<ElemType>*> m_prefetchInput;
-    Matrix<ElemType>* m_prefetchSentenceBegin;
-    vector<MinibatchPackingFlag>* m_prefetchSentenceExistsBeginOrNoLabels;
-    std::thread m_prefetchThread;
-    std::mutex m_mutex;
-    std::condition_variable m_cv;
-    DEVICEID_TYPE m_deviceId;
-    std::atomic<bool> m_isEpochReadingDone;
-    std::atomic<bool> m_minibatchReady;
-    std::atomic<bool> m_isTerminating;
-};
-
-}}}
--- a/MachineLearning/CNTK/SGD.h
+++ b/MachineLearning/CNTK/SGD.h
@ -19,8 +19,6 @@
 #include <random>
 #include "TimerUtility.h"
 #include "Profiler.h"
-#include "MinibatchFetcher.h"
-#include "MinibatchPrefetcher.h"

 #ifdef MPI_SUPPORT
 #include "mpi.h"
@ -201,9 +199,6 @@ public:
        size_t numMBsToShowResult = configSGD("numMBsToShowResult", "10");
        size_t numMBsToCUDAProfile = configSGD("numMBsToCUDAProfile", "0");

-        // Whether it is OK for read to happen on a separate thread while compute is happening
-        bool doPrefetchTrainingData = configSGD("prefetchTrainingData", "true");
-
        bool keepCheckPointFiles = configSGD("keepCheckPointFiles", "false");

        bool gradientClippingWithTruncation = configSGD("gradientClippingWithTruncation", "true");
@ -269,8 +264,7 @@ public:
             gradientCheckSigDigit, validateAfterModelReloading, rpi,
             learnRateAdjustInterval, UsingAllDataForPreComputedNode,
             needAveMultiplier, L2RegWeight, L1RegWeight,
-             autoAdjustMinibatch, minibatchSizeTuningFrequency, minibatchSizeTuningMax,
-             doPrefetchTrainingData);
+             autoAdjustMinibatch, minibatchSizeTuningFrequency, minibatchSizeTuningMax);
    }

    //autoLearnRateSearchType is applied only if the learning rate for the epoch is not specified in learningRatesPerMB and learningRatesPerSample
@ -316,8 +310,7 @@ public:
              const ElemType L1RegWeight = 0,
              const bool autoAdjustMinibatch = false,
              const size_t minibatchSizeTuningFrequency = 1,
-              const size_t minibatchSizeTuningMax = 1048576,
-              bool doPrefetchTrainingData = true)
+              const size_t minibatchSizeTuningMax = 1048576)
    {
        m_numPrevLearnRates = numPrevLearnRates;
        m_prevChosenMinibatchSize = 0;
@ -477,7 +470,6 @@ public:
        m_doGradientCheck = doGradientCheck;
        m_gradientCheckSigDigit = gradientCheckSigDigit;
        m_validateAfterModelReloading = validateAfterModelReloading;
-        m_doPrefetchTrainingData = doPrefetchTrainingData;

        msra::files::make_intermediate_dirs(m_modelPath);
    }
@ -865,9 +857,6 @@ protected:
                break;
            }

-#ifdef MPI_SUPPORT
-            INT32 mySamples = (INT32)
-#endif
            size_t chosenMinibatchSize;
            size_t actualMinibatchSize;

@ -910,20 +899,23 @@ protected:
            fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f  momentum = %f \n",
                i + 1, learnRatePerSample, MomentumPerMB(m_momentumPerSample[i], actualMinibatchSize));

+#ifdef MPI_SUPPORT
+            INT32 mySamples = (INT32)
+#endif
            TrainOneEpoch(net,
-                            refNet, 
-                            refNode, 
-                            i, 
-                            m_epochSize,
-                            trainSetDataReader, 
-                            learnRatePerSample, 
-                            chosenMinibatchSize, 
-                            FeatureNodes,
-                            labelNodes, 
-                            criterionNodes, 
-                            evaluationNodes,
-                            inputMatrices, 
-                            learnableNodes, smoothedGradients,
+                          refNet, 
+                          refNode, 
+                          i, 
+                          m_epochSize,
+                          trainSetDataReader, 
+                          learnRatePerSample, 
+                          chosenMinibatchSize, 
+                          FeatureNodes,
+                          labelNodes, 
+                          criterionNodes, 
+                          evaluationNodes,
+                          inputMatrices, 
+                          learnableNodes, smoothedGradients,
                          epochCriterion, epochEvalErrors, totalSamplesSeen);

            timer.Stop();
@ -1708,22 +1700,15 @@ protected:
        trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, m_epochSize);

        AttemptUtteranceDerivativeFeatures(net, trainSetDataReader, FeatureNodes, inputMatrices);
-        std::unique_ptr<MinibatchFetcher<ElemType>> mbFetcher(
-            m_doPrefetchTrainingData ?
-                new MinibatchPrefetcher<ElemType>(trainSetDataReader, inputMatrices, &(net.SentenceBoundary()), &(net.MinibatchPackingFlags())) :
-                new MinibatchFetcher<ElemType>(trainSetDataReader, inputMatrices, &(net.SentenceBoundary()), &(net.MinibatchPackingFlags())));
-
-        fprintf(stderr, "\nStarting minibatch loop, prefetching is: %s\n", m_doPrefetchTrainingData ? "ENABLED" : "DISABLED");

        Timer timer;
        timer.Start();

-        while (mbFetcher->GetMinibatch())
+        while (trainSetDataReader->GetMinibatch(*inputMatrices))
        {
 #ifdef MPI_SUPPORT
            DecimateMinibatch(inputMatrices);
 #endif
-
            UpdateEvalTimeStamps(FeatureNodes);
            UpdateEvalTimeStamps(labelNodes);

@ -1735,6 +1720,7 @@ protected:

            net.SetActualMiniBatchSize(actualMBSize);
            net.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter());
+            trainSetDataReader->SetSentenceSegBatch(net.SentenceBoundary(), net.MinibatchPackingFlags());

 #ifndef EVALDLL
            if (m_doGradientCheck && GradientCheck(net, criterionNodes, learnableNodes, 0) == false)
@ -1843,6 +1829,7 @@ protected:
                    }
                }
            }
+
            timer.Restart();
            totalEpochSamples += actualMBSize;
            totalSamplesSeen += actualMBSize;
@ -2401,7 +2388,7 @@ protected:
    bool m_needAveMultiplier;
    ElemType m_L2RegWeight;
    ElemType m_L1RegWeight;
-    bool m_doPrefetchTrainingData;
+
 };
 template class SGD<float>;
 template class SGD<double>;
--- a/MachineLearning/CNTKEval/CNTKEval.vcxproj
+++ b/MachineLearning/CNTKEval/CNTKEval.vcxproj
@ -69,6 +69,7 @@
      <SDLCheck>true</SDLCheck>
      <AdditionalIncludeDirectories>..\Common\include;..\Math\Math;"c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\include"</AdditionalIncludeDirectories>
      <TreatWarningAsError>true</TreatWarningAsError>
+      <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
      <SubSystem>Windows</SubSystem>
@ -93,7 +94,7 @@
      <SDLCheck>true</SDLCheck>
      <AdditionalIncludeDirectories>..\Common\include; ..\Math\Math; "c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\include"</AdditionalIncludeDirectories>
      <OpenMPSupport>false</OpenMPSupport>
-      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions>/d2Zi+ /bigobj %(AdditionalOptions)</AdditionalOptions>
      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
      <TreatWarningAsError>true</TreatWarningAsError>
      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@ -47,8 +47,6 @@ __declspec (thread)
 static
 #endif
 cudaStream_t t_stream = cudaStreamDefault;
-cudaStream_t t_readAheadStream = nullptr;
-cudaEvent_t t_computeEvent = nullptr;

 #define DEFAULT_THREAD_PER_DIM		16

@ -66,6 +64,7 @@ cudaStream_t MATH_API GetStream()
    return t_stream;
 }

+
 void CURAND_CALL(curandStatus x)
 {
    if (x != CURAND_STATUS_SUCCESS)
@ -161,51 +160,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return cuHandle;
    }

-    template<class ElemType>
-    void GPUMatrix<ElemType>::RecordComputeSyncPoint()
-    {
-        if (t_computeEvent == nullptr)
-        {
-            CUDA_CALL(cudaEventCreate(&t_computeEvent));
-        }
-        CUDA_CALL(cudaEventRecord(t_computeEvent, t_stream));
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::SyncComputeBeforeRead()
-    {
-        assert(t_readAheadStream != nullptr);
-        if (t_computeEvent != nullptr)
-        {
-            CUDA_CALL(cudaStreamWaitEvent(t_readAheadStream, t_computeEvent, 0 /*flags must be 0*/));
-        }
-    }
-
-    // TODO: We are leaking t_readAheadStream, call cudaStreamDestroy
-    // Not a big issue since it will be cleaned up on process shutdown
-    template<class ElemType>
-    void GPUMatrix<ElemType>::EnableConcurrentRead(DEVICEID_TYPE devId)
-    {
-        CUDA_CALL(cudaSetDevice(devId));
-        if (t_readAheadStream == nullptr)
-        {
-            CUDA_CALL(cudaStreamCreateWithFlags(&t_readAheadStream, cudaStreamNonBlocking));
-        }
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::SyncPendingRead()
-    {
-        assert(t_readAheadStream != nullptr);
-        CUDA_CALL(cudaStreamSynchronize(t_readAheadStream));
-    }
-
-    template<class ElemType>
-    void GPUMatrix<ElemType>::SyncPendingCompute()
-    {
-        CUDA_CALL(cudaStreamSynchronize(t_stream));
-    }
-
    // GetBestGPUDeviceId - Get the best GPU DeviceId, based on cuda information
    //  TODO: should be replaced by BestGpu class instead, it's much better
    template<class ElemType>
@ -1101,17 +1055,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {
                if (!(matrixFlags&matrixFormatRowMajor))
                {
-                    if (t_readAheadStream == nullptr)
-                    {
-                        CUDA_CALL(cudaMemcpy(m_pArray, pArray, sizeof(ElemType)*GetNumElements(),
-                            (matrixFlags&matrixFlagSetValueOnDevice) ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice));
-                    }
-                    else
-                    {
-                        // We are calling async version of the memcpy API to do the copy on a separate stream so that it can overlap with compute.
-                        CUDA_CALL(cudaMemcpyAsync(m_pArray, pArray, sizeof(ElemType)*GetNumElements(),
-                            (matrixFlags&matrixFlagSetValueOnDevice) ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice, t_readAheadStream));
-                    }
+                    CUDA_CALL(cudaMemcpy(m_pArray, pArray, sizeof(ElemType)*GetNumElements(), 
+                        (matrixFlags&matrixFlagSetValueOnDevice)?cudaMemcpyDeviceToDevice:cudaMemcpyHostToDevice));
                }
                else
                {
--- a/Math/Math/GPUMatrix.h
+++ b/Math/Math/GPUMatrix.h
@ -105,11 +105,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        ~GPUMatrix(void);       

        static int GetBestGPUDeviceId();  
-        static void RecordComputeSyncPoint();
-        static void SyncComputeBeforeRead();
-        static void SyncPendingRead();
-        static void SyncPendingCompute();
-        static void EnableConcurrentRead(DEVICEID_TYPE devId);
        int GetComputeDeviceId() const;
        DEVICEID_TYPE PrepareDevice(DEVICEID_TYPE deviceId = -1) const;

--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@ -4444,56 +4444,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return (DEVICEID_TYPE)GPUMatrix<ElemType>::GetBestGPUDeviceId();
    }

-    template<class ElemType>
-    void Matrix<ElemType>::RecordComputeSyncPoint(DEVICEID_TYPE devId)
-    {
-        // This function is necessary and has any effect only on GPU
-        if (devId >= 0)
-        {
-            GPUMatrix<ElemType>::RecordComputeSyncPoint();
-        }
-    }
-
-    template<class ElemType>
-    void Matrix<ElemType>::SyncComputeBeforeRead(DEVICEID_TYPE devId)
-    {
-        // This function is necessary and has any effect only on GPU
-        if (devId >= 0)
-        {
-            GPUMatrix<ElemType>::SyncComputeBeforeRead();
-        }
-    }
-
-    template<class ElemType>
-    void Matrix<ElemType>::SyncPendingRead(DEVICEID_TYPE devId)
-    {
-        // This function is necessary and has any effect only on GPU
-        if (devId >= 0)
-        {
-            GPUMatrix<ElemType>::SyncPendingRead();
-        }
-    }
-
-    template<class ElemType>
-    void Matrix<ElemType>::SyncPendingCompute(DEVICEID_TYPE devId)
-    {
-        // This function is necessary and has any effect only on GPU
-        if (devId >= 0)
-        {
-            GPUMatrix<ElemType>::SyncPendingCompute();
-        }
-    }
-
-    template<class ElemType>
-    void Matrix<ElemType>::EnableConcurrentRead(DEVICEID_TYPE devId)
-    {
-        // This function is necessary and has any effect only on GPU
-        if (devId >= 0)
-        {
-            GPUMatrix<ElemType>::EnableConcurrentRead(devId);
-        }
-    }
-
    template<class ElemType>
    ElemType Matrix<ElemType>::Exp10(ElemType num)
    {
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@ -344,26 +344,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    public:
        static DEVICEID_TYPE GetBestGPUDeviceId(); //{ return GPUMatrix<ElemType>::GetBestGPUDeviceId();}

-        // This API records an event, in the case of GPU computation, that happens between two compute iterations
-        // (it's a compute delimiter between two minibatch iterations)
-        static void RecordComputeSyncPoint(DEVICEID_TYPE devId);
-
-        // This API ensures, in the case of GPU computation, that all compute is flushed before read decides to modify
-        // buffers, and potentially invalidate computation.
-        static void SyncComputeBeforeRead(DEVICEID_TYPE devId);
-
-        // This API ensures, in the case of GPU computation, that all async reads are finished before notifying compute
-        // that the read buffers are ready on the device.
-        static void SyncPendingRead(DEVICEID_TYPE devId);
-
-        // This API ensures, in the case of GPU computation, that all compute is flushed before transferring the criterion
-        // back to the host. This is a workaround for contention between two memcpy calls, one host-to-device and one
-        // device-to-host, which are for some reason getting serialized and cause big delays in compute.
-        static void SyncPendingCompute(DEVICEID_TYPE devId);
-
-        // This API ensures, in the case of GPU computation, creates a separate stream for reading data into GPU buffer.
-        static void EnableConcurrentRead(DEVICEID_TYPE devId);
-
        //static BLAS functions

        // singular value decomposition of A as A = U*SIGMA*VT
--- a/Math/Math/NoGPU.cpp
+++ b/Math/Math/NoGPU.cpp
@ -403,16 +403,6 @@ namespace Microsoft {
                return -1; // CPU
            }

-            template<class ElemType> void GPUMatrix<ElemType>::RecordComputeSyncPoint() { }
-
-            template<class ElemType> void GPUMatrix<ElemType>::SyncComputeBeforeRead() { }
-
-            template<class ElemType> void GPUMatrix<ElemType>::SyncPendingRead() { }
-
-            template<class ElemType> void GPUMatrix<ElemType>::SyncPendingCompute() { }
-
-            template<class ElemType> void GPUMatrix<ElemType>::EnableConcurrentRead(DEVICEID_TYPE devId) { }
-
            // PrepareDevice - Setup the correct cuda context for an operation
            // deviceId - the device on which the operation will take place
            //            defaults to -1, which means use matrices current device