Merge branch 'master' of https://git01.codeplex.com/cntk into bmitra/MatrixMorphSupplements

2015-07-02 07:35:17 +10:00 · 2015-07-02 07:35:17 +10:00 · aae4d9c240
--- a/DataReader/HTKMLFReader/HTKMLFReader.cpp
+++ b/DataReader/HTKMLFReader/HTKMLFReader.cpp
--- a/DataReader/HTKMLFReader/latticearchive.h
+++ b/DataReader/HTKMLFReader/latticearchive.h
@ -934,16 +934,17 @@ public:
                    if (!isendworkaround.empty() && isendworkaround[k])       // secondary criterion to detect ends in broken lattices
                    {
                        k--;    // don't advance, since nothing to advance over
-                        goto skipscores;
                    }
-                    // this is a regular token: update it in-place
-                    auto & ai = uniquededgedatatokens[k];
-                    if (ai.unit >= idmap.size())
-                        throw std::runtime_error ("fread: broken-file heuristics failed");
-                    ai.updateunit (idmap);      // updates itself
-                    if (!ai.last)
-                        continue;
-                skipscores:
+                    else
+                    {
+                        // this is a regular token: update it in-place
+                        auto & ai = uniquededgedatatokens[k];
+                        if (ai.unit >= idmap.size())
+                            throw std::runtime_error ("fread: broken-file heuristics failed");
+                        ai.updateunit (idmap);      // updates itself
+                        if (!ai.last)
+                            continue;
+                    }
                    // if last then skip over the lm and ac scores
                    k += skipscoretokens;
                    uniquealignments++;
--- a/DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
+++ b/DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
@ -57,17 +57,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_truncated = readerConfig("Truncated", "false");
        m_convertLabelsToTargets = false;

-        m_numberOfuttsPerMinibatch = readerConfig("nbruttsineachrecurrentiter", "1");
+        ConfigArray numberOfuttsPerMinibatchForAllEpochs = readerConfig("nbruttsineachrecurrentiter", "1");
+        m_numberOfuttsPerMinibatchForAllEpochs = numberOfuttsPerMinibatchForAllEpochs;

-        if (m_numberOfuttsPerMinibatch < 1)
+        for (int i = 0; i < m_numberOfuttsPerMinibatchForAllEpochs.size(); i++)
        {
-            LogicError("nbrUttsInEachRecurrentIter cannot be less than 1.");
+            m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[i];
+            if (m_numberOfuttsPerMinibatch < 1)
+            {
+                LogicError("nbrUttsInEachRecurrentIter cannot be less than 1.");
+            }
+
+            if (!m_truncated && m_numberOfuttsPerMinibatch != 1)
+            {
+                LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false.");
+            }
        }

-        if (!m_truncated && m_numberOfuttsPerMinibatch != 1)
-        {
-            LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false.");
-        }
+        m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[0];

        m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
        m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
@ -129,7 +136,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            RuntimeError("network needs at least 1 input and 1 output specified!");
        }
-            
+
        //load data for all real-valued inputs (features)
        foreach_index(i, featureNames)
        {
@ -158,7 +165,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            // update m_featDims to reflect the total input dimension (featDim x contextWindow), not the native feature dimension
            // that is what the lower level feature readers expect
            m_featDims[i] = m_featDims[i] * (1 + numContextLeft[i] + numContextRight[i]); 
-            
+
            string type = thisFeature("type","Real");
            if (type=="Real"){
                m_nameToTypeMap[featureNames[i]] = InputOutputTypes::real;
@ -272,6 +279,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // get the read method, defaults to "blockRandomize" other option is "rollingWindow"
        std::string readMethod(readerConfig("readMethod","blockRandomize"));

+        if (readMethod == "blockRandomize" && randomize == randomizeNone)
+        {
+            fprintf(stderr, "WARNING: Randomize cannot be set to None when readMethod is set to blockRandomize. Change it Auto");
+            randomize = randomizeAuto;
+        }
+
+
        // see if they want to use readAhead
        //m_readAhead = readerConfig("readAhead", "false");

@ -298,6 +312,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                if (n!=numFiles)
                    throw std::runtime_error (msra::strfun::strprintf ("number of files in each scriptfile inconsistent (%d vs. %d)", numFiles,n));

+            /* 
+               do "..." expansion if SCP uses relative path names
+               "..." in the SCP means full path is the same as the SCP file
+               for example, if scp file is "//aaa/bbb/ccc/ddd.scp"
+               and contains entry like 
+               .../file1.feat
+               .../file2.feat
+               etc.
+               the features will be read from
+            //aaa/bbb/ccc/file1.feat
+            //aaa/bbb/ccc/file2.feat
+            etc. 
+            This works well if you store the scp file with the features but 
+            do not want different scp files everytime you move or create new features
+            */
+            wstring scpdircached;
+            for (auto & entry : filelist)
+                ExpandDotDotDot(entry, scriptpath, scpdircached);
+
            infilesmulti.push_back(filelist);
        }
 #ifdef _WIN32
@ -346,8 +379,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        //std::vector<std::wstring> pagepath;
        foreach_index(i, mlfpathsmulti)
        {
+            const map<string,size_t>* wordmap = NULL;
+#ifdef WIN32
+            wordmap = unigram ? &unigramsymbols : (map<string,size_t>*) NULL;
+#endif
            msra::asr::htkmlfreader<msra::asr::htkmlfentry,msra::lattices::lattice::htkmlfwordsequence>  
-                labels(mlfpathsmulti[i], restrictmlftokeys, statelistpaths[i], /*unigram ? &unigramsymbols :*/(map<string,size_t>*)  NULL, (map<string,size_t>*) NULL, htktimetoframe);      // label MLF
+                labels(mlfpathsmulti[i], restrictmlftokeys, statelistpaths[i], wordmap, (map<string,size_t>*) NULL, htktimetoframe);      // label MLF
            // get the temp file name for the page file
            labelsmulti.push_back(labels);
        }
@ -362,6 +399,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            // now get the frame source. This has better randomization and doesn't create temp files
            m_frameSource = new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, framemode);
+            m_frameSource->setverbosity(verbosity);
            //m_frameSource = new msra::dbn::minibatchutterancesource(infilesmulti[0], labelsmulti[0], m_featDims[0], m_labelDims[0], numContextLeft[0], numContextRight[0], randomize, *m_lattices, m_latticeMap, framemode);

        }
@ -540,7 +578,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_fileEvalSource = new msra::dbn::FileEvalSource(realDims, numContextLeft, numContextRight, evalchunksize);
    }

-    
+

    // destructor - virtual so it gets called properly 
    template<class ElemType>
@ -599,6 +637,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    void HTKMLFReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples)
    {
        m_mbSize = mbSize;
+        m_numberOfuttsPerMinibatch = m_numberOfuttsPerMinibatchForAllEpochs[epoch];
+
+        m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
+        m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
+        m_processedFrame.assign(m_numberOfuttsPerMinibatch, 0);
+        m_toProcess.assign(m_numberOfuttsPerMinibatch, 0);
+        m_switchFrame.assign(m_numberOfuttsPerMinibatch, 0);

        if (m_trainOrTest)
        {
@ -649,18 +694,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        delete m_mbiter;
        msra::dbn::minibatchsource* source = m_frameSource;
        /*if (m_readAhead)
-        {
-            if (m_readAheadSource == NULL)
-            {
-                m_readAheadSource = new msra::dbn::minibatchreadaheadsource (*source, requestedEpochSamples);
-            }
-            else if (m_readAheadSource->epochsize() != requestedEpochSamples)
-            {
-                delete m_readAheadSource;
-                m_readAheadSource = new msra::dbn::minibatchreadaheadsource (*source, requestedEpochSamples);
-            }
-            source = m_readAheadSource;
-        }*/
+          {
+          if (m_readAheadSource == NULL)
+          {
+          m_readAheadSource = new msra::dbn::minibatchreadaheadsource (*source, requestedEpochSamples);
+          }
+          else if (m_readAheadSource->epochsize() != requestedEpochSamples)
+          {
+          delete m_readAheadSource;
+          m_readAheadSource = new msra::dbn::minibatchreadaheadsource (*source, requestedEpochSamples);
+          }
+          source = m_readAheadSource;
+          }*/
        m_mbiter = new msra::dbn::minibatchiterator(*source, epoch, requestedEpochSamples, mbSize, datapasses);
        if (!m_featuresBufferMultiIO.empty())
        {
@ -698,7 +743,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    delete[] m_featuresBufferMultiUtt[u];
                    m_featuresBufferMultiUtt[u] = NULL;
                    m_featuresBufferAllocatedMultiUtt[u] = 0;
-    }
+                }
                if (m_labelsBufferMultiUtt[u] != NULL)
                {
                    delete[] m_labelsBufferMultiUtt[u];
@ -761,7 +806,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            for (auto iter=matrices.begin();iter!=matrices.end();iter++)
            {
                if (m_nameToTypeMap.find(iter->first)==m_nameToTypeMap.end())
-                    throw std::runtime_error(msra::strfun::strprintf("minibatch requested for input node %S not found in reader - cannot generate input\n",iter->first.c_str()));
+                    throw std::runtime_error(msra::strfun::strprintf("minibatch requested for input node %ls not found in reader - cannot generate input\n",iter->first.c_str()));

            }
            m_checkDictionaryKeys=false;
@ -771,144 +816,144 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            if (m_truncated == false)
            {
-            if (!(*m_mbiter))
-                return false;
+                if (!(*m_mbiter))
+                    return false;

-            // now, access all features and and labels by iterating over map of "matrices"
-            typename std::map<std::wstring, Matrix<ElemType>*>::iterator iter;
-            for (iter = matrices.begin();iter!=matrices.end(); iter++)
-            {
-                // dereference matrix that corresponds to key (input/output name) and 
-                // populate based on whether its a feature or a label
-                Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels
-
-                if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
+                // now, access all features and and labels by iterating over map of "matrices"
+                typename std::map<std::wstring, Matrix<ElemType>*>::iterator iter;
+                for (iter = matrices.begin();iter!=matrices.end(); iter++)
                {
+                    // dereference matrix that corresponds to key (input/output name) and 
+                    // populate based on whether its a feature or a label
+                    Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels

-                    id = m_featureNameToIdMap[iter->first];
-                    dim = m_featureNameToDimMap[iter->first];
-                    const msra::dbn::matrixstripe feat = m_mbiter->frames(id);
-                    const size_t actualmbsize = feat.cols();   // it may still return less if at end of sweep TODO: this check probably only needs to happen once
-                    assert (actualmbsize == m_mbiter->currentmbframes());
-                    skip = (!m_partialMinibatch && m_mbiter->requestedframes() != actualmbsize && m_frameSource->totalframes() > actualmbsize);
-
-                    // check to see if we got the number of frames we requested
-                    if (!skip)
+                    if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
                    {
-                        // copy the features over to our array type
-                        assert(feat.rows()==dim); // check feature dimension matches what's expected

-                        if (m_featuresBufferMultiIO[id]==NULL)
-                        {
-                            m_featuresBufferMultiIO[id] = new ElemType[feat.rows()*feat.cols()];
-                            m_featuresBufferAllocatedMultiIO[id] = feat.rows()*feat.cols();
-                        }
-                        else if (m_featuresBufferAllocatedMultiIO[id]<feat.rows()*feat.cols()) //buffer size changed. can be partial minibatch
-                        {
-                            delete[] m_featuresBufferMultiIO[id];
-                            m_featuresBufferMultiIO[id] = new ElemType[feat.rows()*feat.cols()];
-                            m_featuresBufferAllocatedMultiIO[id] = feat.rows()*feat.cols();
-                        }
-                        // shouldn't need this since we fill up the entire buffer below
-                        //memset(m_featuresBufferMultiIO[id],0,sizeof(ElemType)*feat.rows()*feat.cols());
+                        id = m_featureNameToIdMap[iter->first];
+                        dim = m_featureNameToDimMap[iter->first];
+                        const msra::dbn::matrixstripe feat = m_mbiter->frames(id);
+                        const size_t actualmbsize = feat.cols();   // it may still return less if at end of sweep TODO: this check probably only needs to happen once
+                        assert (actualmbsize == m_mbiter->currentmbframes());
+                        skip = (!m_partialMinibatch && m_mbiter->requestedframes() != actualmbsize && m_frameSource->totalframes() > actualmbsize);

-                        if (sizeof(ElemType) == sizeof(float))
+                        // check to see if we got the number of frames we requested
+                        if (!skip)
                        {
-                            for (int j=0; j < feat.cols(); j++) // column major, so iterate columns
+                            // copy the features over to our array type
+                            assert(feat.rows()==dim); // check feature dimension matches what's expected
+
+                            if (m_featuresBufferMultiIO[id]==NULL)
                            {
-                                // copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns
-                                memcpy_s(&m_featuresBufferMultiIO[id][j*feat.rows()],sizeof(ElemType)*feat.rows(),&feat(0,j),sizeof(ElemType)*feat.rows());
+                                m_featuresBufferMultiIO[id] = new ElemType[feat.rows()*feat.cols()];
+                                m_featuresBufferAllocatedMultiIO[id] = feat.rows()*feat.cols();
                            }
-                        }
-                        else
-                        {
-                            for (int j=0; j < feat.cols(); j++) // column major, so iterate columns in outside loop
+                            else if (m_featuresBufferAllocatedMultiIO[id]<feat.rows()*feat.cols()) //buffer size changed. can be partial minibatch
                            {
-                                for (int i = 0; i < feat.rows(); i++)
+                                delete[] m_featuresBufferMultiIO[id];
+                                m_featuresBufferMultiIO[id] = new ElemType[feat.rows()*feat.cols()];
+                                m_featuresBufferAllocatedMultiIO[id] = feat.rows()*feat.cols();
+                            }
+                            // shouldn't need this since we fill up the entire buffer below
+                            //memset(m_featuresBufferMultiIO[id],0,sizeof(ElemType)*feat.rows()*feat.cols());
+
+                            if (sizeof(ElemType) == sizeof(float))
+                            {
+                                for (int j=0; j < feat.cols(); j++) // column major, so iterate columns
                                {
-                                    m_featuresBufferMultiIO[id][j*feat.rows()+i] = feat(i,j);
+                                    // copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns
+                                    memcpy_s(&m_featuresBufferMultiIO[id][j*feat.rows()],sizeof(ElemType)*feat.rows(),&feat(0,j),sizeof(ElemType)*feat.rows());
                                }
                            }
+                            else
+                            {
+                                for (int j=0; j < feat.cols(); j++) // column major, so iterate columns in outside loop
+                                {
+                                    for (int i = 0; i < feat.rows(); i++)
+                                    {
+                                        m_featuresBufferMultiIO[id][j*feat.rows()+i] = feat(i,j);
+                                    }
+                                }
+                            }
+                            data.SetValue(feat.rows(), feat.cols(), m_featuresBufferMultiIO[id],matrixFlagNormal);
                        }
-                        data.SetValue(feat.rows(), feat.cols(), m_featuresBufferMultiIO[id],matrixFlagNormal);
                    }
-                }
-                else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
-                {
-                    id = m_labelNameToIdMap[iter->first];
-                    dim = m_labelNameToDimMap[iter->first];
-                    const vector<size_t> & uids = m_mbiter->labels(id);
-
-                    // need skip logic here too in case labels are first in map not features
-                    const size_t actualmbsize = uids.size();   // it may still return less if at end of sweep TODO: this check probably only needs to happen once
-                    assert (actualmbsize == m_mbiter->currentmbframes());
-                    skip = (!m_partialMinibatch && m_mbiter->requestedframes() != actualmbsize && m_frameSource->totalframes() > actualmbsize);
-
-                    if (!skip)
+                    else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
                    {
-                        // copy the labels over to array type
-                        //data.Resize(udims[id], uids.size());
-                        //data.SetValue((ElemType)0);
+                        id = m_labelNameToIdMap[iter->first];
+                        dim = m_labelNameToDimMap[iter->first];
+                        const vector<size_t> & uids = m_mbiter->labels(id);

-                        // loop through the columns and set one value to 1
-                        // in the future we want to use a sparse matrix here
-                        //for (int i = 0; i < uids.size(); i++)
-                        //{
-                        //    assert(uids[i] <udims[id]);
-                        //    data(uids[i], i) = (ElemType)1;
-                        //}
+                        // need skip logic here too in case labels are first in map not features
+                        const size_t actualmbsize = uids.size();   // it may still return less if at end of sweep TODO: this check probably only needs to happen once
+                        assert (actualmbsize == m_mbiter->currentmbframes());
+                        skip = (!m_partialMinibatch && m_mbiter->requestedframes() != actualmbsize && m_frameSource->totalframes() > actualmbsize);

-                        if (m_labelsBufferMultiIO[id]==NULL)
+                        if (!skip)
                        {
-                            m_labelsBufferMultiIO[id] = new ElemType[dim*uids.size()];
-                            m_labelsBufferAllocatedMultiIO[id] = dim*uids.size();
-                        }
-                        else if (m_labelsBufferAllocatedMultiIO[id]<dim*uids.size())
-                        {
-                            delete[] m_labelsBufferMultiIO[id];
-                            m_labelsBufferMultiIO[id] = new ElemType[dim*uids.size()];
-                            m_labelsBufferAllocatedMultiIO[id] = dim*uids.size();
-                        }
-                        memset(m_labelsBufferMultiIO[id],0,sizeof(ElemType)*dim*uids.size());                
+                            // copy the labels over to array type
+                            //data.Resize(udims[id], uids.size());
+                            //data.SetValue((ElemType)0);

-
-                        if (m_convertLabelsToTargetsMultiIO[id])
-                        {
-                            size_t labelDim = m_labelToTargetMapMultiIO[id].size();
-                            for (int i = 0; i < uids.size(); i++)
-                            {
-                                assert(uids[i] < labelDim); labelDim;
-                                size_t labelId = uids[i];
-                                for (int j = 0; j < dim; j++)
-                                {
-                                    m_labelsBufferMultiIO[id][i*dim + j] = m_labelToTargetMapMultiIO[id][labelId][j];
-                                }
-                            }
-                        }
-                        else
-                        {
                            // loop through the columns and set one value to 1
                            // in the future we want to use a sparse matrix here
-                            for (int i = 0; i < uids.size(); i++)
+                            //for (int i = 0; i < uids.size(); i++)
+                            //{
+                            //    assert(uids[i] <udims[id]);
+                            //    data(uids[i], i) = (ElemType)1;
+                            //}
+
+                            if (m_labelsBufferMultiIO[id]==NULL)
                            {
-                                assert(uids[i] < dim);
-                                //labels(uids[i], i) = (ElemType)1;
-                                m_labelsBufferMultiIO[id][i*dim+uids[i]]=(ElemType)1;
+                                m_labelsBufferMultiIO[id] = new ElemType[dim*uids.size()];
+                                m_labelsBufferAllocatedMultiIO[id] = dim*uids.size();
                            }
+                            else if (m_labelsBufferAllocatedMultiIO[id]<dim*uids.size())
+                            {
+                                delete[] m_labelsBufferMultiIO[id];
+                                m_labelsBufferMultiIO[id] = new ElemType[dim*uids.size()];
+                                m_labelsBufferAllocatedMultiIO[id] = dim*uids.size();
+                            }
+                            memset(m_labelsBufferMultiIO[id],0,sizeof(ElemType)*dim*uids.size());                
+
+
+                            if (m_convertLabelsToTargetsMultiIO[id])
+                            {
+                                size_t labelDim = m_labelToTargetMapMultiIO[id].size();
+                                for (int i = 0; i < uids.size(); i++)
+                                {
+                                    assert(uids[i] < labelDim); labelDim;
+                                    size_t labelId = uids[i];
+                                    for (int j = 0; j < dim; j++)
+                                    {
+                                        m_labelsBufferMultiIO[id][i*dim + j] = m_labelToTargetMapMultiIO[id][labelId][j];
+                                    }
+                                }
+                            }
+                            else
+                            {
+                                // loop through the columns and set one value to 1
+                                // in the future we want to use a sparse matrix here
+                                for (int i = 0; i < uids.size(); i++)
+                                {
+                                    assert(uids[i] < dim);
+                                    //labels(uids[i], i) = (ElemType)1;
+                                    m_labelsBufferMultiIO[id][i*dim+uids[i]]=(ElemType)1;
+                                }
+                            }
+
+
+                            data.SetValue(dim,uids.size(),m_labelsBufferMultiIO[id],matrixFlagNormal);
                        }
-
-
-                        data.SetValue(dim,uids.size(),m_labelsBufferMultiIO[id],matrixFlagNormal);
                    }
-                }
-                else{
-                    //default:
-                    throw runtime_error(msra::strfun::strprintf("GetMinibatchMultiIO:: unknown InputOutputType for %S\n",(iter->first).c_str()));
-                }
+                    else{
+                        //default:
+                        throw runtime_error(msra::strfun::strprintf("GetMinibatchMultiIO:: unknown InputOutputType for %S\n",(iter->first).c_str()));
+                    }

-            }
-            // advance to the next minibatch
-            (*m_mbiter)++;
+                }
+                // advance to the next minibatch
+                (*m_mbiter)++;
            }
            else
            {
@ -1184,17 +1229,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {
                if (matrices.find(iter->first)==matrices.end())
                {
-                    fprintf(stderr,"GetMinibatchToWrite: feature node %S specified in reader not found in the network\n",iter->first.c_str());
+                    fprintf(stderr,"GetMinibatchToWrite: feature node %ls specified in reader not found in the network\n",iter->first.c_str());
                    throw std::runtime_error("GetMinibatchToWrite: feature node specified in reader not found in the network.");
                }
            }
            /*
-            for (auto iter=matrices.begin();iter!=matrices.end();iter++)
-            {
-                if (m_featureNameToIdMap.find(iter->first)==m_featureNameToIdMap.end())
-                    throw std::runtime_error(msra::strfun::strprintf("minibatch requested for input node %ws not found in reader - cannot generate input\n",iter->first.c_str()));
-            }
-            */
+           for (auto iter=matrices.begin();iter!=matrices.end();iter++)
+           {
+               if (m_featureNameToIdMap.find(iter->first)==m_featureNameToIdMap.end())
+                   throw std::runtime_error(msra::strfun::strprintf("minibatch requested for input node %ls not found in reader - cannot generate input\n",iter->first.c_str()));
+           }
+           */
            m_checkDictionaryKeys=false;
        }

@ -1329,7 +1374,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_labelsStartIndexMultiUtt[id+i*numOfLabel] = totalLabelsNum;
            totalLabelsNum = m_labelsStartIndexMultiUtt[id+i*numOfLabel] + dim * actualmbsizeOri;
        }
-        
+
        if (m_labelsBufferMultiUtt[i]==NULL)
        {
            m_labelsBufferMultiUtt[i] = new ElemType[totalLabelsNum];
@ -1383,7 +1428,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                }
            }
        }
-        
+
        for (auto it = m_labelNameToIdMap.begin(); it != m_labelNameToIdMap.end(); ++it) 
        {
            size_t id = m_labelNameToIdMap[it->first];
@ -1425,8 +1470,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        return true;    
    }
-    
-    
+
+


    // GetLabelMapping - Gets the label mapping from integer to type in file 
@ -1451,7 +1496,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    {
        if (labelListFile==L"")
            throw std::runtime_error("HTKMLFReader::ReadLabelToTargetMappingFile(): cannot read labelToTargetMappingFile without a labelMappingFile!");
-        
+
        vector<std::wstring> labelList;
        size_t count, numLabels;
        count=0;
@ -1573,7 +1618,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        }
    }
+    template<class ElemType>
+    void HTKMLFReader<ElemType>::ExpandDotDotDot(wstring & featPath, const wstring & scpPath, wstring & scpDirCached) 
+    {
+        wstring delim = L"/\\";
+
+        if (scpDirCached.empty()) 
+        {
+            scpDirCached = scpPath;
+            wstring tail; 
+            auto pos = scpDirCached.find_last_of(delim);
+            if (pos != wstring::npos)
+            {
+                tail = scpDirCached.substr(pos + 1);
+                scpDirCached.resize(pos);
+            }
+            if (tail.empty()) // nothing was split off: no dir given, 'dir' contains the filename
+                scpDirCached.swap(tail);            
+        }
+        size_t pos = featPath.find(L"...");
+        if (pos != featPath.npos)
+            featPath = featPath.substr(0, pos) + scpDirCached + featPath.substr(pos + 3);
+    }
+

    template class HTKMLFReader<float>;
    template class HTKMLFReader<double>;
-    }}}
+}}}
--- a/DataReader/HTKMLFReader_linux/HTKMLFReader.h
+++ b/DataReader/HTKMLFReader_linux/HTKMLFReader.h
@ -6,6 +6,7 @@
 // HTKMLFReader.h - Include file for the MTK and MLF format of features and samples 
 #pragma once
 #include "DataReader.h"
+#include "commandArgUtil.h" // for intargvector

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -13,6 +14,9 @@ template<class ElemType>
 class HTKMLFReader : public IDataReader<ElemType>
 {
 private:
+    const static size_t m_htkRandomizeAuto = 0;
+    const static size_t m_htkRandomizeDisable = (size_t)-1;
+
    msra::dbn::minibatchiterator* m_mbiter;
    msra::dbn::minibatchsource* m_frameSource;
    //msra::dbn::minibatchreadaheadsource* m_readAheadSource;
@ -24,6 +28,7 @@ private:
    bool m_readAhead;
    bool m_truncated;
    vector<size_t> m_processedFrame;
+    intargvector m_numberOfuttsPerMinibatchForAllEpochs;
    size_t m_numberOfuttsPerMinibatch;
    size_t m_actualnumberOfuttsPerMinibatch;
    size_t m_mbSize;
@ -86,6 +91,7 @@ private:

    
    size_t ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector<std::vector<ElemType>>& labelToTargetMap);
+    void ExpandDotDotDot(wstring & featPath, const wstring & scpPath, wstring & scpDirCached);
    enum InputOutputTypes
    {
        real,
--- a/DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp
+++ b/DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp
@ -13,15 +13,6 @@
 //#ifndef __unix__
 #include "ssematrix.h"
 //#endif
-//#include "latticearchive.h"             // for reading HTK phoneme lattices (MMI training)
-//#include "simplesenonehmm.h"            // for MMI scoring
-//#include "msra_mgram.h"                 // for unigram scores of ground-truth path in sequence training
-
-//#include "rollingwindowsource.h"        // minibatch sources
-//#include "utterancesource.h"
-//#include "readaheadsource.h"
-//#include "chunkevalsource.h"
-//#include "minibatchiterator.h"

 #define DATAWRITER_EXPORTS  // creating the exports here
 #include "DataWriter.h"
--- a/DataReader/HTKMLFReader_linux/chunkevalsource.h
+++ b/DataReader/HTKMLFReader_linux/chunkevalsource.h
@ -341,8 +341,20 @@ namespace msra { namespace dbn {
                const size_t framesInBlock = framesMulti[i].size();
                feat[i].resize(vdims[i], framesInBlock);   // input features for whole utt (col vectors)
                // augment the features
+                size_t leftextent, rightextent;
+                // page in the needed range of frames
+                if (leftcontext[i] == 0 && rightcontext[i] == 0)
+                {
+                    leftextent = rightextent = augmentationextent(framesMulti[i][0].size(), vdims[i]);
+                }
+                else
+                {
+                    leftextent = leftcontext[i];
+                    rightextent = rightcontext[i];
+                }
+
                //msra::dbn::augmentneighbors(framesMulti[i], boundaryFlags, 0, leftcontext[i], rightcontext[i],)
-                msra::dbn::augmentneighbors (framesMulti[i], boundaryFlags, leftcontext[i], rightcontext[i], 0, framesInBlock, feat[i]);
+                msra::dbn::augmentneighbors (framesMulti[i], boundaryFlags, leftextent, rightextent, 0, framesInBlock, feat[i]);
            }
            minibatchReady=true;
        }
--- a/DataReader/HTKMLFReader_linux/fileutil.cpp
+++ b/DataReader/HTKMLFReader_linux/fileutil.cpp
@ -242,6 +242,30 @@ void fflushOrDie (FILE * f)
 // ----------------------------------------------------------------------------
 size_t filesize (FILE * f)
 {
+#ifdef WIN32
+    size_t curPos = _ftelli64 (f);
+    if (curPos == -1L)
+    {
+    RuntimeError ("error determining file position: %s", strerror (errno));
+    }
+    int rc = _fseeki64 (f, 0, SEEK_END);
+    if (rc != 0)
+    {
+    RuntimeError ("error seeking to end of file: %s", strerror (errno));
+    }
+    size_t len = _ftelli64 (f);
+    if (len == -1L)
+    {
+    RuntimeError ("error determining file position: %s", strerror (errno));
+    }
+    rc = _fseeki64 (f, curPos, SEEK_SET);
+    if (rc != 0)
+    {
+    RuntimeError ("error resetting file position: %s", strerror (errno));
+    }
+    return len;
+#else
+    // linux version 
    long curPos = ftell (f);
    if (curPos == -1L)
    {
--- a/DataReader/HTKMLFReader_linux/htkfeatio.h
+++ b/DataReader/HTKMLFReader_linux/htkfeatio.h
@ -230,7 +230,7 @@ public:
    // We write to a tmp file first to ensure we don't leave broken files that would confuse make mode.
    template<class MATRIX> static void write (const wstring & path, const string & kindstr, unsigned int period, const MATRIX & feat)
    {
-        wstring tmppath = path + L""; // tmp path for make-mode compliant
+        wstring tmppath = path + L"$$"; // tmp path for make-mode compliant
        unlinkOrDie (path);             // delete if old file is already there
        // write it out
        size_t featdim = feat.rows();
@ -613,7 +613,7 @@ public:
 struct htkmlfentry
 {
    unsigned int firstframe;    // range [firstframe,firstframe+numframes)
-    unsigned short numframes;
+    unsigned int numframes;
    //unsigned short classid;     // numeric state id
    unsigned int classid;     // numeric state id - mseltzer changed from ushort to uint for untied cd phones > 2^16
    
@ -624,7 +624,7 @@ private:
        if (te < ts) throw std::runtime_error ("htkmlfentry: end time below start time??");
        // save
        firstframe = (unsigned int) ts;
-        numframes = (unsigned short) (te - ts);
+        numframes = (unsigned int) (te - ts);
        classid = (unsigned int) uid;
        // check for numeric overflow
        if (firstframe != ts || firstframe + numframes != te || classid != uid)
--- a/DataReader/HTKMLFReader_linux/latticearchive.h
+++ b/DataReader/HTKMLFReader_linux/latticearchive.h
@ -933,20 +933,20 @@ public:
                const size_t skipscoretokens = info.hasacscores ? 2 : 1;
                for (size_t k = skipscoretokens; k < uniquededgedatatokens.size(); k++)
                {
-                    auto & ai = uniquededgedatatokens[k];
                    if (!isendworkaround.empty() && isendworkaround[k])       // secondary criterion to detect ends in broken lattices
                    {
                        k--;    // don't advance, since nothing to advance over
-                        goto skipscores;
                    }
-                    // this is a regular token: update it in-place
-                    
-                    if (ai.unit >= idmap.size())
-                        throw std::runtime_error ("fread: broken-file heuristics failed");
-                    ai.updateunit (idmap);      // updates itself
-                    if (!ai.last)
-                        continue;
-                skipscores:
+                    else
+                    {
+                        // this is a regular token: update it in-place
+                        auto & ai = uniquededgedatatokens[k];
+                        if (ai.unit >= idmap.size())
+                            throw std::runtime_error ("fread: broken-file heuristics failed");
+                        ai.updateunit (idmap);      // updates itself
+                        if (!ai.last)
+                            continue;
+                    }
                    // if last then skip over the lm and ac scores
                    k += skipscoretokens;
                    uniquealignments++;
--- a/DataReader/HTKMLFReader_linux/stdafx.h
+++ b/DataReader/HTKMLFReader_linux/stdafx.h
@ -10,6 +10,7 @@

 #pragma once

+#include "Platform.h"
 #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms

 #ifndef __unix__
--- a/DataReader/HTKMLFReader_linux/utterancesource.h
+++ b/DataReader/HTKMLFReader_linux/utterancesource.h
@ -749,7 +749,8 @@ private:
        if (!chunkdata.isinram())
            return;       // already out

-        fprintf (stderr, "releaserandomizedchunk: paging out randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n",
+        if (verbosity)
+            fprintf (stderr, "releaserandomizedchunk: paging out randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n",
                 k, randomizedchunks[k].globalts, randomizedchunks[k].globalte()-1, chunksinram-1);
        chunkdata.releasedata();
        chunksinram--;
@ -768,7 +769,8 @@ private:
        if (chunkdata.isinram())
            return false;

-        fprintf (stderr, "requirerandomizedchunk: paging in randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n", chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
+        if (verbosity)
+            fprintf (stderr, "requirerandomizedchunk: paging in randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n", chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
        msra::util::attempt (5, [&]()   // (reading from network)
        {
            chunkdata.requiredata (featkind, featdim, sampperiod, this->lattices);
@ -858,7 +860,8 @@ public:
            transcripts.clear();

            // return these utterances
-            fprintf (stderr, "getbatch: getting utterances %zu..%zu (%zu frames out of %zu requested) in sweep %zu\n", spos, epos -1, mbframes, framesrequested, sweep);
+            if (verbosity > 0)
+                fprintf (stderr, "getbatch: getting utterances %zu..%zu (%zu frames out of %zu requested) in sweep %zu\n", spos, epos -1, mbframes, framesrequested, sweep);
            size_t tspos = 0;   // relative start of utterance 'pos' within the returned minibatch
            for (size_t pos = spos; pos < epos; pos++)
            {
@ -922,7 +925,8 @@ public:
            const size_t lastchunk = chunkforframepos (globalte-1);
            const size_t windowbegin = randomizedchunks[firstchunk].windowbegin;
            const size_t windowend = randomizedchunks[lastchunk].windowend;
-            fprintf (stderr, "getbatch: getting randomized frames [%zu..%zu] (%zu frames out of %zu requested) in sweep %zu; chunks [%zu..%zu] -> chunk window [%zu..%zu)\n",
+            if (verbosity > 0)
+                fprintf (stderr, "getbatch: getting randomized frames [%zu..%zu] (%zu frames out of %zu requested) in sweep %zu; chunks [%zu..%zu] -> chunk window [%zu..%zu)\n",
                     globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend);
            // release all data outside, and page in all data inside
            for (size_t k = 0; k < windowbegin; k++)
--- a/DataReader/HTKMLFReader_linux/utterancesourcemulti.h
+++ b/DataReader/HTKMLFReader_linux/utterancesourcemulti.h
@ -117,7 +117,7 @@ class minibatchutterancesourcemulti : public minibatchsource
        }
        // page in data for this chunk
        // We pass in the feature info variables by ref which will be filled lazily upon first read
-        void requiredata (string & featkind, size_t & featdim, unsigned int & sampperiod, const latticesource & latticesource) const
+        void requiredata (string & featkind, size_t & featdim, unsigned int & sampperiod, const latticesource & latticesource, int verbosity=0) const
        {

            if (numutterances() == 0)
@ -148,7 +148,8 @@ class minibatchutterancesourcemulti : public minibatchsource
                        latticesource.getlattices (utteranceset[i].key(), lattices[i], uttframes.cols());
                }
                //fprintf (stderr, "\n");
-                fprintf (stderr, "requiredata: %zu utterances read\n", utteranceset.size());
+                if (verbosity)
+                    fprintf (stderr, "requiredata: %zu utterances read\n", utteranceset.size());
            }
            catch (...)
            {
@ -403,15 +404,14 @@ public:
                // TODO: we can store labels more efficiently now since we don't do frame-wise random access anymore.
    
                // OK, utterance has all we need --remember it
-                utteranceset.push_back (std::move (utterance));

                if (m==0)
                {
-                    _totalframes += uttframes;
-                    framesaccum.push_back(uttframes); //track number of frames in each utterance - first feature is the reference
                    if (!labels.empty() && !lacksmlf)
                    //if (!labels.empty() && labelsiter != labels[0].end())
                    {
+                        // first verify that all the label files have the proper duration
+                        bool durationmatch = true;
                        foreach_index (j, labels)
                        {
                            const auto & labseq = labels[j].find(key)->second;
@ -421,31 +421,43 @@ public:
                            {
                                fprintf (stderr, " [duration mismatch (%zu in label vs. %zu in feat file), skipping %S]", labframes, uttframes, key.c_str());
                                nomlf++;
-                                continue;   // skip this utterance at all
+                                durationmatch = false;
+                                break; // continue;   // skip this utterance at all
                            }
-                            // expand classid sequence into flat array
-                            foreach_index (i, labseq)
+                        }
+                        if (durationmatch){
+                            utteranceset.push_back(std::move(utterance));
+                            _totalframes += uttframes;
+                            framesaccum.push_back(uttframes); //track number of frames in each utterance - first feature is the reference
+                            // then parse each mlf if the durations are consistent
+                            foreach_index(j, labels)
                            {
-                                const auto & e = labseq[i];
-                                if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
-                                    throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
-                                if (e.classid >= udim[j])
+                                const auto & labseq = labels[j].find(key)->second;
+                        
+                                // expand classid sequence into flat array
+                                foreach_index (i, labseq)
                                {
-                                    throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: class id exceeds model output dimension"));
+                                    const auto & e = labseq[i];
+                                    if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
+                                        throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
+                                    if (e.classid >= udim[j])
+                                    {
+                                        throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: class id exceeds model output dimension"));
+                                    }
+                                    if (e.classid != (CLASSIDTYPE) e.classid)
+                                        throw std::runtime_error ("CLASSIDTYPE has too few bits");
+                                    for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
+                                        classids[j]->push_back ((CLASSIDTYPE) e.classid);
+                                    numclasses[j] = max (numclasses[j], (size_t)(1u + e.classid));
+                                    counts[j].resize (numclasses[j], 0);
+                                    counts[j][e.classid] += e.numframes;
                                }
-                                if (e.classid != (CLASSIDTYPE) e.classid)
-                                    throw std::runtime_error ("CLASSIDTYPE has too few bits");
-                                for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
-                                    classids[j]->push_back ((CLASSIDTYPE) e.classid);
-                                numclasses[j] = max (numclasses[j], (size_t)(1u + e.classid));
-                                counts[j].resize (numclasses[j], 0);
-                                counts[j][e.classid] += e.numframes;
+                                classids[j]->push_back ((CLASSIDTYPE) -1);  // append a boundary marker marker for checking
+    
+                                if (!labels[j].empty() && classids[j]->size() != _totalframes + utteranceset.size())
+                                    throw std::logic_error (msra::strfun::strprintf ("minibatchutterancesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
+                                assert (labels[j].empty() || classids[j]->size() == _totalframes + utteranceset.size());
                            }
-                            classids[j]->push_back ((CLASSIDTYPE) -1);  // append a boundary marker marker for checking
-
-                            if (!labels[j].empty() && classids[j]->size() != _totalframes + utteranceset.size())
-                                throw std::logic_error (msra::strfun::strprintf ("minibatchutterancesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
-                            assert (labels[j].empty() || classids[j]->size() == _totalframes + utteranceset.size());
                        }
                    }
                    else{
@ -474,7 +486,7 @@ public:
            }
            if (nomlf + nolat > 0)
            {
-                fprintf (stderr, "minibatchutterancesource: out of %zu files, %zu files not found in label set and %zu have no lattice\n", infiles.size(), nomlf, nolat);
+                fprintf (stderr, "minibatchutterancesource: out of %zu files, %zu files not found in label set and %zu have no lattice\n", infiles[0].size(), nomlf, nolat);
                if (nomlf + nolat > infiles[m].size() / 2)
                    throw std::runtime_error ("minibatchutterancesource: too many files not found in label set--assuming broken configuration\n");
            }
@ -600,7 +612,8 @@ private:
            return sweep;

        currentsweep = sweep;
-        fprintf (stderr, "lazyrandomization: re-randomizing for sweep %zu in %s mode\n", currentsweep, framemode ? "frame" : "utterance");
+        if (verbosity>0)
+            fprintf (stderr, "lazyrandomization: re-randomizing for sweep %zu in %s mode\n", currentsweep, framemode ? "frame" : "utterance");

        const size_t sweepts = sweep * _totalframes;     // first global frame index for this sweep

@ -912,8 +925,9 @@ private:
            auto & chunkdata = randomizedchunks[m][k].getchunkdata();
            if (chunkdata.isinram())
            {
-                fprintf (stderr, "releaserandomizedchunk: paging out randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n",
-                     k, randomizedchunks[m][k].globalts, randomizedchunks[m][k].globalte()-1, chunksinram-1);
+                if (verbosity)
+                    fprintf (stderr, "releaserandomizedchunk: paging out randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n",
+                         k, randomizedchunks[m][k].globalts, randomizedchunks[m][k].globalte()-1, chunksinram-1);
                chunkdata.releasedata();
                numreleased++;
            }
@ -957,10 +971,11 @@ private:
            {
                auto & chunk = randomizedchunks[m][chunkindex];
                auto & chunkdata = chunk.getchunkdata();
-                fprintf (stderr, "feature set %d: requirerandomizedchunk: paging in randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n", m, chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
+                if (verbosity)
+                    fprintf (stderr, "feature set %d: requirerandomizedchunk: paging in randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n", m, chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
                msra::util::attempt (5, [&]()   // (reading from network)
                {
-                    chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices);
+                    chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices, verbosity);
                });
            }
            chunksinram++;
@ -1069,7 +1084,8 @@ public:
                }
            }
            // return these utterances
-            fprintf (stderr, "getbatch: getting utterances %zu..%zu (%zu frames out of %zu requested) in sweep %zu\n", spos, epos -1, mbframes, framesrequested, sweep);
+            if (verbosity > 0)
+                fprintf (stderr, "getbatch: getting utterances %zu..%zu (%zu frames out of %zu requested) in sweep %zu\n", spos, epos -1, mbframes, framesrequested, sweep);
            size_t tspos = 0;   // relative start of utterance 'pos' within the returned minibatch
            for (size_t pos = spos; pos < epos; pos++)
            {
@ -1147,7 +1163,8 @@ public:
            const size_t lastchunk = chunkforframepos (globalte-1);
            const size_t windowbegin = randomizedchunks[0][firstchunk].windowbegin;
            const size_t windowend = randomizedchunks[0][lastchunk].windowend;
-            fprintf (stderr, "getbatch: getting randomized frames [%zu..%zu] (%zu frames out of %zu requested) in sweep %zu; chunks [%zu..%zu] -> chunk window [%zu..%zu)\n",
+            if (verbosity > 0)
+                fprintf (stderr, "getbatch: getting randomized frames [%zu..%zu] (%zu frames out of %zu requested) in sweep %zu; chunks [%zu..%zu] -> chunk window [%zu..%zu)\n",
                     globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend);
            // release all data outside, and page in all data inside
            for (size_t k = 0; k < windowbegin; k++)
--- a/DataReader/KaldiReader/HTKMLFReader.cpp
+++ b/DataReader/KaldiReader/HTKMLFReader.cpp
@ -25,6 +25,8 @@
 #include "minibatchiterator.h"
 #define DATAREADER_EXPORTS  // creating the exports here
 #include "DataReader.h"
+
+#include "commandArgUtil.h"
 #include "HTKMLFReader.h"
 #ifdef LEAKDETECT
 #include <vld.h> // for memory leak detection
--- a/DataReader/KaldiReader/HTKMLFWriter.cpp
+++ b/DataReader/KaldiReader/HTKMLFWriter.cpp
@ -27,6 +27,8 @@
 #define DATAWRITER_EXPORTS  // creating the exports here
 #include "DataWriter.h"
 #include "HTKMLFWriter.h"
+
+#include "commandArgUtil.h"
 #ifdef LEAKDETECT
 #include <vld.h> // for memory leak detection
 #endif
--- a/MachineLearning/CNTK/Profiler.cpp
+++ b/MachineLearning/CNTK/Profiler.cpp
@ -20,8 +20,6 @@ Profiler::Profiler(int numSamples)
   :m_numSamples(numSamples),
    m_isProfilingActive(false)
 {
-    if (m_numSamples > 0)
-        Start();
 }

 Profiler::~Profiler()
@ -45,6 +43,11 @@ void Profiler::NextSample()
        if (--m_numSamples == 0)
            Stop();
    }
+    else
+    {
+        if (m_numSamples > 0)
+            Start();
+    }
 }

 void Profiler::Stop()
--- a/MachineLearning/CNTK/SimpleNetworkBuilder.cpp
+++ b/MachineLearning/CNTK/SimpleNetworkBuilder.cpp
@ -1276,7 +1276,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            /// the label is a dense matrix. each element is the word index
            label = m_net->CreateInputNode(L"labels", 2 * (this->nce_noises + 1), mbSize);

-            bias = m_net->CreateLearnableParameter(L"BiasVector", m_layerSizes[m_layerSizes.size() - 1], 1);
+            bias = m_net->CreateLearnableParameter(L"BiasVector", 1, m_layerSizes[m_layerSizes.size() - 1]);
            bias->FunctionValues().SetValue((ElemType)-std::log(m_layerSizes[m_layerSizes.size() - 1]));
            //m_net->InitLearnableParameters(bias, m_uniformInit, randomSeed++, std::log(m_layerSizes[m_layerSizes.size() - 1])* m_initValueScale);
            //clslogpostprob = m_net->Times(clsweight, input, L"ClassPostProb");
--- a/MachineLearning/CNTK/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTK/TrainingCriterionNodes.h
@ -958,10 +958,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {
                // evaluation uses softmax
                m_logSoftmax.AssignProductOf(Inputs(1)->FunctionValues(), true, Inputs(2)->FunctionValues(), false);
+                /*
 #pragma omp parallel for
                for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++)
                for (int j = 0; j < Inputs(3)->FunctionValues().GetNumRows(); j++)
                    m_logSoftmax(i, j) += Inputs(3)->FunctionValues()(j, 0);
+                */
+                m_logSoftmax += Inputs(3)->FunctionValues().Transpose();
                m_logSoftmax.InplaceLogSoftmax(false);
                FunctionValues().Resize(1, 1);
                FunctionValues().SetValue(0);
--- a/Makefile_kaldi.gpu
+++ b/Makefile_kaldi.gpu
@ -85,7 +85,7 @@ MATH_SRC = Math/Math/Matrix.cpp Math/Math/GPUMatrix.cu Math/Math/GPUMatrixCUDAKe
 		   Math/Math/CPUMatrix.cpp Math/Math/CPUSparseMatrix.cpp #Math/Math/InstantiateTemplates.cu
 CN_SRC =  MachineLearning/CNTK/NetworkDescriptionLanguage.cpp MachineLearning/CNTK/CNTK.cpp MachineLearning/CNTK/ComputationNode.cpp \
          MachineLearning/CNTK/ModelEditLanguage.cpp \
-          MachineLearning/CNTK/SimpleNetworkBuilder.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
+          MachineLearning/CNTK/SimpleNetworkBuilder.cpp MachineLearning/CNTK/tests.cpp  MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTKEval/CNTKEval.cpp
 BINARYREADER_SRC = #DataReader/BinaryReader/BinaryWriter.cpp DataReader/BinaryReader/BinaryReader.cpp DataReader/BinaryReader/BinaryFile.cpp
 HTKMLFREADER_SRC = DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp DataReader/HTKMLFReader_linux/DataWriter.cpp DataReader/HTKMLFReader_linux/DataReader.cpp DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
 KALDIREADER_SRC = DataReader/KaldiReader/HTKMLFWriter.cpp DataReader/KaldiReader/DataWriter.cpp DataReader/KaldiReader/DataReader.cpp DataReader/KaldiReader/HTKMLFReader.cpp
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@ -3908,7 +3908,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            for (int sample_id = 0; sample_id < sample_size; sample_id++)
            {
                int sample =(int) (*this)(2 * sample_id, instance_id);
-                c(sample, 0) -= tmp(sample_id, instance_id);
+                c(0, sample) -= tmp(sample_id, instance_id);
            }
        }
        return *this;
@ -3941,7 +3941,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        for (int sample_id = 0; sample_id < sample_size; sample_id++)
        {
            int sample =(int) (*this)(2 * sample_id, instance_id);
-            double score = bias(sample, 0);
+            double score = bias(0, sample);
            for (int dim = 0; dim < b.GetNumRows(); dim++)
                score += a(dim, instance_id)* b(dim, sample);
            double sample_prob = -(*this)(2 * sample_id + 1, instance_id);
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@ -1870,17 +1870,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        cudaEvent_t done = nullptr;
        if (do_sync) CUDA_CALL(cudaEventCreate(&done));
-
+        //a: dim * minibatch
+        //b: dim * |vocab|
        int p = 512;
-        int width = a.GetNumCols();
+        int width = a.GetNumRows(); //dimension of hidden vector
+        //int width = a.GetNumCols(); original setup, considering column-major
+        //
        while (p / 2 > width) p = p / 2;

        _computeNceOutput<ElemType> << <this->GetNumElements() / 2, p >> >(
-            this->GetArray(),
-            m_numRows / 2,
+            this->GetArray(), 
            sampleCount,
+            m_numRows / 2,
            my_a.GetArray(),//a
-            a.GetNumCols(),
+            a.GetNumRows(),
            my_b.GetArray(),//b
            my_bias.GetArray(),
            tmp.GetArray());//tmp
@ -1891,8 +1894,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // summing up objective must be done in one block
        _assignNoiseContrastiveEstimation<ElemType> << <1, p >> >(
            this->GetArray(),
-            m_numRows,
-            sampleCount, my_a.GetArray(),
+            sampleCount,
+            m_numRows / 2,
+             my_a.GetArray(),
            a.GetNumCols(),
            my_b.GetArray(),
            tmp.GetArray(),
@ -1900,7 +1904,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        _computeNceError<ElemType> << <1, p >> >(
            this->GetArray(),
-            m_numRows,
+            m_numRows / 2,
            tmp.GetNumCols(),
            tmp.GetArray());

@ -1919,20 +1923,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        cudaEvent_t done = nullptr;
        if (do_sync) CUDA_CALL(cudaEventCreate(&done));
        int p = 512;
-        int width = a.GetNumCols();
+        int width = a.GetNumRows();
        while (p / 2 > width) p = p / 2;
-        
+
        _assignNceDerivative<ElemType> << <m_nz, p >> >(
            GetArray(),
-            m_numRows,
            tmp.GetNumCols(),
+            m_numRows / 2,
            my_a.GetArray(),
-            a.GetNumCols(),
+            a.GetNumRows(),
            my_b.GetArray(),
            tmp.GetArray(),
            c.GetArray(),
            inputIndex);
-           
+
        if (do_sync) CUDA_CALL(cudaEventRecord(done));
        if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync) CUDA_CALL(cudaEventDestroy(done));
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@ -2836,15 +2836,15 @@ __global__ void _computeNceOutput(

    for (int i = start; i < end; i++)
    {
-        int colIndex = (int)col[2 * i];
-        int rowIndex = i / sampleCount;
+        int wid = (int)col[2 * i];
+        int batchid = i / sampleCount;

        int loadPerThread = (numCols_a + blockDim.x - 1) / blockDim.x;
        int tstart = loadPerThread * threadIdx.x;
        int tend = min(numCols_a, loadPerThread * (threadIdx.x + 1));

        for (int j = tstart; j < tend; j++)
-            partials[threadIdx.x] = a[IDX2C(rowIndex, j, numRows)] * b[IDX2C(j, colIndex, numCols_a)];
+            partials[threadIdx.x] = a[IDX2C(j, batchid, numCols_a)] * b[IDX2C(j, wid, numCols_a)];

        __syncthreads();

@ -3262,19 +3262,23 @@ __global__ void _assignNceDerivative(
            for (int j = tstart; j < tend; j++)
            {
                ElemType val = er * b[IDX2C(j, colIndex, width)];
-                atomicAdd(c + IDX2C(rowIndex, j, numRows), val);
+                atomicAdd(c + IDX2C(j, rowIndex, width), val);
                //c[IDX2C(rowIndex, j, numRows)] += val;
            }
        }
-        else // weight
+        else if (inputIndex == 2) // weight
        {
            for (int j = tstart; j < tend; j++)
            {
-                ElemType val = er * a[IDX2C(rowIndex, j, numRows)];
+                ElemType val = er * a[IDX2C(j, rowIndex, width)];
                atomicAdd(c + IDX2C(j, colIndex, width), val);
                //c[IDX2C(j, colIndex, width)] += val;
            }
        }
+        else //bias vector
+        {
+            c[colIndex] += er;
+        }
    }
 }

--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@ -3645,9 +3645,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (a.GetDeviceId() != b.GetDeviceId() || b.GetDeviceId() != c.GetDeviceId() || c.GetDeviceId() != this->GetDeviceId())
            NOT_IMPLEMENTED;

-        //if (a.GetMatrixType() == MatrixType::DENSE)
-        //    NOT_IMPLEMENTED;
-
        this->Resize(1, 1);

        if (this->GetDeviceId() < 0)