made gcc happy again (mostly missing headers or wrong declaration orders);

Makefile adapted to new paths, but not yet building Network and SGD as separate libs
2015-09-06 09:20:28 -07:00 · 2015-09-06 09:20:28 -07:00 · 54a6b1d2ec
--- a/Common/Include/File.h
+++ b/Common/Include/File.h
@ -4,6 +4,8 @@
 // </copyright>
 //
 #pragma once
 #include "Basics.h"
 #include <stdio.h>
 #include <string>
 #include <vector>
@ -16,6 +18,7 @@
 #endif
 #include "fileutil.h"   // for f{ge,pu}t{,Text}()
 #include <fstream>      // for LoadMatrixFromTextFile() --TODO: change to using this File class
 #include <sstream>
 namespace Microsoft{ namespace MSR { namespace CNTK {
--- a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
@ -109,7 +109,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            size_t numRows = 0;
            size_t numCols = 0;
            auto array = File::LoadMatrixFromTextFile<ElemType>(msra::strfun::utf8(initFromFilePath), numRows, numCols); // TODO: change pathname to wstring
-            FunctionValues().SetValue(numRows, numCols, array.data(), matrixFlagNormal, GetDeviceId());
+            FunctionValues().SetValue(numRows, numCols, array.data(), matrixFlagNormal, m_deviceId);
        }
        virtual const std::wstring OperationName() const {return TypeName();}
--- a/MachineLearning/CNTKSGDLib/SGD.cpp
+++ b/MachineLearning/CNTKSGDLib/SGD.cpp
@ -12,174 +12,260 @@ extern Microsoft::MSR::CNTK::MPIWrapper *g_mpi;
 namespace Microsoft { namespace MSR { namespace CNTK {
-template<class ElemType>
+    template<class ElemType>
-void DecimateMinibatch(std::map<std::wstring, MSR::CNTK::Matrix<ElemType>*>& mb, int numProcessor, int myID)
+    void DecimateMinibatch(std::map<std::wstring, MSR::CNTK::Matrix<ElemType>*>& mb, int numProcessor, int myID)
 {
    int rank = myID;
    int procs = numProcessor;
    size_t rv = 0;
    if (procs > 1)
    {
-        for (auto it = mb.begin(); it != mb.end(); ++it)
+        int rank = myID;
        int procs = numProcessor;
        size_t rv = 0;
        if (procs > 1)
        {
-            MSR::CNTK::Matrix<ElemType> &mat = *(it->second);
+            for (auto it = mb.begin(); it != mb.end(); ++it)
            size_t nCols = mat.GetNumCols();
            size_t col_start = (nCols * rank) / procs;
            size_t col_end = (nCols * (rank + 1)) / procs;
            if (col_end > nCols)
            {
-                // this shouldn't happen
+                MSR::CNTK::Matrix<ElemType> &mat = *(it->second);
-                col_end = nCols;
+                size_t nCols = mat.GetNumCols();
-            }
+                size_t col_start = (nCols * rank) / procs;
-
+                size_t col_end = (nCols * (rank + 1)) / procs;
-            if (col_end == col_start)
+                if (col_end > nCols)
            {
                MSR::CNTK::Matrix<ElemType> tmp(mat.GetNumRows(), 0, AUTOPLACEMATRIX, DENSE);
                mat.SetValue(tmp);
            }
            else
            {
                MSR::CNTK::Matrix<ElemType> tmp = mat.ColumnSlice(col_start, col_end - col_start);
                mat.SetValue(tmp);
            }
            if (rv == 0)
            {
                rv = mat.GetNumCols();
            }
            else
            {
                if (rv != mat.GetNumCols())
                {
-                    throw std::logic_error("Uneven number of columns among inputs.");
+                    // this shouldn't happen
                    col_end = nCols;
                }
                if (col_end == col_start)
                {
                    MSR::CNTK::Matrix<ElemType> tmp(mat.GetNumRows(), 0, AUTOPLACEMATRIX, DENSE);
                    mat.SetValue(tmp);
                }
                else
                {
                    MSR::CNTK::Matrix<ElemType> tmp = mat.ColumnSlice(col_start, col_end - col_start);
                    mat.SetValue(tmp);
                }
                if (rv == 0)
                {
                    rv = mat.GetNumCols();
                }
                else
                {
                    if (rv != mat.GetNumCols())
                    {
                        throw std::logic_error("Uneven number of columns among inputs.");
                    }
                }
            }
        }
    }
 }
-template<class ElemType> 
+    template<class ElemType> 
-size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<ElemType>*> &mb,  /* (input) matrix to be decimated */
+    size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<ElemType>*> &mb,  /* (input) matrix to be decimated */
-                                      int rank, int numprocs,                                    /* (input) rank info */
+                                          int rank, int numprocs,                                    /* (input) rank info */
-                                      size_t& nSlices,                                           /* (input/output): on input, # parallel sentence total , on output, # paralel sentence in this node  */
+                                          size_t& nSlices,                                           /* (input/output): on input, # parallel sentence total , on output, # paralel sentence in this node  */
-                                      Matrix<float>& SentenceBoundary,                           /* (output) nSlices X nMBsize matrix */
+                                          Matrix<float>& SentenceBoundary,                           /* (output) nSlices X nMBsize matrix */
-                                      vector<MinibatchPackingFlag>& PackingFlags,                /* (output) 1 X nMBsize vector  */
+                                          vector<MinibatchPackingFlag>& PackingFlags,                /* (output) 1 X nMBsize vector  */
-                                      IDataReader<ElemType>* trainDataReader)                    /* (input)  to have access to reader */
+                                          IDataReader<ElemType>* trainDataReader)                    /* (input)  to have access to reader */
 {
    // For RNN, a input Matrix is organized in the following way: 
    //   | x_t^1  x_t^2 ... x_t^N |  .... | x_{t+T-1}^1 ... x_{t+T-1}^N | 
    //   |<----   block 1    ---->|  .... |<------  block T       ----->| 
    // N is the nSlice (input)
    // The decimation here is to split each block to individual GPUs 
    // So After decimation 
    //   | x_t^{st} ... x_t^{en-1}|  .... | x_{t+T-1}^{st} ... x_{t+T-1}^{en-1} | 
    // Each block now has nSlice/nProcs 
    // 
    // Correspondingly, the SentenceBoundary and PackingFlags will be revised 
    trainDataReader->SetSentenceSegBatch(SentenceBoundary, PackingFlags);
    size_t rv = 0;
    size_t nOrigParallelUtts = nSlices;
    static bool warned = false;
    if (numprocs > 1)
    {
-        // decide new parallel utterances 
+        // For RNN, a input Matrix is organized in the following way: 
-        size_t sent_start = 0;
+        //   | x_t^1  x_t^2 ... x_t^N |  .... | x_{t+T-1}^1 ... x_{t+T-1}^N | 
-        size_t sent_end = 0;
+        //   |<----   block 1    ---->|  .... |<------  block T       ----->| 
-        if (nOrigParallelUtts % numprocs != 0)
+        // N is the nSlice (input)
        // The decimation here is to split each block to individual GPUs 
        // So After decimation 
        //   | x_t^{st} ... x_t^{en-1}|  .... | x_{t+T-1}^{st} ... x_{t+T-1}^{en-1} | 
        // Each block now has nSlice/nProcs 
        // 
        // Correspondingly, the SentenceBoundary and PackingFlags will be revised 
        trainDataReader->SetSentenceSegBatch(SentenceBoundary, PackingFlags);
        size_t rv = 0;
        size_t nOrigParallelUtts = nSlices;
        static bool warned = false;
        if (numprocs > 1)
        {
-            if (!warned)
+            // decide new parallel utterances 
            size_t sent_start = 0;
            size_t sent_end = 0;
            if (nOrigParallelUtts % numprocs != 0)
            {
-                /* give a warning of potential bandwidth wasting */
+                if (!warned)
-                fprintf(stderr, "WARNING: %d GPUs are used in model averaging, but the number of parallel utterances are %d, a potential training speed degradation.\n",
+                {
-                        (int)g_mpi->NumNodesInUse(), (int)nOrigParallelUtts);
+                    /* give a warning of potential bandwidth wasting */
-                warned = true;
+                    fprintf(stderr, "WARNING: %d GPUs are used in model averaging, but the number of parallel utterances are %d, a potential training speed degradation.\n",
-            }
+                            (int)g_mpi->NumNodesInUse(), (int)nOrigParallelUtts);
-            if (rank == numprocs - 1)
+                    warned = true;
-            {
+                }
-                nSlices = nOrigParallelUtts - (nOrigParallelUtts / numprocs + 1) * (numprocs - 1);
+                if (rank == numprocs - 1)
-                sent_start = (nOrigParallelUtts / numprocs + 1) * (numprocs - 1);
+                {
-                sent_end = nOrigParallelUtts;
+                    nSlices = nOrigParallelUtts - (nOrigParallelUtts / numprocs + 1) * (numprocs - 1);
                    sent_start = (nOrigParallelUtts / numprocs + 1) * (numprocs - 1);
                    sent_end = nOrigParallelUtts;
                }
                else
                {
                    nSlices = nOrigParallelUtts / numprocs + 1;
                    sent_start = nSlices * rank;
                    sent_end = nSlices * (rank + 1);
                    if (sent_end > nOrigParallelUtts) sent_end = nOrigParallelUtts;
                }
            }
            else
            {
-                nSlices = nOrigParallelUtts / numprocs + 1;
+                nSlices = nOrigParallelUtts / numprocs;
-                sent_start = nSlices * rank;
+                sent_start = rank*nSlices;
-                sent_end = nSlices * (rank + 1);
+                sent_end = (rank + 1)*nSlices;
                if (sent_end > nOrigParallelUtts) sent_end = nOrigParallelUtts;
            }
            // decimate data 
            for (auto it = mb.begin(); it != mb.end(); ++it)
            {
                MSR::CNTK::Matrix<ElemType> &mat = *(it->second);
                size_t nCols = mat.GetNumCols();
                if (nCols % nOrigParallelUtts != 0)
                {
                    // this should not happen for DNN, RNN with truncated BPTT, not sure about other special stuff ... 
                    RuntimeError("ERROR: minibatch size %d, but with %d parallel utterances\n", nCols, nOrigParallelUtts);
                }
                size_t nBlocks = nCols / nOrigParallelUtts;
                // for RNN, nBlocks is the size of truncated BPTT
                if (sent_end == sent_start)
                {
                    // should never happen, print debug info
                    RuntimeError("ERROR: in DecimateMinibatch, col_st=col_en=%d, nCol=%d, nBlock=%d, nParaUtts=%d, nGPU=%d\n",
                        (int)sent_start, (int)nCols, (int)nBlocks, (int)nOrigParallelUtts, (int)numprocs);
                }
                MSR::CNTK::Matrix<ElemType> tmp(mat.GetNumRows(), nSlices*nBlocks, mat.GetPreferredDeviceId(), mat.GetMatrixType());
                // do the column slice for each block 
                for (size_t iblock = 0; iblock < nBlocks; iblock++)
                {
                    tmp.SetColumnSlice(mat.ColumnSlice(nOrigParallelUtts*iblock + sent_start, nSlices),
                        iblock*nSlices, nSlices);
                }
                mat.SetValue(tmp);
                // assert the cols are even among nodes 
                if (0 == rv)
                {
                    rv = mat.GetNumCols();
                }
                else
                {
                    if (rv != mat.GetNumCols())
                        throw std::logic_error("Uneven number of columns among inputs.");
                }
            }
            // revise sentence boundary and packing flags
            Matrix<float>  newBoundary(CPUDEVICE); // TODO: change Matrix<float> to a typedef
            size_t nMBSize = PackingFlags.size(); 
            newBoundary.Resize(nSlices, nMBSize);
            newBoundary.AssignRowSliceValuesOf(SentenceBoundary, sent_start, nSlices);
            fill(PackingFlags.begin(), PackingFlags.end(), MinibatchPackingFlag::None);
            for (size_t nt = 0; nt < nMBSize; nt++)
            {
                for (size_t ns = 0; ns < nSlices; ns++)
                {
                    if (newBoundary(ns, nt) == SEQUENCE_START)
                        PackingFlags[nt] |= MinibatchPackingFlag::SequenceStart;
                    if (newBoundary(ns, nt) == SEQUENCE_END)
                        PackingFlags[nt] |= MinibatchPackingFlag::SequenceEnd;
                }
            }
        }
        return rv; 
    }
    static AdaptationRegType ParseAdaptationRegType(wstring s)
    {
        msra::strfun::tolower_ascii(s);
        if (s == L"" || s == L"none")
        {
            return AdaptationRegType::None;
        }
        else if (s == L"kl" || s == L"klreg")
        {
            return AdaptationRegType::KL;
        }
        else
        {
-            nSlices = nOrigParallelUtts / numprocs;
+            throw std::invalid_argument(
-            sent_start = rank*nSlices;
+                "ParseAdaptationRegType: Invalid Adaptation Regularization Type. Valid values are "
-            sent_end = (rank + 1)*nSlices;
+                "(None | KL)");
            if (sent_end > nOrigParallelUtts) sent_end = nOrigParallelUtts;
        }
        // decimate data 
        for (auto it = mb.begin(); it != mb.end(); ++it)
        {
            MSR::CNTK::Matrix<ElemType> &mat = *(it->second);
            size_t nCols = mat.GetNumCols();
            if (nCols % nOrigParallelUtts != 0)
            {
                // this should not happen for DNN, RNN with truncated BPTT, not sure about other special stuff ... 
                RuntimeError("ERROR: minibatch size %d, but with %d parallel utterances\n", nCols, nOrigParallelUtts);
            }
            size_t nBlocks = nCols / nOrigParallelUtts;
            // for RNN, nBlocks is the size of truncated BPTT
            if (sent_end == sent_start)
            {
                // should never happen, print debug info
                RuntimeError("ERROR: in DecimateMinibatch, col_st=col_en=%d, nCol=%d, nBlock=%d, nParaUtts=%d, nGPU=%d\n",
                    (int)sent_start, (int)nCols, (int)nBlocks, (int)nOrigParallelUtts, (int)numprocs);
            }
            MSR::CNTK::Matrix<ElemType> tmp(mat.GetNumRows(), nSlices*nBlocks, mat.GetPreferredDeviceId(), mat.GetMatrixType());
            // do the column slice for each block 
            for (size_t iblock = 0; iblock < nBlocks; iblock++)
            {
                tmp.SetColumnSlice(mat.ColumnSlice(nOrigParallelUtts*iblock + sent_start, nSlices),
                    iblock*nSlices, nSlices);
            }
            mat.SetValue(tmp);
            // assert the cols are even among nodes 
            if (0 == rv)
            {
                rv = mat.GetNumCols();
            }
            else
            {
                if (rv != mat.GetNumCols())
                    throw std::logic_error("Uneven number of columns among inputs.");
            }
        }
        // revise sentence boundary and packing flags
        Matrix<float>  newBoundary(CPUDEVICE); // TODO: change Matrix<float> to a typedef
        size_t nMBSize = PackingFlags.size(); 
        newBoundary.Resize(nSlices, nMBSize);
        newBoundary.AssignRowSliceValuesOf(SentenceBoundary, sent_start, nSlices);
        fill(PackingFlags.begin(), PackingFlags.end(), MinibatchPackingFlag::None);
        for (size_t nt = 0; nt < nMBSize; nt++)
        {
            for (size_t ns = 0; ns < nSlices; ns++)
            {
                if (newBoundary(ns, nt) == SEQUENCE_START)
                    PackingFlags[nt] |= MinibatchPackingFlag::SequenceStart;
                if (newBoundary(ns, nt) == SEQUENCE_END)
                    PackingFlags[nt] |= MinibatchPackingFlag::SequenceEnd;
            }
        }
    }
-    return rv; 
+    static GradientsUpdateType ParseGradUpdateType(wstring s)
-}
+    {
        msra::strfun::tolower_ascii(s);
        if (s == L"" || s == L"none" || s == L"normal" || s == L"simple")
        {
            return GradientsUpdateType::None;
        }
        else if (s == L"adagrad")
        {
            return GradientsUpdateType::AdaGrad;
        }
        else if (s == L"rmsprop")
        {
            return GradientsUpdateType::RmsProp;
        }
        else
        {
            throw std::invalid_argument(
                "ParseGradUpdateType: Invalid Gradient Updating Type. Valid values are "
                "(None | AdaGrad | RmsProp )");
        }
    }
    static ParallelizationMethod ParseParallelizationMethod(wstring s)
    {
        msra::strfun::tolower_ascii(s);
        if ((s == L"") || (s == L"none"))
        {
            return ParallelizationMethod::None;
        }
        else if (s == L"dataparallelsgd")
        {
            return ParallelizationMethod::DataParallelSGD;
        }
        else if (s == L"modelaveragingsgd")
        {
            return ParallelizationMethod::ModelAveragingSGD;
        }
        else
        {
            throw std::invalid_argument("ParseParallelizationMethod: Invalid Parallelization Method. Valid values are (None | DataParallelSGD | ModelAveragingSGD)");
        }
    }
    static LearningRateSearchAlgorithm ParseLearningRateSearchType(wstring s)
    {
        msra::strfun::tolower_ascii(s);
        if (s == L"false" || s == L"none")
        {
            return LearningRateSearchAlgorithm::None;
        }
        else if (s == L"searchbeforeepoch" || s == L"beforeepoch" || s == L"before")
        {
            return LearningRateSearchAlgorithm::SearchBeforeEpoch;
        }
        else if (s == L"adjustafterepoch" || s == L"afterepoch" || s == L"after")
        {
            return LearningRateSearchAlgorithm::AdjustAfterEpoch;
        }
        else
        {
            throw std::invalid_argument(
                "autoAdjustLR: Invalid learning rate search type. Valid values are "
                "(None | SearchBeforeEpoch | AdjustAfterEpoch)");
        }
    }
    template<class ElemType>
    SGD<ElemType>::SGD(const ConfigParameters& configSGD)
@ -594,7 +680,7 @@ size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<E
    void SGD<ElemType>::Adapt(wstring origModelFileName, wstring refNodeName,
               IDataReader<ElemType>* trainSetDataReader,
               IDataReader<ElemType>* validationSetDataReader,
-               const DEVICEID_TYPE deviceID, const bool makeMode = true)
+               const DEVICEID_TYPE deviceID, const bool makeMode)
    {
        if (origModelFileName == L"" || trainSetDataReader == nullptr)
            InvalidArgument("origModel and trainSetDataReader should not be null.");
@ -644,7 +730,7 @@ size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<E
    template<class ElemType>
    void SGD<ElemType>::SequenceTrain(IComputationNetBuilder<ElemType>* netBuilder, wstring origModelFileName,
                       IDataReader<ElemType>* trainSetDataReader, IDataReader<ElemType>* validationSetDataReader,
-                       const DEVICEID_TYPE deviceID, const bool makeMode = true)
+                       const DEVICEID_TYPE deviceID, const bool makeMode)
    {
        if (netBuilder == nullptr || origModelFileName == L"" || trainSetDataReader == nullptr)
            InvalidArgument("netBuilder, origModel and trainSetDataReader should not be null.");
@ -711,11 +797,16 @@ size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<E
        }
    }
    static double MomentumPerMB(double momentumPerSample, size_t minibatchSize)
    {
        return pow(momentumPerSample, minibatchSize);
    }
    template<class ElemType>
    void SGD<ElemType>::Train(IComputationNetBuilder<ElemType>* netBuilder,
               IDataReader<ElemType>* trainSetDataReader,
               IDataReader<ElemType>* validationSetDataReader,
-               const bool makeMode = true)
+               const bool makeMode)
    {
        if (netBuilder == nullptr || trainSetDataReader == nullptr)
            InvalidArgument("netBuilder and trainSetDataReader should not be null.\n");
@ -1449,7 +1540,7 @@ size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<E
                                         /*out*/ double& epochCriterion,
                                         /*out*/ std::vector<double>& epochEvalErrors,
                                         /*out*/ size_t& totalSamplesSeen,
-                                         std::string prefixMsg = "")
+                                         std::string prefixMsg)
    {
        TrainOneEpoch(net, refNet, refNode, epochNumber, epochSize,
                      trainSetDataReader, learnRatePerSample, minibatchSize, featureNodes,
@ -1763,7 +1854,7 @@ size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<E
                         /*out*/ double& epochCriterion,
                         /*out*/ std::vector<double>& epochEvalErrors,
                         /*out*/ size_t& totalSamplesSeen,
-                         std::string prefixMsg = "")
+                         std::string prefixMsg)
    {
        // Since we are getting timing resolution of under microsecond we use double precision
        // to ensure that we have enough digits to represent small time measurements.
@ -2511,7 +2602,7 @@ size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<E
    }
    template<class ElemType>
-    wstring SGD<ElemType>::GetModelNameForEpoch(const int epoch, bool bLastModel = false)
+    wstring SGD<ElemType>::GetModelNameForEpoch(const int epoch, bool bLastModel)
    {
        int epoch1Base = epoch + 1;
        if (epoch1Base == m_maxEpochs || bLastModel)
@ -2557,108 +2648,6 @@ size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<E
        return firstEpoch;
    }
    static AdaptationRegType ParseAdaptationRegType(wstring s)
    {
        msra::strfun::tolower_ascii(s);
        if (s == L"" || s == L"none")
        {
            return AdaptationRegType::None;
        }
        else if (s == L"kl" || s == L"klreg")
        {
            return AdaptationRegType::KL;
        }
        else
        {
            throw std::invalid_argument(
                "ParseAdaptationRegType: Invalid Adaptation Regularization Type. Valid values are "
                "(None | KL)");
        }
    }
    static GradientsUpdateType ParseGradUpdateType(wstring s)
    {
        msra::strfun::tolower_ascii(s);
        if (s == L"" || s == L"none" || s == L"normal" || s == L"simple")
        {
            return GradientsUpdateType::None;
        }
        else if (s == L"adagrad")
        {
            return GradientsUpdateType::AdaGrad;
        }
        else if (s == L"rmsprop")
        {
            return GradientsUpdateType::RmsProp;
        }
        else
        {
            throw std::invalid_argument(
                "ParseGradUpdateType: Invalid Gradient Updating Type. Valid values are "
                "(None | AdaGrad | RmsProp )");
        }
    }
    static ParallelizationMethod ParseParallelizationMethod(wstring s)
    {
        msra::strfun::tolower_ascii(s);
        if ((s == L"") || (s == L"none"))
        {
            return ParallelizationMethod::None;
        }
        else if (s == L"dataparallelsgd")
        {
            return ParallelizationMethod::DataParallelSGD;
        }
        else if (s == L"modelaveragingsgd")
        {
            return ParallelizationMethod::ModelAveragingSGD;
        }
        else
        {
            throw std::invalid_argument("ParseParallelizationMethod: Invalid Parallelization Method. Valid values are (None | DataParallelSGD | ModelAveragingSGD)");
        }
    }
    static LearningRateSearchAlgorithm ParseLearningRateSearchType(wstring s)
    {
        msra::strfun::tolower_ascii(s);
        if (s == L"false" || s == L"none")
        {
            return LearningRateSearchAlgorithm::None;
        }
        else if (s == L"searchbeforeepoch" || s == L"beforeepoch" || s == L"before")
        {
            return LearningRateSearchAlgorithm::SearchBeforeEpoch;
        }
        else if (s == L"adjustafterepoch" || s == L"afterepoch" || s == L"after")
        {
            return LearningRateSearchAlgorithm::AdjustAfterEpoch;
        }
        else {
            throw std::invalid_argument(
                "autoAdjustLR: Invalid learning rate search type. Valid values are "
                "(None | SearchBeforeEpoch | AdjustAfterEpoch)");
        }
    }
    //GradientsUpdateType GradUpdateType() const
    //{
    //    return m_gradType.mType;
    //}
    //
    //double GradientUpdateNoiseStd() const
    //{
    //    return m_gradType.mGaussianNoiseInjectStd;
    //}
    static double MomentumPerMB(double momentumPerSample, size_t minibatchSize)
    {
        return pow(momentumPerSample, minibatchSize);
    }
 // public:
 #define EPSILON 1e-5
    template<class ElemType>
--- a/MachineLearning/CNTKSGDLib/SimpleEvaluator.h
+++ b/MachineLearning/CNTKSGDLib/SimpleEvaluator.h
@ -12,6 +12,7 @@
 #include <fstream>
 #include <queue>
 #include "Basics.h"
 #include "Helpers.h"    // for foreach_column() macro
 #include "fileutil.h"
 #include "DataReader.h"
 #include "DataWriter.h"
--- a/12
+++ b/12
@ -50,7 +50,7 @@ endif
 # The actual compiler/linker flags added can be viewed by running 'mpic++ --showme:compile' and 'mpic++ --showme:link'
 CXX = mpic++
-INCLUDEPATH:= Common/Include Math/Math MachineLearning/CNTK BrainScript
+INCLUDEPATH:= Common/Include Math/Math MachineLearning/CNTK MachineLearning/CNTKComputationNetworkLib MachineLearning/CNTKSGDLib BrainScript
 CPPFLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K
 CXXFLAGS:= -msse3 -std=c++0x -std=c++11 -fopenmp -fpermissive -fPIC -Werror
 LIBPATH:=
@ -355,15 +355,17 @@ endif
 CNTK_SRC =\
 	MachineLearning/CNTK/CNTK.cpp \
 	MachineLearning/CNTK/ComputationNode.cpp \
 	MachineLearning/CNTK/ModelEditLanguage.cpp \
 	MachineLearning/CNTK/NetworkDescriptionLanguage.cpp \
 	MachineLearning/CNTK/Profiler.cpp \
 	MachineLearning/CNTK/ComputationNetwork.cpp \
 	MachineLearning/CNTK/ComputationNetworkBuilder.cpp \
 	MachineLearning/CNTK/SimpleNetworkBuilder.cpp \
 	MachineLearning/CNTK/SynchronousExecutionEngine.cpp \
 	MachineLearning/CNTK/tests.cpp \
 	MachineLearning/CNTKComputationNetworkLib/ComputationNode.cpp \
 	MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp \
 	MachineLearning/CNTKComputationNetworkLib/ComputationNetworkBuilder.cpp \
 	MachineLearning/CNTKComputationNetworkLib/NetworkBuilderFromConfig.cpp \
 	MachineLearning/CNTKSGDLib/Profiler.cpp \
 	MachineLearning/CNTKSGDLib/SGD.cpp \
 	MachineLearning/CNTKEval/CNTKEval.cpp \
 	BrainScript/BrainScriptEvaluator.cpp \
 	BrainScript/BrainScriptParser.cpp \
--- a/Math/Math/Helpers.h
+++ b/Math/Math/Helpers.h
@ -3,7 +3,10 @@
 //     Copyright (c) Microsoft Corporation.  All rights reserved.
 // </copyright>
 //
 //helpful macros
 // TODO: the file's name is too general to be included from outside; MathHelpers.h?
 //iterators
 #pragma once
 #undef foreach_row
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@ -19,6 +19,7 @@
 #include "Basics.h"
 #include "File.h"
 #include "CommonMatrix.h"
 #include <limits.h>
 // This class is exported from the Math.dll
 namespace Microsoft { namespace MSR { namespace CNTK {