made gcc happy again (mostly missing headers or wrong declaration orders);

Makefile adapted to new paths, but not yet building Network and SGD as separate libs
2015-09-06 09:20:28 -07:00 · 2015-09-06 09:20:28 -07:00 · 9aecb5649d
--- a/Common/Include/File.h
+++ b/Common/Include/File.h
@ -4,6 +4,8 @@
 // </copyright>
 //
 #pragma once
+
+#include "Basics.h"
 #include <stdio.h>
 #include <string>
 #include <vector>
@ -16,6 +18,7 @@
 #endif
 #include "fileutil.h"   // for f{ge,pu}t{,Text}()
 #include <fstream>      // for LoadMatrixFromTextFile() --TODO: change to using this File class
+#include <sstream>

 namespace Microsoft{ namespace MSR { namespace CNTK {

--- a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
@ -109,7 +109,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            size_t numRows = 0;
            size_t numCols = 0;
            auto array = File::LoadMatrixFromTextFile<ElemType>(msra::strfun::utf8(initFromFilePath), numRows, numCols); // TODO: change pathname to wstring
-            FunctionValues().SetValue(numRows, numCols, array.data(), matrixFlagNormal, GetDeviceId());
+            FunctionValues().SetValue(numRows, numCols, array.data(), matrixFlagNormal, m_deviceId);
        }

        virtual const std::wstring OperationName() const {return TypeName();}
--- a/MachineLearning/CNTKSGDLib/SGD.cpp
+++ b/MachineLearning/CNTKSGDLib/SGD.cpp
@ -12,174 +12,260 @@ extern Microsoft::MSR::CNTK::MPIWrapper *g_mpi;

 namespace Microsoft { namespace MSR { namespace CNTK {

-template<class ElemType>
-void DecimateMinibatch(std::map<std::wstring, MSR::CNTK::Matrix<ElemType>*>& mb, int numProcessor, int myID)
-{
-    int rank = myID;
-    int procs = numProcessor;
-
-    size_t rv = 0;
-    if (procs > 1)
+    template<class ElemType>
+    void DecimateMinibatch(std::map<std::wstring, MSR::CNTK::Matrix<ElemType>*>& mb, int numProcessor, int myID)
    {
-        for (auto it = mb.begin(); it != mb.end(); ++it)
+        int rank = myID;
+        int procs = numProcessor;
+
+        size_t rv = 0;
+        if (procs > 1)
        {
-            MSR::CNTK::Matrix<ElemType> &mat = *(it->second);
-            size_t nCols = mat.GetNumCols();
-            size_t col_start = (nCols * rank) / procs;
-            size_t col_end = (nCols * (rank + 1)) / procs;
-            if (col_end > nCols)
+            for (auto it = mb.begin(); it != mb.end(); ++it)
            {
-                // this shouldn't happen
-                col_end = nCols;
-            }
-
-            if (col_end == col_start)
-            {
-                MSR::CNTK::Matrix<ElemType> tmp(mat.GetNumRows(), 0, AUTOPLACEMATRIX, DENSE);
-                mat.SetValue(tmp);
-            }
-            else
-            {
-                MSR::CNTK::Matrix<ElemType> tmp = mat.ColumnSlice(col_start, col_end - col_start);
-                mat.SetValue(tmp);
-            }
-
-            if (rv == 0)
-            {
-                rv = mat.GetNumCols();
-            }
-            else
-            {
-                if (rv != mat.GetNumCols())
+                MSR::CNTK::Matrix<ElemType> &mat = *(it->second);
+                size_t nCols = mat.GetNumCols();
+                size_t col_start = (nCols * rank) / procs;
+                size_t col_end = (nCols * (rank + 1)) / procs;
+                if (col_end > nCols)
                {
-                    throw std::logic_error("Uneven number of columns among inputs.");
+                    // this shouldn't happen
+                    col_end = nCols;
+                }
+
+                if (col_end == col_start)
+                {
+                    MSR::CNTK::Matrix<ElemType> tmp(mat.GetNumRows(), 0, AUTOPLACEMATRIX, DENSE);
+                    mat.SetValue(tmp);
+                }
+                else
+                {
+                    MSR::CNTK::Matrix<ElemType> tmp = mat.ColumnSlice(col_start, col_end - col_start);
+                    mat.SetValue(tmp);
+                }
+
+                if (rv == 0)
+                {
+                    rv = mat.GetNumCols();
+                }
+                else
+                {
+                    if (rv != mat.GetNumCols())
+                    {
+                        throw std::logic_error("Uneven number of columns among inputs.");
+                    }
                }
            }
        }
    }
-}

-template<class ElemType> 
-size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<ElemType>*> &mb,  /* (input) matrix to be decimated */
-                                      int rank, int numprocs,                                    /* (input) rank info */
-                                      size_t& nSlices,                                           /* (input/output): on input, # parallel sentence total , on output, # paralel sentence in this node  */
-                                      Matrix<float>& SentenceBoundary,                           /* (output) nSlices X nMBsize matrix */
-                                      vector<MinibatchPackingFlag>& PackingFlags,                /* (output) 1 X nMBsize vector  */
-                                      IDataReader<ElemType>* trainDataReader)                    /* (input)  to have access to reader */
-{
-    // For RNN, a input Matrix is organized in the following way: 
-    //   | x_t^1  x_t^2 ... x_t^N |  .... | x_{t+T-1}^1 ... x_{t+T-1}^N | 
-    //   |<----   block 1    ---->|  .... |<------  block T       ----->| 
-    // N is the nSlice (input)
-    // The decimation here is to split each block to individual GPUs 
-    // So After decimation 
-    //   | x_t^{st} ... x_t^{en-1}|  .... | x_{t+T-1}^{st} ... x_{t+T-1}^{en-1} | 
-    // Each block now has nSlice/nProcs 
-    // 
-    // Correspondingly, the SentenceBoundary and PackingFlags will be revised 
-    trainDataReader->SetSentenceSegBatch(SentenceBoundary, PackingFlags);
-
-    size_t rv = 0;
-    size_t nOrigParallelUtts = nSlices;
-    static bool warned = false;
-    if (numprocs > 1)
+    template<class ElemType> 
+    size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<ElemType>*> &mb,  /* (input) matrix to be decimated */
+                                          int rank, int numprocs,                                    /* (input) rank info */
+                                          size_t& nSlices,                                           /* (input/output): on input, # parallel sentence total , on output, # paralel sentence in this node  */
+                                          Matrix<float>& SentenceBoundary,                           /* (output) nSlices X nMBsize matrix */
+                                          vector<MinibatchPackingFlag>& PackingFlags,                /* (output) 1 X nMBsize vector  */
+                                          IDataReader<ElemType>* trainDataReader)                    /* (input)  to have access to reader */
    {
-        // decide new parallel utterances 
-        size_t sent_start = 0;
-        size_t sent_end = 0;
-        if (nOrigParallelUtts % numprocs != 0)
+        // For RNN, a input Matrix is organized in the following way: 
+        //   | x_t^1  x_t^2 ... x_t^N |  .... | x_{t+T-1}^1 ... x_{t+T-1}^N | 
+        //   |<----   block 1    ---->|  .... |<------  block T       ----->| 
+        // N is the nSlice (input)
+        // The decimation here is to split each block to individual GPUs 
+        // So After decimation 
+        //   | x_t^{st} ... x_t^{en-1}|  .... | x_{t+T-1}^{st} ... x_{t+T-1}^{en-1} | 
+        // Each block now has nSlice/nProcs 
+        // 
+        // Correspondingly, the SentenceBoundary and PackingFlags will be revised 
+        trainDataReader->SetSentenceSegBatch(SentenceBoundary, PackingFlags);
+
+        size_t rv = 0;
+        size_t nOrigParallelUtts = nSlices;
+        static bool warned = false;
+        if (numprocs > 1)
        {
-            if (!warned)
+            // decide new parallel utterances 
+            size_t sent_start = 0;
+            size_t sent_end = 0;
+            if (nOrigParallelUtts % numprocs != 0)
            {
-                /* give a warning of potential bandwidth wasting */
-                fprintf(stderr, "WARNING: %d GPUs are used in model averaging, but the number of parallel utterances are %d, a potential training speed degradation.\n",
-                        (int)g_mpi->NumNodesInUse(), (int)nOrigParallelUtts);
-                warned = true;
-            }
-            if (rank == numprocs - 1)
-            {
-                nSlices = nOrigParallelUtts - (nOrigParallelUtts / numprocs + 1) * (numprocs - 1);
-                sent_start = (nOrigParallelUtts / numprocs + 1) * (numprocs - 1);
-                sent_end = nOrigParallelUtts;
+                if (!warned)
+                {
+                    /* give a warning of potential bandwidth wasting */
+                    fprintf(stderr, "WARNING: %d GPUs are used in model averaging, but the number of parallel utterances are %d, a potential training speed degradation.\n",
+                            (int)g_mpi->NumNodesInUse(), (int)nOrigParallelUtts);
+                    warned = true;
+                }
+                if (rank == numprocs - 1)
+                {
+                    nSlices = nOrigParallelUtts - (nOrigParallelUtts / numprocs + 1) * (numprocs - 1);
+                    sent_start = (nOrigParallelUtts / numprocs + 1) * (numprocs - 1);
+                    sent_end = nOrigParallelUtts;
+                }
+                else
+                {
+                    nSlices = nOrigParallelUtts / numprocs + 1;
+                    sent_start = nSlices * rank;
+                    sent_end = nSlices * (rank + 1);
+                    if (sent_end > nOrigParallelUtts) sent_end = nOrigParallelUtts;
+                }
            }
            else
            {
-                nSlices = nOrigParallelUtts / numprocs + 1;
-                sent_start = nSlices * rank;
-                sent_end = nSlices * (rank + 1);
+                nSlices = nOrigParallelUtts / numprocs;
+                sent_start = rank*nSlices;
+                sent_end = (rank + 1)*nSlices;
                if (sent_end > nOrigParallelUtts) sent_end = nOrigParallelUtts;
            }
+            // decimate data 
+            for (auto it = mb.begin(); it != mb.end(); ++it)
+            {
+                MSR::CNTK::Matrix<ElemType> &mat = *(it->second);
+                size_t nCols = mat.GetNumCols();
+
+                if (nCols % nOrigParallelUtts != 0)
+                {
+                    // this should not happen for DNN, RNN with truncated BPTT, not sure about other special stuff ... 
+                    RuntimeError("ERROR: minibatch size %d, but with %d parallel utterances\n", nCols, nOrigParallelUtts);
+                }
+                size_t nBlocks = nCols / nOrigParallelUtts;
+                // for RNN, nBlocks is the size of truncated BPTT
+                if (sent_end == sent_start)
+                {
+                    // should never happen, print debug info
+                    RuntimeError("ERROR: in DecimateMinibatch, col_st=col_en=%d, nCol=%d, nBlock=%d, nParaUtts=%d, nGPU=%d\n",
+                        (int)sent_start, (int)nCols, (int)nBlocks, (int)nOrigParallelUtts, (int)numprocs);
+                }
+
+                MSR::CNTK::Matrix<ElemType> tmp(mat.GetNumRows(), nSlices*nBlocks, mat.GetPreferredDeviceId(), mat.GetMatrixType());
+
+                // do the column slice for each block 
+                for (size_t iblock = 0; iblock < nBlocks; iblock++)
+                {
+                    tmp.SetColumnSlice(mat.ColumnSlice(nOrigParallelUtts*iblock + sent_start, nSlices),
+                        iblock*nSlices, nSlices);
+                }
+                mat.SetValue(tmp);
+
+                // assert the cols are even among nodes 
+                if (0 == rv)
+                {
+                    rv = mat.GetNumCols();
+                }
+                else
+                {
+                    if (rv != mat.GetNumCols())
+                        throw std::logic_error("Uneven number of columns among inputs.");
+                }
+            }
+            // revise sentence boundary and packing flags
+            Matrix<float>  newBoundary(CPUDEVICE); // TODO: change Matrix<float> to a typedef
+            size_t nMBSize = PackingFlags.size(); 
+            newBoundary.Resize(nSlices, nMBSize);
+            newBoundary.AssignRowSliceValuesOf(SentenceBoundary, sent_start, nSlices);
+            fill(PackingFlags.begin(), PackingFlags.end(), MinibatchPackingFlag::None);
+            for (size_t nt = 0; nt < nMBSize; nt++)
+            {
+                for (size_t ns = 0; ns < nSlices; ns++)
+                {
+                    if (newBoundary(ns, nt) == SEQUENCE_START)
+                        PackingFlags[nt] |= MinibatchPackingFlag::SequenceStart;
+                    if (newBoundary(ns, nt) == SEQUENCE_END)
+                        PackingFlags[nt] |= MinibatchPackingFlag::SequenceEnd;
+                }
+            }
+       
+ 
+        }
+
+        return rv; 
+    }
+
+    static AdaptationRegType ParseAdaptationRegType(wstring s)
+    {
+        msra::strfun::tolower_ascii(s);
+        if (s == L"" || s == L"none")
+        {
+            return AdaptationRegType::None;
+        }
+        else if (s == L"kl" || s == L"klreg")
+        {
+            return AdaptationRegType::KL;
        }
        else
        {
-            nSlices = nOrigParallelUtts / numprocs;
-            sent_start = rank*nSlices;
-            sent_end = (rank + 1)*nSlices;
-            if (sent_end > nOrigParallelUtts) sent_end = nOrigParallelUtts;
+            throw std::invalid_argument(
+                "ParseAdaptationRegType: Invalid Adaptation Regularization Type. Valid values are "
+                "(None | KL)");
        }
-        // decimate data 
-        for (auto it = mb.begin(); it != mb.end(); ++it)
-        {
-            MSR::CNTK::Matrix<ElemType> &mat = *(it->second);
-            size_t nCols = mat.GetNumCols();
-
-            if (nCols % nOrigParallelUtts != 0)
-            {
-                // this should not happen for DNN, RNN with truncated BPTT, not sure about other special stuff ... 
-                RuntimeError("ERROR: minibatch size %d, but with %d parallel utterances\n", nCols, nOrigParallelUtts);
-            }
-            size_t nBlocks = nCols / nOrigParallelUtts;
-            // for RNN, nBlocks is the size of truncated BPTT
-            if (sent_end == sent_start)
-            {
-                // should never happen, print debug info
-                RuntimeError("ERROR: in DecimateMinibatch, col_st=col_en=%d, nCol=%d, nBlock=%d, nParaUtts=%d, nGPU=%d\n",
-                    (int)sent_start, (int)nCols, (int)nBlocks, (int)nOrigParallelUtts, (int)numprocs);
-            }
-
-            MSR::CNTK::Matrix<ElemType> tmp(mat.GetNumRows(), nSlices*nBlocks, mat.GetPreferredDeviceId(), mat.GetMatrixType());
-
-            // do the column slice for each block 
-            for (size_t iblock = 0; iblock < nBlocks; iblock++)
-            {
-                tmp.SetColumnSlice(mat.ColumnSlice(nOrigParallelUtts*iblock + sent_start, nSlices),
-                    iblock*nSlices, nSlices);
-            }
-            mat.SetValue(tmp);
-
-            // assert the cols are even among nodes 
-            if (0 == rv)
-            {
-                rv = mat.GetNumCols();
-            }
-            else
-            {
-                if (rv != mat.GetNumCols())
-                    throw std::logic_error("Uneven number of columns among inputs.");
-            }
-        }
-        // revise sentence boundary and packing flags
-        Matrix<float>  newBoundary(CPUDEVICE); // TODO: change Matrix<float> to a typedef
-        size_t nMBSize = PackingFlags.size(); 
-        newBoundary.Resize(nSlices, nMBSize);
-        newBoundary.AssignRowSliceValuesOf(SentenceBoundary, sent_start, nSlices);
-        fill(PackingFlags.begin(), PackingFlags.end(), MinibatchPackingFlag::None);
-        for (size_t nt = 0; nt < nMBSize; nt++)
-        {
-            for (size_t ns = 0; ns < nSlices; ns++)
-            {
-                if (newBoundary(ns, nt) == SEQUENCE_START)
-                    PackingFlags[nt] |= MinibatchPackingFlag::SequenceStart;
-                if (newBoundary(ns, nt) == SEQUENCE_END)
-                    PackingFlags[nt] |= MinibatchPackingFlag::SequenceEnd;
-            }
-        }
-       
- 
    }

-    return rv; 
-}
+    static GradientsUpdateType ParseGradUpdateType(wstring s)
+    {
+        msra::strfun::tolower_ascii(s);
+        if (s == L"" || s == L"none" || s == L"normal" || s == L"simple")
+        {
+            return GradientsUpdateType::None;
+        }
+        else if (s == L"adagrad")
+        {
+            return GradientsUpdateType::AdaGrad;
+        }
+        else if (s == L"rmsprop")
+        {
+            return GradientsUpdateType::RmsProp;
+        }
+        else
+        {
+            throw std::invalid_argument(
+                "ParseGradUpdateType: Invalid Gradient Updating Type. Valid values are "
+                "(None | AdaGrad | RmsProp )");
+        }
+    }
+
+    static ParallelizationMethod ParseParallelizationMethod(wstring s)
+    {
+        msra::strfun::tolower_ascii(s);
+        if ((s == L"") || (s == L"none"))
+        {
+            return ParallelizationMethod::None;
+        }
+        else if (s == L"dataparallelsgd")
+        {
+            return ParallelizationMethod::DataParallelSGD;
+        }
+        else if (s == L"modelaveragingsgd")
+        {
+            return ParallelizationMethod::ModelAveragingSGD;
+        }
+        else
+        {
+            throw std::invalid_argument("ParseParallelizationMethod: Invalid Parallelization Method. Valid values are (None | DataParallelSGD | ModelAveragingSGD)");
+        }
+    }
+
+    static LearningRateSearchAlgorithm ParseLearningRateSearchType(wstring s)
+    {
+        msra::strfun::tolower_ascii(s);
+        if (s == L"false" || s == L"none")
+        {
+            return LearningRateSearchAlgorithm::None;
+        }
+        else if (s == L"searchbeforeepoch" || s == L"beforeepoch" || s == L"before")
+        {
+            return LearningRateSearchAlgorithm::SearchBeforeEpoch;
+        }
+        else if (s == L"adjustafterepoch" || s == L"afterepoch" || s == L"after")
+        {
+            return LearningRateSearchAlgorithm::AdjustAfterEpoch;
+        }
+        else
+        {
+            throw std::invalid_argument(
+                "autoAdjustLR: Invalid learning rate search type. Valid values are "
+                "(None | SearchBeforeEpoch | AdjustAfterEpoch)");
+        }
+    }

    template<class ElemType>
    SGD<ElemType>::SGD(const ConfigParameters& configSGD)
@ -594,7 +680,7 @@ size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<E
    void SGD<ElemType>::Adapt(wstring origModelFileName, wstring refNodeName,
               IDataReader<ElemType>* trainSetDataReader,
               IDataReader<ElemType>* validationSetDataReader,
-               const DEVICEID_TYPE deviceID, const bool makeMode = true)
+               const DEVICEID_TYPE deviceID, const bool makeMode)
    {
        if (origModelFileName == L"" || trainSetDataReader == nullptr)
            InvalidArgument("origModel and trainSetDataReader should not be null.");
@ -644,7 +730,7 @@ size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<E
    template<class ElemType>
    void SGD<ElemType>::SequenceTrain(IComputationNetBuilder<ElemType>* netBuilder, wstring origModelFileName,
                       IDataReader<ElemType>* trainSetDataReader, IDataReader<ElemType>* validationSetDataReader,
-                       const DEVICEID_TYPE deviceID, const bool makeMode = true)
+                       const DEVICEID_TYPE deviceID, const bool makeMode)
    {
        if (netBuilder == nullptr || origModelFileName == L"" || trainSetDataReader == nullptr)
            InvalidArgument("netBuilder, origModel and trainSetDataReader should not be null.");
@ -711,11 +797,16 @@ size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<E
        }
    }

+    static double MomentumPerMB(double momentumPerSample, size_t minibatchSize)
+    {
+        return pow(momentumPerSample, minibatchSize);
+    }
+
    template<class ElemType>
    void SGD<ElemType>::Train(IComputationNetBuilder<ElemType>* netBuilder,
               IDataReader<ElemType>* trainSetDataReader,
               IDataReader<ElemType>* validationSetDataReader,
-               const bool makeMode = true)
+               const bool makeMode)
    {
        if (netBuilder == nullptr || trainSetDataReader == nullptr)
            InvalidArgument("netBuilder and trainSetDataReader should not be null.\n");
@ -1449,7 +1540,7 @@ size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<E
                                         /*out*/ double& epochCriterion,
                                         /*out*/ std::vector<double>& epochEvalErrors,
                                         /*out*/ size_t& totalSamplesSeen,
-                                         std::string prefixMsg = "")
+                                         std::string prefixMsg)
    {
        TrainOneEpoch(net, refNet, refNode, epochNumber, epochSize,
                      trainSetDataReader, learnRatePerSample, minibatchSize, featureNodes,
@ -1763,7 +1854,7 @@ size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<E
                         /*out*/ double& epochCriterion,
                         /*out*/ std::vector<double>& epochEvalErrors,
                         /*out*/ size_t& totalSamplesSeen,
-                         std::string prefixMsg = "")
+                         std::string prefixMsg)
    {
        // Since we are getting timing resolution of under microsecond we use double precision
        // to ensure that we have enough digits to represent small time measurements.
@ -2511,7 +2602,7 @@ size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<E
    }

    template<class ElemType>
-    wstring SGD<ElemType>::GetModelNameForEpoch(const int epoch, bool bLastModel = false)
+    wstring SGD<ElemType>::GetModelNameForEpoch(const int epoch, bool bLastModel)
    {
        int epoch1Base = epoch + 1;
        if (epoch1Base == m_maxEpochs || bLastModel)
@ -2557,108 +2648,6 @@ size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<E
        return firstEpoch;
    }

-    static AdaptationRegType ParseAdaptationRegType(wstring s)
-    {
-        msra::strfun::tolower_ascii(s);
-        if (s == L"" || s == L"none")
-        {
-            return AdaptationRegType::None;
-        }
-        else if (s == L"kl" || s == L"klreg")
-        {
-            return AdaptationRegType::KL;
-        }
-        else
-        {
-            throw std::invalid_argument(
-                "ParseAdaptationRegType: Invalid Adaptation Regularization Type. Valid values are "
-                "(None | KL)");
-        }
-    }
-
-    static GradientsUpdateType ParseGradUpdateType(wstring s)
-    {
-        msra::strfun::tolower_ascii(s);
-        if (s == L"" || s == L"none" || s == L"normal" || s == L"simple")
-        {
-            return GradientsUpdateType::None;
-        }
-        else if (s == L"adagrad")
-        {
-            return GradientsUpdateType::AdaGrad;
-        }
-        else if (s == L"rmsprop")
-        {
-            return GradientsUpdateType::RmsProp;
-        }
-        else
-        {
-            throw std::invalid_argument(
-                "ParseGradUpdateType: Invalid Gradient Updating Type. Valid values are "
-                "(None | AdaGrad | RmsProp )");
-        }
-    }
-
-    static ParallelizationMethod ParseParallelizationMethod(wstring s)
-    {
-        msra::strfun::tolower_ascii(s);
-        if ((s == L"") || (s == L"none"))
-        {
-            return ParallelizationMethod::None;
-        }
-        else if (s == L"dataparallelsgd")
-        {
-            return ParallelizationMethod::DataParallelSGD;
-        }
-        else if (s == L"modelaveragingsgd")
-        {
-            return ParallelizationMethod::ModelAveragingSGD;
-        }
-        else
-        {
-            throw std::invalid_argument("ParseParallelizationMethod: Invalid Parallelization Method. Valid values are (None | DataParallelSGD | ModelAveragingSGD)");
-        }
-    }
-
-    static LearningRateSearchAlgorithm ParseLearningRateSearchType(wstring s)
-    {
-        msra::strfun::tolower_ascii(s);
-        if (s == L"false" || s == L"none")
-        {
-            return LearningRateSearchAlgorithm::None;
-        }
-        else if (s == L"searchbeforeepoch" || s == L"beforeepoch" || s == L"before")
-        {
-            return LearningRateSearchAlgorithm::SearchBeforeEpoch;
-        }
-        else if (s == L"adjustafterepoch" || s == L"afterepoch" || s == L"after")
-        {
-            return LearningRateSearchAlgorithm::AdjustAfterEpoch;
-        }
-        else {
-            throw std::invalid_argument(
-                "autoAdjustLR: Invalid learning rate search type. Valid values are "
-                "(None | SearchBeforeEpoch | AdjustAfterEpoch)");
-        }
-    }
-
-    //GradientsUpdateType GradUpdateType() const
-    //{
-    //    return m_gradType.mType;
-    //}
-    //
-    //double GradientUpdateNoiseStd() const
-    //{
-    //    return m_gradType.mGaussianNoiseInjectStd;
-    //}
-
-    static double MomentumPerMB(double momentumPerSample, size_t minibatchSize)
-    {
-        return pow(momentumPerSample, minibatchSize);
-    }
-
-// public:
-
 #define EPSILON 1e-5

    template<class ElemType>
--- a/MachineLearning/CNTKSGDLib/SimpleEvaluator.h
+++ b/MachineLearning/CNTKSGDLib/SimpleEvaluator.h
@ -12,6 +12,7 @@
 #include <fstream>
 #include <queue>
 #include "Basics.h"
+#include "Helpers.h"    // for foreach_column() macro
 #include "fileutil.h"
 #include "DataReader.h"
 #include "DataWriter.h"
--- a/12
+++ b/12
@ -50,7 +50,7 @@ endif
 # The actual compiler/linker flags added can be viewed by running 'mpic++ --showme:compile' and 'mpic++ --showme:link'
 CXX = mpic++

-INCLUDEPATH:= Common/Include Math/Math MachineLearning/CNTK BrainScript
+INCLUDEPATH:= Common/Include Math/Math MachineLearning/CNTK MachineLearning/CNTKComputationNetworkLib MachineLearning/CNTKSGDLib BrainScript
 CPPFLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K
 CXXFLAGS:= -msse3 -std=c++0x -std=c++11 -fopenmp -fpermissive -fPIC -Werror
 LIBPATH:=
@ -355,15 +355,17 @@ endif

 CNTK_SRC =\
 	MachineLearning/CNTK/CNTK.cpp \
-	MachineLearning/CNTK/ComputationNode.cpp \
 	MachineLearning/CNTK/ModelEditLanguage.cpp \
 	MachineLearning/CNTK/NetworkDescriptionLanguage.cpp \
-	MachineLearning/CNTK/Profiler.cpp \
-	MachineLearning/CNTK/ComputationNetwork.cpp \
-	MachineLearning/CNTK/ComputationNetworkBuilder.cpp \
 	MachineLearning/CNTK/SimpleNetworkBuilder.cpp \
 	MachineLearning/CNTK/SynchronousExecutionEngine.cpp \
 	MachineLearning/CNTK/tests.cpp \
+	MachineLearning/CNTKComputationNetworkLib/ComputationNode.cpp \
+	MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp \
+	MachineLearning/CNTKComputationNetworkLib/ComputationNetworkBuilder.cpp \
+	MachineLearning/CNTKComputationNetworkLib/NetworkBuilderFromConfig.cpp \
+	MachineLearning/CNTKSGDLib/Profiler.cpp \
+	MachineLearning/CNTKSGDLib/SGD.cpp \
 	MachineLearning/CNTKEval/CNTKEval.cpp \
 	BrainScript/BrainScriptEvaluator.cpp \
 	BrainScript/BrainScriptParser.cpp \
--- a/Math/Math/Helpers.h
+++ b/Math/Math/Helpers.h
@ -3,7 +3,10 @@
 //     Copyright (c) Microsoft Corporation.  All rights reserved.
 // </copyright>
 //
+
 //helpful macros
+// TODO: the file's name is too general to be included from outside; MathHelpers.h?
+
 //iterators
 #pragma once
 #undef foreach_row
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@ -19,6 +19,7 @@
 #include "Basics.h"
 #include "File.h"
 #include "CommonMatrix.h"
+#include <limits.h>

 // This class is exported from the Math.dll
 namespace Microsoft { namespace MSR { namespace CNTK {