Merge branch 'master' of https://git.codeplex.com/cntk

Conflicts: .gitignore Common/Include/fileutil.h
2015-01-17 21:40:37 +08:00 · 2015-01-17 21:40:37 +08:00 · 0443dc818d
--- a/.gitignore
+++ b/.gitignore
@ -166,3 +166,4 @@ bin/
 LOG
 *.log
 core	
+*.lyx#
--- a/Common/BestGpu.cpp
+++ b/Common/BestGpu.cpp
@ -31,6 +31,12 @@
 #endif//__WINDOWS__
 #include <stdio.h>

+#ifdef MPI_SUPPORT
+#include "mpi.h"
+#endif
+extern int myRank;
+extern int numProcs;
+
 // ---------------------------------------------------------------------------
 // BestGpu class
 // ---------------------------------------------------------------------------
@ -86,6 +92,7 @@ public:
    void Init();
    void SetAllowedDevices(const std::vector<int>& devices); // only allow certain GPUs
    bool DeviceAllowed(int device);
+    void DisallowDevice(int device) { m_allowedDevices &= ~(1 << device); }
    void AllowAll(); // reset to allow all GPUs (no allowed list)
    bool UseMultiple(); // using multiple GPUs?
    int GetDevice(BestGpuFlags flags = bestGpuNormal); // get a single device
@ -120,8 +127,39 @@ DEVICEID_TYPE DeviceFromConfig(const ConfigParameters& config)
    }
    if (!_stricmp(val.c_str(), "Auto"))
    {
+#ifdef MPI_SUPPORT
+        // make sure deviceId is unique among processes on the same machine
+        g_bestGpu->AllowAll();
+        std::string MyName(getenv("COMPUTERNAME"));
+        for (int i = 0; i < numProcs; i++)
+        {
+            DEVICEID_TYPE yourDeviceId = deviceId;
+            if (myRank == i)
+            {
+                std::vector<int> devices = g_bestGpu->GetDevices(1);
+                deviceId = yourDeviceId = (DEVICEID_TYPE)devices[0];
+            }
+            MPI_Bcast(&yourDeviceId, 1, MPI_INT, i, MPI_COMM_WORLD);
+            {
+                INT32 YourSize = (INT32)MyName.length();
+                MPI_Bcast(&YourSize,1,MPI_INT,i,MPI_COMM_WORLD);
+                vector<char> YourName(YourSize+1);
+                if (myRank == i)
+                    copy(MyName.begin(), MyName.end(), YourName.begin());
+                MPI_Bcast(YourName.data(), YourSize + 1, MPI_CHAR, i, MPI_COMM_WORLD);
+                if (myRank != i)
+                {
+                    if (!_strcmpi(MyName.data(), YourName.data()))
+                    {
+                        g_bestGpu->DisallowDevice(yourDeviceId);
+                    }
+                }
+            }
+        }
+#else
        std::vector<int> devices = g_bestGpu->GetDevices(1);
        deviceId = (DEVICEID_TYPE)devices[0];
+#endif
    }
    else if (!_stricmp(val.c_str(), "All"))
    {
@ -466,6 +504,9 @@ void BestGpu::QueryNvmlData()
            }
        }

+        if (curPd == NULL)
+            continue;
+
        // Get the memory usage, will only work for TCC drivers
        result = nvmlDeviceGetMemoryInfo(device, &memory);
        if (NVML_SUCCESS != result)
--- a/Common/Include/fileutil.h
+++ b/Common/Include/fileutil.h
@ -379,6 +379,8 @@ template <>            const wchar_t* GetFormatString(float);
 template <>           const wchar_t* GetFormatString(double);
 template <>           const wchar_t* GetFormatString(size_t);
 template <>        const wchar_t* GetFormatString(long long);
+template <>      const wchar_t* GetFormatString(const char*);
+template <>   const wchar_t* GetFormatString(const wchar_t*);

 // GetScanFormatString - get the format string for a particular type
 template <typename T>
--- a/Common/fileutil.cpp
+++ b/Common/fileutil.cpp
@ -78,8 +78,10 @@ template <>     const wchar_t* GetFormatString(unsigned int) {return L" %u";}
 //template <>    const wchar_t* GetFormatString(unsigned long) {return L" %lu";}
 template <>            const wchar_t* GetFormatString(float) {return L" %.9g";}
 template <>           const wchar_t* GetFormatString(double) {return L" %.17g";}
-template <>           const wchar_t* GetFormatString(size_t) { return L" %llu"; }
+template <>           const wchar_t* GetFormatString(size_t) {return L" %llu";}
 template <>        const wchar_t* GetFormatString(long long) {return L" %lli";}
+template <>      const wchar_t* GetFormatString(const char*) {return L" %hs";}
+template <>   const wchar_t* GetFormatString(const wchar_t*) {return L" %ls";}

 // ----------------------------------------------------------------------------
 // fgetText() specializations for fwscanf differences: get a value from a text file
--- a/DataReader/HTKMLFReader/HTKMLFReader.cpp
+++ b/DataReader/HTKMLFReader/HTKMLFReader.cpp
@ -50,6 +50,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_convertLabelsToTargets = false;

        m_numberOfuttsPerMinibatch = readerConfig("nbruttsineachrecurrentiter", "1");
+
+        if (m_numberOfuttsPerMinibatch < 1)
+        {
+            LogicError("nbrUttsInEachRecurrentIter cannot be less than 1.");
+        }
+
+        if (!m_truncated && m_numberOfuttsPerMinibatch != 1)
+        {
+            LogicError("nbrUttsInEachRecurrentIter has to be 1 if Truncated is set to false.");
+        }
+
        m_actualnumberOfuttsPerMinibatch = m_numberOfuttsPerMinibatch;
        m_sentenceEnd.assign(m_numberOfuttsPerMinibatch, true);
        m_processedFrame.assign(m_numberOfuttsPerMinibatch, 0);
--- a/MachineLearning/CNTKEval/CNTKEval.cpp
+++ b/MachineLearning/CNTKEval/CNTKEval.cpp
@ -64,6 +64,7 @@ template<class ElemType>
 void CNTKEval<ElemType>::LoadModel(const std::wstring& modelFileName)
 {
    DEVICEID_TYPE deviceId = DeviceFromConfig(m_config);
+    fprintf(stderr, "DeviceID=%d\n", (int)deviceId);
    if (m_net != NULL)
        delete m_net;
    m_net = new ComputationNetwork<ElemType>(deviceId);
--- a/MachineLearning/cn/ComputationNetwork.h
+++ b/MachineLearning/cn/ComputationNetwork.h
@ -118,9 +118,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

        //dump all nodes in the network to file
-        void DumpAllNodesToFile(const bool printValues, const std::wstring outputFile)
+        void DumpAllNodesToFile(const bool printValues, const std::wstring outputFile, const bool validateBeforeDump = true)
        {
-            ValidateNetwork();  //some internal values in the nodes are computed during validation
+            if (validateBeforeDump)
+                ValidateNetwork();  //some internal values in the nodes are computed during validation

            File fstream(outputFile, FileOptions::fileOptionsText | FileOptions::fileOptionsWrite);

@ -1745,8 +1746,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {
                for (ComputationNodePtr node : FinalCriterionNodes())
                {
-                    PrintComputationTree(node, false);
                    if(!allowFragment) FormRecurentLoops(node);
+					PrintComputationTree(node, false);
                    size_t actualMBSize = this->GetActualMBSize();
                    this->SetActualMiniBatchSize(actualMBSize);
                    ValidateNetwork(node);
@ -1759,8 +1760,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            // now output nodes
            if (OutputNodes().size() > 0)
            {
-                for (ComputationNodePtr node : OutputNodes())
-                    ValidateNetwork(node);
+				for (ComputationNodePtr node : OutputNodes())
+				{
+					if (!allowFragment) FormRecurentLoops(node);
+					ValidateNetwork(node);
+				}
            }
            else if (!allowFragment)
            {
@ -1769,8 +1773,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            // now evaluation nodes
            if (EvaluationNodes().size() > 0)
            {
-                for (ComputationNodePtr node : EvaluationNodes())
-                    ValidateNetwork(node);
+				for (ComputationNodePtr node : EvaluationNodes())
+				{
+					if (!allowFragment) FormRecurentLoops(node);
+					ValidateNetwork(node);
+				}
            }
        }

@ -2039,6 +2046,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            std::vector<ComputationNodePtr> sourceLoopNodes; 
            getStrongSCC(rootNode);
            std::list<ComputationNodePtr>&  nodes = GetEvalOrder(rootNode, sourceLoopNodes);
+			std::list<ComputationNodePtr> nodesForGrad;

            /// debug purpose 
            for (auto iter = m_recurrentInfo.begin(); iter != m_recurrentInfo.end(); iter++)
@ -2080,7 +2088,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            for (auto iter = m_recurrentInfo.begin(); iter != m_recurrentInfo.end(); iter++)
            {
                // sort the recurrent nodes in their ascending name, which is the same as visiting nodes in G^R
-                if ((*iter).m_recurrentNodes.size() > 1 && (*iter).m_recurrentNodesForForward.size() == 0)
+				(*iter).m_recurrentNodesForForward.clear();
+                if ((*iter).m_recurrentNodes.size() > 1)
                {
                    std::list<ComputationNodePtr> result;
                    std::unordered_set<ComputationNodePtr> visited;
@ -2112,7 +2121,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        result.pop_front();
                    }
                    
-                
+					(*iter).m_recurrentNodes = (*iter).m_recurrentNodesForForward;
                }
            }

@ -2124,12 +2133,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                std::list<ComputationNodePtr> noRecurrentNodes; 

                noRecurrentNodes = rootNode->ReshuffleNodes(recurrentNodes);
-
-                ReorderLoops(nodes, recurrentNodes, noRecurrentNodes);
                
                nodes.sort(IsSmaller);

+				ReorderLoops(nodes, recurrentNodes, noRecurrentNodes);
+
                m_cacheEvalOrders[rootNode] = nodes;
+				nodesForGrad = nodes;
+				nodesForGrad.reverse();
+				m_cacheGradientCalcOrders[rootNode] = nodesForGrad;

 #ifdef DISPLAY_DEBUG
                fprintf(stderr, "Reordered nodes\n");
@ -2149,13 +2161,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            std::list<ComputationNodePtr> vTmp;
            std::list<ComputationNodePtr> vRecurrentTmp;
-            int  prevId = -1;
+            //int  prevId = -1;
+			vector<bool> accessed;
+			accessed.assign(m_recurrentInfo.size(),false);
            for (auto nodeIter=nodes.begin(); nodeIter != nodes.end(); nodeIter++)
            {
                int iId = FindInRecurrentLoop(*nodeIter);
                if (iId >= 0)
                {
-                    if (prevId != iId && vRecurrentTmp.size() > 0)
+					
+					if (! accessed[iId])
+					{
+						newList.insert(newList.end(), m_recurrentInfo[iId].m_recurrentNodes.begin(), m_recurrentInfo[iId].m_recurrentNodes.end());
+						accessed[iId] = true;
+					}
+
+                    /*if (prevId != iId && vRecurrentTmp.size() > 0)
                    {
                        newList.insert(newList.end(), vRecurrentTmp.begin(), vRecurrentTmp.end());
                        vRecurrentTmp.clear();
@ -2169,11 +2190,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                    vRecurrentTmp.push_back(*nodeIter);

-                    prevId = iId;
+                    prevId = iId;*/
                }
                else
                {
-                    vTmp.push_back(*nodeIter);
+                    //vTmp.push_back(*nodeIter);
+					newList.push_back(*nodeIter);
                }
            }

--- a/MachineLearning/cn/ComputationNode.h
+++ b/MachineLearning/cn/ComputationNode.h
@ -4743,14 +4743,18 @@ protected:  \
        virtual void EvaluateThisNode(const size_t timeIdxInSeq) 
        {
            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+			Matrix<ElemType> sliceOutputValue = Matrix <ElemType>();

            Matrix<ElemType> sliceMask = Matrix<ElemType>();
            if(m_dropoutRate > 0)
            {
-                m_maskOfDropout.Resize(m_functionValues.GetNumRows(), m_functionValues.GetNumCols());
+				FunctionValues().Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
+				m_maskOfDropout.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
                sliceMask = m_maskOfDropout.ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            }
+			}
+			
+			sliceOutputValue = FunctionValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+			

            EvaluateThisNodeS(m_dropoutRate, m_randomSeed, sliceOutputValue, sliceMask, sliceInput0Value);
        }
--- a/MachineLearning/cn/NDLNetworkBuilder.h
+++ b/MachineLearning/cn/NDLNetworkBuilder.h
@ -45,10 +45,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            IExecutionEngine<ElemType>* executionEngine,
            const std::wstring& networkConfig, 
            const std::string& configParams,
+            const std::wstring& dumpFileName,
            DEVICEID_TYPE deviceId=AUTOPLACEMATRIX) 
        {
            m_executionEngine=executionEngine;
            m_networkConfig=networkConfig;
+            m_dumpFileName = dumpFileName;
            m_initialConfig=configParams;
            m_deviceId=deviceId;
            m_net=&(executionEngine->GetComputationNetwork());
@ -69,6 +71,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            ConfigParameters newConfig;
            ConfigValue networkConfig = config("networkDescription","");
+            ConfigValue dumpFileName = config("dumpFileName", "");
            DEVICEID_TYPE deviceId = DeviceFromConfig(config);
            unsigned long randomSeedOffset = config("randomSeedOffset","0");
            auto executionEngine = new SynchronousExecutionEngine<ElemType>(deviceId, randomSeedOffset);
@ -142,7 +145,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                }
            }

-            Init(executionEngine, networkConfig, newConfig, deviceId);
+            Init(executionEngine, networkConfig, newConfig, dumpFileName, deviceId);
        }

        virtual ~NDLBuilder()
@ -196,7 +199,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_script.FileParse(fileContents);

            NDLUtil<ElemType> ndlUtil(m_net);
-            ndlUtil.ProcessNDLScript(&m_script, ndlPassAll, nullptr, true);
+            ndlUtil.ProcessNDLScript(&m_script, ndlPassAll, nullptr, true, m_dumpFileName);
        }

        // SetFromConfig - Set the NDL script from a configuration string value
@ -222,6 +225,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        ComputationNetwork<ElemType>* m_net;
        IExecutionEngine<ElemType>* m_executionEngine;
        std::wstring m_networkConfig;
+        std::wstring m_dumpFileName;
        std::string m_initialConfig;

        DEVICEID_TYPE m_deviceId;
--- a/MachineLearning/cn/NDLUtil.h
+++ b/MachineLearning/cn/NDLUtil.h
@ -88,7 +88,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // skipThrough - [in/out] for iterative processing, a pointer to an array of NDLNode*, one for each pass
        //               the pointer will be updated to last node processed for that pass, can be NULL if all node processing is desired
        // fullValidate - validate as a complete network? (false if this might be a snippet of a full network)
-        void ProcessNDLScript(NDLScript<ElemType>* script, NDLPass ndlPassUntil=ndlPassAll, NDLNode<ElemType>** skipThrough=nullptr, bool fullValidate = false)
+        void ProcessNDLScript(NDLScript<ElemType>* script, NDLPass ndlPassUntil = ndlPassAll, NDLNode<ElemType>** skipThrough = nullptr, bool fullValidate = false, const std::wstring& dumpFileName = L"")
        {
            // if we don't have a script yet, don't bother
            if (script == nullptr)
@ -104,7 +104,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            for (NDLPass ndlPass=ndlPassInitial;ndlPass <= ndlPassUntil;++ndlPass)
            {
                NDLNode<ElemType>* skipThroughNode = skipThrough?*skipThrough:nullptr;
-                lastNode = ProcessPassNDLScript(script, ndlPass, skipThroughNode, fullValidate);
+                lastNode = ProcessPassNDLScript(script, ndlPass, skipThroughNode, fullValidate, dumpFileName);
                if (skipThrough)
                {
                    *skipThrough = lastNode;
@ -119,13 +119,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // skipThrough - for iterative processing, skip through this node in the script (used for in-line MEL processing)
        // fullValidate - validate as a complete network? (false if this might be a snippet of a full network)
        // returns: last NDL node processed 
-        NDLNode<ElemType>* ProcessPassNDLScript(NDLScript<ElemType>* script, NDLPass ndlPass, NDLNode<ElemType>* skipThrough=nullptr, bool fullValidate = false)
+        NDLNode<ElemType>* ProcessPassNDLScript(NDLScript<ElemType>* script, NDLPass ndlPass, NDLNode<ElemType>* skipThrough = nullptr, bool fullValidate = false, const std::wstring& dumpFileName = L"")
        {
            if (ndlPass == ndlPassFinal)
            {
                // make sure to clear the caches so we pick up the new nodes
                m_net->ClearCaches();
                // validate the network
+                if (dumpFileName != L"")
+                    m_net->DumpAllNodesToFile(false, dumpFileName, false);
                m_net->ValidateNetwork(!fullValidate);
            }
            SynchronousNodeEvaluator<ElemType> ndlEvaluator(*m_net);
--- a/MachineLearning/cn/SGD.h
+++ b/MachineLearning/cn/SGD.h
@ -31,15 +31,33 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    void DecimateMinibatch(std::map<std::wstring, MSR::CNTK::Matrix<ElemType>*> &mb)
    {
+        size_t rv = 0;
        if ( numProcs > 1 ) for (auto it = mb.begin(); it != mb.end(); ++it)
        {
            MSR::CNTK::Matrix<ElemType> &mat = *(it->second);
            size_t nCols = mat.GetNumCols();
-            size_t col_start = (nCols * myRank)/ numProcs;
+            size_t col_start = (nCols * myRank) / numProcs;
            size_t col_end = (nCols*(myRank + 1)) / numProcs;
            if (col_end > nCols) col_end = nCols; // this shouldn't happen
-            MSR::CNTK::Matrix<ElemType> tmp = mat.ColumnSlice(col_start, col_end - col_start);
-            mat.SetValue(tmp);
+            if (col_end == col_start)
+            {
+                MSR::CNTK::Matrix<ElemType> tmp(mat.GetNumRows(), 0, AUTOPLACEMATRIX, DENSE);
+                mat.SetValue(tmp);
+            }
+            else
+            {
+                MSR::CNTK::Matrix<ElemType> tmp = mat.ColumnSlice(col_start, col_end - col_start);
+                mat.SetValue(tmp);
+            }
+            if (0 == rv)
+            {
+                rv = mat.GetNumCols();
+            }
+            else
+            {
+                if (rv != mat.GetNumCols())
+                    throw std::logic_error("Uneven number of columns among inputs.");
+            }
        }
    }

@ -537,9 +555,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                net.BuildPTaskGraph();
            }

-            for (int i=int(startEpoch); i<int(m_maxEpochs); i++)
+            for (int i = int(startEpoch); i < int(m_maxEpochs); i++)
            {
-                auto t_start_epoch = clock();                
+                auto t_start_epoch = clock();

                // set other information to inputMatrices that can contrain information
                // used for class-based LM for clustring information
@ -547,24 +565,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                //set dropout rate
                SetDropoutRate(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
-            
+
                //learning rate adjustment
                if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i))
-                {    
-                    learnRatePerSample = m_learningRatesPerSample[i]; 
+                {
+                    learnRatePerSample = m_learningRatesPerSample[i];
                    setMomentum(m_momentumInputPerMB[i]);
-                }    
-                else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)    
+                }
+                else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
                {
                    ElemType largestPrevLearnRatePerSample = prevLearnRates[0];
-                    for (int j=1; j<m_numPrevLearnRates; j++)
+                    for (int j = 1; j < m_numPrevLearnRates; j++)
                    {
                        largestPrevLearnRatePerSample = max(largestPrevLearnRatePerSample, prevLearnRates[j]);
                    }

                    //return a reasonable  learning rate based on the initial mbsize
                    learnRatePerSample = SearchLearnRateBeforeEpoch(net, refNet, refNode, i, learnRatePerSample, trainSetDataReader, FeatureNodes,
-                            labelNodes,criterionNodes,evaluationNodes, inputMatrices,learnableNodes,smoothedGradients, learnRateInitialized, largestPrevLearnRatePerSample);
+                        labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, learnRateInitialized, largestPrevLearnRatePerSample);

                    prevLearnRates[i % m_numPrevLearnRates] = learnRatePerSample;  //save per sample learn rate to support changeable mbsize
                }
@ -573,18 +591,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                if (learnRatePerSample < m_minLearnRate)
                {
-                    fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n", i+1, learnRatePerSample, m_minLearnRate);
+                    fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n", i + 1, learnRatePerSample, m_minLearnRate);
                    if (m_autoLearnRateSearchType != LearningRateSearchAlgorithm::None)
                        net.SaveToFile(m_modelPath);
                    break;
                }

-                TrainOneEpoch(net, refNet, refNode, i, m_epochSize, trainSetDataReader, learnRatePerSample,FeatureNodes,labelNodes,
-                    criterionNodes,evaluationNodes,inputMatrices, learnableNodes,smoothedGradients,
+#ifdef MPI_SUPPORT
+                INT32 mySamples = (INT32)
+#endif
+                TrainOneEpoch(net, refNet, refNode, i, m_epochSize, trainSetDataReader, learnRatePerSample, FeatureNodes, labelNodes,
+                    criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients,
                    epochCriterion, epochEvalErrors, totalSamplesSeen);

                auto t_end_epoch = clock();
-                ElemType epochTime = ElemType(1.0)*(t_end_epoch-t_start_epoch)/(CLOCKS_PER_SEC);
+                ElemType epochTime = ElemType(1.0)*(t_end_epoch - t_start_epoch) / (CLOCKS_PER_SEC);

                fprintf(stderr, "Finished Epoch[%d]: [Training Set] Train Loss Per Sample = %.8g    ", i + 1, epochCriterion);
                if (epochEvalErrors.size() == 1)
@ -604,21 +625,34 @@ namespace Microsoft { namespace MSR { namespace CNTK {

 #ifdef MPI_SUPPORT
                // model reduction and averaging
-                if ( numProcs > 0 )
-                for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
+                if (numProcs > 0)
                {
-                    ComputationNodePtr node = (*nodeIter);
-                    Microsoft::MSR::CNTK::Matrix<ElemType> &mat = node->FunctionValues();
-                    ElemType *px = mat.CopyToArray();
-                    size_t nx = mat.GetNumElements();
-                    vector<ElemType> py = vector<ElemType>(nx, ElemType(0));
-                    // TODO: Replace this with the reduction-shuffle-dance
-                    MPI_Reduce(px, &(py[0]), (int)nx, sizeof(ElemType) == 4 ? MPI_FLOAT : MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-                    if (myRank == 0)
-                        transform(py.begin(), py.end(), py.begin(), [](ElemType&val)->ElemType{return val / (ElemType)numProcs; });
-                    MPI_Bcast(&(py[0]), nx, sizeof(ElemType) == 4 ? MPI_FLOAT : MPI_DOUBLE, 0, MPI_COMM_WORLD);
-                    mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), &(py[0]));
-                    delete px;
+                    ElemType factor; // weight for the parameter of my model
+                    {
+                        // compute total minibatch size
+                        INT32 allSamples = 0;
+                        MPI_Allreduce(&mySamples, &allSamples, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+                        if (allSamples == 0) allSamples = 1;
+
+                        factor = (ElemType)mySamples / (ElemType)allSamples;
+                    }
+
+                    for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
+                    {
+                        ComputationNodePtr node = (*nodeIter);
+                        Microsoft::MSR::CNTK::Matrix<ElemType> &mat = node->FunctionValues();
+
+                        // weight model by relative size of minibatch samples (and number of processors, for averaging)
+                        ElemType *px = mat.CopyToArray();
+                        size_t nx = mat.GetNumElements();
+                        transform(px, px + nx, px, [factor](ElemType&val)->ElemType{return val * factor; });
+
+                        // TODO: Replace default Allreduce with the reduction-shuffle-dance
+                        vector<ElemType> py = vector<ElemType>(nx, ElemType(0));
+                        MPI_Allreduce(px, &(py[0]), (int)nx, sizeof(ElemType) == 4 ? MPI_FLOAT : MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+                        mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), &(py[0]));
+                        delete px;
+                    }
                }
 #endif

@ -932,7 +966,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, learnRate, smoothedGradients, prevCriterion);  
        }

-        void TrainOneEpoch(ComputationNetwork<ElemType>& net, ComputationNetwork<ElemType>& refNet, const ComputationNodePtr refNode, 
+        size_t TrainOneEpoch(ComputationNetwork<ElemType>& net, ComputationNetwork<ElemType>& refNet, const ComputationNodePtr refNode, 
            const int epochNumber, const size_t epochSize, 
            IDataReader<ElemType>* trainSetDataReader, const ElemType learnRatePerSample,
            const std::vector<ComputationNodePtr>& FeatureNodes,
@ -1006,6 +1040,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                UpdateEvalTimeStamps(labelNodes);

                size_t actualMBSize = net.GetActualMBSize();
+                if (0 == actualMBSize)
+                    continue;

                net.SetActualMiniBatchSize(actualMBSize);
                net.SetActualNbrSlicesInEachRecIter(trainSetDataReader->NumberSlicesInEachRecurrentIter());
@ -1151,6 +1187,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    epochEvalErrors[i] = (const ElemType)localEpochEvalErrors(0,i);
                }
            }
+            return totalEpochSamples;
        }
 public:
        // UpdateWeightsS - static version of UpdateWeights()
--- a/MachineLearning/cn/cn.vcxproj
+++ b/MachineLearning/cn/cn.vcxproj
@ -200,12 +200,6 @@
    </CustomBuildStep>
  </ItemDefinitionGroup>
  <ItemGroup>
-    <Text Include="config.txt">
-      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
-      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
-    </Text>
    <Text Include="DefaultMacros.txt" />
    <Text Include="modelEditor.txt" />
    <Text Include="modelEditorFromScratch.txt" />
--- a/MachineLearning/cn/cn.vcxproj.filters
+++ b/MachineLearning/cn/cn.vcxproj.filters
@ -146,9 +146,6 @@
    <Text Include="modelEditorFromScratch.txt">
      <Filter>Model Editing</Filter>
    </Text>
-    <Text Include="config.txt">
-      <Filter>Main</Filter>
-    </Text>
    <Text Include="DefaultMacros.txt">
      <Filter>Main</Filter>
    </Text>
--- a/Math/Math/CPUSparseMatrix.cpp
+++ b/Math/Math/CPUSparseMatrix.cpp
@ -95,6 +95,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_numRows = 0;
        m_numCols = 0;
        m_elemSizeAllocated = 0;
+        m_compIndexSize = 0;
        m_externalBuffer = false;
        m_computeDevice = CPUDEVICE;
        m_nz = 0;
@ -181,11 +182,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            throw std::logic_error("CPUSparseMatrix:  unsupported SetValue() call.");
        }

-        if(m_elemSizeAllocated < m_nz +1) {
-            throw std::logic_error("CPUSparseMatrix:  allocated size is too small.");
+        if(m_elemSizeAllocated < m_nz +1) //automatic resize
+        {
+            Resize(m_numRows, m_numCols, m_nz + 100);  //allocate 100 more elelemnts and keep existing values
        }

-        if(rIdx < 0 || rIdx >= m_numRows) {
+        if(rIdx < 0 || rIdx >= m_numRows) 
+        {
            throw std::logic_error("CPUSparseMatrix: SetValue() invalid row id");
        }

@ -228,43 +231,62 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    void CPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, size_t size)
+    void CPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve, const bool growOnly, const bool keepExistingValues)
    {               
-        m_nz = 0; 
-        m_colIdx = -1;
+        size_t newCompIndexSize = (numCols > numRows ? numCols : numRows) + 1;
+        bool reallocate = (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly) || m_compIndexSize < newCompIndexSize);
+
        m_numRows = numRows;
-        m_numCols = numCols;            
-        
-        if(m_elemSizeAllocated < size) 
+        m_numCols = numCols;
+
+        if (reallocate)
        {                
-            m_elemSizeAllocated = size;
-            if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) 
+            if (m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR)
            {
-                if(m_pArray != NULL) 
+                ElemType *pArray = new ElemType[numNZElemToReserve];
+                size_t *unCompIndex = new size_t[numNZElemToReserve];
+                size_t *compIndex = new size_t[newCompIndexSize];
+                
+                if (keepExistingValues && m_nz > 0)
+                {
+                    memcpy(pArray, m_pArray, sizeof(ElemType)*m_nz);
+                    memcpy(unCompIndex, m_unCompIndex, sizeof(size_t)*m_nz);
+                    memcpy(compIndex, m_compIndex, sizeof(size_t)*m_compIndexSize);
+                }
+
+                if (m_pArray != NULL)
                    delete[] m_pArray;
-                if(m_unCompIndex != NULL) 
+                if (m_unCompIndex != NULL)
                    delete[] m_unCompIndex;
-                if(m_compIndex != NULL) 
-                    delete[] m_compIndex; 
-                
-                //int len = m_format == MatrixFormat::matrixFormatSparseCSC ? numCols : numRows;
-                size_t len = numCols > numRows ? numCols : numRows;
-                m_pArray = new ElemType[size];
-                m_unCompIndex = new size_t[size];                
-                m_compIndex = new size_t[len+1];  
-                
-            } 
+                if (m_compIndex != NULL)
+                    delete[] m_compIndex;
+
+                m_pArray = pArray;
+                m_unCompIndex = unCompIndex;
+                m_compIndex = compIndex;
+            }
            else if(m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
            {
-                if(m_blockVal != NULL) 
+                ElemType *blockVal = new ElemType[numNZElemToReserve];
+                size_t *blockIds = new size_t[newCompIndexSize];
+
+                if (keepExistingValues && m_elemSizeAllocated > 0)
+                {
+                    memcpy(blockVal, m_blockVal, sizeof(ElemType)*m_elemSizeAllocated);
+                    memcpy(blockIds, m_blockIds, sizeof(size_t)*m_compIndexSize);
+                }
+
+                if (m_blockVal != NULL)
                    delete[] m_blockVal;
                if(m_blockIds != NULL) 
                    delete[] m_blockIds;

-                size_t max = numCols > numRows ? numCols : numRows;
-                m_blockVal = new ElemType[size];                
-                m_blockIds = new size_t[max];
+                m_blockVal = blockVal;
+                m_blockIds = blockIds;
            }
+
+            m_elemSizeAllocated = numNZElemToReserve;
+            m_compIndexSize = newCompIndexSize;
        }
    }

@ -274,6 +296,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    {                
        m_nz = 0;
        m_colIdx = -1;
+        m_compIndexSize = 0;
        m_blockSize = 0;
    }

--- a/Math/Math/CPUSparseMatrix.h
+++ b/Math/Math/CPUSparseMatrix.h
@ -86,7 +86,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        
        int GetComputeDeviceId() const {return -1;}
        
-        void Resize(const size_t numRows, const size_t numCols, size_t size = 0);
+        void Resize(const size_t numRows, const size_t numCols, size_t numNZElemToReserve = 0, const bool growOnly = true, const bool keepExistingValues = true);
        void Reset();

    public:
@ -133,6 +133,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    private:
        int m_colIdx; //used to SetValue()
+        size_t m_compIndexSize;
+
        //non-zero values are stored in m_pArray
        size_t *m_unCompIndex; //row/col ids in CSC/CSR format
        size_t *m_compIndex; //begin ids of col/row in CSC/CSR format
--- a/Math/Math/CommonMatrix.h
+++ b/Math/Math/CommonMatrix.h
@ -85,6 +85,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void SetOwnBuffer(bool own) {m_externalBuffer = !own;}
        wchar_t* GetMatrixName() const { return m_matrixName; }
        size_t NzCount() const {return m_nz;}
+        void SetNzCount(const size_t nz) { m_nz = nz; }
        size_t GetSizeAllocated() const {return m_elemSizeAllocated; }
        void SetMatrixName(const wchar_t* s) 
        { 
--- a/Math/Math/GPUSparseMatrix.cu
+++ b/Math/Math/GPUSparseMatrix.cu
@ -130,6 +130,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        deepCopy.PrepareDevice();

        Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.m_nz, deepCopy.m_format);
+        m_nz = deepCopy.m_nz;
        CUDACALL(cudaMemcpy(NzValues(), deepCopy.NzValues(), NzSize(), cudaMemcpyDeviceToDevice));
        CUDACALL(cudaMemcpy(MajorIndexLocation(), deepCopy.MajorIndexLocation(), MajorIndexSize(), cudaMemcpyDeviceToDevice));
        CUDACALL(cudaMemcpy(SecondaryIndexLocation(), deepCopy.SecondaryIndexLocation(), SecondaryIndexSize(), cudaMemcpyDeviceToDevice));
@ -199,6 +200,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            //we need to do conversion because CPUSparseMatrix uses size_t for indexes while GPUSparseMatrix uses int
            GPUSPARSE_INDEX_TYPE *h_CSRRow, *h_Col;
            cpuSparseMatrix.Resize(GetNumRows(), GetNumCols(), GetNumNZElements());
+            cpuSparseMatrix.SetNzCount(GetNumNZElements());

            PrepareDevice();
            h_CSRRow = new GPUSPARSE_INDEX_TYPE[m_numRows + 1];
@ -219,6 +221,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            //we need to do conversion because CPUSparseMatrix uses size_t for indexes while GPUSparseMatrix uses int
            GPUSPARSE_INDEX_TYPE *h_CSCCol, *h_Row;
            cpuSparseMatrix.Resize(GetNumRows(), GetNumCols(), GetNumNZElements());
+            cpuSparseMatrix.SetNzCount(GetNumNZElements());

            PrepareDevice();
            h_CSCCol = new GPUSPARSE_INDEX_TYPE[m_numCols + 1];
@ -322,6 +325,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        outMatrix.ChangeDeviceTo(GetComputeDeviceId());
        outMatrix.Resize(m_numRows, m_numCols, m_nz,newFormat);
+        outMatrix.SetNzCount(m_nz);

        if (oldFormat == matrixFormatSparseCSR && newFormat == matrixFormatSparseCSC)
        {
@ -475,6 +479,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        CUDACALL(cudaEventDestroy(done));

        Resize(numRows, numCols, nnzTotalDevHostPtr, matrixFormat);
+        SetNzCount(nnzTotalDevHostPtr);

        CUDACALL(cudaEventCreate(&done));
        
@ -605,6 +610,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    void GPUSparseMatrix<ElemType>::ResizeAsAndCopyIndexFrom(const GPUSparseMatrix<ElemType>& a, const bool growOnly /*= true*/)
    {
        Resize(a.m_numRows, a.m_numCols, a.m_nz, a.m_format, growOnly);
+        SetNzCount(a.m_nz);

        CUDACALL(cudaMemcpy(MajorIndexLocation(), a.MajorIndexLocation(), MajorIndexSize(), cudaMemcpyDeviceToDevice));
        CUDACALL(cudaMemcpy(SecondaryIndexLocation(), a.SecondaryIndexLocation(), SecondaryIndexSize(), cudaMemcpyDeviceToDevice));
@ -630,30 +636,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZ)
+    void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly)
    {
-        Resize(numRows, numCols, numNZ, GetFormat(), true);
+        Resize(numRows, numCols, numNZElemToReserve, GetFormat(), growOnly);
    }

+    //WARNING: When memory is reallocated existing information will be lost, workaround is to allocte enough memory from start.
+    //TODO: add keepExistingValues (default to true) argument so that the existing values are kept even after reallocation 
    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat matrixFormat, const bool growOnly /*= true*/)
+    void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly /*= true*/)
    {               
        m_numRows = numRows;
        m_numCols = numCols; 
-        m_nz = numNZ;

        if (matrixFormat == MatrixFormat::matrixFormatSparseCSC || matrixFormat == MatrixFormat::matrixFormatSparseCSR)
        {
-            bool reallocate = (m_totalBufferSizeAllocated < BufferSizeNeeded() || (!growOnly && m_totalBufferSizeAllocated > BufferSizeNeeded()));
+            size_t bufferSizeNeeded = BufferSizeNeeded(numNZElemToReserve);
+            bool reallocate = (m_totalBufferSizeAllocated < bufferSizeNeeded || (!growOnly && m_totalBufferSizeAllocated > bufferSizeNeeded));

            if (reallocate)
            {
                if (!OwnBuffer())
                    throw logic_error("Cannot Resize since the buffer is managed externally.");

-                m_totalBufferSizeAllocated = BufferSizeNeeded();
-                m_elemSizeAllocated = numNZ;
-
                if (m_pArray != nullptr)
                    CUDACALL(cudaFree(m_pArray));
                if (m_block2Id != nullptr)
@ -663,21 +668,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                PrepareDevice();

-                CUDACALL(cudaMalloc((void **)&m_pArray, m_totalBufferSizeAllocated));
+                CUDACALL(cudaMalloc((void **)&m_pArray, bufferSizeNeeded));
                CUDACALL(cudaMalloc((void **)&m_block2Id, sizeof(size_t)*(numCols * 2)));
                CUDACALL(cudaMalloc((void **)&m_block2UniqId, sizeof(size_t)*(numCols * 2)));
+
+                m_totalBufferSizeAllocated = bufferSizeNeeded;
+                m_elemSizeAllocated = numNZElemToReserve;
            }
        } 
        else if (matrixFormat == MatrixFormat::matrixFormatSparseBlockCol || matrixFormat == MatrixFormat::matrixFormatSparseBlockRow)
        {
-            if (m_blockVal != nullptr)
-                CUDACALL(cudaFree(m_blockVal));
-            if (m_blockIds != nullptr)
-                CUDACALL(cudaFree(m_blockIds));
-            PrepareDevice();
-            CUDACALL(cudaMalloc((void **)&m_blockVal, sizeof(ElemType)*numNZ));
-            int max = numCols > numRows ? numCols : numRows;
-            CUDACALL(cudaMalloc((void **)&m_blockIds, sizeof(size_t)*max));
+            if (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly))
+            {
+                if (m_blockVal != nullptr)
+                    CUDACALL(cudaFree(m_blockVal));
+                if (m_blockIds != nullptr)
+                    CUDACALL(cudaFree(m_blockIds));
+                PrepareDevice();
+                CUDACALL(cudaMalloc((void **)&m_blockVal, sizeof(ElemType)*numNZElemToReserve));
+                int max = numCols > numRows ? numCols : numRows;
+                CUDACALL(cudaMalloc((void **)&m_blockIds, sizeof(size_t)*max));
+
+                m_elemSizeAllocated = numNZElemToReserve;
+            }
        }
        else
            NOT_IMPLEMENTED;
@ -701,6 +714,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        m_format = matrixFormatSparseCSR;
        Resize(numRows, numCols, nz);
+        SetNzCount(nz);

        cudaMemcpyKind kind = IsOnDevice ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
        CUDACALL(cudaMemcpy(RowLocation(), h_CSRRow, RowSize(), kind));
@ -741,6 +755,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        SetComputeDeviceId(devId);
        m_format = matrixFormatSparseCSC;
        Resize(numRows, numCols, nz);
+        SetNzCount(nz);

        cudaMemcpyKind kind = IsOnDevice ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
        CUDACALL(cudaMemcpy(RowLocation(), h_Row, RowSize(), kind));
@ -792,6 +807,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    {
        m_format = matrixFormatSparseCSC;
        Resize(m_numRows, m_numCols, labelSize);
+        SetNzCount(labelSize);

        m_expandedSize = expandedSize;
        m_blockSize = blockSize;
@ -1320,6 +1336,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        // now we know the number of Non-zeros in the result set, set the output size
        c.Resize(m, n, nnzC);
+        c.m_nz = nnzC;
+
        CUDACALL(cudaMemcpy(c.SecondaryIndexLocation(),csrRowPtrC,c.SecondaryIndexSize(),cudaMemcpyDeviceToDevice));

        // if we allocated the buffer, free it here
@ -1805,6 +1823,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        PrepareDevice();
        GPUSparseMatrix c(GetFormat(), GetComputeDeviceId());
        c.Resize(n, m, nnz, GetFormat());
+        c.m_nz = nnz;

        cusparseHandle_t cusparseHandle = 0;
        CUSPARSECALL(cusparseCreate(&cusparseHandle));
@ -2283,6 +2302,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            NOT_IMPLEMENTED;

        us.Resize(rownum, colnum, nz);
+        us.SetNzCount(nz);

        if (nz > 0)
        {
--- a/Math/Math/GPUSparseMatrix.h
+++ b/Math/Math/GPUSparseMatrix.h
@ -77,7 +77,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        size_t MajorIndexSize() const { return sizeof(GPUSPARSE_INDEX_TYPE)*MajorIndexCount(); } // actual number of major index bytes in use

        GPUSPARSE_INDEX_TYPE* SecondaryIndexLocation() const { return MajorIndexLocation() + m_elemSizeAllocated; } //this is the compressed index, col/row in CSC/CSR format
-        size_t SecondaryIndexCount() const 
+        size_t SecondaryIndexCount(const size_t numNZ) const 
        {
            if (m_format&matrixFormatCompressed)
            {
@ -86,12 +86,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                return cnt;
            }
            else
-                return m_nz; // COO format
+                return numNZ; // COO format
        }
+
+        size_t SecondaryIndexCount() const
+        {
+            return SecondaryIndexCount(m_nz);
+        }
+
        // get size for compressed index
        size_t SecondaryIndexSize() const { return (SecondaryIndexCount())*sizeof(GPUSPARSE_INDEX_TYPE); }

        size_t BufferSizeNeeded() const { return NzSize() + MajorIndexSize() + SecondaryIndexSize(); }
+        size_t BufferSizeNeeded(const size_t numNZ) const 
+        { return sizeof(ElemType)*numNZ + sizeof(GPUSPARSE_INDEX_TYPE)*(numNZ + SecondaryIndexCount(numNZ)); }
+
        size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
        ElemType* BufferPointer() const;

@ -107,8 +116,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void SetValue(const GPUMatrix<ElemType>& denseMatrix);

        void ResizeAsAndCopyIndexFrom(const GPUSparseMatrix<ElemType>& a, const bool growOnly = true);
-        void Resize(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat matrixFormat, const bool growOnly = true); //matrix format will affect the size to allocate
-        void Resize(const size_t numRows, const size_t numCols, const size_t numNZ);  
+        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly = true); //matrix format will affect the size to allocate
+        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly = true);

        GPUSparseMatrix<ElemType> Transpose() const;
        void InplaceTranspose();
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@ -925,7 +925,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    void Matrix<ElemType>::SetValue(const size_t rIdx, const size_t cIdx, ElemType val)
    {
-        DISPATCH_MATRIX_ON_FLAG(this,
+        DISPATCH_MATRIX_ON_FLAG_USECPU_4BOTH(this,
            this,
            (*m_CPUMatrix)(rIdx, cIdx) = val, 
            NOT_IMPLEMENTED, 
@ -1150,26 +1150,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType>
-    void Matrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly /*=true*/)
+    void Matrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve /*=0*/, bool growOnly /*=true*/)
    {
        DISPATCH_MATRIX_ON_FLAG(this,
            this,
            m_CPUMatrix->Resize(numRows,numCols,growOnly), 
            m_GPUMatrix->Resize(numRows,numCols,growOnly), 
-            NOT_IMPLEMENTED,
-            NOT_IMPLEMENTED
-            );
-    }
-
-    template<class ElemType>
-    void Matrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t allocatedSize)
-    {
-        DISPATCH_MATRIX_ON_FLAG(this,
-            this,
-            NOT_IMPLEMENTED, 
-            NOT_IMPLEMENTED, 
-            m_CPUSparseMatrix->Resize(numRows,numCols, allocatedSize), 
-            m_GPUSparseMatrix->Resize(numRows,numCols, allocatedSize)
+            m_CPUSparseMatrix->Resize(numRows, numCols, numNZElemToReserve, growOnly),
+            m_GPUSparseMatrix->Resize(numRows, numCols, numNZElemToReserve, growOnly)
            );
    }

@ -3069,11 +3057,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                {
                    delete m_CPUSparseMatrix;
                    m_CPUSparseMatrix = NULL;
-                    SetDataLocation(GPU, DENSE);
+                    SetDataLocation(GPU, SPARSE);
                }
                else
                {
-                    SetDataLocation(BOTH, DENSE);
+                    SetDataLocation(BOTH, SPARSE);
                }
            }
            else //from GPU
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@ -112,8 +112,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN);
       
        void Reshape(const size_t numRows, const size_t numCols);
-        void Resize(const size_t numRows, const size_t numCols, bool growOnly = true);  //by default we only reallocate if need to grow        
-        void Resize(const size_t numRows, const size_t numCols, const size_t allocatedSize); //for sparse matrix
+        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 0, bool growOnly = true);  //by default we only reallocate if need to grow        
        size_t GetAllocatedSize() const;
        void Reset(); //reset for sparse matrix

--- a/Math/Math/NoGPU.cpp
+++ b/Math/Math/NoGPU.cpp
@ -73,7 +73,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template<class ElemType> void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat matrixFormat, const bool growOnly = true) {}//matrix format will affect the size to allocate
-    template<class ElemType> void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZ) {}
+    template<class ElemType> void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZ, const bool growOnly = true) {}

    template<class ElemType> GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::CopyToDenseMatrix() const
    {
--- a/papers/CNTK-TechReport/lyx/#CNTKBook_CNTK_Programmer_Chapter.lyx#
+++ b/papers/CNTK-TechReport/lyx/#CNTKBook_CNTK_Programmer_Chapter.lyx#
--- a/papers/CNTK-TechReport/lyx/CNTKBook-Draft0.4-2015-01-04.pdf
+++ b/papers/CNTK-TechReport/lyx/CNTKBook-Draft0.4-2015-01-04.pdf
--- a/papers/CNTK-TechReport/lyx/CNTKBook-master.lyx
+++ b/papers/CNTK-TechReport/lyx/CNTKBook-master.lyx
@ -115,8 +115,8 @@ Jie Gao, Avner May, Baolin Peng, Andreas Stolcke, Malcolm Slaney
 \end_layout

 \begin_layout Date
-MSR-TR-2014-112 (DRAFT v0.3: Nov.
- 23, 2014)
+MSR-TR-2014-112 (DRAFT v0.4: Jan.
+ 4, 2015)
 \end_layout

 \begin_layout Standard
--- a/papers/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx
+++ b/papers/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx
@ -3265,7 +3265,7 @@ status open

 \begin_layout Plain Layout

-Delay(m, [delayTime=1, defaultPastValue=0.1])
+Delay(rows, [cols], m, [delayTime=1, defaultPastValue=0.1])
 \end_layout

 \end_inset
@ -3273,6 +3273,18 @@ Delay(m, [delayTime=1, defaultPastValue=0.1])

 \end_layout

+\begin_layout Itemize
+rows - the number of rows in the delay node (and in the input matrix).
+ This parameter is needed because under some loopy conditions the dimensions
+ cannot be automatically inferred from the input matrix.
+\end_layout
+
+\begin_layout Itemize
+cols - the number of columns in the delay node (and in the input matrix).
+ This parameter is optional since it will be set based on the minibatch
+ size during training and testing.
+\end_layout
+
 \begin_layout Itemize
 m - input matrix to be delayed.
 Each column is a sample.
--- a/papers/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx
+++ b/papers/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx
@ -1638,8 +1638,21 @@ loadBestModel
 \end_layout

 \begin_layout Itemize
-learnRateAdjustInterval: determine the frequency of applying the learning
- rate adjustment check.
+
+\emph on
+learnRateAdjustInterval
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+learnRateAdjustInterval
+\end_layout
+
+\end_inset
+
+
+\emph default
+: determine the frequency of applying the learning rate adjustment check.
 Default is 1 epoch.
 If this value is set to a value larger than 1 the learning rate adjustment
 will be based on the average criterion computed from the last learnRateAdjustIn
@ -1776,9 +1789,113 @@ gradUpdateType
 : gradient update type.
 Valid values are None (default, no special treatment to the gradient),
 AdaGrad, and RmsProp.
+ When gradUpdateType equals to RmsProp, you can control the behavior of
+ the gradient update using following parameters:
+\end_layout
+
+\begin_deeper
+\begin_layout Itemize
+
+\emph on
+rms_wgt_inc
+\emph default
+
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+rms_wgt_inc
+\end_layout
+
+\end_inset
+
+: multiplicative increment of the learning rate scale.
+ Default is 1.2.
+\end_layout
+
+\begin_layout Itemize
+
+\emph on
+rms_wgt_dec
+\emph default
+
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+rms_wgt_dec
+\end_layout
+
+\end_inset
+
+: multiplicative decrement of the learning rate scale.
+ Default is 0.75.
+\end_layout
+
+\begin_layout Itemize
+
+\emph on
+rms_wgt_max
+\emph default
+
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+rms_wgt_max
+\end_layout
+
+\end_inset
+
+: maximum learning rate scale allowed.
+ A value closer to 1 makes the learning rate adjustment more stable but
+ slower.
+ Default is 10.
+\end_layout
+
+\begin_layout Itemize
+
+\emph on
+rms_wgt_min
+\emph default
+
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+rms_wgt_min
+\end_layout
+
+\end_inset
+
+: minimum learning rate scale allowed.
+ A value closer to 1 makes the learning rate adjustment more stable but
+ slower.
+ Default is 0.1.
+\end_layout
+
+\begin_layout Itemize
+
+\emph on
+rms_gamma
+\emph default
+
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+rms_gamma
+\end_layout
+
+\end_inset
+
+: smoothing factor used to estimate the moving average of the variance.
+ The smaller the value, the quicker it forgets the past information.
+ Default is 0.99.
 
 \end_layout

+\end_deeper
 \begin_layout Itemize

 \emph on
@ -4366,6 +4483,60 @@ minibatchSize
 – the minibatch size to use when creating the label mapping file.
 \end_layout

+\begin_layout Section
+ConvertDBN Command
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+ConvertDBN Command
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+This command is used to convert a model generated by Microsoft's dbn.exe
+ tool to a CNTK model.
+ This command is useful when you want to compare the performance of these
+ two tools (dbn.exe only supports simple fully connected deep neural networks),
+ port existing models trained with dbn.exe to CNTK, or if you want to use
+ the RBM pre-training which is available in dbn.exe but not in CNTK right
+ now.
+ The related parameters are
+\end_layout
+
+\begin_layout Itemize
+modelPath
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+modelPath
+\end_layout
+
+\end_inset
+
+ – the full path of the generated CNTK model.
+ 
+\end_layout
+
+\begin_layout Itemize
+dbnModelPath
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+dbnModelPath
+\end_layout
+
+\end_inset
+
+ – the full path of the model to be converted.
+\end_layout
+
 \begin_layout Section
 Additional Top-Level Configurations
 \end_layout
--- a/papers/CNTK-TechReport/lyx/CNTKBook_CNTK_Programmer_Chapter.lyx
+++ b/papers/CNTK-TechReport/lyx/CNTKBook_CNTK_Programmer_Chapter.lyx
@ -116,10 +116,10 @@ At the center of the CNTK is the ComputationNetwork class, which manages
 the life span of computation nodes comprising the network and all the functions
 operating at the network level such as forward computations and gradient
 calculations.
- To build a computational network you need to use one of the ComputationNetBuild
-er classes that implement the IComputationNetBuilder interface.
+ To build a computational network you need to use one of the computational
+ network builder classes that implement the IComputationNetBuilder interface.
 These classes include SimpleNetworkBuilder that supports building simple
- layer-by-layer fully connected networks, 
+ layer-by-layer fully connected networks and 
 \begin_inset Index idx
 status open

@ -149,8 +149,7 @@ LSTM

 \end_inset

-) neural networks.
- It also includes NDLNetworkBuilder that can build neural network, using
+) RNNs, as well as NDLNetworkBuilder that builds neural networks, using
 any computation node we have described in Section 
 \begin_inset CommandInset ref
 LatexCommand ref
@ -181,7 +180,7 @@ IDataReader
 \end_inset

 is an interface for loading data and its transcriptions.
- Different data file format requires different data readers.
+ Different data file formats require different data readers.
 CNTK already implements the UCIFastReader and the BinaryReader that reads
 in UCI data in either text or binary format, the HTKMLFReader that reads
 in HTK/MLF speech data, the SequenceReader that is designed for language
--- a/papers/CNTK-TechReport/lyx/CNTKBook_Introduction.lyx
+++ b/papers/CNTK-TechReport/lyx/CNTKBook_Introduction.lyx
@ -159,7 +159,7 @@ key "Variable-Component-Deep-Neural-Network:2014"
 Conventionally, one needs to design the network, derive the derivatives
 needed to optimize the network, implement the algorithm, and then run the
 experiments.
- These steps are error pronoe and time consuming.
+ These steps are error prone and time consuming.
 With CNTK, however in many cases, you only need to write a simple configuration
 file.
 The rest of this chapter describes the configuration file needed to implement
@ -819,7 +819,7 @@ status open

 \begin_layout Plain Layout

-cn.exe config=Simple.config 
+cn.exe configFile=Simple.config 
 \end_layout

 \end_inset
@ -838,7 +838,7 @@ status open

 \begin_layout Plain Layout

-cn config=Simple.config 
+cn configFile=Simple.config 
 \end_layout

 \end_inset