diff --git a/MachineLearning/CNTK/CNTK.cpp b/MachineLearning/CNTK/CNTK.cpp
index 1e1aed59e..41428acc5 100644
--- a/MachineLearning/CNTK/CNTK.cpp
+++ b/MachineLearning/CNTK/CNTK.cpp
@@ -1657,7 +1657,7 @@ void PrintBuiltInfo()
     fprintf(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_);
 #endif 
 #ifdef _CUB_PATH_
-    fprintf(stderr, "\t\tCUDA_PATH: %s\n", _CUB_PATH_);
+    fprintf(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
 #endif 
 #ifdef _GIT_EXIST
     fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
index 4ea6f5a83..deaf37484 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@@ -294,7 +294,7 @@ public:
         m_deviceId = EnforceOneGPUOnly(m_deviceId);      // see EnforceOneGPUOnly() for comment on what this is
     }
 
-    DEVICEID_TYPE GetDeviceId() { return m_deviceId; }
+    DEVICEID_TYPE GetDeviceId() const { return m_deviceId; }
 
     unsigned long GetRandomSeedOffset() { return m_randomSeedOffset; }
     void SetRandomSeedOffset(unsigned long value) { m_randomSeedOffset = value; }
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 2296f3e4c..4dfd11bee 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -119,6 +119,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual ~IComputationNode() { }
     };
 
+    // =======================================================================
+    //  This provide a interface for stateful node (e.g., DelayNodeBase) and definition of state
+    //  This interface allows to Export and Import state from elsewhere 
+    //  It is needed when doing sub-minibatch implementation 
+    // =======================================================================
+
+    class INodeState: public std::enable_shared_from_this<INodeState>
+    {
+    public:
+        virtual ~INodeState() {} 
+    };
+
+    struct /*interface*/ IStateFulNode
+    {
+        typedef std::shared_ptr<INodeState> NodeStatePtr;
+        virtual NodeStatePtr ExportState() = 0;
+        virtual void ImportState(const NodeStatePtr& pImportedState) = 0;
+    };
+
     // =======================================================================
     // ComputationNetworkOwnedNodeState -- class to collect ComputationNode members that are really owned by ComputationNetwork
     // These members are only to be set, changed, and read by ComputationNetwork code.
@@ -1475,6 +1494,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     };
 
+
+
     // =======================================================================
     // helper macro to ease access to base members in presence of C++ two-phase name lookup
     // =======================================================================
diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
index c900963e0..20d850cbf 100644
--- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
@@ -24,6 +24,51 @@
 
 namespace Microsoft { namespace MSR { namespace CNTK {
 
+    // -----------------------------------------------------------------------
+    // The following defines a state of a delay node which is going to be exported to others (saving for the next minibatch)
+    // -----------------------------------------------------------------------
+    template<class ElemType>
+    class DelayedValueNodeState: public INodeState
+    {
+               
+        public:
+            DelayedValueNodeState(int deviceID) :
+                m_cachedActivity((size_t)0, (size_t)0, deviceID), m_delayedActivationMBLayout(nullptr), m_isEmpty(true)
+            { }
+            void CacheDelayedMBLayout(const MBLayoutPtr& pMBLayout)
+            {
+                m_delayedActivationMBLayout = make_shared<MBLayout>();
+                m_delayedActivationMBLayout->CopyFrom(pMBLayout);
+            }
+            void CacheState(const Matrix<ElemType>& cachedActivity)
+            {
+                m_cachedActivity.SetValue(cachedActivity); 
+                m_isEmpty = false; 
+            }
+            void ExportDelayedMBLayout(MBLayoutPtr& pMBLayout)
+            {
+                pMBLayout->CopyFrom(m_delayedActivationMBLayout); 
+            }
+            bool IsEmpty()
+            {
+                return m_isEmpty; 
+            }
+            const Matrix<ElemType>& ExportCachedActivity()
+            {
+                return m_cachedActivity; 
+            }
+            
+        protected:
+            Matrix<ElemType>    m_cachedActivity; // 1 column per parallel sequence 
+            // MBLayoutPtr         m_shiftedMBLayout;   
+            // Currently, we only support saving state for m_timeStep == 1
+            // there is no need for this m_shiftedMBLayout if m_timeStep == 1
+            MBLayoutPtr         m_delayedActivationMBLayout; 
+            bool                m_isEmpty;      // in some case 
+            // (e.g., at the boundary of sentence end or begin/full utterance mode), we don't need to store state (but we do need to need know m_delayedActivationMBLayout)
+    };
+    
+
     // -----------------------------------------------------------------------
     // DelayedValueNodeBase (input) -- abstract base class for PastValueNode and FutureValueNode to hold all shared code
     // The two differ in the step direction, some loop directions, and sequence-boundary flags.
@@ -31,9 +76,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     // TODO: 'direction' is really too general. signOfTimeOffset?
     template<class ElemType, int direction/*-1 for Past/left-to-right or +1 for Future/right-to-left*/, MinibatchPackingFlags SequenceStart_or_End/*-Start or -End*/>
-    class DelayedValueNodeBase : public ComputationNode<ElemType>, public ILateAttachingNode, public NumInputs<1>
+    class DelayedValueNodeBase : public ComputationNode<ElemType>, public
+                                 ILateAttachingNode, public IStateFulNode,  public NumInputs<1>
     {
         typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+        typedef std::shared_ptr<DelayedValueNodeState<ElemType>> DelayedNodeStatePtr; 
         static const std::wstring TypeName() { return L"DelayedValue"; }
     private:
         void Init(size_t row_size, size_t col_size, ElemType initialActivationValue = (ElemType)DEFAULT_HIDDEN_ACTIVATION)
@@ -352,6 +399,129 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
         }
 
+        //========================================
+        // implement the IStateFulNode interface
+        //========================================
+
+        virtual NodeStatePtr ExportState() override
+        {
+            NodeStatePtr pExportedState;
+            size_t nT = m_pMBLayout->GetNumTimeSteps();
+            size_t nU = m_pMBLayout->GetNumParallelSequences();
+            int dir = direction; 
+            if (m_timeStep != 1)
+            {
+                // not support yet; give user a hint 
+                RuntimeError("Currently importing/exporting state info for timeStep>1 is not supported. Contact erw@microsoft.com for more detail");
+            }
+            if (dir == -1) // we look into past 
+            {
+                bool   allAtBoundary = true;
+                // if the current last frames are all sentence end or no feature , there is no need to carry on state info
+                if (m_pMBLayout->Is(nT-1, MinibatchPackingFlags::SequenceEnd | MinibatchPackingFlags::NoFeature))
+                {
+                    for (size_t u = 0; u < nU; u++)
+                    {
+                        if (!m_pMBLayout->Is(u, nT - 1, MinibatchPackingFlags::SequenceEnd | MinibatchPackingFlags::NoFeature))
+                        {
+                            allAtBoundary = false;
+                            break;
+                        }
+                    }
+                }
+                else
+                {
+                    allAtBoundary = false; 
+                }
+
+                if (allAtBoundary)
+                {
+                    auto pState = make_shared<DelayedValueNodeState<ElemType>>(m_deviceId); 
+                    pState->CacheDelayedMBLayout(m_delayedActivationMBLayout); 
+                    // return an empty one 
+                }
+                else
+                {
+                    auto pState = make_shared<DelayedValueNodeState<ElemType>>(m_deviceId);
+                    //pState->CacheState(FunctionValues().Reshaped(nD*nU, nT).RowSlice(nD*(nT - 1), nD));
+                    pState->CacheState(m_delayedActivation.ColumnSlice((nT - 1)*nU, nU)); 
+                    pState->CacheDelayedMBLayout(m_delayedActivationMBLayout); 
+                    pExportedState = pState; 
+                }
+            }
+            if (dir == 1) // we look into future 
+            {
+                // TODO: check whether all at boundary and don't carry state if it is the case 
+                size_t nT = m_pMBLayout->GetNumTimeSteps(); 
+                size_t nU = m_pMBLayout->GetNumParallelSequences(); 
+                bool allAtBoundary = true; 
+                if (m_pMBLayout->Is(0, MinibatchPackingFlags::NoFeature | MinibatchPackingFlags::SequenceStart))
+                {
+                    for (size_t u = 0; u < nU; u++)
+                    {
+                        if (!m_pMBLayout->Is(u, 0, MinibatchPackingFlags::SequenceStart | MinibatchPackingFlags::NoFeature))
+                        {
+                            allAtBoundary = false; 
+                            break;
+                        }
+                    }
+                }
+
+                if (allAtBoundary)
+                {
+                    auto pState = make_shared<DelayedValueNodeState<ElemType>>(m_deviceId); 
+                    pState->CacheDelayedMBLayout(m_delayedActivationMBLayout); 
+                    pExportedState = pState; 
+                }
+                else
+                {
+                    auto pState = make_shared<DelayedValueNodeState<ElemType>>(m_deviceId);
+                    pState->CacheState(m_delayedActivation.ColumnSlice((nT-1)*nU, nU));
+                    pState->CacheDelayedMBLayout(m_delayedActivationMBLayout);
+                    pExportedState = pState;
+                }
+                
+               
+            }
+            if (dir != -1 && dir != 1)
+            {
+                RuntimeError("Unrecognized direction in DelayedValueNodeBase");
+            }
+            return pExportedState;
+        }
+        virtual void ImportState(const NodeStatePtr& pImportedState) override
+        {
+            DelayedNodeStatePtr pState = dynamic_pointer_cast<DelayedValueNodeState<ElemType>> (pImportedState); 
+
+            if (!pState)
+                RuntimeError("Expecting DelayValueNodeState after down casting"); 
+
+            pState->ExportDelayedMBLayout(m_delayedActivationMBLayout);  // pstate copy to m_delayedActivationMBLayout
+            if (pState->IsEmpty())
+            {
+                return;
+            }
+
+            const Matrix<ElemType>& delayedActivation = pState->ExportCachedActivity();
+            size_t nT = m_delayedActivationMBLayout->GetNumTimeSteps();
+            size_t nU = m_delayedActivationMBLayout->GetNumParallelSequences();
+
+            int dir = direction;
+            if (dir == -1) // looking backward 
+            {
+                m_delayedActivation.SetColumnSlice(delayedActivation, (nT - 1)*nU, nU);
+            }
+            if (dir == 1)
+            {
+                //m_delayedActivation.CopyColumnsStrided(delayedActivation, nU, 1, nT);
+                m_delayedActivation.SetColumnSlice(delayedActivation, 0, nU);
+            }
+            if (dir != -1 && dir == 1)
+            {// it is really a compile error ? 
+                RuntimeError("Unrecognized direction in DelayedValueNodeBase");
+            }
+
+        }
     protected:
 
         ElemType m_initialActivationValue;          // starting value for hidden activation vector at boundary
diff --git a/MachineLearning/CNTKSGDLib/DataReaderHelpers.h b/MachineLearning/CNTKSGDLib/DataReaderHelpers.h
index a10acb927..17ea06071 100644
--- a/MachineLearning/CNTKSGDLib/DataReaderHelpers.h
+++ b/MachineLearning/CNTKSGDLib/DataReaderHelpers.h
@@ -10,6 +10,8 @@
 #include <map>
 #include "TrainingCriterionNodes.h"
 
+//#define SMB_DEBUG
+
 namespace Microsoft { namespace MSR { namespace CNTK {
 
     /*static*/ struct DataReaderHelpers
@@ -166,4 +168,404 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     };
 
+    // SubminibatchHelpers
+    // Helper for sub-minibatch implementation
+    // A sub-minibathc is a part of a minibatch which helps computing large minibatches that cannot load into GPU memory in one forward-backward computation 
+    // The usage would be : 
+    //        SubminibatchHelpers sbhelper;    
+    //        for (;;)
+    //        {
+    //            size_t nsb=sb.GetMinibatchIntoCache(...); 
+    //            for (size_t i=0; i<nsb; i++)
+    //            {
+    //                sbhelper.GetSubMinibatchToNet(i); 
+    //                net.Evaluate(criterionNodes[0]);
+    //                sbhelper.DoneWithCurrentSubMinibatch(); 
+    //            }
+    //            UpdateWeights(...);
+    //        }
+
+    template<class ElemType>
+    class SubminibatchDispatcher
+    {
+    private: 
+        typedef            std::vector<shared_ptr<const msra::dbn::latticesource::latticepair>>         Lattice; 
+        typedef            std::vector<size_t>                                                          Uid; 
+        typedef            std::vector<size_t>                                                          ExtrauttMap;
+
+        typedef            std::vector<shared_ptr<const msra::dbn::latticesource::latticepair>>*        LatticePtr;
+        typedef            std::vector<size_t>*                                                         UidPtr;
+        typedef            std::vector<size_t>*                                                         ExtrauttMapPtr;
+        typedef            std::map<std::wstring, Matrix<ElemType>*>                                    Matrices; 
+
+
+        // member variables served as caching space 
+        Matrices                                            m_inputMatricesCache; 
+        MBLayoutPtr                                         m_MBLayoutCache;
+        LatticePtr                                          m_LatticeCache; 
+        UidPtr                                              m_uidCache; 
+        ExtrauttMapPtr                                      m_extrauttmapCache;
+        shared_ptr<Matrix<ElemType>>                        m_NetCriterionAccumulator; 
+        shared_ptr<Matrix<ElemType>>                        m_NetEvaluationAccumulator; 
+        std::map<wstring, vector<shared_ptr<INodeState>>>   m_NetStates;            // m_NetStatefulNodes[node][i] caches the state of i-th subminibatch of node
+
+
+        Matrices                                            m_CachedGraident; 
+        // we also need to remember where to put into the net
+        MBLayoutPtr                                         m_NetMBLayoutPtr;
+        std::map<wstring, shared_ptr<ComputationNode<ElemType>>>    m_LearnableNodePtr;  
+        // followings are lattice-related 
+        Matrices                                            m_NetInputMatrixPtr;
+        LatticePtr                                          m_NetLatticePtr; 
+        UidPtr                                              m_NetUidPtr;
+        ExtrauttMapPtr                                      m_NetExtrauttMapPtr;
+        // we remember the pointer to the learnable Nodes so that we can accumulate the gradient once a sub-minibatch is done 
+        
+
+        size_t                                              m_numParallelSequences; // number of paralle sequence in the cached matrix and MBLayout 
+        size_t                                              m_numSubminibatches;    // how many subminibatches we are going to use ? 
+
+        std::vector<shared_ptr<ComputationNode<ElemType>>>                 m_NetCriterionNodes; 
+        std::vector<shared_ptr<ComputationNode<ElemType>>>                 m_NetEvaluationNodes; 
+        std::map<wstring, shared_ptr<IStateFulNode>>                       m_NetStatefulNodes;      // we need to Export/Import states of stateful nodes when we swtich subminibatches 
+
+    private:
+
+        void EnumerateStatefulNodeWithRoot(ComputationNetwork& net, ComputationNodeBasePtr root,  std::map<wstring, shared_ptr<IStateFulNode>>& statefulnode)
+        {
+            std::list<ComputationNodeBasePtr> evalorder = net.GetEvalOrder(root, false); 
+            for (auto& x : evalorder)
+            {
+                wstring name = x->GetName(); 
+                if (statefulnode.find(name )!=statefulnode.end()) continue; // already in the list 
+                shared_ptr<IStateFulNode> pNode = dynamic_pointer_cast<IStateFulNode>(x); 
+                if (pNode)
+                {
+                    statefulnode[name] = pNode; 
+                }
+            }
+        }
+        std::map<wstring, shared_ptr<IStateFulNode>> EnumerateStatefulNode(ComputationNetwork& net,
+            const std::vector<ComputationNodeBasePtr>& criterionNode,
+            const std::vector<ComputationNodeBasePtr>& evaluationNode)
+        {
+            std::map<wstring, shared_ptr<IStateFulNode>> statefulnodes; 
+            for (auto& root : criterionNode)
+            {
+                EnumerateStatefulNodeWithRoot(net, root, statefulnodes); 
+            }
+            for (auto& root : evaluationNode)
+            {
+                EnumerateStatefulNodeWithRoot(net, root, statefulnodes);
+            }
+            return statefulnodes; 
+        }
+
+    public:
+        SubminibatchDispatcher() :
+            m_MBLayoutCache(nullptr), m_LatticeCache(nullptr), m_uidCache(nullptr), m_extrauttmapCache(nullptr)
+        { }
+
+        void Init(ComputationNetworkPtr & net,
+            const std::list<ComputationNodeBasePtr>& learnableNodes,
+            const std::vector<ComputationNodeBasePtr>& criterionNodes,
+            const std::vector<ComputationNodeBasePtr>& evaluationNodes)
+        {
+            m_MBLayoutCache = make_shared<MBLayout>(); 
+            m_NetCriterionAccumulator = make_shared<Matrix<ElemType>>(1, 1, net->GetDeviceId()); 
+            m_NetEvaluationAccumulator = make_shared<Matrix<ElemType>>(1, evaluationNodes.size(), net->GetDeviceId());
+            // remember ptr to  learnableNode 
+            for (auto x : learnableNodes)
+            {
+                shared_ptr<ComputationNode<ElemType>> pLearnableNode = dynamic_pointer_cast<ComputationNode<ElemType>>(x); 
+                wstring nodename = x->NodeName(); 
+                m_LearnableNodePtr[nodename] = pLearnableNode;
+            }
+            for (auto& x : criterionNodes)
+            {
+                m_NetCriterionNodes.push_back(dynamic_pointer_cast<ComputationNode<ElemType>>(x)); 
+            }
+            for (auto& x : evaluationNodes)
+            {
+                m_NetEvaluationNodes.push_back(dynamic_pointer_cast<ComputationNode<ElemType>>(x));
+            }         
+            m_NetCriterionAccumulator->SetValue((ElemType)0); 
+            m_NetEvaluationAccumulator->SetValue((ElemType)0);            
+
+            // emulate all the nodes, find nodes that have state 
+            m_NetStatefulNodes = EnumerateStatefulNode(*net, criterionNodes, evaluationNodes);
+            for (auto x : m_NetStatefulNodes)
+            {
+                wstring name = x.first; 
+                m_NetStates[name] = vector<shared_ptr<INodeState>>(); 
+            }
+        }
+
+        ~SubminibatchDispatcher()
+        {
+            // TODO: remove these by using shared_ptr 
+            delete m_LatticeCache; 
+            delete m_uidCache; 
+            delete m_extrauttmapCache;
+            
+            for (auto x : m_inputMatricesCache)
+            {
+                delete x.second; 
+            }
+
+            for (auto x : m_CachedGraident)
+            {
+                delete x.second; 
+            }
+        }
+        size_t  GetMinibatchIntoCache(   IDataReader<ElemType>& trainSetDataReader,
+                                        ComputationNetwork& net,
+                                        std::map<std::wstring, Matrix<ElemType>*> & inputMatrices,
+                                        size_t requestedSubminibatches)
+        {
+            // first, remember interface to the net 
+            m_NetMBLayoutPtr = net.GetMBLayoutPtr();
+            m_NetInputMatrixPtr = inputMatrices; 
+
+            // second, get data from reader, stored it in cache 
+            // 1. for each key, allocate the specific matrix on device 
+            for (auto pa : inputMatrices)
+            {
+                wstring name = pa.first; 
+                Matrix<ElemType>* M= pa.second; 
+                if (m_inputMatricesCache.find(name) == m_inputMatricesCache.end())
+                {
+                    m_inputMatricesCache[name] = new Matrix<ElemType>(*M, M->GetDeviceId()); // deep copy from M 
+                }
+                else
+                {
+                    m_inputMatricesCache[name]->SetValue(*M);
+                }
+            }
+            // 2. MBlayout 
+            m_MBLayoutCache->CopyFrom(net.GetMBLayoutPtr());
+            size_t nParallelSequences = m_MBLayoutCache->GetNumParallelSequences(); 
+
+            if (m_NetCriterionNodes[0] != nullptr && (m_NetCriterionNodes[0]->OperationName() == L"SequenceWithSoftmax"))
+            {
+                // auto node = dynamic_pointer_cast<SequenceWithSoftmaxNode<ElemType>>(criterionNode);
+                NOT_IMPLEMENTED;
+                // TODO: implement this for Sequence training !!!
+            }
+            
+            // subminibatches are cutted at the parallel sequence level; 
+            // if #requested subminibatch is larger than #parallel sequence, 
+            // we cannot split further; instead, each subsequence become a subminibatch 
+            size_t actualnumSubminibatches = requestedSubminibatches > nParallelSequences ? nParallelSequences : requestedSubminibatches; 
+           
+            // 3. third, allocate space for accumulated gradient 
+            for (auto& n: m_LearnableNodePtr)
+            {
+                auto node = n.second; 
+                if (node->IsParameterUpdateRequired())
+                {
+                    wstring nodeName = node->GetName(); 
+                    shared_ptr<ComputationNode<ElemType>>  pLearnableNode = node;
+                    auto funvalue = pLearnableNode->FunctionValues();   // gradient may not be allocated when this function is first called 
+                    size_t nrow = funvalue.GetNumRows(); 
+                    size_t ncol = funvalue.GetNumCols();
+                    if (m_CachedGraident.find(nodeName) == m_CachedGraident.end())
+                    {
+                        // not allocated yet 
+                        m_CachedGraident[nodeName] = new Matrix<ElemType>(nrow, ncol, funvalue.GetDeviceId()); 
+                        m_CachedGraident[nodeName]->SetValue((ElemType)0);
+                    }
+                }
+            }            
+            // 4. for stateful node 
+            for (auto x : m_NetStatefulNodes)
+            {
+                wstring name = x.first; 
+                if (m_NetStates[name].empty())
+                {
+                    // this only happens in the first minibatch in an epoch
+                    m_NetStates[name].resize(actualnumSubminibatches); 
+                }
+            }
+
+            return (m_numSubminibatches = actualnumSubminibatches); 
+        }
+        
+        void GetSubMinibatchToNet(size_t iSubminibatch)
+        {
+            Matrices decimatedMatrices; 
+            MBLayoutPtr decimatedLayout;
+            DataReaderHelpers::DecimateMinibatch(m_inputMatricesCache, decimatedMatrices, m_MBLayoutCache, decimatedLayout, m_numSubminibatches, iSubminibatch); 
+            //  NOTE: decimatedMatrices must be released by caller
+
+            //m_NetInputMatrixPtr = decimatedMatrices;
+            for (auto& x : decimatedMatrices)
+            {
+                wstring name = x.first;               
+                m_NetInputMatrixPtr[name]->SetValue(*x.second);
+                delete x.second;    // TODO: is it safe to delete here ? Yes! SetValue call cuda memcpy so it is a blocking call  
+                x.second = nullptr;
+            }
+
+            m_NetMBLayoutPtr->CopyFrom(decimatedLayout); 
+
+            for (auto& x : m_NetStatefulNodes)
+            {
+                wstring name = x.first; 
+                shared_ptr<IStateFulNode>   pNode = x.second; 
+                if (m_NetStates[name][iSubminibatch])
+                    pNode->ImportState(m_NetStates[name][iSubminibatch]);
+            }
+
+        }
+        // TODO: encapsulate it into a destructor !!!   Note: Cannot throw exceptions in destructor.
+        void DoneWithCurrentSubMinibatch(size_t iSubminibatch)
+        {
+            // accumulate gradient here 
+            for (auto x : m_CachedGraident)
+            {
+                wstring nodename = x.first; 
+                if (m_LearnableNodePtr.find(nodename) == m_LearnableNodePtr.end())
+                {
+                    RuntimeError("ERROR: in DoneWithCurrentSubMinibatch: node %ls not found in LeanrableNode", nodename.c_str());
+                }
+                shared_ptr<ComputationNode<ElemType>> pNode = m_LearnableNodePtr[nodename];
+                m_CachedGraident[nodename]->operator+=(pNode->GradientValues()); 
+                pNode->GradientValues().SetValue((ElemType)0);
+            }
+            // accumulate criterion value 
+            Matrix<ElemType>::AddElementToElement(
+                    m_NetCriterionNodes[0]->FunctionValues() , 0, 0,
+                    *m_NetCriterionAccumulator, 0, 0
+                    ); 
+            m_NetCriterionNodes[0]->FunctionValues().SetValue((ElemType)0);
+            // accumulate evaluation value 
+            for (size_t i = 0; i < m_NetEvaluationNodes.size(); i++)
+            {
+                Matrix<ElemType>::AddElementToElement( 
+                    m_NetEvaluationNodes[i]->FunctionValues(), 0, 0,
+                    *m_NetEvaluationAccumulator,  0, i
+                    );
+                m_NetEvaluationNodes[i]->FunctionValues().SetValue((ElemType)0);
+            }
+
+            // Export node state 
+            for (auto& x : m_NetStatefulNodes)
+            {
+                wstring name = x.first; 
+                m_NetStates[name][iSubminibatch] = x.second->ExportState();
+            }
+        }
+        void DoneWithCurrentMinibatch()
+        {
+            for (auto& x : m_CachedGraident)
+            {
+                wstring name = x.first; 
+                Matrix<ElemType>* accumulategrad = x.second; 
+
+                if (m_LearnableNodePtr.find(name) == m_LearnableNodePtr.end())
+                {
+                    // should never happen, remove this code later
+                    RuntimeError("ERROR: in DoneWithCurrentSubMinibatch: node %ls not found in LearnableNode", name.c_str());
+                }
+                m_LearnableNodePtr[name]->GradientValues().SetValue(*accumulategrad); 
+                x.second->SetValue((ElemType)0);
+            }
+            // also revert net.m_MBLayoutPtr
+            m_NetMBLayoutPtr->CopyFrom(m_MBLayoutCache);
+
+            //m_NetCriterionNodes[0]->FunctionValues().SetValue((ElemType)0);
+            Matrix<ElemType>::AddElementToElement(
+                *m_NetCriterionAccumulator, 0, 0,
+                m_NetCriterionNodes[0]->FunctionValues(), 0, 0
+            ); 
+            m_NetCriterionAccumulator->SetValue((ElemType)0);
+
+            for (size_t i = 0; i < m_NetEvaluationNodes.size(); i++)
+            {
+                //m_NetEvaluationNodes[i]->FunctionValues().SetValue((ElemType)0);
+                Matrix<ElemType>::AddElementToElement(
+                    *m_NetEvaluationAccumulator, 0, i,
+                    m_NetEvaluationNodes[i]->FunctionValues(), 0, 0
+                );
+            }
+            m_NetEvaluationAccumulator->SetValue((ElemType)0);
+        }
+
+#ifdef SMB_DEBUG
+
+        template<class Matrix, class ElemType>
+        void WriteMatrix(const Matrix& mat, string filename)
+        {
+            ElemType* pArray = mat.CopyToArray();
+            size_t nRows = mat.GetNumRows();
+            size_t nCols = mat.GetNumCols();
+            FILE* fp = fopenOrDie(filename, "w");
+            for (size_t r = 0; r < nRows; r++)
+            {
+                for (size_t c = 0; c < nCols; c++)
+                {
+                    fprintf(fp, "%.9f ", pArray[nRows*c + r]);
+                }
+                fprintf(fp, "\n");
+            }
+            fcloseOrDie(fp);
+            delete[]pArray;
+        }
+        void WriteMBLayout(MBLayoutPtr pMBLayout, wstring filename)
+        {
+            size_t nT = pMBLayout->GetNumTimeSteps();
+            size_t nU = pMBLayout->GetNumParallelSequences();
+
+            FILE* fp = fopenOrDie(filename, L"w");
+            for (size_t u = 0; u < nU; u++)
+            {
+                for (size_t t = 0; t < nT; t++)
+                {
+                    MinibatchPackingFlags flag = pMBLayout->Get(u, t);
+                    fprintf(fp, "%d\t", (int)flag);
+                }
+                fprintf(fp, "\n");
+            }
+            fcloseOrDie(fp);
+        }
+        void WriteInputMatriceAndMBLayout(size_t mbID, size_t smbID)
+        {
+            wstring node = L"features";
+            wstring filename = msra::strfun::wstrprintf(L"tmp/%s.%d.%d", node.c_str(), mbID, smbID);
+            if (m_NetInputMatrixPtr.find(node) != m_NetInputMatrixPtr.end())
+            {
+                WriteMatrix<Matrix<ElemType>, ElemType>(*m_NetInputMatrixPtr[node], msra::strfun::wcstombs(filename));
+            }
+            wstring fn = msra::strfun::wstrprintf(L"tmp/Layout.%d.%d", mbID, smbID);
+            WriteMBLayout(m_NetMBLayoutPtr, fn);
+        }
+        void WriteInputMatriceAndMBLayout(Matrices m, MBLayoutPtr pMBLayout, size_t mbID)
+        {
+            wstring filename = msra::strfun::wstrprintf(L"tmp/features.%d", mbID);
+            wstring fn       = msra::strfun::wstrprintf(L"tmp/layout.%d", mbID);
+            if (m.find(L"features") != m.end())
+            {
+                WriteMatrix<Matrix<ElemType>, ElemType>(*m[L"features"], msra::strfun::wcstombs(filename));
+            }
+            WriteMBLayout(pMBLayout, fn);
+        }
+
+        void WriteGradient(size_t mbID)
+        {
+            wstring node = L"LSTMoutput1.bias";
+            wstring filename = msra::strfun::wstrprintf(L"%s.%d", L"tmp/gradient", mbID);
+            if (m_CachedGraident.find(node) != m_CachedGraident.end())
+            {
+                WriteMatrix<Matrix<ElemType>, ElemType>(*m_CachedGraident[node], msra::strfun::wcstombs(filename));
+            }
+        }
+
+        void WriteGradient(const Matrix<ElemType>& mat, wstring fn)
+        {
+            WriteMatrix<Matrix<ElemType>, ElemType>(mat, msra::strfun::wcstombs(fn));
+        }
+#endif // SMB_DEBUG
+    };
+
 }}}
diff --git a/MachineLearning/CNTKSGDLib/SGD.cpp b/MachineLearning/CNTKSGDLib/SGD.cpp
index cb033cb9a..c9053e23d 100644
--- a/MachineLearning/CNTKSGDLib/SGD.cpp
+++ b/MachineLearning/CNTKSGDLib/SGD.cpp
@@ -110,6 +110,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         //       truncated = truncation length
         m_mbSize = configSGD(L"minibatchSize", ConfigRecordType::Array(intargvector(vector<int>{ 256 })));
         m_truncated = configSGD(L"truncated", false);
+        m_maxSamplesInRAM = configSGD(L"maxSamplesInRAM", ConfigRecordType::Array(intargvector(vector < int > {0})));
 
         // the number of samples in each epoch (0 means, use all the samples in each epoch).
         m_epochSize = configSGD(L"epochSize", (size_t)0);
@@ -1697,6 +1698,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             refNet->StartEvaluateMinibatchLoop(refNode);
         }
 
+        SubminibatchDispatcher<ElemType> smbDisplatcher; 
+        size_t samplesInRAM = m_maxSamplesInRAM[epochNumber]; 
+        // convert it to SubminibatchRequested 
+        size_t numSubminibatchRequested = 0; 
+        if (samplesInRAM > 0)   // if samplesInRAM = 0 , we will not use subminibatch dispatcher
+        {
+            size_t nParallelSequences = trainSetDataReader->GetNumParallelSequences(); 
+            size_t estimatedMBSize = tunedMBSize * nParallelSequences; 
+            numSubminibatchRequested = (size_t)std::ceil(estimatedMBSize / samplesInRAM);             
+        }
+        if (numSubminibatchRequested > 1) // only use subminibatch dispatcher if more than 1 subminibatch is required 
+        {
+            smbDisplatcher.Init(net, learnableNodes, criterionNodes, evaluationNodes);
+        }
+        size_t actualNumSubminibatch=0;
+
         // Attemps to compute the error signal for the whole utterance, which will
         // be fed to the neural network as features. Currently it is a workaround
         // for the two-forward-pass sequence and ctc training, which allows
@@ -1710,10 +1727,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             fprintf(stderr, ", DataParallelSGD training (MyRank = %d, NumNodes = %d, NumGradientBits = %d)",
                     (int)g_mpi->CurrentNodeRank(), (int)g_mpi->NumNodesInUse(), (int)m_numGradientBits);
         }
-
         if (useDistributedMBReading)
         {
-            fprintf(stderr, ", Distributed reading is ENABLED");
+            fprintf(stderr, ", distributed reading is ENABLED");
+        }
+        if (numSubminibatchRequested > 0)
+        {
+            fprintf(stderr, ", with %d Max Samples in RAM", (int)samplesInRAM);
         }
         fprintf(stderr, ".\n");
 
@@ -1735,6 +1755,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             nSamplesSinceLastModelSync += actualMBSize;
 
+            if (numSubminibatchRequested > 0)
+            {
+                actualNumSubminibatch = smbDisplatcher.GetMinibatchIntoCache(*trainSetDataReader, *net, *inputMatrices, numSubminibatchRequested); 
+            }
+            else
+            {
+                actualNumSubminibatch = 0;
+            }
+            
             // node data was changed
             // TODO: move this to that function as well--just tired to pass everything as arguments
             // TODO: We should do this right after the GetMinibatch() call, since that's where these changed.
@@ -1772,26 +1801,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
                 //compute eval node first since when gradient is computed the forward function values
                 //may be changed and need to be recomputed when gradient and function value share the same matrix
-                net->Evaluate(evaluationNodes);
+                if (actualNumSubminibatch > 0)
+                {
+                    for (size_t ismb = 0; ismb < actualNumSubminibatch; ismb++)
+                    {
+                        smbDisplatcher.GetSubMinibatchToNet(ismb);
+#ifdef SMB_DEBUG
+                        //smbhelper.WriteInputMatriceAndMBLayout(numMBsRun, ismb);
+#endif 
+                        ComputationNetwork::UpdateEvalTimeStamps(featureNodes); 
+                        ComputationNetwork::UpdateEvalTimeStamps(labelNodes);
+                        ForwardBackward(*net, evaluationNodes, criterionNodes[0], learnRatePerSample > 0.01 * m_minLearnRate); 
+                        smbDisplatcher.DoneWithCurrentSubMinibatch(ismb); 
+                    }
+#ifdef SMB_DEBUG
+                    //smbhelper.WriteGradient(numMBsRun);
+#endif 
+                    smbDisplatcher.DoneWithCurrentMinibatch(); 
 
-                // only compute gradient when learning rate is large enough
-                if (learnRatePerSample > m_minLearnRate * 0.01)
-                {
-                    // use only the first criterion. Is there any possibility to use more?
-                    // ==============================
-                    // forward prop, back-prop  --this is where the magic happens baby, what we have all be waiting for!
-                    // ==============================
-                    net->ComputeGradient<ElemType>(criterionNodes[0]);
-                    // TODO: we should split Evaluate() out from ComputeGradient(), then call them ForwardProp() and BackProp(), for clarity
                 }
-                else
+                else 
                 {
-                    // use only the first criterion. Is there any possibility to use more?
-                    // ==============================
-                    // forward prop
-                    // ==============================
-                    net->Evaluate(criterionNodes[0]);
+                    ForwardBackward(*net, evaluationNodes, criterionNodes[0], learnRatePerSample > 0.01 * m_minLearnRate);
                 }
+
             } // if (actualMBSize > 0)
 
             // Some labels may be missing (e.g. forced alignment failed, or being gaps due to packing parallel sequences).
diff --git a/MachineLearning/CNTKSGDLib/SGD.h b/MachineLearning/CNTKSGDLib/SGD.h
index ee268e1ec..18efd9246 100644
--- a/MachineLearning/CNTKSGDLib/SGD.h
+++ b/MachineLearning/CNTKSGDLib/SGD.h
@@ -150,6 +150,14 @@ protected:
     //         We really should only read it in SGD and pass it ourselves on to the Reader, instead of it being a Reader parameter.
     // BUGBUG: If m_truncated, then m_mbSize is interpreted as truncation length; the actual MB size is a combination of that and the #parallel sequences specified in the reader.
     // TODO: do not specify 'Truncated' but 'TruncatedLength', set m_truncated so given, and let m_mbSize control how many #parallel sequences the reader is allowed to pack into an MB.
+    intargvector m_maxSamplesInRAM; 
+    // This is related with subminibatch implementation 
+    // maxSamplesInRAM denotes how many samples we used in forward-backward on net. 
+    // Due to the GPU memory limitations, it is sometime not possible to hold the m_mbSize in RAM. 
+    // To mitigate this issue, we adopt the sub-minibatch implementation, where 
+    // each m_mbSize[epoch] is divided by a few sub-minibatch of which size will be no more than m_maxSamplesInRAM[epoch]
+    // a forward-backward is performed for each sub-minibathch; a model update is performed after each minibatch 
+
 
     // the number of samples in each epoch (0 means, use all the samples in each epoch).
     size_t m_epochSize;
@@ -485,6 +493,28 @@ protected:
 
 private:
     int SGDTrace(FILE *__restrict __stream, const char *__restrict __format, ...);
+    void ForwardBackward(ComputationNetwork& net, const std::vector<ComputationNodeBasePtr>& evalNodes,  shared_ptr<ComputationNodeBase> criterionNode, bool dobackpropogate=true)
+    {
+        net.Evaluate(evalNodes);
+        // only compute gradient when learning rate is large enough
+        if (dobackpropogate)
+        {
+            // use only the first criterion. Is there any possibility to use more?
+            // ==============================
+            // forward prop, back-prop  --this is where the magic happens baby, what we have all be waiting for!
+            // ==============================
+            net.ComputeGradient<ElemType>(criterionNode);
+            // TODO: we should split Evaluate() out from ComputeGradient(), then call them ForwardProp() and BackProp(), for clarity
+        }
+        else
+        {
+            // use only the first criterion. Is there any possibility to use more?
+            // ==============================
+            // forward prop
+            // ==============================
+            net.Evaluate(criterionNode);
+        }
+    }
 };
 
 }}}
diff --git a/Math/CNTKMathTest/CPUMatrixUnitTests.cpp b/Math/CNTKMathTest/CPUMatrixUnitTests.cpp
new file mode 100644
index 000000000..5c581efe1
--- /dev/null
+++ b/Math/CNTKMathTest/CPUMatrixUnitTests.cpp
@@ -0,0 +1,712 @@
+//
+// <copyright file="CPUMatrixUnitTests.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+#include "stdafx.h"
+#include "CppUnitTest.h"
+#include "..\Math\CPUMatrix.h"
+#define DEBUG_FLAG 1
+using namespace Microsoft::MSR::CNTK;
+
+#pragma warning (disable: 4305)
+
+using namespace Microsoft::VisualStudio::CppUnitTestFramework;
+
+namespace CNTKMathTest
+{    
+    TEST_CLASS(CPUMatrixUnitTest)
+    {
+        //typedef CPUSingleMatrix Matrix;  
+        typedef CPUDoubleMatrix Matrix;  
+
+    public:
+        static void DebugPrint(FILE* gpuDebugFile, Matrix M, const char* str, const bool colwiseVec = true)
+        {
+            fprintf(gpuDebugFile, "\n %s\n", str);
+            const size_t matNumCol = M.GetNumCols();
+            const size_t elemNum = M.GetNumElements();
+            Matrix M1 = M.Transpose();
+            double* pArray = M1.GetArray();
+            if (colwiseVec)
+            {
+                for (size_t i = 0; i < elemNum; i++)
+                {
+
+                    fprintf(gpuDebugFile, "%3d ", (int)pArray[i]);
+                    if ( (i+1)% matNumCol == 0)
+                        fprintf(gpuDebugFile, "\n");
+                }
+            }
+            //const size_t matNumRow = M.GetNumRows();
+            //for (int i = 0; i < matNumRow; i++)
+            //{
+            //    for (int j = 0; j < matNumCol; j++)
+            //    {
+            //        fprintf(gpuDebugFile, "%3d ", M(i,j));
+            //        //if ( (j+1)% matNumCol == 0)
+            //    }
+            //    fprintf(gpuDebugFile, "\n");
+            //}
+        }    
+        TEST_METHOD(CPUMatrixConsturctors)
+        {
+            Matrix M0;
+            Assert::IsTrue(M0.IsEmpty());
+
+            M0.Resize(2,3);
+            Assert::IsFalse(M0.IsEmpty());
+            Assert::AreEqual<size_t>(2,M0.GetNumRows());
+            Assert::AreEqual<size_t>(3,M0.GetNumCols());
+            Assert::AreEqual<size_t>(6,M0.GetNumElements());
+
+            M0(0,0) = 1; M0(1,2) = 2;
+            Assert::IsTrue(M0(0,0) == 1);
+            Assert::IsTrue(M0(1,2) == 2);
+
+            Matrix M1(12,53);
+            Assert::AreEqual<size_t>(12,M1.GetNumRows());
+            Assert::AreEqual<size_t>(53,M1.GetNumCols());   
+
+
+            float *fArray = new float[6];
+            fArray[0] = 1; fArray[1] = 2; fArray[2] = 3; 
+            fArray[3] = 4; fArray[4] = 5; fArray[5] = 6; 
+            CPUMatrix<float> M2(2, 3, fArray, matrixFlagNormal);
+            Assert::AreEqual<float>(M2(0,0), 1);
+            Assert::AreEqual<float>(M2(0,1), 3);
+            Assert::AreEqual<float>(M2(0,2), 5);
+            Assert::AreEqual<float>(M2(1,0), 2);
+            Assert::AreEqual<float>(M2(1,1), 4);
+            Assert::AreEqual<float>(M2(1,2), 6);
+
+            double *dArray = new double[6];
+            dArray[0] = 1; dArray[1] = 2; dArray[2] = 3; 
+            dArray[3] = 4; dArray[4] = 5; dArray[5] = 6; 
+            CPUMatrix<double> M3(2, 3, dArray, matrixFormatRowMajor);
+            Assert::AreEqual<double>(M3(0,0), 1);
+            Assert::AreEqual<double>(M3(0,1), 2);
+            Assert::AreEqual<double>(M3(0,2), 3);
+            Assert::AreEqual<double>(M3(1,0), 4);
+            Assert::AreEqual<double>(M3(1,1), 5);
+            Assert::AreEqual<double>(M3(1,2), 6);
+
+            Matrix M4(M0);
+            Assert::IsTrue(M4.IsEqualTo(M0));
+
+            Matrix M5 = M0;
+            Assert::IsTrue(M5.IsEqualTo(M0));
+        }
+
+        TEST_METHOD(CPUMatrixAddAndSub)
+        {
+            Matrix M0(2,3);
+            M0(0,0) = 1; M0(0,1) = 2; M0(0,2) = 3;
+            M0(1,0) = 4; M0(1,1) = 5; M0(1,2) = 6;
+
+            Matrix M1(2,3);
+            M1(0,0) = 11; M1(0,1) = 12; M1(0,2) = 13;
+            M1(1,0) = 14; M1(1,1) = 15; M1(1,2) = 16;
+
+            Matrix M2(2,3);
+            M2(0,0) = 12; M2(0,1) = 14; M2(0,2) = 16;
+            M2(1,0) = 18; M2(1,1) = 20; M2(1,2) = 22;
+
+            Matrix MC(2,1);
+            MC(0,0) = 10; 
+            MC(1,0) = 10; 
+
+            Matrix MR(1,3);
+            MR(0,0) = 10; MR(0,1) = 10; MR(0,2) = 10; 
+
+            Matrix MS(1,1);
+            MS(0,0) = 10; 
+
+            Matrix M3 = M2 - M0;
+            Assert::IsTrue(M3.IsEqualTo(M1)); 
+
+            M3 += M0;
+            Assert::IsTrue(M3.IsEqualTo(M2)); 
+
+            M3 = M0 + 10;
+            Assert::IsTrue(M3.IsEqualTo(M1));  
+
+            M3 -= 10;
+            Assert::IsTrue(M3.IsEqualTo(M0));  
+
+            M3 = M1 + M0;
+            Assert::IsTrue(M3.IsEqualTo(M2));  
+
+            M3 -= M0;
+            Assert::IsTrue(M3.IsEqualTo(M1));  
+
+            M3 = M1 - 10;
+            Assert::IsTrue(M3.IsEqualTo(M0));  
+
+            M3 += 10;
+            Assert::IsTrue(M3.IsEqualTo(M1));  
+
+            M3 -= MC;
+            Assert::IsTrue(M3.IsEqualTo(M0));  
+
+            M3 += MC;
+            Assert::IsTrue(M3.IsEqualTo(M1));  
+
+            M3 -= MR;
+            Assert::IsTrue(M3.IsEqualTo(M0));  
+
+            M3 += MR;
+            Assert::IsTrue(M3.IsEqualTo(M1));  
+
+            M3.AssignDifferenceOf(M3, MS);
+            Assert::IsTrue(M3.IsEqualTo(M0));  
+        }
+
+        TEST_METHOD(CPUMatrixMultiAndDiv)
+        {
+            Matrix M0(2,3);
+            M0(0,0) = 1; M0(0,1) = 2; M0(0,2) = 3;
+            M0(1,0) = 4; M0(1,1) = 5; M0(1,2) = 6;
+
+            Matrix M00(2,3);
+            M00(0,0) = 10; M00(0,1) = 20; M00(0,2) = 30;
+            M00(1,0) = 40; M00(1,1) = 50; M00(1,2) = 60;
+
+            Matrix M1(2,3);
+            M1.Reshape(3,2);
+            M1(0,0) = 11; M1(0,1) = 15; 
+            M1(1,0) = 14; M1(1,1) = 13; 
+            M1(2,0) = 12; M1(2,1) = 16; 
+
+            Matrix M2(2,2);
+            M2(0,0) = 75; M2(0,1) = 89; 
+            M2(1,0) = 186; M2(1,1) = 221; 
+
+            Matrix M3 = M0 * M1;
+            Assert::IsTrue(M3.IsEqualTo(M2));  
+
+            M3 = M0 * 10;
+            Assert::IsTrue(M3.IsEqualTo(M00));  
+
+            M3 = M3 / 10;
+            Assert::IsTrue(M3.IsEqualTo(M0));  
+
+            M3 *= 10;
+            Assert::IsTrue(M3.IsEqualTo(M00));  
+
+            M3 /= 10;
+            Assert::IsTrue(M3.IsEqualTo(M0));  
+
+            Matrix::MultiplyAndWeightedAdd(1, M0, false, M1, false, 0, M3);
+            Assert::IsTrue(M3.IsEqualTo(M2));  
+
+            M1.Reshape(2,3);
+            Matrix::MultiplyAndWeightedAdd(1, M0, false, M1, true, 0, M3);
+            M2(0,0) = 74; M2(0,1) = 92; 
+            M2(1,0) = 182; M2(1,1) = 227; 
+            Assert::IsTrue(M3.IsEqualTo(M2));  
+
+            Matrix::MultiplyAndWeightedAdd(10, M0, false, M1, true, 2, M3);
+            M2(0,0) = 888; M2(0,1) = 1104; 
+            M2(1,0) = 2184; M2(1,1) = 2724; 
+            Assert::IsTrue(M3.IsEqualTo(M2));  
+
+            Matrix::MultiplyAndWeightedAdd(1, M0, true, M1, false, 0, M3);
+            M2.Resize(3,3);
+            M2(0,0) = 67; M2(0,1) = 72; M2(0,2) = 77; 
+            M2(1,0) = 92; M2(1,1) = 99; M2(1,2) = 106; 
+            M2(2,0) = 117; M2(2,1) = 126; M2(2,2) = 135; 
+            Assert::IsTrue(M3.IsEqualTo(M2));  
+        }
+
+        TEST_METHOD(CPUMatrixElementOps)
+        {
+            Matrix M0(2,3);
+            M0(0,0) = 1; M0(0,1) = 2; M0(0,2) = 3;
+            M0(1,0) = 4; M0(1,1) = 5; M0(1,2) = 6;
+
+            Matrix M00(2,3);
+            M00(0,0) = 1.0; M00(0,1) = 1/2.0; M00(0,2) = 1/3.0;
+            M00(1,0) = 1/4.0; M00(1,1) = 1/5.0; M00(1,2) = 1/6.0;
+
+            Matrix M1(2,3);
+            M1(0,0) = 1; M1(0,1) = 1; M1(0,2) = 1;
+            M1(1,0) = 1; M1(1,1) = 1; M1(1,2) = 1;
+
+            Matrix M3;
+            M3.AssignElementProductOf(M0, M00);
+            Assert::IsTrue(M3.IsEqualTo(M1, 0.0001)); 
+
+            M3 = M0 ^ 4;
+            Matrix M2(2,3);
+            M2(0,0) = 1; M2(0,1) = 16; M2(0,2) = 81;
+            M2(1,0) = 256; M2(1,1) = 625; M2(1,2) = 1296;
+            Assert::IsTrue(M3.IsEqualTo(M2)); 
+
+            M3.SetValue(M0);
+            M3 ^= 4;
+            Assert::IsTrue(M3.IsEqualTo(M2)); 
+
+            M3.SetValue(M0);
+            M3.ElementMultiplyWith(M00);
+            Assert::IsTrue(M3.IsEqualTo(M1)); 
+
+            M3.SetValue(M0);
+            M3.ElementInverse();
+            Assert::IsTrue(M3.IsEqualTo(M00)); 
+
+            M2(0,0) = 0.7311; M2(0,1) = 0.8808; M2(0,2) = 0.9526;
+            M2(1,0) = 0.9820; M2(1,1) = 0.9933; M2(1,2) = 0.9975;
+            M3.AssignElementDivisionOf(M2, M0);
+            M2.ElementMultiplyWith(M00);
+            Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); 
+
+            M3.SetValue(M0);
+            M3.InplaceSigmoid();
+            M2(0,0) = 0.7311; M2(0,1) = 0.8808; M2(0,2) = 0.9526;
+            M2(1,0) = 0.9820; M2(1,1) = 0.9933; M2(1,2) = 0.9975;
+            Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); 
+           
+            M3.SetValue(M0);
+            M3.InplaceTanh();
+            M2(0,0) = 0.7616; M2(0,1) = 0.9640; M2(0,2) = 0.9951;
+            M2(1,0) = 0.9993; M2(1,1) = 0.9999; M2(1,2) = 1.0000;
+            Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); 
+           
+            M3.SetValue(M0);
+            M3.InplaceLogSoftmax(true);
+            M3.InplaceExp();
+            M2(0,0) = 0.0474; M2(0,1) = 0.0474; M2(0,2) = 0.0474;
+            M2(1,0) = 0.9526; M2(1,1) = 0.9526; M2(1,2) = 0.9526;
+            Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); 
+           
+            M3.SetValue(M0);
+            M3.InplaceLogSoftmax(false);
+            M3.InplaceExp();
+            M2(0,0) = 0.0900; M2(0,1) = 0.2447; M2(0,2) = 0.6652;
+            M2(1,0) = 0.0900; M2(1,1) = 0.2447; M2(1,2) = 0.6652;
+            Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); 
+
+            M3.SetValue(M0);
+            M3.InplaceHardmax(true);
+            M2(0, 0) = 0.0; M2(0, 1) = 0.0; M2(0, 2) = 0.0;
+            M2(1, 0) = 1.0; M2(1, 1) = 1.0; M2(1, 2) = 1.0;
+            Assert::IsTrue(M3.IsEqualTo(M2, 0.0001));
+
+            M3.SetValue(M0);
+            M3.InplaceHardmax(false);
+            M2(0, 0) = 0.0; M2(0, 1) = 0.0; M2(0, 2) = 1.0;
+            M2(1, 0) = 0.0; M2(1, 1) = 0.0; M2(1, 2) = 1.0;
+            Assert::IsTrue(M3.IsEqualTo(M2, 0.0001));
+
+            M3.SetValue(M0);
+            M3.InplaceSqrt();
+            M2(0,0) = 1; M2(0,1) = 1.4142; M2(0,2) = 1.7321;
+            M2(1,0) = 2; M2(1,1) = 2.2361; M2(1,2) = 2.4495;
+            Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); 
+           
+            M3.SetValue(M0);
+            M3.InplaceExp();
+            M2(0,0) = 2.7183; M2(0,1) = 7.3891; M2(0,2) = 20.0855;
+            M2(1,0) = 54.5982; M2(1,1) = 148.4132; M2(1,2) = 403.4288;
+            Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); 
+           
+            M3.SetValue(M0);
+            M3.InplaceExp();
+            M2(0,0) = 2.7183; M2(0,1) = 7.3891; M2(0,2) = 20.0855;
+            M2(1,0) = 54.5982; M2(1,1) = 148.4132; M2(1,2) = 403.4288;
+            Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); 
+           
+            M3.InplaceLog();
+            Assert::IsTrue(M3.IsEqualTo(M0, 0.0001)); 
+
+            M3.SetValue(M0);
+            M3.InplaceTruncateBottom(2);
+            M2(0,0) = 2; M2(0,1) = 2; M2(0,2) = 3;
+            M2(1,0) = 4; M2(1,1) = 5; M2(1,2) = 6;
+            Assert::IsTrue(M3.IsEqualTo(M2)); 
+
+            M3.SetValue(M0);
+            M3.InplaceTruncateTop(4);
+            M2(0,0) = 1; M2(0,1) = 2; M2(0,2) = 3;
+            M2(1,0) = 4; M2(1,1) = 4; M2(1,2) = 4;
+            Assert::IsTrue(M3.IsEqualTo(M2)); 
+
+            double pi = 3.14159265358979323846264338327950288419716939937510;
+
+            Matrix M_Trig(2,3);
+            M_Trig(0,0) = 0; M_Trig(0,1) = pi/2.0; M_Trig(0,2) = pi;
+            M_Trig(1,0) = 3.0*pi/2.0; M_Trig(1,1) = 2.0*pi; M_Trig(1,2) = 5.0*pi/2.0;
+
+            Matrix M_Cos(2,3);
+            M_Cos.SetValue(M_Trig);
+
+            Matrix M_Cos_expected(2,3);
+            M_Cos_expected(0,0) = 1; M_Cos_expected(0,1) = 0; M_Cos_expected(0,2) = -1;
+            M_Cos_expected(1,0) = 0; M_Cos_expected(1,1) = 1; M_Cos_expected(1,2) =  0;
+
+            M_Cos.InplaceCosine();
+            Assert::IsTrue(M_Cos.IsEqualTo(M_Cos_expected, 0.0001)); 
+
+            M_Cos.SetValue(M_Trig);
+            M_Cos.AssignCosineOf(M_Trig);
+            Assert::IsTrue(M_Cos.IsEqualTo(M_Cos_expected, 0.0001)); 
+
+            Matrix M_NegSine(2,3);
+            M_NegSine.SetValue(M_Trig);
+
+            Matrix M_NegSine_expected(2,3);
+            M_NegSine_expected(0,0) = 0; M_NegSine_expected(0,1) = -1; M_NegSine_expected(0,2) =  0;
+            M_NegSine_expected(1,0) = 1; M_NegSine_expected(1,1) =  0; M_NegSine_expected(1,2) = -1;
+
+            M_NegSine.InplaceNegativeSine();
+            Assert::IsTrue(M_NegSine.IsEqualTo(M_NegSine_expected, 0.0001)); 
+
+            M_NegSine.SetValue(M_Trig);
+            M_NegSine.AssignNegativeSineOf(M_Trig);
+            Assert::IsTrue(M_NegSine.IsEqualTo(M_NegSine_expected, 0.0001));
+        }
+
+        TEST_METHOD(CPUMatrixNorms)
+        {
+            Matrix M0(2,3);
+            M0(0,0) = 1; M0(0,1) = 2; M0(0,2) = 3;
+            M0(1,0) = 4; M0(1,1) = 5; M0(1,2) = 6;
+
+            Matrix M3;
+            M0.VectorNorm1(M3, true);
+            Matrix M2(1, 3);
+            M2(0,0) = 5; M2(0,1) = 7; M2(0,2) = 9;
+            Assert::IsTrue(M3.IsEqualTo(M2)); 
+
+            M0.VectorNorm1(M3, false);
+            M2.Resize(2,1);
+            M2(0,0) = 6;
+            M2(1,0) = 15;
+            Assert::IsTrue(M3.IsEqualTo(M2)); 
+
+            M0.VectorNorm2(M3, true);
+            M2.Resize(1, 3);
+            M2(0,0) = 4.1231; M2(0,1) = 5.3852; M2(0,2) = 6.7082;
+            Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); 
+
+            M0.VectorNorm2(M3, false);
+            M2.Resize(2,1);
+            M2(0,0) = 3.7417;
+            M2(1,0) = 8.7750;
+            Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); 
+
+            M0.VectorNormInf(M3, true);
+            M2.Resize(1, 3);
+            M2(0,0) = 4; M2(0,1) = 5; M2(0,2) = 6;
+            Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); 
+
+            M0.VectorNormInf(M3, false);
+            M2.Resize(2,1);
+            M2(0,0) = 3;
+            M2(1,0) = 6;
+            Assert::IsTrue(M3.IsEqualTo(M2)); 
+
+            Assert::IsTrue(abs(M0.FrobeniusNorm() - 9.5394) < 0.0001);
+            Assert::IsTrue(abs(M0.MatrixNormInf() - 6) < 0.0001);
+
+            Matrix M1;
+            M0.VectorMax(M1, M3, true);
+            M2.Resize(1, 3);
+            M2(0,0) = 4; M2(0,1) = 5; M2(0,2) = 6;
+            Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); 
+
+            M0.VectorMax(M1, M3, false);
+            M2.Resize(2,1);
+            M2(0,0) = 3;
+            M2(1,0) = 6;
+            Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); 
+
+            M0.VectorMin(M1, M3, true);
+            M2.Resize(1, 3);
+            M2(0,0) = 1; M2(0,1) = 2; M2(0,2) = 3;
+            Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); 
+
+            M0.VectorMin(M1, M3, false);
+            M2.Resize(2,1);
+            M2(0,0) = 1;
+            M2(1,0) = 4;
+            Assert::IsTrue(M3.IsEqualTo(M2, 0.0001));            
+        }
+
+        TEST_METHOD(CPUMatrixSetValues)
+        {
+            Matrix M0(3,3);
+            M0(0,0) = 10; M0(1,1) = 10; M0(2,2) = 10;
+
+            Matrix M1(3,3);
+            M1.SetDiagonalValue(10);
+            Assert::IsTrue(M1.IsEqualTo(M0, 0.0001)); 
+
+            Matrix M2(3,1);
+            M2(0,0) = 10; M2(1,0) = 10; M2(2,0) = 10;
+            M1.SetDiagonalValue(M2);
+            Assert::IsTrue(M1.IsEqualTo(M0, 0.0001)); 
+
+            M1.SetUniformRandomValue(-0.01, 0.01);
+            for (int i=0; i<M1.GetNumRows(); i++)
+                for (int j=0; j<M1.GetNumCols(); j++)
+                    Assert::IsTrue(M1(i,j) >= -0.01 && M1(i,j) < 0.01);
+
+            M1.SetGaussianRandomValue(0, 0.01);
+        }
+
+        TEST_METHOD(CPUMatrixTranspose)
+        {
+            Matrix M0(2,3);
+            M0(0,0) = 1; M0(0,1) = 2; M0(0,2) = 3;
+            M0(1,0) = 4; M0(1,1) = 5; M0(1,2) = 6;
+
+            Matrix M1(3,2);
+            M1(0,0) = 1; M1(0,1) = 4; 
+            M1(1,0) = 2; M1(1,1) = 5;
+            M1(2,0) = 3; M1(2,1) = 6;
+
+            Matrix M2 = M0.Transpose();
+            Assert::IsTrue(M2.IsEqualTo(M1, 0.0001)); 
+           
+            M2.AssignTransposeOf(M1);
+            Assert::IsTrue(M2.IsEqualTo(M0, 0.0001)); 
+        }
+
+        TEST_METHOD(CPUMatrixColumnSlice)
+        {
+            Matrix M0(2,3);
+            M0(0,0) = 1; M0(0,1) = 2; M0(0,2) = 3;
+            M0(1,0) = 4; M0(1,1) = 5; M0(1,2) = 6;
+
+            Matrix M1(2,2);
+            M1(0,0) = 1; M1(0,1) = 2;
+            M1(1,0) = 4; M1(1,1) = 5;
+
+            Matrix M2 = M0.ColumnSlice(0,2);
+            Assert::IsTrue(M2.IsEqualTo(M1, 0.0001)); 
+
+            M1(0,0) = 2; M1(0,1) = 3;
+            M1(1,0) = 5; M1(1,1) = 6;
+
+            M2 = M0.ColumnSlice(1,2);
+            Assert::IsTrue(M2.IsEqualTo(M1, 0.0001)); 
+
+            size_t k=100, n=20, m=50;
+
+            Matrix AG((size_t)k,(size_t)n);
+            AG.SetUniformRandomValue(-1,1);
+
+            Matrix BG((size_t)n,(size_t)m);
+            BG.SetUniformRandomValue(-1,1);
+
+            Matrix CG((size_t)k,(size_t)m);
+            CG.SetUniformRandomValue(-1,1);
+            Matrix DG((size_t)k,(size_t)m);
+            DG.SetValue(CG);
+
+            Matrix::MultiplyAndAdd(AG, false, BG, false, DG);
+
+            for (int i=0; i<m; i++)
+            {
+                Matrix col_BG = BG.ColumnSlice(i,1);
+                Matrix col_CG = CG.ColumnSlice(i,1);
+                Matrix::MultiplyAndAdd(AG, false, col_BG, false, col_CG);
+            }
+            Assert::IsTrue(CG.IsEqualTo(DG, 0.0001)); 
+        }
+
+        TEST_METHOD(CPUKhatriRaoProduct)
+        {
+            Matrix A(3,4);
+            A(0,0) = 0.8147; A(0,1) = 0.9134; A(0,2) = 0.2785; A(0,3) = 0.9649;
+            A(1,0) = 0.9058; A(1,1) = 0.6324; A(1,2) = 0.5469; A(1,3) = 0.1576;
+            A(2,0) = 0.1270; A(2,1) = 0.0975; A(2,2) = 0.9575; A(2,3) = 0.9706;
+
+            Matrix B(2,4);
+            B(0,0) = 0.9572; B(0,1) = 0.8003; B(0,2) = 0.4218; B(0,3) = 0.7922;
+            B(1,0) = 0.4854; B(1,1) = 0.1419; B(1,2) = 0.9157; B(1,3) = 0.9595;
+
+            Matrix D(6,4);
+            D(0,0) = 0.7798; D(0,1) = 0.7310; D(0,2) = 0.1175; D(0,3) = 0.7644;
+            D(1,0) = 0.8670; D(1,1) = 0.5061; D(1,2) = 0.2307; D(1,3) = 0.1249;
+            D(2,0) = 0.1215; D(2,1) = 0.0781; D(2,2) = 0.4038; D(2,3) = 0.7689;
+            D(3,0) = 0.3954; D(3,1) = 0.1296; D(3,2) = 0.2550; D(3,3) = 0.9258;
+            D(4,0) = 0.4396; D(4,1) = 0.0897; D(4,2) = 0.5008; D(4,3) = 0.1512;
+            D(5,0) = 0.0616; D(5,1) = 0.0138; D(5,2) = 0.8768; D(5,3) = 0.9313;
+
+            Matrix C;
+            C.AssignKhatriRaoProductOf(A, B);
+            Assert::IsTrue(C.IsEqualTo(D, 0.0001)); 
+
+        }
+
+        TEST_METHOD(CPUAddColumnReshapeProductOf)
+        {
+            Matrix A(6,2);
+            A(0,0) = 0.6557; A(0,1) = 0.7431; 
+            A(1,0) = 0.0357; A(1,1) = 0.3922; 
+            A(2,0) = 0.8491; A(2,1) = 0.6555; 
+            A(3,0) = 0.9340; A(3,1) = 0.1712; 
+            A(4,0) = 0.6787; A(4,1) = 0.7060; 
+            A(5,0) = 0.7577; A(5,1) = 0.0318; 
+
+            Matrix B(3,2);
+            B(0,0) = 0.2769; B(0,1) = 0.8235; 
+            B(1,0) = 0.0462; B(1,1) = 0.6948; 
+            B(2,0) = 0.0971; B(2,1) = 0.3171; 
+
+            Matrix D0(2,2);
+            D0(0,0) = 0.2867; D0(0,1) = 1.2913; 
+            D0(1,0) = 0.1266; D0(1,1) = 0.4520; 
+
+            Matrix D1(2,2);
+            D1(0,0) = 0.2657; D1(0,1) = 1.0923; 
+            D1(1,0) = 0.3636; D1(1,1) = 0.6416; 
+
+            Matrix C(2,2);
+            C.SetValue(0);
+            C.AddColumnReshapeProductOf(A, B, false);
+            Assert::IsTrue(C.IsEqualTo(D0, 0.0001)); 
+
+            C.SetValue(0);
+            C.AddColumnReshapeProductOf(A, B, true);
+            Assert::IsTrue(C.IsEqualTo(D1, 0.0001)); 
+        }
+
+        TEST_METHOD(CPUMatrixRowSliceAndStack)
+        {
+            Matrix M0(5,3);
+            M0(0,0) = 1; M0(0,1) = 6; M0(0,2) = 11;
+            M0(1,0) = 2; M0(1,1) = 7; M0(1,2) = 12;
+            M0(2,0) = 3; M0(2,1) = 8; M0(2,2) = 13;
+            M0(3,0) = 4; M0(3,1) = 9; M0(3,2) = 14;
+            M0(4,0) = 5; M0(4,1) = 10; M0(4,2) = 15;
+
+            Matrix M1(2,3);
+            M1(0,0) = 3; M1(0,1) = 8; M1(0,2) = 13;
+            M1(1,0) = 4; M1(1,1) = 9; M1(1,2) = 14;
+
+            Matrix M2;
+            M2.AssignRowSliceValuesOf(M0, 2, 2);
+            Assert::IsTrue(M2.IsEqualTo(M1, 0.0001)); 
+
+            Matrix M3(5,3);
+            M3(0,0) = 0; M3(0,1) = 0; M3(0,2) = 0;
+            M3(1,0) = 0; M3(1,1) = 0; M3(1,2) = 0;
+            M3(2,0) = 3; M3(2,1) = 8; M3(2,2) = 13;
+            M3(3,0) = 4; M3(3,1) = 9; M3(3,2) = 14;
+            M3(4,0) = 0; M3(4,1) = 0; M3(4,2) = 0;
+
+            M3 += M0;
+            M0.AddToRowSliceValuesOf(M1, 2,2);
+            Assert::IsTrue(M3.IsEqualTo(M0, 0.0001)); 
+
+            M2.AddWithRowSliceValuesOf(M1, 0, 2);
+            Matrix M4(2, 3);
+            M4(0, 0) = 6; M4(0, 1) = 16; M4(0, 2) = 26;
+            M4(1, 0) = 8; M4(1, 1) = 18; M4(1, 2) = 28;
+            Assert::IsTrue(M2.IsEqualTo(M4, 0.0001));
+
+#if 0
+            Matrix M5, M6, M7, M8;
+            M5.AssignRowSliceValuesOf(M0, 0, 2);
+            M6.AssignRowSliceValuesOf(M0, 2, 1);
+            M7.AssignRowSliceValuesOf(M0, 3, 2);
+
+            std::vector<const Matrix*> inputMatrices;
+            inputMatrices.resize(3);
+            inputMatrices[0] = &M5;
+            inputMatrices[1] = &M6;
+            inputMatrices[2] = &M7;
+            M8.AssignRowStackValuesOf(inputMatrices, 0, 3);
+            
+            Assert::IsTrue(M8.IsEqualTo(M0, 0.0001));
+#endif
+        }
+
+        TEST_METHOD(CPUAssignRepeatOf)
+        {
+            Matrix M0(2, 3);
+            M0(0, 0) = 1; M0(0, 1) = 6; M0(0, 2) = 11;
+            M0(1, 0) = 2; M0(1, 1) = 7; M0(1, 2) = 12;
+
+            Matrix M1;
+            M1.AssignRepeatOf(M0, 1, 1);
+            Assert::IsTrue(M1.IsEqualTo(M0, 0.0001));
+
+            Matrix M3(6, 6);
+            M3(0, 0) = 1; M3(0, 1) = 6; M3(0, 2) = 11; M3(0, 3) = 1; M3(0, 4) = 6; M3(0, 5) = 11;
+            M3(1, 0) = 2; M3(1, 1) = 7; M3(1, 2) = 12; M3(1, 3) = 2; M3(1, 4) = 7; M3(1, 5) = 12;
+            M3(2, 0) = 1; M3(2, 1) = 6; M3(2, 2) = 11; M3(2, 3) = 1; M3(2, 4) = 6; M3(2, 5) = 11;
+            M3(3, 0) = 2; M3(3, 1) = 7; M3(3, 2) = 12; M3(3, 3) = 2; M3(3, 4) = 7; M3(3, 5) = 12;
+            M3(4, 0) = 1; M3(4, 1) = 6; M3(4, 2) = 11; M3(4, 3) = 1; M3(4, 4) = 6; M3(4, 5) = 11;
+            M3(5, 0) = 2; M3(5, 1) = 7; M3(5, 2) = 12; M3(5, 3) = 2; M3(5, 4) = 7; M3(5, 5) = 12;
+
+            M1.AssignRepeatOf(M0, 3, 2);
+            Assert::IsTrue(M1.IsEqualTo(M3, 0.0001));
+        }
+
+        TEST_METHOD(CPURowElementOperations)
+        {
+            Matrix M0 = Matrix::RandomUniform(20, 28, -1, 1);
+            Matrix M1 = Matrix::RandomUniform(1, 28, 1, 2);
+
+            Matrix M3;
+            M3.SetValue(M0);
+            M3.RowElementMultiplyWith(M1);
+            M3.RowElementDivideBy(M1);
+
+            Assert::IsTrue(M0.IsEqualTo(M3, 0.0001));
+        }
+        TEST_METHOD(CPUColumnElementOperations)
+        {
+            Matrix M0 = Matrix::RandomUniform(20, 28, -1, 1);
+            Matrix M1 = Matrix::RandomUniform(20, 1, 1, 2);
+
+            Matrix M3;
+            M3.SetValue(M0);
+            M3.ColumnElementMultiplyWith(M1);
+            M3.ColumnElementDivideBy(M1);
+
+            Assert::IsTrue(M0.IsEqualTo(M3, 0.0001));
+        }
+
+		TEST_METHOD(CPUAssignMatrixByColumnSlice)
+		{
+			printf("starts here\n");
+			Matrix M0 = Matrix::RandomUniform(400, 50, -100, 100); 
+
+
+			vector<size_t> columnrange = { 0, 3, 5, 4 };
+			Matrix M1; 
+			try
+			{
+				M1.AssignMatrixByColumnSlice(M0, columnrange);
+			}
+			catch (exception& e)
+			{
+				printf("%s\n", e.what()); 
+				Assert::Fail(); 
+			}
+		
+
+			for (size_t des = 0; des < columnrange.size(); des ++)
+			{
+				size_t src = columnrange[des]; 
+
+				double err = 0; 
+				for (size_t r = 0; r < 400; r++)
+				{
+					double diff = (M0(r, src) - M1(r, des)); 
+					diff *= diff; 
+					err += diff; 
+				}
+				Assert::AreEqual(err, 0, 1e-7);
+			}
+
+		}
+
+    };
+}
\ No newline at end of file
diff --git a/Math/Math/CPUMatrix.cpp b/Math/Math/CPUMatrix.cpp
index 79bf8d24e..26f4a8cd6 100644
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@@ -259,6 +259,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return slice;
     }
 
+    // BUGBUG: Unlike ColumnSlice(), this does not return a view. Must be renamed.
+    template<class ElemType> 
+    CPUMatrix<ElemType> CPUMatrix<ElemType>::RowSlice(size_t startRow, size_t numRows) const
+    {
+        if (startRow + numRows > m_numRows )
+             InvalidArgument("The row slice (%d+%d) is out of range of the source matrix (%d).", (int)startRow, (int)numRows, (int)m_numRows);
+
+        CPUMatrix<ElemType> slice; 
+        slice.AssignRowSliceValuesOf(*this, startRow, numRows); 
+
+        return slice; 
+    }
+
     // set this(:, 0:numCols-1) = fromMatrix(:, startColumn : startColumn+numCols-1)
     // TODO: why not say *this = ColumnSlice()?
     template<class ElemType>
@@ -333,7 +346,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
         }
     }
-
+
     //for each column of a, we add all rows of a to this starting from startIndex
     template<class ElemType>
     CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignToRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows)
diff --git a/Math/Math/CPUMatrix.h b/Math/Math/CPUMatrix.h
index 83d63559b..0c7e7f34c 100644
--- a/Math/Math/CPUMatrix.h
+++ b/Math/Math/CPUMatrix.h
@@ -52,6 +52,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         CPUMatrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
         CPUMatrix<ElemType>& AssignColumnSlice(const CPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols);
         CPUMatrix<ElemType>& SetColumnSlice(const CPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols);
+        // BUGBUG: Unlike ColumnSlice(), this does not return a view. Must be renamed.
+        CPUMatrix<ElemType> RowSlice(size_t startRow, size_t numRows) const;
 
         void CopyColumnsStrided(const CPUMatrix<ElemType>& fromMatrix, size_t numCols, size_t srcNumColsStride, size_t destNumColsStride);
 
diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu
index 29d5c96f4..1f9b81042 100644
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@@ -537,6 +537,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return slice;
     }
 
+    template<class ElemType> 
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::RowSlice(size_t startRow, size_t numRows) const
+    {
+        if (startRow + numRows > m_numRows )
+            InvalidArgument("The row slice (%d+%d) is out of range of the source matrix (%d).", (int)startRow, (int)numRows, (int)m_numRows);
+
+        GPUMatrix<ElemType> slice(GetComputeDeviceId());
+        slice.AssignRowSliceValuesOf(*this, startRow, numRows); 
+        return slice; 
+    }
+
     template<class ElemType>
     GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignColumnSlice(const GPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols)
     {
diff --git a/Math/Math/GPUMatrix.h b/Math/Math/GPUMatrix.h
index 07ef47683..f6fc150e5 100644
--- a/Math/Math/GPUMatrix.h
+++ b/Math/Math/GPUMatrix.h
@@ -142,6 +142,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         GPUMatrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
         GPUMatrix<ElemType>& AssignColumnSlice(const GPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols);
         GPUMatrix<ElemType>& SetColumnSlice(const GPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols);
+        // BUGBUG: Unlike ColumnSlice(), this does not return a view. Must be renamed.
+        GPUMatrix<ElemType> RowSlice(size_t startRow, size_t numRows) const;
 
         void CopyColumnsStrided(const GPUMatrix<ElemType>& fromMatrix, size_t numCols, size_t srcNumColsStride, size_t destNumColsStride);
 
diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp
index 751345b3c..3c8d76cdf 100755
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@@ -766,6 +766,46 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return slice;
     }
 
+    // BUGBUG: Unlike ColumnSlice(), this does not return a view. Must be renamed.
+    template<class ElemType>
+    Matrix<ElemType> Matrix<ElemType>::RowSlice(size_t startRow, size_t numRow) const
+    {
+        int devId = GetDeviceId();
+        Matrix<ElemType> slice(devId); 
+        slice.m_preferredDeviceId = m_preferredDeviceId; 
+        if (GetMatrixType() == MatrixType::DENSE)
+        {
+            if (devId == CPUDEVICE)
+            {
+                if (slice.m_CPUMatrix != nullptr)
+                    slice.m_CPUMatrix->operator=(static_cast<CPUMatrix<ElemType>&&> (m_CPUMatrix->RowSlice(startRow, numRow)));
+                else
+                    slice.m_CPUMatrix = new CPUMatrix<ElemType>(static_cast<CPUMatrix<ElemType>&&> (m_CPUMatrix->RowSlice(startRow, numRow))); 
+                slice.SetDataLocation(CPU, DENSE); 
+            }
+            else
+            {
+                if (slice.m_GPUMatrix != nullptr)
+                    slice.m_GPUMatrix->operator=(static_cast<GPUMatrix<ElemType>&&>(m_GPUMatrix->RowSlice(startRow, numRow)));
+                else
+                    slice.m_GPUMatrix = new GPUMatrix<ElemType>(static_cast<GPUMatrix<ElemType>&&>(m_GPUMatrix->RowSlice(startRow, numRow)));
+                slice.SetDataLocation(GPU, DENSE);
+            }
+            
+        }
+        else if (GetMatrixType() == MatrixType::SPARSE)
+        {
+            NOT_IMPLEMENTED; 
+        }
+        else
+        {
+            RuntimeError("Unknown matrix type");
+        }
+        return slice; 
+    }
+   
+
+
     template<class ElemType>
     Matrix<ElemType>& Matrix<ElemType>::AssignColumnSlice(const Matrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols)
     {            
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index 9490adc41..b3a4b0258 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -153,7 +153,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void CopySection(size_t numRows, size_t numCols, ElemType* dst, size_t colStride) const; 
 
         Matrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
-
+        // BUGBUG: Unlike ColumnSlice(), this does not return a view. Must be renamed.
+        Matrix<ElemType> RowSlice(size_t startRow, size_t numRows) const;
 
         // difference between AssignColumnSlice and SetColumnSlice 
         // AssignColumnSlice :      this(:, startColumn:startColumn+numCols-1) = fromMatrix(:, startColumn: startColumn+numCols-1) 
diff --git a/Math/Math/NoGPU.cpp b/Math/Math/NoGPU.cpp
index 89f79586f..a776a3000 100644
--- a/Math/Math/NoGPU.cpp
+++ b/Math/Math/NoGPU.cpp
@@ -490,7 +490,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType> GPUMatrix<ElemType> GPUMatrix<ElemType>::ColumnSlice(size_t startColumn, size_t numCols) const
     {
         GPUMatrix<ElemType> slice(0);
+        return slice;
+    }
 
+    template<class ElemType> GPUMatrix<ElemType> GPUMatrix<ElemType>::RowSlice(size_t startRow, size_t numRows) const
+    {
+        GPUMatrix<ElemType> slice(0); 
         return slice;
     }