diff --git a/MachineLearning/CNTK/CNTK.cpp b/MachineLearning/CNTK/CNTK.cpp index 1e1aed59e..41428acc5 100644 --- a/MachineLearning/CNTK/CNTK.cpp +++ b/MachineLearning/CNTK/CNTK.cpp @@ -1657,7 +1657,7 @@ void PrintBuiltInfo() fprintf(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_); #endif #ifdef _CUB_PATH_ - fprintf(stderr, "\t\tCUDA_PATH: %s\n", _CUB_PATH_); + fprintf(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_); #endif #ifdef _GIT_EXIST fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_); diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h index 4ea6f5a83..deaf37484 100644 --- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h +++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h @@ -294,7 +294,7 @@ public: m_deviceId = EnforceOneGPUOnly(m_deviceId); // see EnforceOneGPUOnly() for comment on what this is } - DEVICEID_TYPE GetDeviceId() { return m_deviceId; } + DEVICEID_TYPE GetDeviceId() const { return m_deviceId; } unsigned long GetRandomSeedOffset() { return m_randomSeedOffset; } void SetRandomSeedOffset(unsigned long value) { m_randomSeedOffset = value; } diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h index 2296f3e4c..4dfd11bee 100644 --- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h +++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h @@ -119,6 +119,25 @@ namespace Microsoft { namespace MSR { namespace CNTK { virtual ~IComputationNode() { } }; + // ======================================================================= + // This provide a interface for stateful node (e.g., DelayNodeBase) and definition of state + // This interface allows to Export and Import state from elsewhere + // It is needed when doing sub-minibatch implementation + // ======================================================================= + + class INodeState: public std::enable_shared_from_this + { + public: + virtual ~INodeState() {} + }; + + struct /*interface*/ IStateFulNode + { + typedef std::shared_ptr NodeStatePtr; + virtual NodeStatePtr ExportState() = 0; + virtual void ImportState(const NodeStatePtr& pImportedState) = 0; + }; + // ======================================================================= // ComputationNetworkOwnedNodeState -- class to collect ComputationNode members that are really owned by ComputationNetwork // These members are only to be set, changed, and read by ComputationNetwork code. @@ -1475,6 +1494,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { } }; + + // ======================================================================= // helper macro to ease access to base members in presence of C++ two-phase name lookup // ======================================================================= diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h index c900963e0..20d850cbf 100644 --- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h +++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h @@ -24,6 +24,51 @@ namespace Microsoft { namespace MSR { namespace CNTK { + // ----------------------------------------------------------------------- + // The following defines a state of a delay node which is going to be exported to others (saving for the next minibatch) + // ----------------------------------------------------------------------- + template + class DelayedValueNodeState: public INodeState + { + + public: + DelayedValueNodeState(int deviceID) : + m_cachedActivity((size_t)0, (size_t)0, deviceID), m_delayedActivationMBLayout(nullptr), m_isEmpty(true) + { } + void CacheDelayedMBLayout(const MBLayoutPtr& pMBLayout) + { + m_delayedActivationMBLayout = make_shared(); + m_delayedActivationMBLayout->CopyFrom(pMBLayout); + } + void CacheState(const Matrix& cachedActivity) + { + m_cachedActivity.SetValue(cachedActivity); + m_isEmpty = false; + } + void ExportDelayedMBLayout(MBLayoutPtr& pMBLayout) + { + pMBLayout->CopyFrom(m_delayedActivationMBLayout); + } + bool IsEmpty() + { + return m_isEmpty; + } + const Matrix& ExportCachedActivity() + { + return m_cachedActivity; + } + + protected: + Matrix m_cachedActivity; // 1 column per parallel sequence + // MBLayoutPtr m_shiftedMBLayout; + // Currently, we only support saving state for m_timeStep == 1 + // there is no need for this m_shiftedMBLayout if m_timeStep == 1 + MBLayoutPtr m_delayedActivationMBLayout; + bool m_isEmpty; // in some case + // (e.g., at the boundary of sentence end or begin/full utterance mode), we don't need to store state (but we do need to need know m_delayedActivationMBLayout) + }; + + // ----------------------------------------------------------------------- // DelayedValueNodeBase (input) -- abstract base class for PastValueNode and FutureValueNode to hold all shared code // The two differ in the step direction, some loop directions, and sequence-boundary flags. @@ -31,9 +76,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { // TODO: 'direction' is really too general. signOfTimeOffset? template - class DelayedValueNodeBase : public ComputationNode, public ILateAttachingNode, public NumInputs<1> + class DelayedValueNodeBase : public ComputationNode, public + ILateAttachingNode, public IStateFulNode, public NumInputs<1> { typedef ComputationNode Base; UsingComputationNodeMembersBoilerplate; + typedef std::shared_ptr> DelayedNodeStatePtr; static const std::wstring TypeName() { return L"DelayedValue"; } private: void Init(size_t row_size, size_t col_size, ElemType initialActivationValue = (ElemType)DEFAULT_HIDDEN_ACTIVATION) @@ -352,6 +399,129 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } + //======================================== + // implement the IStateFulNode interface + //======================================== + + virtual NodeStatePtr ExportState() override + { + NodeStatePtr pExportedState; + size_t nT = m_pMBLayout->GetNumTimeSteps(); + size_t nU = m_pMBLayout->GetNumParallelSequences(); + int dir = direction; + if (m_timeStep != 1) + { + // not support yet; give user a hint + RuntimeError("Currently importing/exporting state info for timeStep>1 is not supported. Contact erw@microsoft.com for more detail"); + } + if (dir == -1) // we look into past + { + bool allAtBoundary = true; + // if the current last frames are all sentence end or no feature , there is no need to carry on state info + if (m_pMBLayout->Is(nT-1, MinibatchPackingFlags::SequenceEnd | MinibatchPackingFlags::NoFeature)) + { + for (size_t u = 0; u < nU; u++) + { + if (!m_pMBLayout->Is(u, nT - 1, MinibatchPackingFlags::SequenceEnd | MinibatchPackingFlags::NoFeature)) + { + allAtBoundary = false; + break; + } + } + } + else + { + allAtBoundary = false; + } + + if (allAtBoundary) + { + auto pState = make_shared>(m_deviceId); + pState->CacheDelayedMBLayout(m_delayedActivationMBLayout); + // return an empty one + } + else + { + auto pState = make_shared>(m_deviceId); + //pState->CacheState(FunctionValues().Reshaped(nD*nU, nT).RowSlice(nD*(nT - 1), nD)); + pState->CacheState(m_delayedActivation.ColumnSlice((nT - 1)*nU, nU)); + pState->CacheDelayedMBLayout(m_delayedActivationMBLayout); + pExportedState = pState; + } + } + if (dir == 1) // we look into future + { + // TODO: check whether all at boundary and don't carry state if it is the case + size_t nT = m_pMBLayout->GetNumTimeSteps(); + size_t nU = m_pMBLayout->GetNumParallelSequences(); + bool allAtBoundary = true; + if (m_pMBLayout->Is(0, MinibatchPackingFlags::NoFeature | MinibatchPackingFlags::SequenceStart)) + { + for (size_t u = 0; u < nU; u++) + { + if (!m_pMBLayout->Is(u, 0, MinibatchPackingFlags::SequenceStart | MinibatchPackingFlags::NoFeature)) + { + allAtBoundary = false; + break; + } + } + } + + if (allAtBoundary) + { + auto pState = make_shared>(m_deviceId); + pState->CacheDelayedMBLayout(m_delayedActivationMBLayout); + pExportedState = pState; + } + else + { + auto pState = make_shared>(m_deviceId); + pState->CacheState(m_delayedActivation.ColumnSlice((nT-1)*nU, nU)); + pState->CacheDelayedMBLayout(m_delayedActivationMBLayout); + pExportedState = pState; + } + + + } + if (dir != -1 && dir != 1) + { + RuntimeError("Unrecognized direction in DelayedValueNodeBase"); + } + return pExportedState; + } + virtual void ImportState(const NodeStatePtr& pImportedState) override + { + DelayedNodeStatePtr pState = dynamic_pointer_cast> (pImportedState); + + if (!pState) + RuntimeError("Expecting DelayValueNodeState after down casting"); + + pState->ExportDelayedMBLayout(m_delayedActivationMBLayout); // pstate copy to m_delayedActivationMBLayout + if (pState->IsEmpty()) + { + return; + } + + const Matrix& delayedActivation = pState->ExportCachedActivity(); + size_t nT = m_delayedActivationMBLayout->GetNumTimeSteps(); + size_t nU = m_delayedActivationMBLayout->GetNumParallelSequences(); + + int dir = direction; + if (dir == -1) // looking backward + { + m_delayedActivation.SetColumnSlice(delayedActivation, (nT - 1)*nU, nU); + } + if (dir == 1) + { + //m_delayedActivation.CopyColumnsStrided(delayedActivation, nU, 1, nT); + m_delayedActivation.SetColumnSlice(delayedActivation, 0, nU); + } + if (dir != -1 && dir == 1) + {// it is really a compile error ? + RuntimeError("Unrecognized direction in DelayedValueNodeBase"); + } + + } protected: ElemType m_initialActivationValue; // starting value for hidden activation vector at boundary diff --git a/MachineLearning/CNTKSGDLib/DataReaderHelpers.h b/MachineLearning/CNTKSGDLib/DataReaderHelpers.h index a10acb927..17ea06071 100644 --- a/MachineLearning/CNTKSGDLib/DataReaderHelpers.h +++ b/MachineLearning/CNTKSGDLib/DataReaderHelpers.h @@ -10,6 +10,8 @@ #include #include "TrainingCriterionNodes.h" +//#define SMB_DEBUG + namespace Microsoft { namespace MSR { namespace CNTK { /*static*/ struct DataReaderHelpers @@ -166,4 +168,404 @@ namespace Microsoft { namespace MSR { namespace CNTK { } }; + // SubminibatchHelpers + // Helper for sub-minibatch implementation + // A sub-minibathc is a part of a minibatch which helps computing large minibatches that cannot load into GPU memory in one forward-backward computation + // The usage would be : + // SubminibatchHelpers sbhelper; + // for (;;) + // { + // size_t nsb=sb.GetMinibatchIntoCache(...); + // for (size_t i=0; i + class SubminibatchDispatcher + { + private: + typedef std::vector> Lattice; + typedef std::vector Uid; + typedef std::vector ExtrauttMap; + + typedef std::vector>* LatticePtr; + typedef std::vector* UidPtr; + typedef std::vector* ExtrauttMapPtr; + typedef std::map*> Matrices; + + + // member variables served as caching space + Matrices m_inputMatricesCache; + MBLayoutPtr m_MBLayoutCache; + LatticePtr m_LatticeCache; + UidPtr m_uidCache; + ExtrauttMapPtr m_extrauttmapCache; + shared_ptr> m_NetCriterionAccumulator; + shared_ptr> m_NetEvaluationAccumulator; + std::map>> m_NetStates; // m_NetStatefulNodes[node][i] caches the state of i-th subminibatch of node + + + Matrices m_CachedGraident; + // we also need to remember where to put into the net + MBLayoutPtr m_NetMBLayoutPtr; + std::map>> m_LearnableNodePtr; + // followings are lattice-related + Matrices m_NetInputMatrixPtr; + LatticePtr m_NetLatticePtr; + UidPtr m_NetUidPtr; + ExtrauttMapPtr m_NetExtrauttMapPtr; + // we remember the pointer to the learnable Nodes so that we can accumulate the gradient once a sub-minibatch is done + + + size_t m_numParallelSequences; // number of paralle sequence in the cached matrix and MBLayout + size_t m_numSubminibatches; // how many subminibatches we are going to use ? + + std::vector>> m_NetCriterionNodes; + std::vector>> m_NetEvaluationNodes; + std::map> m_NetStatefulNodes; // we need to Export/Import states of stateful nodes when we swtich subminibatches + + private: + + void EnumerateStatefulNodeWithRoot(ComputationNetwork& net, ComputationNodeBasePtr root, std::map>& statefulnode) + { + std::list evalorder = net.GetEvalOrder(root, false); + for (auto& x : evalorder) + { + wstring name = x->GetName(); + if (statefulnode.find(name )!=statefulnode.end()) continue; // already in the list + shared_ptr pNode = dynamic_pointer_cast(x); + if (pNode) + { + statefulnode[name] = pNode; + } + } + } + std::map> EnumerateStatefulNode(ComputationNetwork& net, + const std::vector& criterionNode, + const std::vector& evaluationNode) + { + std::map> statefulnodes; + for (auto& root : criterionNode) + { + EnumerateStatefulNodeWithRoot(net, root, statefulnodes); + } + for (auto& root : evaluationNode) + { + EnumerateStatefulNodeWithRoot(net, root, statefulnodes); + } + return statefulnodes; + } + + public: + SubminibatchDispatcher() : + m_MBLayoutCache(nullptr), m_LatticeCache(nullptr), m_uidCache(nullptr), m_extrauttmapCache(nullptr) + { } + + void Init(ComputationNetworkPtr & net, + const std::list& learnableNodes, + const std::vector& criterionNodes, + const std::vector& evaluationNodes) + { + m_MBLayoutCache = make_shared(); + m_NetCriterionAccumulator = make_shared>(1, 1, net->GetDeviceId()); + m_NetEvaluationAccumulator = make_shared>(1, evaluationNodes.size(), net->GetDeviceId()); + // remember ptr to learnableNode + for (auto x : learnableNodes) + { + shared_ptr> pLearnableNode = dynamic_pointer_cast>(x); + wstring nodename = x->NodeName(); + m_LearnableNodePtr[nodename] = pLearnableNode; + } + for (auto& x : criterionNodes) + { + m_NetCriterionNodes.push_back(dynamic_pointer_cast>(x)); + } + for (auto& x : evaluationNodes) + { + m_NetEvaluationNodes.push_back(dynamic_pointer_cast>(x)); + } + m_NetCriterionAccumulator->SetValue((ElemType)0); + m_NetEvaluationAccumulator->SetValue((ElemType)0); + + // emulate all the nodes, find nodes that have state + m_NetStatefulNodes = EnumerateStatefulNode(*net, criterionNodes, evaluationNodes); + for (auto x : m_NetStatefulNodes) + { + wstring name = x.first; + m_NetStates[name] = vector>(); + } + } + + ~SubminibatchDispatcher() + { + // TODO: remove these by using shared_ptr + delete m_LatticeCache; + delete m_uidCache; + delete m_extrauttmapCache; + + for (auto x : m_inputMatricesCache) + { + delete x.second; + } + + for (auto x : m_CachedGraident) + { + delete x.second; + } + } + size_t GetMinibatchIntoCache( IDataReader& trainSetDataReader, + ComputationNetwork& net, + std::map*> & inputMatrices, + size_t requestedSubminibatches) + { + // first, remember interface to the net + m_NetMBLayoutPtr = net.GetMBLayoutPtr(); + m_NetInputMatrixPtr = inputMatrices; + + // second, get data from reader, stored it in cache + // 1. for each key, allocate the specific matrix on device + for (auto pa : inputMatrices) + { + wstring name = pa.first; + Matrix* M= pa.second; + if (m_inputMatricesCache.find(name) == m_inputMatricesCache.end()) + { + m_inputMatricesCache[name] = new Matrix(*M, M->GetDeviceId()); // deep copy from M + } + else + { + m_inputMatricesCache[name]->SetValue(*M); + } + } + // 2. MBlayout + m_MBLayoutCache->CopyFrom(net.GetMBLayoutPtr()); + size_t nParallelSequences = m_MBLayoutCache->GetNumParallelSequences(); + + if (m_NetCriterionNodes[0] != nullptr && (m_NetCriterionNodes[0]->OperationName() == L"SequenceWithSoftmax")) + { + // auto node = dynamic_pointer_cast>(criterionNode); + NOT_IMPLEMENTED; + // TODO: implement this for Sequence training !!! + } + + // subminibatches are cutted at the parallel sequence level; + // if #requested subminibatch is larger than #parallel sequence, + // we cannot split further; instead, each subsequence become a subminibatch + size_t actualnumSubminibatches = requestedSubminibatches > nParallelSequences ? nParallelSequences : requestedSubminibatches; + + // 3. third, allocate space for accumulated gradient + for (auto& n: m_LearnableNodePtr) + { + auto node = n.second; + if (node->IsParameterUpdateRequired()) + { + wstring nodeName = node->GetName(); + shared_ptr> pLearnableNode = node; + auto funvalue = pLearnableNode->FunctionValues(); // gradient may not be allocated when this function is first called + size_t nrow = funvalue.GetNumRows(); + size_t ncol = funvalue.GetNumCols(); + if (m_CachedGraident.find(nodeName) == m_CachedGraident.end()) + { + // not allocated yet + m_CachedGraident[nodeName] = new Matrix(nrow, ncol, funvalue.GetDeviceId()); + m_CachedGraident[nodeName]->SetValue((ElemType)0); + } + } + } + // 4. for stateful node + for (auto x : m_NetStatefulNodes) + { + wstring name = x.first; + if (m_NetStates[name].empty()) + { + // this only happens in the first minibatch in an epoch + m_NetStates[name].resize(actualnumSubminibatches); + } + } + + return (m_numSubminibatches = actualnumSubminibatches); + } + + void GetSubMinibatchToNet(size_t iSubminibatch) + { + Matrices decimatedMatrices; + MBLayoutPtr decimatedLayout; + DataReaderHelpers::DecimateMinibatch(m_inputMatricesCache, decimatedMatrices, m_MBLayoutCache, decimatedLayout, m_numSubminibatches, iSubminibatch); + // NOTE: decimatedMatrices must be released by caller + + //m_NetInputMatrixPtr = decimatedMatrices; + for (auto& x : decimatedMatrices) + { + wstring name = x.first; + m_NetInputMatrixPtr[name]->SetValue(*x.second); + delete x.second; // TODO: is it safe to delete here ? Yes! SetValue call cuda memcpy so it is a blocking call + x.second = nullptr; + } + + m_NetMBLayoutPtr->CopyFrom(decimatedLayout); + + for (auto& x : m_NetStatefulNodes) + { + wstring name = x.first; + shared_ptr pNode = x.second; + if (m_NetStates[name][iSubminibatch]) + pNode->ImportState(m_NetStates[name][iSubminibatch]); + } + + } + // TODO: encapsulate it into a destructor !!! Note: Cannot throw exceptions in destructor. + void DoneWithCurrentSubMinibatch(size_t iSubminibatch) + { + // accumulate gradient here + for (auto x : m_CachedGraident) + { + wstring nodename = x.first; + if (m_LearnableNodePtr.find(nodename) == m_LearnableNodePtr.end()) + { + RuntimeError("ERROR: in DoneWithCurrentSubMinibatch: node %ls not found in LeanrableNode", nodename.c_str()); + } + shared_ptr> pNode = m_LearnableNodePtr[nodename]; + m_CachedGraident[nodename]->operator+=(pNode->GradientValues()); + pNode->GradientValues().SetValue((ElemType)0); + } + // accumulate criterion value + Matrix::AddElementToElement( + m_NetCriterionNodes[0]->FunctionValues() , 0, 0, + *m_NetCriterionAccumulator, 0, 0 + ); + m_NetCriterionNodes[0]->FunctionValues().SetValue((ElemType)0); + // accumulate evaluation value + for (size_t i = 0; i < m_NetEvaluationNodes.size(); i++) + { + Matrix::AddElementToElement( + m_NetEvaluationNodes[i]->FunctionValues(), 0, 0, + *m_NetEvaluationAccumulator, 0, i + ); + m_NetEvaluationNodes[i]->FunctionValues().SetValue((ElemType)0); + } + + // Export node state + for (auto& x : m_NetStatefulNodes) + { + wstring name = x.first; + m_NetStates[name][iSubminibatch] = x.second->ExportState(); + } + } + void DoneWithCurrentMinibatch() + { + for (auto& x : m_CachedGraident) + { + wstring name = x.first; + Matrix* accumulategrad = x.second; + + if (m_LearnableNodePtr.find(name) == m_LearnableNodePtr.end()) + { + // should never happen, remove this code later + RuntimeError("ERROR: in DoneWithCurrentSubMinibatch: node %ls not found in LearnableNode", name.c_str()); + } + m_LearnableNodePtr[name]->GradientValues().SetValue(*accumulategrad); + x.second->SetValue((ElemType)0); + } + // also revert net.m_MBLayoutPtr + m_NetMBLayoutPtr->CopyFrom(m_MBLayoutCache); + + //m_NetCriterionNodes[0]->FunctionValues().SetValue((ElemType)0); + Matrix::AddElementToElement( + *m_NetCriterionAccumulator, 0, 0, + m_NetCriterionNodes[0]->FunctionValues(), 0, 0 + ); + m_NetCriterionAccumulator->SetValue((ElemType)0); + + for (size_t i = 0; i < m_NetEvaluationNodes.size(); i++) + { + //m_NetEvaluationNodes[i]->FunctionValues().SetValue((ElemType)0); + Matrix::AddElementToElement( + *m_NetEvaluationAccumulator, 0, i, + m_NetEvaluationNodes[i]->FunctionValues(), 0, 0 + ); + } + m_NetEvaluationAccumulator->SetValue((ElemType)0); + } + +#ifdef SMB_DEBUG + + template + void WriteMatrix(const Matrix& mat, string filename) + { + ElemType* pArray = mat.CopyToArray(); + size_t nRows = mat.GetNumRows(); + size_t nCols = mat.GetNumCols(); + FILE* fp = fopenOrDie(filename, "w"); + for (size_t r = 0; r < nRows; r++) + { + for (size_t c = 0; c < nCols; c++) + { + fprintf(fp, "%.9f ", pArray[nRows*c + r]); + } + fprintf(fp, "\n"); + } + fcloseOrDie(fp); + delete[]pArray; + } + void WriteMBLayout(MBLayoutPtr pMBLayout, wstring filename) + { + size_t nT = pMBLayout->GetNumTimeSteps(); + size_t nU = pMBLayout->GetNumParallelSequences(); + + FILE* fp = fopenOrDie(filename, L"w"); + for (size_t u = 0; u < nU; u++) + { + for (size_t t = 0; t < nT; t++) + { + MinibatchPackingFlags flag = pMBLayout->Get(u, t); + fprintf(fp, "%d\t", (int)flag); + } + fprintf(fp, "\n"); + } + fcloseOrDie(fp); + } + void WriteInputMatriceAndMBLayout(size_t mbID, size_t smbID) + { + wstring node = L"features"; + wstring filename = msra::strfun::wstrprintf(L"tmp/%s.%d.%d", node.c_str(), mbID, smbID); + if (m_NetInputMatrixPtr.find(node) != m_NetInputMatrixPtr.end()) + { + WriteMatrix, ElemType>(*m_NetInputMatrixPtr[node], msra::strfun::wcstombs(filename)); + } + wstring fn = msra::strfun::wstrprintf(L"tmp/Layout.%d.%d", mbID, smbID); + WriteMBLayout(m_NetMBLayoutPtr, fn); + } + void WriteInputMatriceAndMBLayout(Matrices m, MBLayoutPtr pMBLayout, size_t mbID) + { + wstring filename = msra::strfun::wstrprintf(L"tmp/features.%d", mbID); + wstring fn = msra::strfun::wstrprintf(L"tmp/layout.%d", mbID); + if (m.find(L"features") != m.end()) + { + WriteMatrix, ElemType>(*m[L"features"], msra::strfun::wcstombs(filename)); + } + WriteMBLayout(pMBLayout, fn); + } + + void WriteGradient(size_t mbID) + { + wstring node = L"LSTMoutput1.bias"; + wstring filename = msra::strfun::wstrprintf(L"%s.%d", L"tmp/gradient", mbID); + if (m_CachedGraident.find(node) != m_CachedGraident.end()) + { + WriteMatrix, ElemType>(*m_CachedGraident[node], msra::strfun::wcstombs(filename)); + } + } + + void WriteGradient(const Matrix& mat, wstring fn) + { + WriteMatrix, ElemType>(mat, msra::strfun::wcstombs(fn)); + } +#endif // SMB_DEBUG + }; + }}} diff --git a/MachineLearning/CNTKSGDLib/SGD.cpp b/MachineLearning/CNTKSGDLib/SGD.cpp index cb033cb9a..c9053e23d 100644 --- a/MachineLearning/CNTKSGDLib/SGD.cpp +++ b/MachineLearning/CNTKSGDLib/SGD.cpp @@ -110,6 +110,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // truncated = truncation length m_mbSize = configSGD(L"minibatchSize", ConfigRecordType::Array(intargvector(vector{ 256 }))); m_truncated = configSGD(L"truncated", false); + m_maxSamplesInRAM = configSGD(L"maxSamplesInRAM", ConfigRecordType::Array(intargvector(vector < int > {0}))); // the number of samples in each epoch (0 means, use all the samples in each epoch). m_epochSize = configSGD(L"epochSize", (size_t)0); @@ -1697,6 +1698,22 @@ namespace Microsoft { namespace MSR { namespace CNTK { refNet->StartEvaluateMinibatchLoop(refNode); } + SubminibatchDispatcher smbDisplatcher; + size_t samplesInRAM = m_maxSamplesInRAM[epochNumber]; + // convert it to SubminibatchRequested + size_t numSubminibatchRequested = 0; + if (samplesInRAM > 0) // if samplesInRAM = 0 , we will not use subminibatch dispatcher + { + size_t nParallelSequences = trainSetDataReader->GetNumParallelSequences(); + size_t estimatedMBSize = tunedMBSize * nParallelSequences; + numSubminibatchRequested = (size_t)std::ceil(estimatedMBSize / samplesInRAM); + } + if (numSubminibatchRequested > 1) // only use subminibatch dispatcher if more than 1 subminibatch is required + { + smbDisplatcher.Init(net, learnableNodes, criterionNodes, evaluationNodes); + } + size_t actualNumSubminibatch=0; + // Attemps to compute the error signal for the whole utterance, which will // be fed to the neural network as features. Currently it is a workaround // for the two-forward-pass sequence and ctc training, which allows @@ -1710,10 +1727,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { fprintf(stderr, ", DataParallelSGD training (MyRank = %d, NumNodes = %d, NumGradientBits = %d)", (int)g_mpi->CurrentNodeRank(), (int)g_mpi->NumNodesInUse(), (int)m_numGradientBits); } - if (useDistributedMBReading) { - fprintf(stderr, ", Distributed reading is ENABLED"); + fprintf(stderr, ", distributed reading is ENABLED"); + } + if (numSubminibatchRequested > 0) + { + fprintf(stderr, ", with %d Max Samples in RAM", (int)samplesInRAM); } fprintf(stderr, ".\n"); @@ -1735,6 +1755,15 @@ namespace Microsoft { namespace MSR { namespace CNTK { nSamplesSinceLastModelSync += actualMBSize; + if (numSubminibatchRequested > 0) + { + actualNumSubminibatch = smbDisplatcher.GetMinibatchIntoCache(*trainSetDataReader, *net, *inputMatrices, numSubminibatchRequested); + } + else + { + actualNumSubminibatch = 0; + } + // node data was changed // TODO: move this to that function as well--just tired to pass everything as arguments // TODO: We should do this right after the GetMinibatch() call, since that's where these changed. @@ -1772,26 +1801,30 @@ namespace Microsoft { namespace MSR { namespace CNTK { //compute eval node first since when gradient is computed the forward function values //may be changed and need to be recomputed when gradient and function value share the same matrix - net->Evaluate(evaluationNodes); + if (actualNumSubminibatch > 0) + { + for (size_t ismb = 0; ismb < actualNumSubminibatch; ismb++) + { + smbDisplatcher.GetSubMinibatchToNet(ismb); +#ifdef SMB_DEBUG + //smbhelper.WriteInputMatriceAndMBLayout(numMBsRun, ismb); +#endif + ComputationNetwork::UpdateEvalTimeStamps(featureNodes); + ComputationNetwork::UpdateEvalTimeStamps(labelNodes); + ForwardBackward(*net, evaluationNodes, criterionNodes[0], learnRatePerSample > 0.01 * m_minLearnRate); + smbDisplatcher.DoneWithCurrentSubMinibatch(ismb); + } +#ifdef SMB_DEBUG + //smbhelper.WriteGradient(numMBsRun); +#endif + smbDisplatcher.DoneWithCurrentMinibatch(); - // only compute gradient when learning rate is large enough - if (learnRatePerSample > m_minLearnRate * 0.01) - { - // use only the first criterion. Is there any possibility to use more? - // ============================== - // forward prop, back-prop --this is where the magic happens baby, what we have all be waiting for! - // ============================== - net->ComputeGradient(criterionNodes[0]); - // TODO: we should split Evaluate() out from ComputeGradient(), then call them ForwardProp() and BackProp(), for clarity } - else + else { - // use only the first criterion. Is there any possibility to use more? - // ============================== - // forward prop - // ============================== - net->Evaluate(criterionNodes[0]); + ForwardBackward(*net, evaluationNodes, criterionNodes[0], learnRatePerSample > 0.01 * m_minLearnRate); } + } // if (actualMBSize > 0) // Some labels may be missing (e.g. forced alignment failed, or being gaps due to packing parallel sequences). diff --git a/MachineLearning/CNTKSGDLib/SGD.h b/MachineLearning/CNTKSGDLib/SGD.h index ee268e1ec..18efd9246 100644 --- a/MachineLearning/CNTKSGDLib/SGD.h +++ b/MachineLearning/CNTKSGDLib/SGD.h @@ -150,6 +150,14 @@ protected: // We really should only read it in SGD and pass it ourselves on to the Reader, instead of it being a Reader parameter. // BUGBUG: If m_truncated, then m_mbSize is interpreted as truncation length; the actual MB size is a combination of that and the #parallel sequences specified in the reader. // TODO: do not specify 'Truncated' but 'TruncatedLength', set m_truncated so given, and let m_mbSize control how many #parallel sequences the reader is allowed to pack into an MB. + intargvector m_maxSamplesInRAM; + // This is related with subminibatch implementation + // maxSamplesInRAM denotes how many samples we used in forward-backward on net. + // Due to the GPU memory limitations, it is sometime not possible to hold the m_mbSize in RAM. + // To mitigate this issue, we adopt the sub-minibatch implementation, where + // each m_mbSize[epoch] is divided by a few sub-minibatch of which size will be no more than m_maxSamplesInRAM[epoch] + // a forward-backward is performed for each sub-minibathch; a model update is performed after each minibatch + // the number of samples in each epoch (0 means, use all the samples in each epoch). size_t m_epochSize; @@ -485,6 +493,28 @@ protected: private: int SGDTrace(FILE *__restrict __stream, const char *__restrict __format, ...); + void ForwardBackward(ComputationNetwork& net, const std::vector& evalNodes, shared_ptr criterionNode, bool dobackpropogate=true) + { + net.Evaluate(evalNodes); + // only compute gradient when learning rate is large enough + if (dobackpropogate) + { + // use only the first criterion. Is there any possibility to use more? + // ============================== + // forward prop, back-prop --this is where the magic happens baby, what we have all be waiting for! + // ============================== + net.ComputeGradient(criterionNode); + // TODO: we should split Evaluate() out from ComputeGradient(), then call them ForwardProp() and BackProp(), for clarity + } + else + { + // use only the first criterion. Is there any possibility to use more? + // ============================== + // forward prop + // ============================== + net.Evaluate(criterionNode); + } + } }; }}} diff --git a/Math/CNTKMathTest/CPUMatrixUnitTests.cpp b/Math/CNTKMathTest/CPUMatrixUnitTests.cpp new file mode 100644 index 000000000..5c581efe1 --- /dev/null +++ b/Math/CNTKMathTest/CPUMatrixUnitTests.cpp @@ -0,0 +1,712 @@ +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +#include "stdafx.h" +#include "CppUnitTest.h" +#include "..\Math\CPUMatrix.h" +#define DEBUG_FLAG 1 +using namespace Microsoft::MSR::CNTK; + +#pragma warning (disable: 4305) + +using namespace Microsoft::VisualStudio::CppUnitTestFramework; + +namespace CNTKMathTest +{ + TEST_CLASS(CPUMatrixUnitTest) + { + //typedef CPUSingleMatrix Matrix; + typedef CPUDoubleMatrix Matrix; + + public: + static void DebugPrint(FILE* gpuDebugFile, Matrix M, const char* str, const bool colwiseVec = true) + { + fprintf(gpuDebugFile, "\n %s\n", str); + const size_t matNumCol = M.GetNumCols(); + const size_t elemNum = M.GetNumElements(); + Matrix M1 = M.Transpose(); + double* pArray = M1.GetArray(); + if (colwiseVec) + { + for (size_t i = 0; i < elemNum; i++) + { + + fprintf(gpuDebugFile, "%3d ", (int)pArray[i]); + if ( (i+1)% matNumCol == 0) + fprintf(gpuDebugFile, "\n"); + } + } + //const size_t matNumRow = M.GetNumRows(); + //for (int i = 0; i < matNumRow; i++) + //{ + // for (int j = 0; j < matNumCol; j++) + // { + // fprintf(gpuDebugFile, "%3d ", M(i,j)); + // //if ( (j+1)% matNumCol == 0) + // } + // fprintf(gpuDebugFile, "\n"); + //} + } + TEST_METHOD(CPUMatrixConsturctors) + { + Matrix M0; + Assert::IsTrue(M0.IsEmpty()); + + M0.Resize(2,3); + Assert::IsFalse(M0.IsEmpty()); + Assert::AreEqual(2,M0.GetNumRows()); + Assert::AreEqual(3,M0.GetNumCols()); + Assert::AreEqual(6,M0.GetNumElements()); + + M0(0,0) = 1; M0(1,2) = 2; + Assert::IsTrue(M0(0,0) == 1); + Assert::IsTrue(M0(1,2) == 2); + + Matrix M1(12,53); + Assert::AreEqual(12,M1.GetNumRows()); + Assert::AreEqual(53,M1.GetNumCols()); + + + float *fArray = new float[6]; + fArray[0] = 1; fArray[1] = 2; fArray[2] = 3; + fArray[3] = 4; fArray[4] = 5; fArray[5] = 6; + CPUMatrix M2(2, 3, fArray, matrixFlagNormal); + Assert::AreEqual(M2(0,0), 1); + Assert::AreEqual(M2(0,1), 3); + Assert::AreEqual(M2(0,2), 5); + Assert::AreEqual(M2(1,0), 2); + Assert::AreEqual(M2(1,1), 4); + Assert::AreEqual(M2(1,2), 6); + + double *dArray = new double[6]; + dArray[0] = 1; dArray[1] = 2; dArray[2] = 3; + dArray[3] = 4; dArray[4] = 5; dArray[5] = 6; + CPUMatrix M3(2, 3, dArray, matrixFormatRowMajor); + Assert::AreEqual(M3(0,0), 1); + Assert::AreEqual(M3(0,1), 2); + Assert::AreEqual(M3(0,2), 3); + Assert::AreEqual(M3(1,0), 4); + Assert::AreEqual(M3(1,1), 5); + Assert::AreEqual(M3(1,2), 6); + + Matrix M4(M0); + Assert::IsTrue(M4.IsEqualTo(M0)); + + Matrix M5 = M0; + Assert::IsTrue(M5.IsEqualTo(M0)); + } + + TEST_METHOD(CPUMatrixAddAndSub) + { + Matrix M0(2,3); + M0(0,0) = 1; M0(0,1) = 2; M0(0,2) = 3; + M0(1,0) = 4; M0(1,1) = 5; M0(1,2) = 6; + + Matrix M1(2,3); + M1(0,0) = 11; M1(0,1) = 12; M1(0,2) = 13; + M1(1,0) = 14; M1(1,1) = 15; M1(1,2) = 16; + + Matrix M2(2,3); + M2(0,0) = 12; M2(0,1) = 14; M2(0,2) = 16; + M2(1,0) = 18; M2(1,1) = 20; M2(1,2) = 22; + + Matrix MC(2,1); + MC(0,0) = 10; + MC(1,0) = 10; + + Matrix MR(1,3); + MR(0,0) = 10; MR(0,1) = 10; MR(0,2) = 10; + + Matrix MS(1,1); + MS(0,0) = 10; + + Matrix M3 = M2 - M0; + Assert::IsTrue(M3.IsEqualTo(M1)); + + M3 += M0; + Assert::IsTrue(M3.IsEqualTo(M2)); + + M3 = M0 + 10; + Assert::IsTrue(M3.IsEqualTo(M1)); + + M3 -= 10; + Assert::IsTrue(M3.IsEqualTo(M0)); + + M3 = M1 + M0; + Assert::IsTrue(M3.IsEqualTo(M2)); + + M3 -= M0; + Assert::IsTrue(M3.IsEqualTo(M1)); + + M3 = M1 - 10; + Assert::IsTrue(M3.IsEqualTo(M0)); + + M3 += 10; + Assert::IsTrue(M3.IsEqualTo(M1)); + + M3 -= MC; + Assert::IsTrue(M3.IsEqualTo(M0)); + + M3 += MC; + Assert::IsTrue(M3.IsEqualTo(M1)); + + M3 -= MR; + Assert::IsTrue(M3.IsEqualTo(M0)); + + M3 += MR; + Assert::IsTrue(M3.IsEqualTo(M1)); + + M3.AssignDifferenceOf(M3, MS); + Assert::IsTrue(M3.IsEqualTo(M0)); + } + + TEST_METHOD(CPUMatrixMultiAndDiv) + { + Matrix M0(2,3); + M0(0,0) = 1; M0(0,1) = 2; M0(0,2) = 3; + M0(1,0) = 4; M0(1,1) = 5; M0(1,2) = 6; + + Matrix M00(2,3); + M00(0,0) = 10; M00(0,1) = 20; M00(0,2) = 30; + M00(1,0) = 40; M00(1,1) = 50; M00(1,2) = 60; + + Matrix M1(2,3); + M1.Reshape(3,2); + M1(0,0) = 11; M1(0,1) = 15; + M1(1,0) = 14; M1(1,1) = 13; + M1(2,0) = 12; M1(2,1) = 16; + + Matrix M2(2,2); + M2(0,0) = 75; M2(0,1) = 89; + M2(1,0) = 186; M2(1,1) = 221; + + Matrix M3 = M0 * M1; + Assert::IsTrue(M3.IsEqualTo(M2)); + + M3 = M0 * 10; + Assert::IsTrue(M3.IsEqualTo(M00)); + + M3 = M3 / 10; + Assert::IsTrue(M3.IsEqualTo(M0)); + + M3 *= 10; + Assert::IsTrue(M3.IsEqualTo(M00)); + + M3 /= 10; + Assert::IsTrue(M3.IsEqualTo(M0)); + + Matrix::MultiplyAndWeightedAdd(1, M0, false, M1, false, 0, M3); + Assert::IsTrue(M3.IsEqualTo(M2)); + + M1.Reshape(2,3); + Matrix::MultiplyAndWeightedAdd(1, M0, false, M1, true, 0, M3); + M2(0,0) = 74; M2(0,1) = 92; + M2(1,0) = 182; M2(1,1) = 227; + Assert::IsTrue(M3.IsEqualTo(M2)); + + Matrix::MultiplyAndWeightedAdd(10, M0, false, M1, true, 2, M3); + M2(0,0) = 888; M2(0,1) = 1104; + M2(1,0) = 2184; M2(1,1) = 2724; + Assert::IsTrue(M3.IsEqualTo(M2)); + + Matrix::MultiplyAndWeightedAdd(1, M0, true, M1, false, 0, M3); + M2.Resize(3,3); + M2(0,0) = 67; M2(0,1) = 72; M2(0,2) = 77; + M2(1,0) = 92; M2(1,1) = 99; M2(1,2) = 106; + M2(2,0) = 117; M2(2,1) = 126; M2(2,2) = 135; + Assert::IsTrue(M3.IsEqualTo(M2)); + } + + TEST_METHOD(CPUMatrixElementOps) + { + Matrix M0(2,3); + M0(0,0) = 1; M0(0,1) = 2; M0(0,2) = 3; + M0(1,0) = 4; M0(1,1) = 5; M0(1,2) = 6; + + Matrix M00(2,3); + M00(0,0) = 1.0; M00(0,1) = 1/2.0; M00(0,2) = 1/3.0; + M00(1,0) = 1/4.0; M00(1,1) = 1/5.0; M00(1,2) = 1/6.0; + + Matrix M1(2,3); + M1(0,0) = 1; M1(0,1) = 1; M1(0,2) = 1; + M1(1,0) = 1; M1(1,1) = 1; M1(1,2) = 1; + + Matrix M3; + M3.AssignElementProductOf(M0, M00); + Assert::IsTrue(M3.IsEqualTo(M1, 0.0001)); + + M3 = M0 ^ 4; + Matrix M2(2,3); + M2(0,0) = 1; M2(0,1) = 16; M2(0,2) = 81; + M2(1,0) = 256; M2(1,1) = 625; M2(1,2) = 1296; + Assert::IsTrue(M3.IsEqualTo(M2)); + + M3.SetValue(M0); + M3 ^= 4; + Assert::IsTrue(M3.IsEqualTo(M2)); + + M3.SetValue(M0); + M3.ElementMultiplyWith(M00); + Assert::IsTrue(M3.IsEqualTo(M1)); + + M3.SetValue(M0); + M3.ElementInverse(); + Assert::IsTrue(M3.IsEqualTo(M00)); + + M2(0,0) = 0.7311; M2(0,1) = 0.8808; M2(0,2) = 0.9526; + M2(1,0) = 0.9820; M2(1,1) = 0.9933; M2(1,2) = 0.9975; + M3.AssignElementDivisionOf(M2, M0); + M2.ElementMultiplyWith(M00); + Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); + + M3.SetValue(M0); + M3.InplaceSigmoid(); + M2(0,0) = 0.7311; M2(0,1) = 0.8808; M2(0,2) = 0.9526; + M2(1,0) = 0.9820; M2(1,1) = 0.9933; M2(1,2) = 0.9975; + Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); + + M3.SetValue(M0); + M3.InplaceTanh(); + M2(0,0) = 0.7616; M2(0,1) = 0.9640; M2(0,2) = 0.9951; + M2(1,0) = 0.9993; M2(1,1) = 0.9999; M2(1,2) = 1.0000; + Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); + + M3.SetValue(M0); + M3.InplaceLogSoftmax(true); + M3.InplaceExp(); + M2(0,0) = 0.0474; M2(0,1) = 0.0474; M2(0,2) = 0.0474; + M2(1,0) = 0.9526; M2(1,1) = 0.9526; M2(1,2) = 0.9526; + Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); + + M3.SetValue(M0); + M3.InplaceLogSoftmax(false); + M3.InplaceExp(); + M2(0,0) = 0.0900; M2(0,1) = 0.2447; M2(0,2) = 0.6652; + M2(1,0) = 0.0900; M2(1,1) = 0.2447; M2(1,2) = 0.6652; + Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); + + M3.SetValue(M0); + M3.InplaceHardmax(true); + M2(0, 0) = 0.0; M2(0, 1) = 0.0; M2(0, 2) = 0.0; + M2(1, 0) = 1.0; M2(1, 1) = 1.0; M2(1, 2) = 1.0; + Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); + + M3.SetValue(M0); + M3.InplaceHardmax(false); + M2(0, 0) = 0.0; M2(0, 1) = 0.0; M2(0, 2) = 1.0; + M2(1, 0) = 0.0; M2(1, 1) = 0.0; M2(1, 2) = 1.0; + Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); + + M3.SetValue(M0); + M3.InplaceSqrt(); + M2(0,0) = 1; M2(0,1) = 1.4142; M2(0,2) = 1.7321; + M2(1,0) = 2; M2(1,1) = 2.2361; M2(1,2) = 2.4495; + Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); + + M3.SetValue(M0); + M3.InplaceExp(); + M2(0,0) = 2.7183; M2(0,1) = 7.3891; M2(0,2) = 20.0855; + M2(1,0) = 54.5982; M2(1,1) = 148.4132; M2(1,2) = 403.4288; + Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); + + M3.SetValue(M0); + M3.InplaceExp(); + M2(0,0) = 2.7183; M2(0,1) = 7.3891; M2(0,2) = 20.0855; + M2(1,0) = 54.5982; M2(1,1) = 148.4132; M2(1,2) = 403.4288; + Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); + + M3.InplaceLog(); + Assert::IsTrue(M3.IsEqualTo(M0, 0.0001)); + + M3.SetValue(M0); + M3.InplaceTruncateBottom(2); + M2(0,0) = 2; M2(0,1) = 2; M2(0,2) = 3; + M2(1,0) = 4; M2(1,1) = 5; M2(1,2) = 6; + Assert::IsTrue(M3.IsEqualTo(M2)); + + M3.SetValue(M0); + M3.InplaceTruncateTop(4); + M2(0,0) = 1; M2(0,1) = 2; M2(0,2) = 3; + M2(1,0) = 4; M2(1,1) = 4; M2(1,2) = 4; + Assert::IsTrue(M3.IsEqualTo(M2)); + + double pi = 3.14159265358979323846264338327950288419716939937510; + + Matrix M_Trig(2,3); + M_Trig(0,0) = 0; M_Trig(0,1) = pi/2.0; M_Trig(0,2) = pi; + M_Trig(1,0) = 3.0*pi/2.0; M_Trig(1,1) = 2.0*pi; M_Trig(1,2) = 5.0*pi/2.0; + + Matrix M_Cos(2,3); + M_Cos.SetValue(M_Trig); + + Matrix M_Cos_expected(2,3); + M_Cos_expected(0,0) = 1; M_Cos_expected(0,1) = 0; M_Cos_expected(0,2) = -1; + M_Cos_expected(1,0) = 0; M_Cos_expected(1,1) = 1; M_Cos_expected(1,2) = 0; + + M_Cos.InplaceCosine(); + Assert::IsTrue(M_Cos.IsEqualTo(M_Cos_expected, 0.0001)); + + M_Cos.SetValue(M_Trig); + M_Cos.AssignCosineOf(M_Trig); + Assert::IsTrue(M_Cos.IsEqualTo(M_Cos_expected, 0.0001)); + + Matrix M_NegSine(2,3); + M_NegSine.SetValue(M_Trig); + + Matrix M_NegSine_expected(2,3); + M_NegSine_expected(0,0) = 0; M_NegSine_expected(0,1) = -1; M_NegSine_expected(0,2) = 0; + M_NegSine_expected(1,0) = 1; M_NegSine_expected(1,1) = 0; M_NegSine_expected(1,2) = -1; + + M_NegSine.InplaceNegativeSine(); + Assert::IsTrue(M_NegSine.IsEqualTo(M_NegSine_expected, 0.0001)); + + M_NegSine.SetValue(M_Trig); + M_NegSine.AssignNegativeSineOf(M_Trig); + Assert::IsTrue(M_NegSine.IsEqualTo(M_NegSine_expected, 0.0001)); + } + + TEST_METHOD(CPUMatrixNorms) + { + Matrix M0(2,3); + M0(0,0) = 1; M0(0,1) = 2; M0(0,2) = 3; + M0(1,0) = 4; M0(1,1) = 5; M0(1,2) = 6; + + Matrix M3; + M0.VectorNorm1(M3, true); + Matrix M2(1, 3); + M2(0,0) = 5; M2(0,1) = 7; M2(0,2) = 9; + Assert::IsTrue(M3.IsEqualTo(M2)); + + M0.VectorNorm1(M3, false); + M2.Resize(2,1); + M2(0,0) = 6; + M2(1,0) = 15; + Assert::IsTrue(M3.IsEqualTo(M2)); + + M0.VectorNorm2(M3, true); + M2.Resize(1, 3); + M2(0,0) = 4.1231; M2(0,1) = 5.3852; M2(0,2) = 6.7082; + Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); + + M0.VectorNorm2(M3, false); + M2.Resize(2,1); + M2(0,0) = 3.7417; + M2(1,0) = 8.7750; + Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); + + M0.VectorNormInf(M3, true); + M2.Resize(1, 3); + M2(0,0) = 4; M2(0,1) = 5; M2(0,2) = 6; + Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); + + M0.VectorNormInf(M3, false); + M2.Resize(2,1); + M2(0,0) = 3; + M2(1,0) = 6; + Assert::IsTrue(M3.IsEqualTo(M2)); + + Assert::IsTrue(abs(M0.FrobeniusNorm() - 9.5394) < 0.0001); + Assert::IsTrue(abs(M0.MatrixNormInf() - 6) < 0.0001); + + Matrix M1; + M0.VectorMax(M1, M3, true); + M2.Resize(1, 3); + M2(0,0) = 4; M2(0,1) = 5; M2(0,2) = 6; + Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); + + M0.VectorMax(M1, M3, false); + M2.Resize(2,1); + M2(0,0) = 3; + M2(1,0) = 6; + Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); + + M0.VectorMin(M1, M3, true); + M2.Resize(1, 3); + M2(0,0) = 1; M2(0,1) = 2; M2(0,2) = 3; + Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); + + M0.VectorMin(M1, M3, false); + M2.Resize(2,1); + M2(0,0) = 1; + M2(1,0) = 4; + Assert::IsTrue(M3.IsEqualTo(M2, 0.0001)); + } + + TEST_METHOD(CPUMatrixSetValues) + { + Matrix M0(3,3); + M0(0,0) = 10; M0(1,1) = 10; M0(2,2) = 10; + + Matrix M1(3,3); + M1.SetDiagonalValue(10); + Assert::IsTrue(M1.IsEqualTo(M0, 0.0001)); + + Matrix M2(3,1); + M2(0,0) = 10; M2(1,0) = 10; M2(2,0) = 10; + M1.SetDiagonalValue(M2); + Assert::IsTrue(M1.IsEqualTo(M0, 0.0001)); + + M1.SetUniformRandomValue(-0.01, 0.01); + for (int i=0; i= -0.01 && M1(i,j) < 0.01); + + M1.SetGaussianRandomValue(0, 0.01); + } + + TEST_METHOD(CPUMatrixTranspose) + { + Matrix M0(2,3); + M0(0,0) = 1; M0(0,1) = 2; M0(0,2) = 3; + M0(1,0) = 4; M0(1,1) = 5; M0(1,2) = 6; + + Matrix M1(3,2); + M1(0,0) = 1; M1(0,1) = 4; + M1(1,0) = 2; M1(1,1) = 5; + M1(2,0) = 3; M1(2,1) = 6; + + Matrix M2 = M0.Transpose(); + Assert::IsTrue(M2.IsEqualTo(M1, 0.0001)); + + M2.AssignTransposeOf(M1); + Assert::IsTrue(M2.IsEqualTo(M0, 0.0001)); + } + + TEST_METHOD(CPUMatrixColumnSlice) + { + Matrix M0(2,3); + M0(0,0) = 1; M0(0,1) = 2; M0(0,2) = 3; + M0(1,0) = 4; M0(1,1) = 5; M0(1,2) = 6; + + Matrix M1(2,2); + M1(0,0) = 1; M1(0,1) = 2; + M1(1,0) = 4; M1(1,1) = 5; + + Matrix M2 = M0.ColumnSlice(0,2); + Assert::IsTrue(M2.IsEqualTo(M1, 0.0001)); + + M1(0,0) = 2; M1(0,1) = 3; + M1(1,0) = 5; M1(1,1) = 6; + + M2 = M0.ColumnSlice(1,2); + Assert::IsTrue(M2.IsEqualTo(M1, 0.0001)); + + size_t k=100, n=20, m=50; + + Matrix AG((size_t)k,(size_t)n); + AG.SetUniformRandomValue(-1,1); + + Matrix BG((size_t)n,(size_t)m); + BG.SetUniformRandomValue(-1,1); + + Matrix CG((size_t)k,(size_t)m); + CG.SetUniformRandomValue(-1,1); + Matrix DG((size_t)k,(size_t)m); + DG.SetValue(CG); + + Matrix::MultiplyAndAdd(AG, false, BG, false, DG); + + for (int i=0; i inputMatrices; + inputMatrices.resize(3); + inputMatrices[0] = &M5; + inputMatrices[1] = &M6; + inputMatrices[2] = &M7; + M8.AssignRowStackValuesOf(inputMatrices, 0, 3); + + Assert::IsTrue(M8.IsEqualTo(M0, 0.0001)); +#endif + } + + TEST_METHOD(CPUAssignRepeatOf) + { + Matrix M0(2, 3); + M0(0, 0) = 1; M0(0, 1) = 6; M0(0, 2) = 11; + M0(1, 0) = 2; M0(1, 1) = 7; M0(1, 2) = 12; + + Matrix M1; + M1.AssignRepeatOf(M0, 1, 1); + Assert::IsTrue(M1.IsEqualTo(M0, 0.0001)); + + Matrix M3(6, 6); + M3(0, 0) = 1; M3(0, 1) = 6; M3(0, 2) = 11; M3(0, 3) = 1; M3(0, 4) = 6; M3(0, 5) = 11; + M3(1, 0) = 2; M3(1, 1) = 7; M3(1, 2) = 12; M3(1, 3) = 2; M3(1, 4) = 7; M3(1, 5) = 12; + M3(2, 0) = 1; M3(2, 1) = 6; M3(2, 2) = 11; M3(2, 3) = 1; M3(2, 4) = 6; M3(2, 5) = 11; + M3(3, 0) = 2; M3(3, 1) = 7; M3(3, 2) = 12; M3(3, 3) = 2; M3(3, 4) = 7; M3(3, 5) = 12; + M3(4, 0) = 1; M3(4, 1) = 6; M3(4, 2) = 11; M3(4, 3) = 1; M3(4, 4) = 6; M3(4, 5) = 11; + M3(5, 0) = 2; M3(5, 1) = 7; M3(5, 2) = 12; M3(5, 3) = 2; M3(5, 4) = 7; M3(5, 5) = 12; + + M1.AssignRepeatOf(M0, 3, 2); + Assert::IsTrue(M1.IsEqualTo(M3, 0.0001)); + } + + TEST_METHOD(CPURowElementOperations) + { + Matrix M0 = Matrix::RandomUniform(20, 28, -1, 1); + Matrix M1 = Matrix::RandomUniform(1, 28, 1, 2); + + Matrix M3; + M3.SetValue(M0); + M3.RowElementMultiplyWith(M1); + M3.RowElementDivideBy(M1); + + Assert::IsTrue(M0.IsEqualTo(M3, 0.0001)); + } + TEST_METHOD(CPUColumnElementOperations) + { + Matrix M0 = Matrix::RandomUniform(20, 28, -1, 1); + Matrix M1 = Matrix::RandomUniform(20, 1, 1, 2); + + Matrix M3; + M3.SetValue(M0); + M3.ColumnElementMultiplyWith(M1); + M3.ColumnElementDivideBy(M1); + + Assert::IsTrue(M0.IsEqualTo(M3, 0.0001)); + } + + TEST_METHOD(CPUAssignMatrixByColumnSlice) + { + printf("starts here\n"); + Matrix M0 = Matrix::RandomUniform(400, 50, -100, 100); + + + vector columnrange = { 0, 3, 5, 4 }; + Matrix M1; + try + { + M1.AssignMatrixByColumnSlice(M0, columnrange); + } + catch (exception& e) + { + printf("%s\n", e.what()); + Assert::Fail(); + } + + + for (size_t des = 0; des < columnrange.size(); des ++) + { + size_t src = columnrange[des]; + + double err = 0; + for (size_t r = 0; r < 400; r++) + { + double diff = (M0(r, src) - M1(r, des)); + diff *= diff; + err += diff; + } + Assert::AreEqual(err, 0, 1e-7); + } + + } + + }; +} \ No newline at end of file diff --git a/Math/Math/CPUMatrix.cpp b/Math/Math/CPUMatrix.cpp index 79bf8d24e..26f4a8cd6 100644 --- a/Math/Math/CPUMatrix.cpp +++ b/Math/Math/CPUMatrix.cpp @@ -259,6 +259,19 @@ namespace Microsoft { namespace MSR { namespace CNTK { return slice; } + // BUGBUG: Unlike ColumnSlice(), this does not return a view. Must be renamed. + template + CPUMatrix CPUMatrix::RowSlice(size_t startRow, size_t numRows) const + { + if (startRow + numRows > m_numRows ) + InvalidArgument("The row slice (%d+%d) is out of range of the source matrix (%d).", (int)startRow, (int)numRows, (int)m_numRows); + + CPUMatrix slice; + slice.AssignRowSliceValuesOf(*this, startRow, numRows); + + return slice; + } + // set this(:, 0:numCols-1) = fromMatrix(:, startColumn : startColumn+numCols-1) // TODO: why not say *this = ColumnSlice()? template @@ -333,7 +346,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } } - + //for each column of a, we add all rows of a to this starting from startIndex template CPUMatrix& CPUMatrix::AssignToRowSliceValuesOf(const CPUMatrix& a, const size_t startIndex, const size_t numRows) diff --git a/Math/Math/CPUMatrix.h b/Math/Math/CPUMatrix.h index 83d63559b..0c7e7f34c 100644 --- a/Math/Math/CPUMatrix.h +++ b/Math/Math/CPUMatrix.h @@ -52,6 +52,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { CPUMatrix ColumnSlice(size_t startColumn, size_t numCols) const; CPUMatrix& AssignColumnSlice(const CPUMatrix& fromMatrix, size_t startColumn, size_t numCols); CPUMatrix& SetColumnSlice(const CPUMatrix& fromMatrix, size_t startColumn, size_t numCols); + // BUGBUG: Unlike ColumnSlice(), this does not return a view. Must be renamed. + CPUMatrix RowSlice(size_t startRow, size_t numRows) const; void CopyColumnsStrided(const CPUMatrix& fromMatrix, size_t numCols, size_t srcNumColsStride, size_t destNumColsStride); diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu index 29d5c96f4..1f9b81042 100644 --- a/Math/Math/GPUMatrix.cu +++ b/Math/Math/GPUMatrix.cu @@ -537,6 +537,17 @@ namespace Microsoft { namespace MSR { namespace CNTK { return slice; } + template + GPUMatrix GPUMatrix::RowSlice(size_t startRow, size_t numRows) const + { + if (startRow + numRows > m_numRows ) + InvalidArgument("The row slice (%d+%d) is out of range of the source matrix (%d).", (int)startRow, (int)numRows, (int)m_numRows); + + GPUMatrix slice(GetComputeDeviceId()); + slice.AssignRowSliceValuesOf(*this, startRow, numRows); + return slice; + } + template GPUMatrix& GPUMatrix::AssignColumnSlice(const GPUMatrix& fromMatrix, size_t startColumn, size_t numCols) { diff --git a/Math/Math/GPUMatrix.h b/Math/Math/GPUMatrix.h index 07ef47683..f6fc150e5 100644 --- a/Math/Math/GPUMatrix.h +++ b/Math/Math/GPUMatrix.h @@ -142,6 +142,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { GPUMatrix ColumnSlice(size_t startColumn, size_t numCols) const; GPUMatrix& AssignColumnSlice(const GPUMatrix& fromMatrix, size_t startColumn, size_t numCols); GPUMatrix& SetColumnSlice(const GPUMatrix& fromMatrix, size_t startColumn, size_t numCols); + // BUGBUG: Unlike ColumnSlice(), this does not return a view. Must be renamed. + GPUMatrix RowSlice(size_t startRow, size_t numRows) const; void CopyColumnsStrided(const GPUMatrix& fromMatrix, size_t numCols, size_t srcNumColsStride, size_t destNumColsStride); diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp index 751345b3c..3c8d76cdf 100755 --- a/Math/Math/Matrix.cpp +++ b/Math/Math/Matrix.cpp @@ -766,6 +766,46 @@ namespace Microsoft { namespace MSR { namespace CNTK { return slice; } + // BUGBUG: Unlike ColumnSlice(), this does not return a view. Must be renamed. + template + Matrix Matrix::RowSlice(size_t startRow, size_t numRow) const + { + int devId = GetDeviceId(); + Matrix slice(devId); + slice.m_preferredDeviceId = m_preferredDeviceId; + if (GetMatrixType() == MatrixType::DENSE) + { + if (devId == CPUDEVICE) + { + if (slice.m_CPUMatrix != nullptr) + slice.m_CPUMatrix->operator=(static_cast&&> (m_CPUMatrix->RowSlice(startRow, numRow))); + else + slice.m_CPUMatrix = new CPUMatrix(static_cast&&> (m_CPUMatrix->RowSlice(startRow, numRow))); + slice.SetDataLocation(CPU, DENSE); + } + else + { + if (slice.m_GPUMatrix != nullptr) + slice.m_GPUMatrix->operator=(static_cast&&>(m_GPUMatrix->RowSlice(startRow, numRow))); + else + slice.m_GPUMatrix = new GPUMatrix(static_cast&&>(m_GPUMatrix->RowSlice(startRow, numRow))); + slice.SetDataLocation(GPU, DENSE); + } + + } + else if (GetMatrixType() == MatrixType::SPARSE) + { + NOT_IMPLEMENTED; + } + else + { + RuntimeError("Unknown matrix type"); + } + return slice; + } + + + template Matrix& Matrix::AssignColumnSlice(const Matrix& fromMatrix, size_t startColumn, size_t numCols) { diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h index 9490adc41..b3a4b0258 100644 --- a/Math/Math/Matrix.h +++ b/Math/Math/Matrix.h @@ -153,7 +153,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { void CopySection(size_t numRows, size_t numCols, ElemType* dst, size_t colStride) const; Matrix ColumnSlice(size_t startColumn, size_t numCols) const; - + // BUGBUG: Unlike ColumnSlice(), this does not return a view. Must be renamed. + Matrix RowSlice(size_t startRow, size_t numRows) const; // difference between AssignColumnSlice and SetColumnSlice // AssignColumnSlice : this(:, startColumn:startColumn+numCols-1) = fromMatrix(:, startColumn: startColumn+numCols-1) diff --git a/Math/Math/NoGPU.cpp b/Math/Math/NoGPU.cpp index 89f79586f..a776a3000 100644 --- a/Math/Math/NoGPU.cpp +++ b/Math/Math/NoGPU.cpp @@ -490,7 +490,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { template GPUMatrix GPUMatrix::ColumnSlice(size_t startColumn, size_t numCols) const { GPUMatrix slice(0); + return slice; + } + template GPUMatrix GPUMatrix::RowSlice(size_t startRow, size_t numRows) const + { + GPUMatrix slice(0); return slice; }