From 1fc15ed1058afc48ade968758b2d4d23974f156d Mon Sep 17 00:00:00 2001 From: Guoli Ye Date: Thu, 20 Feb 2020 22:34:44 -0800 Subject: [PATCH] multithread GPU code checkin, no speedup though due to the stream in kernel function --- .../ComputationNetwork.h | 1490 +++++++++++++---- .../ComputationNetworkEditing.cpp | 55 +- .../ComputationNetworkLib/ComputationNode.h | 5 +- .../ComputationNetworkLib/ReshapingNodes.cpp | 188 ++- Source/ComputationNetworkLib/ReshapingNodes.h | 611 ++++--- Source/Math/CPUMatrix.h | 1 + Source/Math/CPUMatrixImpl.h | 18 + Source/Math/GPUMatrix.cu | 125 ++ Source/Math/GPUMatrix.h | 6 + Source/Math/GPUMatrixCUDAKernels.cuh | 13 + Source/Math/GPUSparseMatrix.cu | 13 + Source/Math/GPUSparseMatrix.h | 2 +- Source/Math/GPUTensor.cu | 316 ++++ Source/Math/GPUTensor.h | 6 + Source/Math/Matrix.cpp | 32 + Source/Math/Matrix.h | 7 + Source/Math/NoGPU.cpp | 11 +- Source/Math/TensorView.cpp | 43 +- Source/Math/TensorView.h | 1 + Source/SGDLib/SGD.cpp | 112 +- Source/SGDLib/SGD.h | 1 + Source/SGDLib/SimpleOutputWriter.h | 68 +- 22 files changed, 2376 insertions(+), 748 deletions(-) diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h index d05d71494..748105a1f 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.h +++ b/Source/ComputationNetworkLib/ComputationNetwork.h @@ -59,7 +59,6 @@ inline std::wstring ToString(const ComputationNodeBasePtr& node) // =========================================================================== // move the shared function/data structure from SimpleOutputWriter.h to this file - class ComputationNetwork : public ScriptableObjects::Object, public ScriptableObjects::HasToString, public ScriptableObjects::CustomConfigRecord @@ -239,7 +238,6 @@ public: } } - static void BumpEvalTimeStamp(const std::vector& nodes); void ResetEvalTimeStamps(); void SetEvalTimeStampsOutdatedWithRegardToAll(); @@ -494,6 +492,7 @@ public: ComputationNodeBasePtr CopyNode(const ComputationNetwork& fromNet, const std::wstring fromName, std::wstring toName, const CopyNodeFlags flags); void CopySubTree(const ComputationNetwork& fromNet, const std::wstring fromName, std::wstring toNamePrefix, const CopyNodeFlags flags); + void ShowNodeMemory(const ComputationNetwork& fromNet, const std::wstring fromName); void CopyInputs(const std::wstring fromName, std::wstring toName); void RenameNode(const std::wstring& nodeNameOrig, const std::wstring& nodeNameNew); void RenameNode(ComputationNodeBasePtr node, const std::wstring& newNodeName); @@ -1421,8 +1420,6 @@ public: vector>> m_decodeOutputCache; std::vector m_nodesToCache; - - struct Sequence { //shared_ptr> LabelMatrix; @@ -1524,6 +1521,77 @@ public: vector().swap(oneSeq.labelseq); } + // the below 2 newseq and deleteseq are for multithread, where different thread do not want to share the m_nameToPastValueNodeCache and m_decodeOutputCache. Rather, each thread will have its own passed explicitly by parameter ; + + Sequence newSeq(Sequence& a, DEVICEID_TYPE deviceId, unordered_map>>>& m_nameToPastValueNodeCachePerThread, vector>>& m_decodeOutputCachePerThread) + { + Sequence oneSeq; + oneSeq.labelseq = a.labelseq; + oneSeq.logP = a.logP; + oneSeq.length = a.length; + oneSeq.lengthwithblank = a.lengthwithblank; + oneSeq.processlength = a.processlength; + if (m_decodeOutputCachePerThread.size() > 0) + { + oneSeq.decodeoutput = m_decodeOutputCachePerThread.back(); + m_decodeOutputCachePerThread.pop_back(); + } + else + { + oneSeq.decodeoutput = make_shared>(a.decodeoutput->GetNumRows(), (size_t) 1, a.decodeoutput->GetDeviceId()); + } + oneSeq.decodeoutput->SetValue(*(a.decodeoutput)); + + typename unordered_map>>::iterator it; + for (it = a.nameToNodeValues.begin(); it != a.nameToNodeValues.end(); it++) + { + if (oneSeq.processlength > 0) + { + if (it->second->Value().GetNumElements() > 0 && a.realValues) + { + oneSeq.nameToParentNodeValues[it->first] = it->second; + a.refs++; + } + else + oneSeq.nameToParentNodeValues[it->first] = a.nameToParentNodeValues[it->first]; + /*size_t ab = oneSeq.nameToParentNodeValues[it->first]->Value().GetNumElements(); + if (ab > 0) + fprintf(stderr, "test %ls %zu", it->first.c_str(), ab);*/ + } + auto itin = m_nameToPastValueNodeCachePerThread.find(it->first); + if (itin != m_nameToPastValueNodeCachePerThread.end() && m_nameToPastValueNodeCachePerThread[it->first].size() > 0) + { + oneSeq.nameToNodeValues[it->first] = m_nameToPastValueNodeCachePerThread[it->first].back(); + m_nameToPastValueNodeCachePerThread[it->first].pop_back(); + } + else + { + oneSeq.nameToNodeValues[it->first] = make_shared>(deviceId, it->first); + } + /*std::ostringstream address; + address << oneSeq.nameToNodeValues[it->first]; + fprintf(stderr, "newSeq %ls %s \n", it->first.c_str(), address.str().c_str());*/ + } + + return oneSeq; + } + + void deleteSeq(Sequence oneSeq, unordered_map>>>& m_nameToPastValueNodeCachePerThread, vector>>& m_decodeOutputCachePerThread) + { + typename unordered_map>>::iterator it; + for (it = oneSeq.nameToNodeValues.begin(); it != oneSeq.nameToNodeValues.end(); it++) + { + auto itin = m_nameToPastValueNodeCachePerThread.find(it->first); + if (itin == m_nameToPastValueNodeCachePerThread.end()) + m_nameToPastValueNodeCachePerThread[it->first] = vector>>(); + if (oneSeq.refs == 0) + m_nameToPastValueNodeCachePerThread[it->first].push_back(oneSeq.nameToNodeValues[it->first]); + } + m_decodeOutputCachePerThread.push_back(oneSeq.decodeoutput); + + vector().swap(oneSeq.labelseq); + } + void extendSeq(Sequence& insequence, size_t labelId, ElemType logP) { insequence.labelseq.push_back(labelId); @@ -1531,7 +1599,7 @@ public: insequence.length++; insequence.lengthwithblank++; } - vector> getTopN(Microsoft::MSR::CNTK::Matrix& prob, size_t N, size_t& blankid) + vector> getTopN(Microsoft::MSR::CNTK::Matrix& prob, size_t N, const size_t& blankid) { vector> datapair; typedef typename vector>::value_type ValueType; @@ -1568,121 +1636,227 @@ public: } void forward_decode(Sequence& oneSeq, StreamMinibatchInputs decodeinputMatrices, DEVICEID_TYPE deviceID, const std::vector& decodeOutputNodes, - const std::vector& decodeinputNodes, size_t vocabSize, size_t plength, ComputationNetwork& cn) + const std::vector& decodeinputNodes, size_t vocabSize, size_t plength, ComputationNetwork& net, int uttFrameNum = 0) { // size_t labelLength = oneSeq.length; if (oneSeq.processlength + 1 != plength && plength != oneSeq.processlength) LogicError("Current implementation assumes 1 step difference"); - if (plength != oneSeq.processlength) + /* + if (uttFrameNum == 94) { - /*m_logIndex = m_logIndex + 1; - wstring fileName = L"D:\\users\\vadimma\\cntk_3\\new_opt" + std::to_wstring(m_logIndex) + L".txt"; - std::ofstream out(fileName, std::ios::out); - out << fixed; - out.precision(3); - for (size_t li = 0; li < oneSeq.labelseq.size(); li++) - out << oneSeq.labelseq[li] << " "; - - out << "\n";*/ - - Matrix lmin(deviceID); - - lmin.Resize(vocabSize, 1); - lmin.SetValue(0.0); - lmin(oneSeq.labelseq[plength - 1], 0) = 1.0; - auto lminput = decodeinputMatrices.begin(); - lminput->second.pMBLayout->Init(1, 1); - //std::swap(lminput->second.GetMatrix(), lmin); - lminput->second.GetMatrix().SetValue(lmin); - if (plength == 1) + for (const auto& node : net.GetAllNodesForRoot(decodeOutputNodes[0])) { - lminput->second.pMBLayout->AddSequence(NEW_SEQUENCE_ID, 0, 0, 1); + + if (dynamic_pointer_cast>(node)->Value().IsEmpty()) + { + fprintf(stderr, "forward_decode 0 NodeName = %ls, Empty \n", node->NodeName().c_str()); + } + else + { + double pnorm = dynamic_pointer_cast>(node)->Value().FrobeniusNorm(); + fprintf(stderr, "forward_decode 0 NodeName = %ls, Norm = %f \n", node->NodeName().c_str(), pnorm); + } } - else + } + */ + if (plength != oneSeq.processlength) { - ///lminput->second.pMBLayout->//m_sequences.erase(0); - lminput->second.pMBLayout->AddSequence(NEW_SEQUENCE_ID, 0, SentinelValueIndicatingUnspecifedSequenceBeginIdx, 1); + Matrix lmin(deviceID); - //DataReaderHelpers::NotifyChangedNodes(m_net, decodeinputMatrices); + lmin.Resize(vocabSize, 1); + lmin.SetValue(0.0); + lmin(oneSeq.labelseq[plength - 1], 0) = 1.0; + auto lminput = decodeinputMatrices.begin(); + if (lminput->second.pMBLayout == NULL) + { + lminput->second.pMBLayout = make_shared(); + } + lminput->second.pMBLayout->Init(1, 1); + //std::swap(lminput->second.GetMatrix(), lmin); + lminput->second.GetMatrix().SetValue(lmin); + if (plength == 1) + { + lminput->second.pMBLayout->AddSequence(NEW_SEQUENCE_ID, 0, 0, 1); + } + else + { + ///lminput->second.pMBLayout->//m_sequences.erase(0); + lminput->second.pMBLayout->AddSequence(NEW_SEQUENCE_ID, 0, SentinelValueIndicatingUnspecifedSequenceBeginIdx, 1); + + //DataReaderHelpers::NotifyChangedNodes(m_net, decodeinputMatrices); + + for (size_t i = 0; i < m_nodesToCache.size(); i++) + { + auto nodePtr = net.GetNodeFromName(m_nodesToCache[i]); + + if (oneSeq.nameToNodeValues[m_nodesToCache[i]]->Value().GetNumElements() > 0) + + { + oneSeq.nameToNodeValues[m_nodesToCache[i]]->CopyTo(nodePtr, m_nodesToCache[i], CopyNodeFlags::copyNodeInputLinks); + } + } + } + + net.BumpEvalTimeStamp(decodeinputNodes); + // NotifyChangedNodes(m_net, decodeinputMatrices); + + net.ForwardProp(decodeOutputNodes[0]); + /* + if (uttFrameNum == 94) + { + + for (const auto& node : net.GetAllNodesForRoot(decodeOutputNodes[0])) + { + + if (dynamic_pointer_cast>(node)->Value().IsEmpty()) + { + fprintf(stderr, "forward_decode 1 NodeName = %ls, Empty \n", node->NodeName().c_str()); + } + else + { + double pnorm = dynamic_pointer_cast>(node)->Value().FrobeniusNorm(); + fprintf(stderr, "forward_decode 1 NodeName = %ls, Norm = %f \n", node->NodeName().c_str(), pnorm); + } + } + fprintf(stderr, "forward_decode decodeOutputNodes = %f, oneSeq.decodeoutput = %f, debug 1\n", (*(&dynamic_pointer_cast>(decodeOutputNodes[0])->Value())).FrobeniusNorm(), (*(oneSeq.decodeoutput)).FrobeniusNorm()); + } + */ + //Matrix tempMatrix = *(&dynamic_pointer_cast>(decodeOutputNodes[0])->Value()); + oneSeq.decodeoutput->SetValue((*(&dynamic_pointer_cast>(decodeOutputNodes[0])->Value()))); + // fprintf(stderr, "forward_decode = %f \n", oneSeq.decodeoutput->FrobeniusNorm()); + /* + if (uttFrameNum == 94) + { + fprintf(stderr, "oneSeq.decodeoutput = %f, debug 2\n", (*(oneSeq.decodeoutput)).FrobeniusNorm()); + } + */ + oneSeq.processlength = plength; for (size_t i = 0; i < m_nodesToCache.size(); i++) { - auto nodePtr = cn.GetNodeFromName(m_nodesToCache[i]); - - if (oneSeq.nameToNodeValues[m_nodesToCache[i]]->Value().GetNumElements() > 0) + auto nodePtr = net.GetNodeFromName(m_nodesToCache[i]); + if (plength == 1) { - oneSeq.nameToNodeValues[m_nodesToCache[i]]->CopyTo(nodePtr, m_nodesToCache[i], CopyNodeFlags::copyNodeInputLinks); + nodePtr->CopyTo(oneSeq.nameToNodeValues[m_nodesToCache[i]], m_nodesToCache[i], CopyNodeFlags::copyNodeAll); } } + + lmin.ReleaseMemory(); } - - // cn.BumpEvalTimeStamp(decodeinputNodes); - - //m_net->DumpAllNodesToFile(true, true, L"D:\\users\\vadimma\\cntk_3\\After_model_opt.txt"); - cn.BumpEvalTimeStamp(decodeinputNodes); - // NotifyChangedNodes(m_net, decodeinputMatrices); - - cn.ForwardProp(decodeOutputNodes[0]); - //Matrix tempMatrix = *(&dynamic_pointer_cast>(decodeOutputNodes[0])->Value()); - oneSeq.decodeoutput->SetValue((*(&dynamic_pointer_cast>(decodeOutputNodes[0])->Value()))); - oneSeq.processlength = plength; - - for (size_t i = 0; i < m_nodesToCache.size(); i++) - { - auto nodePtr = cn.GetNodeFromName(m_nodesToCache[i]); - - if (plength == 1) - { - nodePtr->CopyTo(oneSeq.nameToNodeValues[m_nodesToCache[i]], m_nodesToCache[i], CopyNodeFlags::copyNodeAll); - } - } - - lmin.ReleaseMemory(); - } } - void forwardmerged(Sequence a, size_t t, Matrix& sumofENandDE, Matrix& encodeOutput, Matrix& decodeOutput, ComputationNodeBasePtr PlusNode, - ComputationNodeBasePtr PlusTransNode, std::vector Plusnodes, std::vector Plustransnodes, Matrix& Wm, Matrix& bm, ComputationNetwork& cn) + void forwardmerged(Sequence a, size_t t, const Matrix& encodeOutput, Matrix& decodeOutput, + std::vector Plusnodes, std::vector Plustransnodes, const Matrix& Wm, const Matrix& bm, const ComputationNetworkPtr& net, + int uttFrameNum = 0, DEVICEID_TYPE deviceID = CPUDEVICE) { + /* + if (uttFrameNum == 94) + { - sumofENandDE.AssignSumOf(encodeOutput.ColumnSlice(t, 1), *(a.decodeoutput)); - //sumofENandDE.InplaceLogSoftmax(true); - Matrix tempMatrix(encodeOutput.GetDeviceId()); + fprintf(stderr, "frowardmerged encodeoutput = %f, a.decodeoutput = %f, debug 1\n", encodeOutput.ColumnSlice(t, 1).FrobeniusNorm(), (*(a.decodeoutput)).FrobeniusNorm()); + } + */ + decodeOutput.AssignSumOf(encodeOutput.ColumnSlice(t, 1), *(a.decodeoutput)); // sum broadcast + //decodeOutput.AssignSumOf(encodeOutput.ColumnSlice(t, 1), encodeOutput.ColumnSlice(t, 1)); + /* + if (uttFrameNum == 94) + { + fprintf(stderr, "frowardmerged decodeOutput = %f, debug 2\n", decodeOutput.FrobeniusNorm()); + } + */ + + Matrix tempMatrix(deviceID); //plus broadcast - (&dynamic_pointer_cast>(PlusNode)->Value())->SetValue(sumofENandDE); - //SumMatrix.SetValue(sumofENandDE); - ComputationNetwork::BumpEvalTimeStamp(Plusnodes); - auto PlusMBlayout = PlusNode->GetMBLayout(); - PlusMBlayout->Init(1, 1); - PlusMBlayout->AddSequence(NEW_SEQUENCE_ID, 0, 0, 1); - // cn.FormEvalOrder(Plustransnodes[0]); - cn.ForwardPropFromTo(Plusnodes, Plustransnodes); - decodeOutput.SetValue(*(&dynamic_pointer_cast>(PlusTransNode)->Value())); + if (!net) + { + //fprintf(stderr, "debug forwardmerge uttNum = %d, &decodeOutput = %p , 3 \n", uttNum, (void*) (&decodeOutput)); + decodeOutput.SetToZeroIfLessThan(0); // reLU + //fprintf(stderr, "debug forwardmerge uttNum = %d, &decodeOutput = %p , 4 \n", uttNum, (void*) (&decodeOutput)); + /* + if (uttFrameNum == 94) + { + fprintf(stderr, "frowardmerged decodeOutput = %f, debug 3\n", decodeOutput.FrobeniusNorm()); + } + */ + } + else + { + (&dynamic_pointer_cast>(Plusnodes[0])->Value())->SetValue(decodeOutput); + ComputationNetwork::BumpEvalTimeStamp(Plusnodes); + auto PlusMBlayout = Plusnodes[0]->GetMBLayout(); + PlusMBlayout->Init(1, 1); + PlusMBlayout->AddSequence(NEW_SEQUENCE_ID, 0, 0, 1); + + net->ForwardPropFromTo(Plusnodes, Plustransnodes); + decodeOutput.SetValue(*(&dynamic_pointer_cast>(Plustransnodes[0])->Value())); + } + + //fprintf(stderr, "forward merge = %f \n", decodeOutput.FrobeniusNorm()); + //fprintf(stderr, "debug forwardmerge uttNum = %d, &decodeOutput = %p , 5 \n", uttNum, (void*) (&decodeOutput)); + /* + if (uttFrameNum == 94) + { + + fprintf(stderr, "frowardmerged Wm = %f, decodeoutput = %f, debug 4\n", Wm.FrobeniusNorm(), decodeOutput.FrobeniusNorm()); + } + */ tempMatrix.AssignProductOf(Wm, true, decodeOutput, false); + //fprintf(stderr, "debug forwardmerge uttNum = %d, &decodeOutput = %p , 6 \n", uttNum, (void*) (&decodeOutput)); + /* + if (uttFrameNum == 94) + { + + fprintf(stderr, "frowardmerged bm = %f, tempMatrix = %f, debug 5\n", bm.FrobeniusNorm(), tempMatrix.FrobeniusNorm()); + } + */ decodeOutput.AssignSumOf(tempMatrix, bm); + /* + if (uttFrameNum == 94) + { + + fprintf(stderr, "frowardmerged decodeOutput = %f, debug 6\n", decodeOutput.FrobeniusNorm()); + } + */ + //fprintf(stderr, "debug forwardmerge uttNum = %d, &decodeOutput = %p , 7 \n", uttNum, (void*) (&decodeOutput)); //decodeOutput.VectorMax(maxIdx, maxVal, true); decodeOutput.InplaceLogSoftmax(true); + /* + if (uttFrameNum == 94) + { + + fprintf(stderr, "frowardmerged decodeOutput = %f, debug 7\n", decodeOutput.FrobeniusNorm()); + } + */ + //fprintf(stderr, "debug forwardmerge uttNum = %d, &decodeOutput = %p , 8 \n", uttNum, (void*) (&decodeOutput)); } - void forwardmergedSVD(Sequence a, size_t t, Matrix& sumofENandDE, Matrix& encodeOutput, Matrix& decodeOutput, ComputationNodeBasePtr PlusNode, - ComputationNodeBasePtr PlusTransNode, std::vector Plusnodes, std::vector Plustransnodes, Matrix& Wmu, Matrix& Wmv, Matrix& bm, ComputationNetwork& cn) + void forwardmergedSVD(Sequence a, size_t t, const Matrix& encodeOutput, Matrix& decodeOutput, std::vector Plusnodes, std::vector Plustransnodes, const Matrix& Wmu, const Matrix& Wmv, const Matrix& bm, const ComputationNetworkPtr& net) { - sumofENandDE.AssignSumOf(encodeOutput.ColumnSlice(t, 1), *(a.decodeoutput)); - //sumofENandDE.InplaceLogSoftmax(true); - Matrix tempMatrix(encodeOutput.GetDeviceId()), tempMatrix1(encodeOutput.GetDeviceId()); + decodeOutput.AssignSumOf(encodeOutput.ColumnSlice(t, 1), *(a.decodeoutput)); + Matrix tempMatrix(encodeOutput.GetDeviceId()), tempMatrix1(encodeOutput.GetDeviceId()); //broadcast //plus broadcast - (&dynamic_pointer_cast>(PlusNode)->Value())->SetValue(sumofENandDE); - //SumMatrix.SetValue(sumofENandDE); - ComputationNetwork::BumpEvalTimeStamp(Plusnodes); - auto PlusMBlayout = PlusNode->GetMBLayout(); - PlusMBlayout->Init(1, 1); - PlusMBlayout->AddSequence(NEW_SEQUENCE_ID, 0, 0, 1); - // cn.FormEvalOrder(Plustransnodes[0]); - cn.ForwardPropFromTo(Plusnodes, Plustransnodes); - decodeOutput.SetValue(*(&dynamic_pointer_cast>(PlusTransNode)->Value())); + + if (!net) + { + decodeOutput.SetToZeroIfLessThan(0); //reLu + } + else + { + (&dynamic_pointer_cast>(Plusnodes[0])->Value())->SetValue(decodeOutput); + //SumMatrix.SetValue(sumofENandDE); + ComputationNetwork::BumpEvalTimeStamp(Plusnodes); + auto PlusMBlayout = Plusnodes[0]->GetMBLayout(); + PlusMBlayout->Init(1, 1); + PlusMBlayout->AddSequence(NEW_SEQUENCE_ID, 0, 0, 1); + + net->ForwardPropFromTo(Plusnodes, Plustransnodes); + decodeOutput.SetValue(*(&dynamic_pointer_cast>(Plustransnodes[0])->Value())); + } + + // fprintf(stderr, "forward merge SVD = %f \n", decodeOutput.FrobeniusNorm()); tempMatrix.AssignProductOf(Wmu, true, decodeOutput, false); tempMatrix1.AssignProductOf(Wmv, true, tempMatrix, false); decodeOutput.AssignSumOf(tempMatrix1, bm); @@ -1710,7 +1884,7 @@ public: } } - float compute_wer(vector& ref, vector& rec) + float compute_wer(const vector& ref, vector& rec) { short** mat; size_t i, j; @@ -1745,221 +1919,60 @@ public: delete[] mat; return wer; } - - void RNNT_decode_nbest_MBR(const std::vector& outputNodeNames, Matrix& encodeOutput, MBLayoutPtr& encodeMBLayout, - Matrix& decodeInputMatrix, MBLayoutPtr& decodeMBLayout, std::vector decodeinputNodes, size_t numBestMBR, bool lengthNorm, const vector& vt_labels, vector>& uttPathsInfo, vector& vt_nws, vector& vt_onebest_wer, - bool SVD, ComputationNetwork& cn) - { - //time_t my_time = time(NULL); - //fprintf(stderr, "RNNT_decode_nbest_MBR time 1 = %s", ctime(&my_time)); - if (outputNodeNames.size() == 0) - fprintf(stderr, "OutputNodeNames are not specified, using the default outputnodes.\n"); - std::vector outputNodes = cn.OutputNodesByName(outputNodeNames); + /* + void RNNT_decode_oneutt_MBR(std::ref(cn), std::ref(vocabSize), std::ref(blankId), std::ref(deviceid), std::ref(uttFrameNum[uttID]), + std::ref(decodeOutputNodeNames), std::ref(decodeInputNodeNames), + std::ref(uttFrameBeginIdx[uttID]), std::ref(uttFrameToChanInd[uttID]), std::ref(numParallelSequences), + std::ref(SVD), std::ref(encondeOutput), std::ref(outputNodeNames), + std::ref(numBestMBR), std::ref(lengthNorm), + std::ref(wordSeqs[uttID]), std::ref(uttPathsInfo[uttID]), std::ref(vt_onebest_wer[uttID]) + */ + void RNNT_decode_oneutt_MBR(const ComputationNetwork& net, const size_t& vocabSize, const size_t& blankId, const size_t& deviceid, const size_t& uttFrameNum, - - //prediction related nodes - std::vector decodeOutputNodeNames(outputNodeNames.begin() + 1, outputNodeNames.begin() + 2); - std::vector decodeOutputNodes = cn.OutputNodesByName(decodeOutputNodeNames); - - std::list pastValueNodes = cn.PastValueNodesForOutputs(decodeOutputNodes); - - std::list::iterator it; - for (it = pastValueNodes.begin(); it != pastValueNodes.end(); ++it) + const std::vector& decodeOutputNodeNames, + const std::vector& decodeInputNodeNames, + const size_t& uttFrameBeginIdx, const size_t& uttFrameToChanInd, const size_t& numParallelSequences, + const bool& SVD, const Matrix& encodeOutput, const std::vector& outputNodeNames, + const size_t& numBestMBR, const bool& lengthNorm, const vector& vt_labels, + const std::vector& wordSeq, vector& oneuttPathsInfo, float& onebest_wer, + const Matrix& Wm, const Matrix& Wmu, const Matrix& Wmv, const Matrix& bm, const size_t uttID) { - auto pastValueNode = dynamic_pointer_cast>(*it); //DelayedValueNodeBase - if (pastValueNode || !(*it)->NodeName().compare(0, 5, L"Loop_")) + vector CurSequences, nextSequences; + ComputationNetwork decode_net; + unordered_map>>> m_nameToPastValueNodeCachePerThread; + time_t my_time; + my_time = time(NULL); + fprintf(stderr, "RNNT_decode_oneutt_MBR time 1 = %s, uttFrameNum = %d, uttID = %d \n", ctime(&my_time), int(uttFrameNum), int(uttID)); + + vector>> m_decodeOutputCachePerThread; + + decode_net.CopySubTree(net, decodeOutputNodeNames[0], L"", CopyNodeFlags::copyNodeAll); + // m_pMBLayout->CopyFrom(m_minibatchBuffer[index].pMBLayout); + decode_net.CompileNetwork(); + std::vector decodeOutputNodes = decode_net.OutputNodesByName(decodeOutputNodeNames); + + decode_net.FormEvalOrder(decodeOutputNodes[0]); + decode_net.FormNestedNetwork(decodeOutputNodes[0]); + + my_time = time(NULL); + fprintf(stderr, "RNNT_decode_oneutt_MBR time 2 = %s, uttFrameNum = %d, uttID = %d \n", ctime(&my_time), int(uttFrameNum), int(uttID)); + + + + for (const auto& node : decode_net.GetAllNodesForRoot(decodeOutputNodes[0])) { - m_nodesToCache.push_back((*it)->NodeName()); - } - } - //joint nodes - ComputationNodeBasePtr PlusNode = cn.GetNodeFromName(outputNodeNames[2]); - ComputationNodeBasePtr PlusTransNode = cn.GetNodeFromName(outputNodeNames[3]); - ComputationNodeBasePtr WmNode, WmuNode, WmvNode, bmNode; - WmNode; - WmuNode; - WmvNode; - if (SVD) - { - WmuNode = cn.GetNodeFromName(outputNodeNames[4]); - WmvNode = cn.GetNodeFromName(outputNodeNames[5]); - bmNode = cn.GetNodeFromName(outputNodeNames[6]); - } - else - { - WmNode = cn.GetNodeFromName(outputNodeNames[4]); - bmNode = cn.GetNodeFromName(outputNodeNames[5]); - } - std::vector Plusnodes, Plustransnodes; - Plusnodes.push_back(PlusNode); - Plustransnodes.push_back(PlusTransNode); - - size_t deviceid = decodeInputMatrix.GetDeviceId(); - std::map outputMatrices; - Matrix decodeOutput(deviceid), Wm(deviceid), Wmu(deviceid), Wmv(deviceid), bm(deviceid), tempMatrix(deviceid); - Matrix greedyOutput(deviceid); - Matrix sumofENandDE(deviceid), maxIdx(deviceid), maxVal(deviceid); - Wmu; - Wmv; - Wm; - if (SVD) - { - Wmu.SetValue(*(&dynamic_pointer_cast>(WmuNode)->Value())); - Wmv.SetValue(*(&dynamic_pointer_cast>(WmvNode)->Value())); - } - else - Wm.SetValue(*(&dynamic_pointer_cast>(WmNode)->Value())); - - bm.SetValue(*(&dynamic_pointer_cast>(bmNode)->Value())); - const size_t numIterationsBeforePrintingProgress = 100; - - //get MBlayer of encoder input - size_t numParallelSequences = encodeMBLayout->GetNumParallelSequences(); - size_t numParallelPhoneSequences = decodeMBLayout->GetNumParallelSequences(); - const auto numSequences = encodeMBLayout->GetNumSequences(); - - std::vector uttFrameBeginIdx, uttPhoneBeginIdx; - // the frame number of each utterance. The size of this vector = the number of all utterances in this minibatch - std::vector uttFrameNum, uttPhoneNum; - // map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance. - std::vector uttFrameToChanInd, uttPhoneToChanInd; - - uttFrameNum.clear(); - uttFrameToChanInd.clear(); - uttFrameBeginIdx.clear(); - - uttFrameNum.reserve(numSequences); - uttFrameToChanInd.reserve(numSequences); - uttFrameBeginIdx.reserve(numSequences); - - uttPhoneNum.clear(); - uttPhoneToChanInd.clear(); - uttPhoneBeginIdx.clear(); - - uttPhoneNum.reserve(numSequences); - uttPhoneToChanInd.reserve(numSequences); - uttPhoneBeginIdx.reserve(numSequences); - uttPathsInfo.clear(); - uttPathsInfo.resize(numSequences); - - vt_nws.clear(); - vt_nws.resize(numSequences); - - vt_onebest_wer.clear(); - vt_onebest_wer.resize(numSequences); - //get utt information, such as channel map id and utt begin frame, utt frame num, utt phone num for frame and phone respectively.... - size_t seqId = 0; //frame - size_t totalframenum = 0; - - // this->FormEvalOrder(Plustransnodes[0]); - - for (const auto& seq : encodeMBLayout->GetAllSequences()) - { - if (seq.seqId == GAP_SEQUENCE_ID) - { - continue; - } - assert(seq.seqId == seqId); - seqId++; - uttFrameToChanInd.push_back(seq.s); - size_t numFrames = seq.GetNumTimeSteps(); - uttFrameBeginIdx.push_back(seq.tBegin); - uttFrameNum.push_back(numFrames); - totalframenum += numFrames; - } - - //get utt information for prediction input.... - seqId = 0; //frame - - for (const auto& seq : decodeMBLayout->GetAllSequences()) - { - if (seq.seqId == GAP_SEQUENCE_ID) - { - continue; - } - assert(seq.seqId == seqId); - seqId++; - uttPhoneToChanInd.push_back(seq.s); - size_t numFrames = seq.GetNumTimeSteps(); - uttPhoneBeginIdx.push_back(seq.tBegin); - uttPhoneNum.push_back(numFrames); - } - - //get phone sequene - CNTK::Matrix maxIndex(deviceid), maxValue(deviceid); - decodeInputMatrix.VectorMax(maxIndex, maxValue, true); - maxIndex.TransferToDeviceIfNotThere(CPUDEVICE); - - //backup decoding input matrix and MBlayout - MBLayoutPtr decodebackupMBlayout; - decodebackupMBlayout = make_shared(); - decodebackupMBlayout->CopyFrom(decodeMBLayout); - - Matrix decodeInputMatrixBackup(deviceid); - decodeInputMatrixBackup.SetValue(decodeInputMatrix); - - std::vector> phoneSeqs; - phoneSeqs.resize(numSequences); - for (size_t utt = 0; utt < numSequences; utt++) - { - //phoneSeqs[utt].resize(uttPhoneNum[utt]); - for (size_t u = 0; u < uttPhoneNum[utt]; u++) - { - size_t uID = (u + uttPhoneBeginIdx[utt]) * numParallelPhoneSequences + uttPhoneToChanInd[utt]; - phoneSeqs[utt].push_back((size_t)(maxIndex(0, uID))); - } - } - // convert the phoneSeqs to word sequence, as reference, convert a string of "_ab_cdef_g" to word sequence of "ab cdef g". - std::vector> wordSeqs; - wordSeqs.resize(numSequences); - for (size_t uttID = 0; uttID < numSequences; uttID++) - { - string word_sequence = ""; - for (size_t i = 0; i < phoneSeqs[uttID].size(); i++) - { - size_t labelID = phoneSeqs[uttID][i]; - if (labelID != (vt_labels.size() - 1)) // it is not + if (node->OperationName().find(L"ReduceElements") != string::npos) { - string wordpiece = vt_labels[labelID]; - word_sequence += wordpiece; + auto rNode = node->As>(); + //rNode->set_frame(uttFrameNum); + rNode->is_multi_thread(true); } } - convert_word_sequence_string_2_vector(word_sequence, wordSeqs[uttID], '_'); - vt_nws[uttID] = wordSeqs[uttID].size(); - /* - fprintf(stderr, "word sequence for uttID = %d .\n", int(uttID)); - for (size_t i = 0; i < wordSeqs[uttID].size(); i++) - { - fprintf(stderr, "%s ", wordSeqs[uttID][i].c_str()); - } - fprintf(stderr, "\n"); - */ - } + std::vector decodeinputNodes = decode_net.OutputNodesByName(decodeInputNodeNames); + StreamMinibatchInputs decodeinputMatrices = DataReaderHelpersFunctions::RetrieveInputMatrices(decodeinputNodes); - // the data structure for phone sequence - - // do decoding for the utterances, and feed in the data structure, - - size_t vocabSize = bm.GetNumRows(); - size_t blankId = vocabSize - 1; - vector CurSequences, nextSequences; - // sanity check - if (vt_labels.size() != vocabSize) - { - RuntimeError("RNNT_decode_nbest_MBR: size not match, vt_labels.size() = %d, and vocabSize = %d.", int(vt_labels.size()), int(vocabSize)); - } - - StreamMinibatchInputs decodeinputMatrices = DataReaderHelpersFunctions::RetrieveInputMatrices(decodeinputNodes); - - // this->FormEvalOrder(Plustransnodes[0]); - - //my_time = time(NULL); - //fprintf(stderr, "RNNT_decode_nbest_MBR time 2 = %s", ctime(&my_time)); - for (size_t uttID = 0; uttID < numSequences; uttID++) - { - // fprintf(stderr, "decode v0 uttID = %d .\n", int(uttID)); nextSequences.clear(); //initialize with blank ID Sequence oneSeq = newSeq(vocabSize, (size_t) 50, deviceid); @@ -1967,46 +1980,128 @@ public: nextSequences.push_back(oneSeq); + Matrix decodeOutput(deviceid); + + std::vector Plusnodes, Plustransnodes; // as a placeholder, will not be used in multithread case + + my_time = time(NULL); + fprintf(stderr, "RNNT_decode_oneutt_MBR time 3 = %s, uttFrameNum = %d, uttID = %d \n", ctime(&my_time), int(uttFrameNum), int(uttID)); + // loop for each frame - for (size_t t = 0; t < uttFrameNum[uttID]; t++) + for (size_t t = 0; t < uttFrameNum; t++) { + //fprintf(stderr, "one utt, uttframenum = %d, t = %d, 1 \n", int(uttFrameNum), int(t)); for (size_t n = 0; n < CurSequences.size(); n++) { - deleteSeq(CurSequences[n]); + deleteSeq(CurSequences[n], m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread); } vector().swap(CurSequences); CurSequences = nextSequences; vector().swap(nextSequences); - //fprintf(stderr, "t = %d .\n", int(t)); - - //deal with the same prefix - //int count = 0; + //fprintf(stderr,"one utt, uttframenum = %d, t = %d, 2 \n", int(uttFrameNum), int(t)); + int count = 0; while (true) { - // fprintf(stderr, "count = %d .\n", int(count++)); + //fprintf(stderr, "while, uttframenum = %d, t = %d, 1 \n", int(uttFrameNum), int(t)); + /* + if (uttFrameNum == 94) + { + fprintf(stderr, "uttframenum = %d, t = %d, count = %d, debug 1 \n", int(uttFrameNum), int(t), count); + fprintf(stderr, "uttframenum = %d, t = %d, curSequences sequence size = %d \n", int(uttFrameNum), int(t), int(CurSequences.size())); + for (size_t n = 0; n < CurSequences.size(); n++) + { + fprintf(stderr, "n = %d, logp = %f \n", int(n), CurSequences[n].logP); + } + + fprintf(stderr, "uttframenum = %d, t = %d, next sequence size = %d \n", int(uttFrameNum), int(t), int(nextSequences.size())); + for (size_t n = 0; n < nextSequences.size(); n++) + { + fprintf(stderr, "n = %d, logp = %f \n", int(n), nextSequences[n].logP); + } + } + */ auto maxSeq = std::max_element(CurSequences.begin(), CurSequences.end()); - Sequence tempSeq = newSeq(*maxSeq, deviceid); - deleteSeq(*maxSeq); + Sequence tempSeq = newSeq(*maxSeq, deviceid, m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread); + /* + if (uttFrameNum == 94) + { + fprintf(stderr, "tempSeq.logp = %f, debug 1 \n", tempSeq.logP); + } + */ + deleteSeq(*maxSeq, m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread); CurSequences.erase(maxSeq); + //fprintf(stderr, "while, uttframenum = %d, t = %d, 2 \n", int(uttFrameNum), int(t)); prepareSequence(tempSeq); - forward_decode(tempSeq, decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, tempSeq.labelseq.size(), cn); + /* + if (uttFrameNum == 94) + { + fprintf(stderr, "tempSeq.logp = %f, debug 2 \n", tempSeq.logP); + } + if (uttFrameNum == 94) + { - size_t tinMB = (t + uttFrameBeginIdx[uttID]) * numParallelSequences + uttFrameToChanInd[uttID]; + fprintf(stderr, "uttframenum = %d, t = %d, count = %d, debug 2 \n", int(uttFrameNum), int(t), count); + fprintf(stderr, "uttframenum = %d, t = %d, curSequences sequence size = %d \n", int(uttFrameNum), int(t), int(CurSequences.size())); + for (size_t n = 0; n < CurSequences.size(); n++) + { + fprintf(stderr, "n = %d, logp = %f \n", int(n), CurSequences[n].logP); + } + + fprintf(stderr, "uttframenum = %d, t = %d, next sequence size = %d \n", int(uttFrameNum), int(t), int(nextSequences.size())); + for (size_t n = 0; n < nextSequences.size(); n++) + { + fprintf(stderr, "n = %d, logp = %f \n", int(n), nextSequences[n].logP); + } + } + */ + // mask for debug purpose + forward_decode(tempSeq, decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, tempSeq.labelseq.size(), decode_net, int(uttFrameNum)); + /* + if (uttFrameNum == 94) + { + fprintf(stderr, "tempSeq.logp = %f, debug 3 \n", tempSeq.logP); + } + */ + size_t tinMB = (t + uttFrameBeginIdx) * numParallelSequences + uttFrameToChanInd; + //fprintf(stderr, "while, uttframenum = %d, t = %d, 3 \n", int(uttFrameNum), int(t)); if (SVD) - forwardmergedSVD(tempSeq, tinMB, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wmu, Wmv, bm, cn); + forwardmergedSVD(tempSeq, tinMB, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wmu, Wmv, bm, NULL); else - forwardmerged(tempSeq, tinMB, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wm, bm, cn); + forwardmerged(tempSeq, tinMB, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wm, bm, NULL, int(uttFrameNum), deviceid); + /* + if (uttFrameNum == 94) + { + ElemType* probdata = decodeOutput.CopyToArray(); + fprintf(stderr, "tempSeq.logp = %f, probdata[blankid] = %f, norm = %f, debug 4 \n", tempSeq.logP, probdata[blankId], decodeOutput.FrobeniusNorm()); + delete probdata; + } + */ + //fprintf(stderr, "while, uttframenum = %d, t = %d, 4 \n", int(uttFrameNum), int(t)); //sort log posterior and get best N labels vector> topN = getTopN(decodeOutput, numBestMBR, blankId); - + //fprintf(stderr, "while, uttframenum = %d, t = %d, 5 \n", int(uttFrameNum), int(t)); //expand blank - Sequence seqK = newSeq(tempSeq, deviceid); + Sequence seqK = newSeq(tempSeq, deviceid, m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread); + /* + if (uttFrameNum == 94) + { + fprintf(stderr, "tempSeq.logp = %f, debug 5 \n", tempSeq.logP); + } + */ ElemType newlogP = topN[vocabSize].second + tempSeq.logP; + /* + if (uttFrameNum == 94) + { + fprintf(stderr, "tempSeq.logp = %f, topN = %f, debug 6 \n", tempSeq.logP, topN[vocabSize].second); + } + */ seqK.logP = newlogP; bool existseq = false; + //fprintf(stderr, "while, uttframenum = %d, t = %d, 6 \n", int(uttFrameNum), int(t)); + for (auto itseq = nextSequences.begin(); itseq != nextSequences.end(); itseq++) { //merge the score with same sequence @@ -2017,15 +2112,36 @@ public: break; } } + //fprintf(stderr, "while, uttframenum = %d, t = %d, 7 \n", int(uttFrameNum), int(t)); if (!existseq) { nextSequences.push_back(seqK); } + /* + if (uttFrameNum == 94) + { + + fprintf(stderr, "uttframenum = %d, t = %d, count = %d, debug 3 \n", int(uttFrameNum), int(t), count); + fprintf(stderr, "uttframenum = %d, t = %d, curSequences sequence size = %d \n", int(uttFrameNum), int(t), int(CurSequences.size())); + for (size_t n = 0; n < CurSequences.size(); n++) + { + fprintf(stderr, "n = %d, logp = %f \n", int(n), CurSequences[n].logP); + } + + fprintf(stderr, "uttframenum = %d, t = %d, next sequence size = %d \n", int(uttFrameNum), int(t), int(nextSequences.size())); + for (size_t n = 0; n < nextSequences.size(); n++) + { + fprintf(stderr, "n = %d, logp = %f \n", int(n), nextSequences[n].logP); + } + } + */ + + //fprintf(stderr, "while, uttframenum = %d, t = %d, 8 \n", int(uttFrameNum), int(t)); int iLabel; for (iLabel = 0; iLabel < numBestMBR; iLabel++) { - seqK = newSeq(tempSeq, deviceid); + seqK = newSeq(tempSeq, deviceid, m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread); newlogP = topN[iLabel].second + tempSeq.logP; seqK.logP = newlogP; @@ -2037,30 +2153,111 @@ public: CurSequences.push_back(seqK); } } - vector>().swap(topN); - deleteSeq(tempSeq); + /* + if (uttFrameNum == 94) + { + fprintf(stderr, "uttframenum = %d, t = %d, count = %d, debug 4 \n", int(uttFrameNum), int(t), count); + fprintf(stderr, "uttframenum = %d, t = %d, curSequences sequence size = %d \n", int(uttFrameNum), int(t), int(CurSequences.size())); + for (size_t n = 0; n < CurSequences.size(); n++) + { + fprintf(stderr, "n = %d, logp = %f \n", int(n), CurSequences[n].logP); + } + + fprintf(stderr, "uttframenum = %d, t = %d, next sequence size = %d \n", int(uttFrameNum), int(t), int(nextSequences.size())); + for (size_t n = 0; n < nextSequences.size(); n++) + { + fprintf(stderr, "n = %d, logp = %f \n", int(n), nextSequences[n].logP); + } + } + */ + + //fprintf(stderr, "while, uttframenum = %d, t = %d, 9 \n", int(uttFrameNum), int(t)); + vector>().swap(topN); + deleteSeq(tempSeq, m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread); + //fprintf(stderr, "while, uttframenum = %d, t = %d, 10 \n", int(uttFrameNum), int(t)); + /* + if (uttFrameNum == 94) + { + + fprintf(stderr, "uttframenum = %d, t = %d, debug 5, count = %d, \n", int(uttFrameNum), int(t), count); + fprintf(stderr, "uttframenum = %d, t = %d, curSequences sequence size = %d \n", int(uttFrameNum), int(t), int(CurSequences.size())); + for (size_t n = 0; n < CurSequences.size(); n++) + { + fprintf(stderr, "n = %d, logp = %f \n", int(n), CurSequences[n].logP); + } + + fprintf(stderr, "uttframenum = %d, t = %d, next sequence size = %d \n", int(uttFrameNum), int(t), int(nextSequences.size())); + for (size_t n = 0; n < nextSequences.size(); n++) + { + fprintf(stderr, "n = %d, logp = %f \n", int(n), nextSequences[n].logP); + } + } + */ if (CurSequences.size() == 0) break; auto ya = std::max_element(CurSequences.begin(), CurSequences.end()); auto yb = std::max_element(nextSequences.begin(), nextSequences.end()); if (nextSequences.size() > numBestMBR && yb->logP > ya->logP) break; + //fprintf(stderr, "while, uttframenum = %d, t = %d, 11 \n", int(uttFrameNum), int(t)); + count++; } + //fprintf(stderr,"one utt, uttframenum = %d, t = %d, 3 \n", int(uttFrameNum), int(t)); std::sort(nextSequences.begin(), nextSequences.end()); std::reverse(nextSequences.begin(), nextSequences.end()); if (nextSequences.size() > numBestMBR) { for (size_t n = numBestMBR; n < nextSequences.size(); n++) { - deleteSeq(nextSequences[n]); + deleteSeq(nextSequences[n], m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread); } } + /* + if (uttFrameNum == 94) + { + + fprintf(stderr, "uttframenum = %d, t = %d, count = %d, debug 6 \n", int(uttFrameNum), int(t), count); + fprintf(stderr, "uttframenum = %d, t = %d, curSequences sequence size = %d \n", int(uttFrameNum), int(t), int(CurSequences.size())); + for (size_t n = 0; n < CurSequences.size(); n++) + { + fprintf(stderr, "n = %d, logp = %f \n", int(n), CurSequences[n].logP); + } + + fprintf(stderr, "uttframenum = %d, t = %d, next sequence size = %d \n", int(uttFrameNum), int(t), int(nextSequences.size())); + for (size_t n = 0; n < nextSequences.size(); n++) + { + fprintf(stderr, "n = %d, logp = %f \n", int(n), nextSequences[n].logP); + } + } + */ + //fprintf(stderr,"one utt, uttframenum = %d, t = %d, 4 \n", int(uttFrameNum), int(t)); for (size_t iseq = nextSequences.size(); iseq > numBestMBR; iseq--) nextSequences.pop_back(); - } + //fprintf(stderr, "one utt, uttframenum = %d, t = %d, 5 \n", int(uttFrameNum), int(t)); + /* + if (uttFrameNum == 94) + { + fprintf(stderr, "uttframenum = %d, t = %d, count = %d, debug 7 \n", int(uttFrameNum), int(t), count); + fprintf(stderr, "uttframenum = %d, t = %d, curSequences sequence size = %d \n", int(uttFrameNum), int(t), int(CurSequences.size())); + for (size_t n = 0; n < CurSequences.size(); n++) + { + fprintf(stderr, "n = %d, logp = %f \n", int(n), CurSequences[n].logP); + } + + fprintf(stderr, "uttframenum = %d, t = %d, next sequence size = %d \n", int(uttFrameNum), int(t), int(nextSequences.size())); + for (size_t n = 0; n < nextSequences.size(); n++) + { + fprintf(stderr, "n = %d, logp = %f \n", int(n), nextSequences[n].logP); + } + } + */ + } + //fprintf(stderr, "one utt, uttframenum = %d, 6 \n", int(uttFrameNum)); //nbest output + my_time = time(NULL); + fprintf(stderr, "RNNT_decode_oneutt_MBR time 4 = %s, uttFrameNum = %d, uttID = %d \n", ctime(&my_time), int(uttFrameNum), int(uttID)); if (nextSequences.size() != 0) { @@ -2113,63 +2310,648 @@ public: vector vt_words; convert_word_sequence_string_2_vector(word_sequence, vt_words, '_'); - pi.WER = compute_wer(wordSeqs[uttID], vt_words); + pi.WER = compute_wer(wordSeq, vt_words); pi.label_seq = nextSequences[n].labelseq; - uttPathsInfo[uttID].push_back(pi); + oneuttPathsInfo.push_back(pi); } - vt_onebest_wer[uttID] = uttPathsInfo[uttID][onebest_index].WER; + onebest_wer = oneuttPathsInfo[onebest_index].WER; } + //fprintf(stderr, "one utt, uttframenum = %d, 7 \n", int(uttFrameNum)); + for (size_t n = 0; n < CurSequences.size(); n++) { - deleteSeq(CurSequences[n]); + deleteSeq(CurSequences[n], m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread); } + //fprintf(stderr, "one utt, uttframenum = %d, 8 \n", int(uttFrameNum)); + vector().swap(CurSequences); for (size_t n = 0; n < nextSequences.size(); n++) { - deleteSeq(nextSequences[n]); + deleteSeq(nextSequences[n], m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread); } + //fprintf(stderr, "one utt, uttframenum = %d, 9 \n", int(uttFrameNum)); + vector().swap(nextSequences); + my_time = time(NULL); + fprintf(stderr, "RNNT_decode_oneutt_MBR time 5 = %s, uttFrameNum = %d, uttID = %d \n", ctime(&my_time), int(uttFrameNum), int(uttID)); + // end here + } - } // end of for loop - decodeInputMatrix.SetValue(decodeInputMatrixBackup); - //decodeInputMatrix.Print("after ss"); - decodeMBLayout->CopyFrom(decodebackupMBlayout); - //my_time = time(NULL); - //fprintf(stderr, "RNNT_decode_nbest_MBR time 3 = %s", ctime(&my_time)); + void RNNT_decode_nbest_MBR_Multithread(const std::vector& outputNodeNames, Matrix& encodeOutput, MBLayoutPtr& encodeMBLayout, + Matrix& decodeInputMatrix, MBLayoutPtr& decodeMBLayout, const std::vector decodeInputNodeNames, + size_t numBestMBR, bool lengthNorm, const vector& vt_labels, vector>& uttPathsInfo, vector& vt_nws, vector& vt_onebest_wer, + bool SVD, ComputationNetwork& net, + const Matrix& Wm, const Matrix& Wmu, const Matrix& Wmv, const Matrix& bm) /*, size_t num_utt, size_t start_utt) */ + { + + time_t my_time = time(NULL); + fprintf(stderr, "RNNT_decode_nbest_MBR_Multithread time 1 = %s \n", ctime(&my_time)); + + if (outputNodeNames.size() == 0) + fprintf(stderr, "OutputNodeNames are not specified, using the default outputnodes.\n"); + + //prediction related nodes + std::vector decodeOutputNodeNames(outputNodeNames.begin() + 1, outputNodeNames.begin() + 2); + + std::vector decodeOutputNodes; + + std::list pastValueNodes; + decodeOutputNodes = net.OutputNodesByName(decodeOutputNodeNames); + pastValueNodes = net.PastValueNodesForOutputs(decodeOutputNodes); + + std::list::iterator it; + for (it = pastValueNodes.begin(); it != pastValueNodes.end(); ++it) + { + auto pastValueNode = dynamic_pointer_cast>(*it); //DelayedValueNodeBase + if (pastValueNode || !(*it)->NodeName().compare(0, 5, L"Loop_")) + { + m_nodesToCache.push_back((*it)->NodeName()); + } + } + //joint nodes + + int deviceid = decodeInputMatrix.GetDeviceId(); + std::map outputMatrices; + + Matrix maxIdx(deviceid), maxVal(deviceid); + + const size_t numIterationsBeforePrintingProgress = 100; + + //get MBlayer of encoder input + size_t numParallelSequences = encodeMBLayout->GetNumParallelSequences(); + size_t numParallelPhoneSequences = decodeMBLayout->GetNumParallelSequences(); + const auto numSequences = encodeMBLayout->GetNumSequences(); + + std::vector uttFrameBeginIdx, uttPhoneBeginIdx; + // the frame number of each utterance. The size of this vector = the number of all utterances in this minibatch + std::vector uttFrameNum, uttPhoneNum; + // map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance. + std::vector uttFrameToChanInd, uttPhoneToChanInd; + + uttFrameNum.clear(); + uttFrameToChanInd.clear(); + uttFrameBeginIdx.clear(); + + uttFrameNum.reserve(numSequences); + uttFrameToChanInd.reserve(numSequences); + uttFrameBeginIdx.reserve(numSequences); + + uttPhoneNum.clear(); + uttPhoneToChanInd.clear(); + uttPhoneBeginIdx.clear(); + + uttPhoneNum.reserve(numSequences); + uttPhoneToChanInd.reserve(numSequences); + uttPhoneBeginIdx.reserve(numSequences); + uttPathsInfo.clear(); + uttPathsInfo.resize(numSequences); + + vt_nws.clear(); + vt_nws.resize(numSequences); + + vt_onebest_wer.clear(); + vt_onebest_wer.resize(numSequences); + //get utt information, such as channel map id and utt begin frame, utt frame num, utt phone num for frame and phone respectively.... + size_t seqId = 0; //frame + size_t totalframenum = 0; + + // this->FormEvalOrder(Plustransnodes[0]); + + for (const auto& seq : encodeMBLayout->GetAllSequences()) + { + if (seq.seqId == GAP_SEQUENCE_ID) + { + continue; + } + assert(seq.seqId == seqId); + seqId++; + uttFrameToChanInd.push_back(seq.s); + size_t numFrames = seq.GetNumTimeSteps(); + uttFrameBeginIdx.push_back(seq.tBegin); + uttFrameNum.push_back(numFrames); + totalframenum += numFrames; + } + + //get utt information for prediction input.... + seqId = 0; //frame + + for (const auto& seq : decodeMBLayout->GetAllSequences()) + { + if (seq.seqId == GAP_SEQUENCE_ID) + { + continue; + } + assert(seq.seqId == seqId); + seqId++; + uttPhoneToChanInd.push_back(seq.s); + size_t numFrames = seq.GetNumTimeSteps(); + uttPhoneBeginIdx.push_back(seq.tBegin); + uttPhoneNum.push_back(numFrames); + } + + //get phone sequene + CNTK::Matrix maxIndex(deviceid), maxValue(deviceid); + decodeInputMatrix.VectorMax(maxIndex, maxValue, true); + maxIndex.TransferToDeviceIfNotThere(CPUDEVICE); + + //backup decoding input matrix and MBlayout + MBLayoutPtr decodebackupMBlayout; + decodebackupMBlayout = make_shared(); + decodebackupMBlayout->CopyFrom(decodeMBLayout); + + Matrix decodeInputMatrixBackup(deviceid); + decodeInputMatrixBackup.SetValue(decodeInputMatrix); + + std::vector> phoneSeqs; + phoneSeqs.resize(numSequences); + for (size_t utt = 0; utt < numSequences; utt++) + { + //phoneSeqs[utt].resize(uttPhoneNum[utt]); + for (size_t u = 0; u < uttPhoneNum[utt]; u++) + { + size_t uID = (u + uttPhoneBeginIdx[utt]) * numParallelPhoneSequences + uttPhoneToChanInd[utt]; + phoneSeqs[utt].push_back((size_t)(maxIndex(0, uID))); + } + } + // convert the phoneSeqs to word sequence, as reference, convert a string of "_ab_cdef_g" to word sequence of "ab cdef g". + std::vector> wordSeqs; + wordSeqs.resize(numSequences); + for (size_t uttID = 0; uttID < numSequences; uttID++) + { + string word_sequence = ""; + for (size_t i = 0; i < phoneSeqs[uttID].size(); i++) + { + size_t labelID = phoneSeqs[uttID][i]; + if (labelID != (vt_labels.size() - 1)) // it is not + { + string wordpiece = vt_labels[labelID]; + word_sequence += wordpiece; + } + } + convert_word_sequence_string_2_vector(word_sequence, wordSeqs[uttID], '_'); + + vt_nws[uttID] = wordSeqs[uttID].size(); + } + + // the data structure for phone sequence + + // do decoding for the utterances, and feed in the data structure, + + size_t vocabSize = vt_labels.size(); + size_t blankId = vocabSize - 1; + + // this->FormEvalOrder(Plustransnodes[0]); + + my_time = time(NULL); + fprintf(stderr, "RNNT_decode_nbest_MBR_Multithread time 2 = %s, numSequences = %d, uttFrameNum = %d \n ", ctime(&my_time), int(numSequences), int(uttFrameNum[0])); + std::vector vt_threads(numSequences); + deviceid = CPUDEVICE; + for (size_t uttID = 0; uttID < numSequences; uttID++) + //for (size_t uttID = start_utt; uttID < num_utt; uttID++) + { + vt_threads[uttID] = std::thread(&RNNTDecodeFunctions::RNNT_decode_oneutt_MBR, this, std::ref(net), std::ref(vocabSize), std::ref(blankId), std::ref(deviceid), std::ref(uttFrameNum[uttID]), + std::ref(decodeOutputNodeNames), std::ref(decodeInputNodeNames), + std::ref(uttFrameBeginIdx[uttID]), std::ref(uttFrameToChanInd[uttID]), std::ref(numParallelSequences), + std::ref(SVD), std::ref(encodeOutput), std::ref(outputNodeNames), + std::ref(numBestMBR), std::ref(lengthNorm), std::ref(vt_labels), + std::ref(wordSeqs[uttID]), std::ref(uttPathsInfo[uttID]), std::ref(vt_onebest_wer[uttID]), + std::ref(Wm), std::ref(Wmu), std::ref(Wmv), std::ref(bm), (uttID)); + + } // end of for loop + + for (size_t uttID = 0; uttID < numSequences; uttID++) + // for (size_t uttID = start_utt; uttID < num_utt; uttID++) + { + vt_threads[uttID].join(); + } + decodeInputMatrix.SetValue(decodeInputMatrixBackup); + //decodeInputMatrix.Print("after ss"); + decodeMBLayout->CopyFrom(decodebackupMBlayout); + my_time = time(NULL); + fprintf(stderr, "RNNT_decode_nbest_MBR_Multithread time 3 = %s \n", ctime(&my_time)); + } + + void RNNT_decode_nbest_MBR(const std::vector& outputNodeNames, Matrix& encodeOutput, MBLayoutPtr& encodeMBLayout, + Matrix& decodeInputMatrix, MBLayoutPtr& decodeMBLayout, std::vector decodeinputNodes, size_t numBestMBR, bool lengthNorm, const vector& vt_labels, vector>& uttPathsInfo, vector& vt_nws, vector& vt_onebest_wer, + bool SVD, const ComputationNetworkPtr& net) + { + time_t my_time = time(NULL); + fprintf(stderr, "RNNT_decode_nbest_MBR time 1 = %s \n", ctime(&my_time)); + + if (outputNodeNames.size() == 0) + fprintf(stderr, "OutputNodeNames are not specified, using the default outputnodes.\n"); + std::vector outputNodes = net->OutputNodesByName(outputNodeNames); + + //prediction related nodes + std::vector decodeOutputNodeNames(outputNodeNames.begin() + 1, outputNodeNames.begin() + 2); + std::vector decodeOutputNodes = net->OutputNodesByName(decodeOutputNodeNames); + + std::list pastValueNodes = net->PastValueNodesForOutputs(decodeOutputNodes); + + std::list::iterator it; + for (it = pastValueNodes.begin(); it != pastValueNodes.end(); ++it) + { + auto pastValueNode = dynamic_pointer_cast>(*it); //DelayedValueNodeBase + if (pastValueNode || !(*it)->NodeName().compare(0, 5, L"Loop_")) + { + m_nodesToCache.push_back((*it)->NodeName()); + } + } + //joint nodes + ComputationNodeBasePtr WmNode, WmuNode, WmvNode, bmNode; + WmNode; + WmuNode; + WmvNode; + if (SVD) + { + WmuNode = net->GetNodeFromName(outputNodeNames[4]); + WmvNode = net->GetNodeFromName(outputNodeNames[5]); + bmNode = net->GetNodeFromName(outputNodeNames[6]); + } + else + { + WmNode = net->GetNodeFromName(outputNodeNames[4]); + bmNode = net->GetNodeFromName(outputNodeNames[5]); + } + std::vector Plusnodes, Plustransnodes; + Plusnodes.push_back(net->GetNodeFromName(outputNodeNames[2])); + Plustransnodes.push_back(net->GetNodeFromName(outputNodeNames[3])); + + size_t deviceid = decodeInputMatrix.GetDeviceId(); + std::map outputMatrices; + Matrix decodeOutput(deviceid), Wm(deviceid), Wmu(deviceid), Wmv(deviceid), bm(deviceid); + + Matrix maxIdx(deviceid), maxVal(deviceid); + Wmu; + Wmv; + Wm; + if (SVD) + { + Wmu.SetValue(*(&dynamic_pointer_cast>(WmuNode)->Value())); + Wmv.SetValue(*(&dynamic_pointer_cast>(WmvNode)->Value())); + } + else + Wm.SetValue(*(&dynamic_pointer_cast>(WmNode)->Value())); + + bm.SetValue(*(&dynamic_pointer_cast>(bmNode)->Value())); + const size_t numIterationsBeforePrintingProgress = 100; + + //get MBlayer of encoder input + size_t numParallelSequences = encodeMBLayout->GetNumParallelSequences(); + size_t numParallelPhoneSequences = decodeMBLayout->GetNumParallelSequences(); + const auto numSequences = encodeMBLayout->GetNumSequences(); + + std::vector uttFrameBeginIdx, uttPhoneBeginIdx; + // the frame number of each utterance. The size of this vector = the number of all utterances in this minibatch + std::vector uttFrameNum, uttPhoneNum; + // map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance. + std::vector uttFrameToChanInd, uttPhoneToChanInd; + + uttFrameNum.clear(); + uttFrameToChanInd.clear(); + uttFrameBeginIdx.clear(); + + uttFrameNum.reserve(numSequences); + uttFrameToChanInd.reserve(numSequences); + uttFrameBeginIdx.reserve(numSequences); + + uttPhoneNum.clear(); + uttPhoneToChanInd.clear(); + uttPhoneBeginIdx.clear(); + + uttPhoneNum.reserve(numSequences); + uttPhoneToChanInd.reserve(numSequences); + uttPhoneBeginIdx.reserve(numSequences); + uttPathsInfo.clear(); + uttPathsInfo.resize(numSequences); + + vt_nws.clear(); + vt_nws.resize(numSequences); + + vt_onebest_wer.clear(); + vt_onebest_wer.resize(numSequences); + //get utt information, such as channel map id and utt begin frame, utt frame num, utt phone num for frame and phone respectively.... + size_t seqId = 0; //frame + size_t totalframenum = 0; + + // this->FormEvalOrder(Plustransnodes[0]); + + for (const auto& seq : encodeMBLayout->GetAllSequences()) + { + if (seq.seqId == GAP_SEQUENCE_ID) + { + continue; + } + assert(seq.seqId == seqId); + seqId++; + uttFrameToChanInd.push_back(seq.s); + size_t numFrames = seq.GetNumTimeSteps(); + uttFrameBeginIdx.push_back(seq.tBegin); + uttFrameNum.push_back(numFrames); + totalframenum += numFrames; + } + + //get utt information for prediction input.... + seqId = 0; //frame + + for (const auto& seq : decodeMBLayout->GetAllSequences()) + { + if (seq.seqId == GAP_SEQUENCE_ID) + { + continue; + } + assert(seq.seqId == seqId); + seqId++; + uttPhoneToChanInd.push_back(seq.s); + size_t numFrames = seq.GetNumTimeSteps(); + uttPhoneBeginIdx.push_back(seq.tBegin); + uttPhoneNum.push_back(numFrames); + } + + //get phone sequene + CNTK::Matrix maxIndex(deviceid), maxValue(deviceid); + decodeInputMatrix.VectorMax(maxIndex, maxValue, true); + maxIndex.TransferToDeviceIfNotThere(CPUDEVICE); + + //backup decoding input matrix and MBlayout + MBLayoutPtr decodebackupMBlayout; + decodebackupMBlayout = make_shared(); + decodebackupMBlayout->CopyFrom(decodeMBLayout); + + Matrix decodeInputMatrixBackup(deviceid); + decodeInputMatrixBackup.SetValue(decodeInputMatrix); + + std::vector> phoneSeqs; + phoneSeqs.resize(numSequences); + for (size_t utt = 0; utt < numSequences; utt++) + { + //phoneSeqs[utt].resize(uttPhoneNum[utt]); + for (size_t u = 0; u < uttPhoneNum[utt]; u++) + { + size_t uID = (u + uttPhoneBeginIdx[utt]) * numParallelPhoneSequences + uttPhoneToChanInd[utt]; + phoneSeqs[utt].push_back((size_t)(maxIndex(0, uID))); + } + } + // convert the phoneSeqs to word sequence, as reference, convert a string of "_ab_cdef_g" to word sequence of "ab cdef g". + std::vector> wordSeqs; + wordSeqs.resize(numSequences); + for (size_t uttID = 0; uttID < numSequences; uttID++) + { + string word_sequence = ""; + for (size_t i = 0; i < phoneSeqs[uttID].size(); i++) + { + size_t labelID = phoneSeqs[uttID][i]; + if (labelID != (vt_labels.size() - 1)) // it is not + { + string wordpiece = vt_labels[labelID]; + word_sequence += wordpiece; + } + } + convert_word_sequence_string_2_vector(word_sequence, wordSeqs[uttID], '_'); + + vt_nws[uttID] = wordSeqs[uttID].size(); + } + + // the data structure for phone sequence + + // do decoding for the utterances, and feed in the data structure, + + size_t vocabSize = bm.GetNumRows(); + size_t blankId = vocabSize - 1; + vector CurSequences, nextSequences; + // sanity check + if (vt_labels.size() != vocabSize) + { + RuntimeError("RNNT_decode_nbest_MBR: size not match, vt_labels.size() = %d, and vocabSize = %d.", int(vt_labels.size()), int(vocabSize)); + } + + StreamMinibatchInputs decodeinputMatrices = DataReaderHelpersFunctions::RetrieveInputMatrices(decodeinputNodes); + + // this->FormEvalOrder(Plustransnodes[0]); + + my_time = time(NULL); + fprintf(stderr, "RNNT_decode_nbest_MBR time 2 = %s, num_sequence = %d \n", ctime(&my_time), int(numSequences)); + + for (size_t uttID = 0; uttID < numSequences; uttID++) + { + // fprintf(stderr, "decode v0 uttID = %d .\n", int(uttID)); + nextSequences.clear(); + //initialize with blank ID + Sequence oneSeq = newSeq(vocabSize, (size_t) 50, deviceid); + extendSeq(oneSeq, blankId, 0.0); + + nextSequences.push_back(oneSeq); + + // loop for each frame + for (size_t t = 0; t < uttFrameNum[uttID]; t++) + { + for (size_t n = 0; n < CurSequences.size(); n++) + { + deleteSeq(CurSequences[n]); + } + vector().swap(CurSequences); + CurSequences = nextSequences; + + vector().swap(nextSequences); + //fprintf(stderr, "t = %d .\n", int(t)); + + //deal with the same prefix + //int count = 0; + while (true) + { + // fprintf(stderr, "count = %d .\n", int(count++)); + + auto maxSeq = std::max_element(CurSequences.begin(), CurSequences.end()); + Sequence tempSeq = newSeq(*maxSeq, deviceid); + deleteSeq(*maxSeq); + CurSequences.erase(maxSeq); + prepareSequence(tempSeq); + forward_decode(tempSeq, decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, tempSeq.labelseq.size(), *net); + + size_t tinMB = (t + uttFrameBeginIdx[uttID]) * numParallelSequences + uttFrameToChanInd[uttID]; + if (SVD) + forwardmergedSVD(tempSeq, tinMB, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wmu, Wmv, bm, net); + else + forwardmerged(tempSeq, tinMB, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wm, bm, net); + + //sort log posterior and get best N labels + vector> topN = getTopN(decodeOutput, numBestMBR, blankId); + + //expand blank + Sequence seqK = newSeq(tempSeq, deviceid); + ElemType newlogP = topN[vocabSize].second + tempSeq.logP; + seqK.logP = newlogP; + bool existseq = false; + for (auto itseq = nextSequences.begin(); itseq != nextSequences.end(); itseq++) + { + //merge the score with same sequence + if (seqK.labelseq == itseq->labelseq) + { + existseq = true; + itseq->logP = decodeOutput.LogAdd(seqK.logP, itseq->logP); + break; + } + } + if (!existseq) + { + nextSequences.push_back(seqK); + } + int iLabel; + for (iLabel = 0; iLabel < numBestMBR; iLabel++) + { + + seqK = newSeq(tempSeq, deviceid); + newlogP = topN[iLabel].second + tempSeq.logP; + seqK.logP = newlogP; + + if (topN[iLabel].first != blankId) + + { + extendSeq(seqK, topN[iLabel].first, newlogP); + + CurSequences.push_back(seqK); + } + } + vector>().swap(topN); + deleteSeq(tempSeq); + + if (CurSequences.size() == 0) + break; + auto ya = std::max_element(CurSequences.begin(), CurSequences.end()); + auto yb = std::max_element(nextSequences.begin(), nextSequences.end()); + if (nextSequences.size() > numBestMBR && yb->logP > ya->logP) + break; + } + std::sort(nextSequences.begin(), nextSequences.end()); + std::reverse(nextSequences.begin(), nextSequences.end()); + if (nextSequences.size() > numBestMBR) + { + for (size_t n = numBestMBR; n < nextSequences.size(); n++) + { + deleteSeq(nextSequences[n]); + } + } + for (size_t iseq = nextSequences.size(); iseq > numBestMBR; iseq--) + nextSequences.pop_back(); + } + + //nbest output + + if (nextSequences.size() != 0) + { + float totalProb = 0; + + ElemType onebest_lnLogP = ElemType(nextSequences[0].logP / nextSequences[0].labelseq.size()); + size_t onebest_index = 0; + + ElemType lnLogP; + for (size_t n = 0; n < nextSequences.size(); n++) + { + if (n == 0) + { + lnLogP = onebest_lnLogP; + } + else + { + lnLogP = ElemType(nextSequences[n].logP / nextSequences[n].labelseq.size()); + if (lnLogP > onebest_lnLogP) + { + onebest_lnLogP = lnLogP; + onebest_index = n; + } + } + + if (lengthNorm) + nextSequences[n].logP = lnLogP; + + nextSequences[n].logP = exp(nextSequences[n].logP); // the logP actually becomes P + totalProb += float(nextSequences[n].logP); + } + + for (size_t n = 0; n < nextSequences.size(); n++) + { + PathInfo pi; + pi.prob = float(nextSequences[n].logP / totalProb); + + string word_sequence = ""; + for (size_t k = 0; k < nextSequences[n].length - 1; k++) + { + size_t labelID = nextSequences[n].labelseq[k + 1]; + if (labelID != (vt_labels.size() - 1)) // it is not + { + + string wordpiece = vt_labels[labelID]; + word_sequence += wordpiece; + } + } + + vector vt_words; + convert_word_sequence_string_2_vector(word_sequence, vt_words, '_'); + + pi.WER = compute_wer(wordSeqs[uttID], vt_words); + + pi.label_seq = nextSequences[n].labelseq; + + uttPathsInfo[uttID].push_back(pi); + } + vt_onebest_wer[uttID] = uttPathsInfo[uttID][onebest_index].WER; + } + for (size_t n = 0; n < CurSequences.size(); n++) + { + deleteSeq(CurSequences[n]); + } + vector().swap(CurSequences); + for (size_t n = 0; n < nextSequences.size(); n++) + { + deleteSeq(nextSequences[n]); + } + vector().swap(nextSequences); + // end here + my_time = time(NULL); + + fprintf(stderr, "RNNT_decode_nbest_MBR time 3 = %s, uttID = %d \n", ctime(&my_time), int(uttID)); + + } // end of for loop + decodeInputMatrix.SetValue(decodeInputMatrixBackup); + //decodeInputMatrix.Print("after ss"); + decodeMBLayout->CopyFrom(decodebackupMBlayout); + my_time = time(NULL); + fprintf(stderr, "RNNT_decode_nbest_MBR time 4 = %s \n", ctime(&my_time)); + } + }; + + // helper that returns 'float' or 'double' depending on ElemType + template + static inline const wchar_t* ElemTypeName(); + template <> + /*static*/ + inline const wchar_t* ElemTypeName() + { + return L"float"; + } + template <> + /*static*/ inline const wchar_t* ElemTypeName() + { + return L"double"; + } + template <> + /*static*/ inline const wchar_t* ElemTypeName() + { + return L"half"; } -}; -// helper that returns 'float' or 'double' depending on ElemType -template -static inline const wchar_t* ElemTypeName(); -template <> -/*static*/ inline const wchar_t* ElemTypeName() -{ - return L"float"; -} -template <> -/*static*/ inline const wchar_t* ElemTypeName() -{ - return L"double"; -} -template <> -/*static*/ inline const wchar_t* ElemTypeName() -{ - return L"half"; -} + // The following emits the class and enables the BaseMatrix to be available (used by EvalDll) + // The corresponding Matrix is emitted in the SetDeviceId function above. + template class Matrix; + template class Matrix; -// The following emits the class and enables the BaseMatrix to be available (used by EvalDll) -// The corresponding Matrix is emitted in the SetDeviceId function above. -template class Matrix; -template class Matrix; - -// TODOs: -// - automatic inference of time window w.r.t. delay nodes (and related nodes such as a temporal pooling) -// - have overrides of RuntimeError etc. in ComputationNode, which prepend the error string with the node name and operation + // TODOs: + // - automatic inference of time window w.r.t. delay nodes (and related nodes such as a temporal pooling) + // - have overrides of RuntimeError etc. in ComputationNode, which prepend the error string with the node name and operation +} // namespace CNTK } // namespace CNTK } // namespace MSR -} // namespace Microsoft diff --git a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp index 6175d2c9c..b814f2f03 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp @@ -75,6 +75,9 @@ void ComputationNetwork::CopySubTree(const ComputationNetwork& fromNet, LogicError("CopySubTree: you cannot copy a tree without copying the node values."); ComputationNodeBasePtr fromRoot = fromNet.GetNodeFromName(fromName); + std::set set_cur_tonode_names; + + set_cur_tonode_names.clear(); if (!fromNet.EvalOrderExists(fromRoot)) const_cast(fromNet).FormEvalOrder(fromRoot); @@ -84,20 +87,66 @@ void ComputationNetwork::CopySubTree(const ComputationNetwork& fromNet, wstring fromNodeName = fromNode->NodeName(); wstring toNodeName = toNamePrefix + fromNodeName; + // fprintf(stderr, "CopySubTree node = %ls\n", fromNodeName.c_str()); + ComputationNodeBasePtr toNode = CopyNode(fromNet, fromNodeName, toNodeName, CopyNodeFlags::copyNodeValue); - + set_cur_tonode_names.insert(toNodeName); + ComputationNodeBasePtr fromNodeTmp; + fromNodeTmp = fromNet.GetNodeFromName(fromNodeName); + + /* fprintf(stderr, "CopySubTree node = %ls, numrows = %d, numcols = %d \n", fromNodeName.c_str(), + int((&dynamic_pointer_cast>(fromNodeTmp)->Value())->GetNumRows()), + int((&dynamic_pointer_cast>(fromNodeTmp)->Value())->GetNumCols()) + ); */ if (flags & CopyNodeFlags::copyNodeInputLinks) { // copy the children structure but use the new nodes generated for (int i = 0; i < fromNode->GetNumInputs(); i++) - toNode->SetInput(i, GetNodeFromName(toNamePrefix + fromNode->GetInputs()[i]->NodeName())); + { + wstring inputNodeName; + inputNodeName = (toNamePrefix + fromNode->GetInputs()[i]->NodeName()); + if (set_cur_tonode_names.find(inputNodeName) == set_cur_tonode_names.end() ) + { + //somehow the evalorder goes wrong, and the input is not copied to the toNetwork yet, do copy here + // fprintf(stderr, "CopySubTree i = %d, missing node = %ls\n", i, inputNodeName.c_str()); + + CopyNode(fromNet, fromNode->GetInputs()[i]->NodeName(), + inputNodeName, + CopyNodeFlags::copyNodeValue); + set_cur_tonode_names.insert(inputNodeName); + } + //fprintf(stderr, "CopySubTree i = %d, node = %ls\n", i, toNodeName.c_str()); + + //toNode->SetInput(i, GetNodeFromName(toNamePrefix + fromNode->GetInputs()[i]->NodeName())); + toNode->SetInput(i, GetNodeFromName(inputNodeName)); + } } } } -// you can only copy inputs from nodes in the same network +void ComputationNetwork::ShowNodeMemory(const ComputationNetwork& fromNet, + const std::wstring fromName) +{ + + ComputationNodeBasePtr fromRoot = fromNet.GetNodeFromName(fromName); + + if (!fromNet.EvalOrderExists(fromRoot)) + const_cast(fromNet).FormEvalOrder(fromRoot); + + for (const auto& fromNode : fromNet.GetEvalOrder(fromRoot)) // BUGBUG: This probably will fail because the precomputed eval orders are invalid at this point. + { + wstring fromNodeName = fromNode->NodeName(); + ComputationNodeBasePtr fromNodeTmp; + fromNodeTmp = fromNet.GetNodeFromName(fromNodeName); + + fprintf(stderr, "ShowNodeMemory node = %ls, numrows = %d, numcols = %d \n", fromNodeName.c_str(), + int((&dynamic_pointer_cast>(fromNodeTmp)->Value())->GetNumRows()), + int((&dynamic_pointer_cast>(fromNodeTmp)->Value())->GetNumCols())); + } +} + // you can only copy inputs from nodes in the same network void ComputationNetwork::CopyInputs(const std::wstring fromName, std::wstring toName) { CopyNode(*this, fromName, toName, CopyNodeFlags::copyNodeInputLinks); diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 8a6fca012..9459289a6 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -13,7 +13,7 @@ #include "MatrixPool.h" #include "ComputationEnvironment.h" #include "Globals.h" - +#include #include #include #include @@ -233,7 +233,6 @@ struct ComputationNetworkOwnedNodeState return m_parentGradientOptimization == ParentGradientOptimization::Reuse; } - virtual void MarkValueNonSharable() { m_valueSharable = false; @@ -351,6 +350,8 @@ class ComputationNodeBase : public IComputationNode, public: typedef shared_ptr ComputationNodeBasePtr; + + // ----------------------------------------------------------------------- // constructors, copying, (de-)serialization // ----------------------------------------------------------------------- diff --git a/Source/ComputationNetworkLib/ReshapingNodes.cpp b/Source/ComputationNetworkLib/ReshapingNodes.cpp index bd1fbf816..62f970c09 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.cpp +++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp @@ -26,7 +26,12 @@ #include #include -namespace Microsoft { namespace MSR { namespace CNTK { +namespace Microsoft +{ +namespace MSR +{ +namespace CNTK +{ // ----------------------------------------------------------------------- // ReduceElements (op, axis=, input) @@ -39,10 +44,10 @@ template if (flags & CopyNodeFlags::copyNodeValue) { auto node = dynamic_pointer_cast>(nodeP); - node->m_axes = m_axes; - node->m_operation = m_operation; + node->m_axes = m_axes; + node->m_operation = m_operation; node->m_reductionOp = m_reductionOp; - node->m_scale = m_scale; + node->m_scale = m_scale; node->m_keepDimensions = m_keepDimensions; } } @@ -57,7 +62,7 @@ template fstream >> num_axes; for (int i = 0; i < num_axes; ++i) { - int axis; + int axis; fstream >> axis; m_axes.push_back(axis); } @@ -123,7 +128,7 @@ template if (ReduceSequenceAxis()) { ElemType gapPadValue = NeutralValue(m_reductionOp); - input = ComputationNode::Unpack(GetSampleLayout(), InputRef(0).Value(), InputRef(0).GetMBLayout(), m_tempUnpackedData, m_tempScatterIndices, m_tempMask, /*batchMajor=*/ true, &gapPadValue); + input = ComputationNode::Unpack(GetSampleLayout(), InputRef(0).Value(), InputRef(0).GetMBLayout(), m_tempUnpackedData, m_tempScatterIndices, m_tempMask, /*batchMajor=*/true, &gapPadValue); } else input = InputRef(0).ValueTensorFor(rank, frInput); @@ -142,7 +147,24 @@ template default: // the actual operation is a Copy with reduction, where the magic is in the reduction op // For "Mean", m_scale is 1/#elements, and 1 otherwise. - result.DoUnaryOpOf(0, input, m_scale, ElementWiseOperator::opCopy, m_reductionOp); + { + /* + if (num_frames == 94) + { + fprintf(stderr, "reshape 1, name = %ls, result = %f, input = %f, m_scale = %f \n", NodeName().c_str(), double(result.GetSOB().FrobeniusNorm()), double(input.GetSOB().FrobeniusNorm()), double(m_scale)); + } + */ + if (multi_thread) + result.DoUnaryOpOfDebug(0, input, m_scale, ElementWiseOperator::opCopy, m_reductionOp); + else + result.DoUnaryOpOf(0, input, m_scale, ElementWiseOperator::opCopy, m_reductionOp); + /* + if (num_frames == 94) + { + fprintf(stderr, "reshape 2, name = %ls, result = %f, input = %f, m_scale = %f \n", NodeName().c_str(), double(result.GetSOB().FrobeniusNorm()), double(input.GetSOB().FrobeniusNorm()), double(m_scale)); + } + */ + } } } @@ -156,15 +178,15 @@ template { // Broadcast along the sequence auto result = ValueFor(fr); - ComputationNode::BroadcastToPacked(Gradient(), GetMBLayout(), /*beta =*/ accumulateGradient ? (ElemType)1 : (ElemType)0, InputRef(0).Gradient(), FrameRange(InputRef(0).GetMBLayout()), m_tempGatherIndices); + ComputationNode::BroadcastToPacked(Gradient(), GetMBLayout(), /*beta =*/accumulateGradient ? (ElemType) 1 : (ElemType) 0, InputRef(0).Gradient(), FrameRange(InputRef(0).GetMBLayout()), m_tempGatherIndices); } else { const auto frInput = (ReduceAllAxes() || ReduceBatchAxis()) ? FrameRange(InputRef(0).GetMBLayout()) : fr; // can't use 'fr' for ReduceAllAxes() as it refers to the result (same as for training criteria) - // get the args + // get the args size_t rank = DetermineElementwiseTensorRank(); auto sliceOutputGrad = ReduceAllAxes() ? TensorView(GradientPtr(), GetSampleLayout()) : GradientTensorFor(rank, fr); // propagate from this one... - auto sliceInputGrad = InputRef(0).GradientTensorFor(rank, frInput); // ...to this one + auto sliceInputGrad = InputRef(0).GradientTensorFor(rank, frInput); // ...to this one // gradients are not as simple as passing an op-code, unfortunately switch (m_reductionOp) @@ -220,8 +242,8 @@ template break; case ElementWiseOperator::opElementwiseProduct: { - auto input = InputRef(inputIndex).ValueTensorFor(rank, frInput); - auto output = ValueTensorFor(rank, fr.AllowBroadcast()); + auto input = InputRef(inputIndex).ValueTensorFor(rank, frInput); + auto output = ValueTensorFor(rank, fr.AllowBroadcast()); if (accumulateGradient) sliceInputGrad.AddElementwiseProductWithQuotientOf(sliceOutputGrad, output, input); else @@ -242,13 +264,20 @@ template { switch (m_reductionOp) { - case ElementWiseOperator::opSum: return false; - case ElementWiseOperator::opLogSum: return true; - case ElementWiseOperator::opMin: return true; - case ElementWiseOperator::opMax: return true; - case ElementWiseOperator::opElementwiseProduct: return true; - case ElementWiseOperator::opArgmin: return false; - case ElementWiseOperator::opArgmax: return false; + case ElementWiseOperator::opSum: + return false; + case ElementWiseOperator::opLogSum: + return true; + case ElementWiseOperator::opMin: + return true; + case ElementWiseOperator::opMax: + return true; + case ElementWiseOperator::opElementwiseProduct: + return true; + case ElementWiseOperator::opArgmin: + return false; + case ElementWiseOperator::opArgmax: + return false; } LogicError("Should not get here."); } @@ -258,13 +287,20 @@ template { switch (m_reductionOp) { - case ElementWiseOperator::opSum: return false; - case ElementWiseOperator::opLogSum: return true; - case ElementWiseOperator::opMin: return true; - case ElementWiseOperator::opMax: return true; - case ElementWiseOperator::opElementwiseProduct: return true; - case ElementWiseOperator::opArgmin: return false; - case ElementWiseOperator::opArgmax: return false; + case ElementWiseOperator::opSum: + return false; + case ElementWiseOperator::opLogSum: + return true; + case ElementWiseOperator::opMin: + return true; + case ElementWiseOperator::opMax: + return true; + case ElementWiseOperator::opElementwiseProduct: + return true; + case ElementWiseOperator::opArgmin: + return false; + case ElementWiseOperator::opArgmax: + return false; } LogicError("Should not get here."); } @@ -282,7 +318,7 @@ template // validate the opcode (in case we got instantiated empty and never updated) ValidateOp(); - m_scale = (ElemType)1; + m_scale = (ElemType) 1; if (ReduceAllAxes()) Base::ValidateUnaryReduce(isFinalValidationPass, m_keepDimensions); else if (ReduceSequenceAxis()) @@ -319,29 +355,27 @@ template let shape = Input(0)->GetSampleLayout(); auto dims = shape.GetDims(); - size_t reducedDimProd = 1; + size_t reducedDimProd = 1; if (ReduceAllStaticAxes()) { reducedDimProd = shape.GetNumElements(); - dims = m_keepDimensions ? SmallVector(shape.GetRank(), 1) : (Environment().IsV2Library() ? SmallVector({}) : SmallVector({ 1 })); // entire sample is reduced to a scalar + dims = m_keepDimensions ? SmallVector(shape.GetRank(), 1) : (Environment().IsV2Library() ? SmallVector({}) : SmallVector({1})); // entire sample is reduced to a scalar } - else if (!m_axes.empty() - && std::all_of(m_axes.begin(), - m_axes.end(), - [&dims](int axis) { return axis - 1 >= 0 && axis - 1 < dims.size(); })) + else if (!m_axes.empty() && std::all_of(m_axes.begin(), + m_axes.end(), + [&dims](int axis) { return axis - 1 >= 0 && axis - 1 < dims.size(); })) { //Accumulate the number of elements for reduce_mean reducedDimProd = std::accumulate(m_axes.begin(), - m_axes.end(), - 1, - [&dims](size_t acc, int& axis) { return acc * dims[axis - 1]; }); + m_axes.end(), + 1, + [&dims](size_t acc, int& axis) { return acc * dims[axis - 1]; }); // axes reduced to a scalar if (m_keepDimensions) std::for_each(m_axes.begin(), - m_axes.end(), - [&dims](int axis) {dims[axis - 1] = 1; } - ); + m_axes.end(), + [&dims](int axis) { dims[axis - 1] = 1; }); else { SmallVector reducedDims(dims.size() - m_axes.size()); @@ -355,13 +389,12 @@ template dims = reducedDims; } } - else if (isFinalValidationPass) + else if (isFinalValidationPass) { InvalidArgument("The shape of %ls [%ls] can not be reduced along axes [%ls]", - NodeDescription().c_str(), - wstring(shape).c_str(), - boost::algorithm::join(m_axes | boost::adaptors::transformed([](int axis) { return std::to_wstring(axis); }), ", ").c_str() - ); + NodeDescription().c_str(), + wstring(shape).c_str(), + boost::algorithm::join(m_axes | boost::adaptors::transformed([](int axis) { return std::to_wstring(axis); }), ", ").c_str()); } // for "Mean", we must divide by #elements if (isFinalValidationPass && IsMean()) @@ -384,10 +417,14 @@ struct SequenceLengthVector { typedef vector> SequenceVector; typedef MBLayout::SequenceInfo SequenceInfo; - const SequenceVector& m_sequenceVector; // vector of sequences (to get sequence length) - const vector& m_sequenceInfo; // original sequence info (for seqId) - SequenceLengthVector(const vector& sequenceInfo, const SequenceVector& sequenceVector) : m_sequenceInfo(sequenceInfo), m_sequenceVector(sequenceVector) { } - size_t size() const { return m_sequenceInfo.size(); } + const SequenceVector& m_sequenceVector; // vector of sequences (to get sequence length) + const vector& m_sequenceInfo; // original sequence info (for seqId) + SequenceLengthVector(const vector& sequenceInfo, const SequenceVector& sequenceVector) + : m_sequenceInfo(sequenceInfo), m_sequenceVector(sequenceVector) {} + size_t size() const + { + return m_sequenceInfo.size(); + } MBLayout::SequenceInfo operator[](size_t i) const // return a descriptor of the new sequence { SequenceInfo seq; @@ -427,12 +464,12 @@ template for (size_t t = 0; t < seq.GetNumTimeSteps(); t++) { double delta = input(0, inMBLayout->GetColumnIndex(seq, t)); // how many frames the current time step should expand into - desiredCount += delta; // this is now how many frames we should have + desiredCount += delta; // this is now how many frames we should have // use a margin against round-off errors, so that we get non-binary ratios like 1/3 and 1/5 right // This really means generate a frame if too few, unless we are within machine accuracy of the target. // The assumption is that the delta has this error, while accumulation (in double) has no error. ElemType relativeMargin = 1 - std::numeric_limits::epsilon(); - while ((indexSequence.empty() && desiredCount > 0) // no margin for the first frame (always include unless flag is 0) + while ((indexSequence.empty() && desiredCount > 0) // no margin for the first frame (always include unless flag is 0) || indexSequence.size() < desiredCount * relativeMargin) indexSequence.push_back(t); } @@ -441,10 +478,10 @@ template input.CollapseDataLocation(); // BUGBUG: Move back, since BOTH state is broken at present. // create a new MBLayout let& outMBLayout = GetMBLayout(); - outMBLayout->InitAsPackedSequences(SequenceLengthVector(sequences, indexSequences), /*temp*/m_placementBuffer, /*temp*/m_rowAllocationsBuffer); + outMBLayout->InitAsPackedSequences(SequenceLengthVector(sequences, indexSequences), /*temp*/ m_placementBuffer, /*temp*/ m_rowAllocationsBuffer); // copy to output vector buf(outMBLayout->GetNumCols(), numeric_limits::quiet_NaN()); // STL cannot easily avoid initializing, so we might as well init with NaN for gaps - let size = min(sequences.size(), outMBLayout->GetAllSequences().size()); // no non-gap sequence has an index beyond this + let size = min(sequences.size(), outMBLayout->GetAllSequences().size()); // no non-gap sequence has an index beyond this for (size_t i = 0; i < size; i++) { let& seq = outMBLayout->GetAllSequences()[i]; @@ -452,7 +489,7 @@ template continue; let& indexSequence = indexSequences[i]; for (size_t t = 0; t < seq.GetNumTimeSteps(); t++) - buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType)indexSequence[t]; + buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType) indexSequence[t]; } // there may be dangling gaps at the end. Take the opportunity to verify this. for (size_t i = size; i < sequences.size(); i++) @@ -460,7 +497,7 @@ template for (size_t i = size; i < outMBLayout->GetAllSequences().size(); i++) assert(outMBLayout->GetAllSequences()[i].seqId == GAP_SEQUENCE_ID); // the result will be kept in CPUDEVICE, since most likely we will access it again in PackedIndexNode - Value().TransferToDeviceIfNotThere(CPUDEVICE, /*isBeingMoved=*/ true, /*emptyTransfer=*/ true, /*updatePreferredDevice=*/ true); + Value().TransferToDeviceIfNotThere(CPUDEVICE, /*isBeingMoved=*/true, /*emptyTransfer=*/true, /*updatePreferredDevice=*/true); Value().SetValue(1, outMBLayout->GetNumCols(), CPUDEVICE, buf.data(), MatrixFormat::matrixFormatColMajor); } @@ -501,9 +538,9 @@ template /*virtual*/ void PackedIndexNode::ForwardPropNonLooping() /*override*/ { let& sourceMBLayout = InputRef(SOURCEDATA).GetMBLayout(); // only used for index conversion - let& indexMBLayout = InputRef(INDEXDATA).GetMBLayout(); - let& index = InputRef(INDEXDATA).Value(); // per-seq index values that are to be mapped - auto& result = Value(); // packed index values as mapped to sourceData's layout + let& indexMBLayout = InputRef(INDEXDATA).GetMBLayout(); + let& index = InputRef(INDEXDATA).Value(); // per-seq index values that are to be mapped + auto& result = Value(); // packed index values as mapped to sourceData's layout // loop over sourceSequences // Input matrix contains time indices for each sequence that refer to frames inside that sequence. // We replace every per-sequence index by the resolved column index w.r.t. the same MBLayout. @@ -516,10 +553,10 @@ template let& indexSeq = indexMBLayout->FindMatchingSequence(sourceSequences, i); // find corresponding entry in indexMBLayout for (size_t tIndex = 0; tIndex < indexSeq.GetNumTimeSteps(); tIndex++) // map all index values in index sequence { - let jIndex = indexMBLayout->GetColumnIndex(indexSeq, tIndex); // map time index to actual location in the matrix storage object - let tSource = (size_t)index(0, jIndex); // the new time location (relative to source sequence) + let jIndex = indexMBLayout->GetColumnIndex(indexSeq, tIndex); // map time index to actual location in the matrix storage object + let tSource = (size_t) index(0, jIndex); // the new time location (relative to source sequence) let jSource = sourceMBLayout->GetColumnIndex(sourceSeq, tSource); // map new time index as well. This performs a range check. - result(0, jIndex) = (ElemType)jSource; + result(0, jIndex) = (ElemType) jSource; } } // Note: maybe this is no longer needed, now that we do the same inside UpdateFunctionValueSize() for all nodes. @@ -563,8 +600,8 @@ template /*virtual*/ void GatherPackedNode::ForwardPropNonLooping() /*override*/ { InputRef(INDEXDATA).MaskMissingValueColumnsTo(FrameRange(InputRef(INDEXDATA).GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter - let& index = InputRef(INDEXDATA) .Value(); // column indices to copy from - let& source = InputRef(SOURCEDATA).Value(); // source data to copy + let& index = InputRef(INDEXDATA).Value(); // column indices to copy from + let& source = InputRef(SOURCEDATA).Value(); // source data to copy #ifdef _MSC_VER auto& outputValuePtrRef = ValuePtrRef(); @@ -587,9 +624,9 @@ template { if (inputIndex == SOURCEDATA) { - let& index = InputRef(INDEXDATA) .Value(); // column indices to copy from + let& index = InputRef(INDEXDATA).Value(); // column indices to copy from auto& sourceGradient = InputRef(SOURCEDATA).Gradient(); // source to propagate the gradient intpu - auto& outputGradient = Gradient(); // output gradient to propagate + auto& outputGradient = Gradient(); // output gradient to propagate sourceGradient.DoScatterColumnsOf(/*beta=*/1, index, outputGradient, /*alpha=*/1, true); } } @@ -614,7 +651,7 @@ template SetDims(Input(SOURCEDATA)->GetSampleLayout(), HasMBLayout()); else { - SmallVector layout = { 1 }; // Scalar + SmallVector layout = {1}; // Scalar if (Input(SOURCEDATA)->GetSampleLayout().GetRank() > 1) { auto srcLayout = Input(SOURCEDATA)->GetSampleLayout().GetDims(); @@ -638,8 +675,8 @@ template if (*InputRef(INDEXDATA).GetMBLayout() != *InputRef(SOURCEDATA).GetMBLayout()) InvalidArgument("%ls %ls operation requires the minibatch layout of index and source data to be the same.", NodeName().c_str(), OperationName().c_str()); InputRef(INDEXDATA).MaskMissingValueColumnsTo(FrameRange(InputRef(INDEXDATA).GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter - let& index = InputRef(INDEXDATA) .Value(); // column indices to copy from - let& source = InputRef(SOURCEDATA).Value(); // source data to copy + let& index = InputRef(INDEXDATA).Value(); // column indices to copy from + let& source = InputRef(SOURCEDATA).Value(); // source data to copy #ifdef _MSC_VER auto& outputValuePtrRef = ValuePtrRef(); @@ -653,7 +690,7 @@ template source.GetMatrixType(), source.GetFormat()); - auto& output = Value(); // output goes here + auto& output = Value(); // output goes here output.DoScatterColumnsOf(/*beta=*/0, index, source, /*alpha=*/1, true); } @@ -662,9 +699,9 @@ template { if (inputIndex == SOURCEDATA) { - let& index = InputRef(INDEXDATA).Value(); // column indices to copy from + let& index = InputRef(INDEXDATA).Value(); // column indices to copy from auto& sourceGradient = Input(SOURCEDATA)->Gradient(); // source to propagate the gradient input - auto& outputGradient = Gradient(); // output gradient to propagate + auto& outputGradient = Gradient(); // output gradient to propagate sourceGradient.DoGatherColumnsOf(/*beta=*/1, index, outputGradient, /*alpha=*/1); } } @@ -707,8 +744,8 @@ template CropNode::CropNode(size_t offsetX, size_t offsetY, DEVICEID_TYPE deviceId, const wstring& name) : CropNode(deviceId, name) { - m_xOffset = (double)(offsetX); - m_yOffset = (double)(offsetY); + m_xOffset = (double) (offsetX); + m_yOffset = (double) (offsetY); } template @@ -880,8 +917,7 @@ void CropNode::ComputeCropOffsets() // nodeToTransformMap contains coordinate maps for all nodes traversed so far, and is updated by this function. // Traversal stack contains all nodes traversed so far. Inputs of currNode are pushed to traversal stack so that their // inputs can be processed later on. - auto ProcessInputs = [](ComputationNodeBase* currNode, stack& traversalStack, unordered_map& nodeToTransformMap) - { + auto ProcessInputs = [](ComputationNodeBase* currNode, stack& traversalStack, unordered_map& nodeToTransformMap) { if (!currNode->Is()) RuntimeError("Node does not support affine transform for cropping."); @@ -1033,4 +1069,6 @@ template class CropNode; template class CropNode; template class CropNode; -}}} +} // namespace CNTK +} // namespace MSR +} // namespace Microsoft diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h index 25bb1aaf2..c84fd1263 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.h +++ b/Source/ComputationNetworkLib/ReshapingNodes.h @@ -21,7 +21,12 @@ #include #include -namespace Microsoft { namespace MSR { namespace CNTK { +namespace Microsoft +{ +namespace MSR +{ +namespace CNTK +{ // ----------------------------------------------------------------------- // Reshape(x, tensorShape, beginAxis=0, endAxis=0) -- reinterpret input samples as having different tensor dimensions @@ -46,8 +51,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { template class ReshapeNode : public UnaryElementWiseNode { - typedef UnaryElementWiseNode Base; UsingUnaryElementwiseNodeBaseMembers; - static const std::wstring TypeName() { return L"Reshape"; } + typedef UnaryElementWiseNode Base; + UsingUnaryElementwiseNodeBaseMembers; + static const std::wstring TypeName() + { + return L"Reshape"; + } public: ReshapeNode(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& replacementSampleLayout = TensorShape(), int beginAxis = 1, int endAxis = 0) @@ -70,7 +79,7 @@ public: { auto node = dynamic_pointer_cast>(nodeP); node->m_beginDimParameter = m_beginDimParameter; - node->m_endDimParameter = m_endDimParameter; + node->m_endDimParameter = m_endDimParameter; node->m_replacementSampleLayout = m_replacementSampleLayout; } } @@ -152,25 +161,25 @@ public: virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override { - auto result = ValueFor(fr); + auto result = ValueFor(fr); auto inputValue = InputRef(0).ValueFor(fr); ForwardPropImpl(result, inputValue); } - static void ForwardPropImpl(Matrix &result, Matrix &input) + static void ForwardPropImpl(Matrix& result, Matrix& input) { result.AssignValuesOf(input.Reshaped(result.GetNumRows(), result.GetNumCols())); } virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override { - auto gradient = GradientFor(fr); + auto gradient = GradientFor(fr); auto inputGradient = InputRef(inputIndex).GradientFor(fr); BackpropImpl(gradient, inputGradient, Input(inputIndex)->IsGradientOptimized(this), Input(inputIndex)->ParentGradientReused()); } - static void BackpropImpl(Matrix &gradient, Matrix &inputGradient, bool isGradientOptimized, bool isParentGradientReused) + static void BackpropImpl(Matrix& gradient, Matrix& inputGradient, bool isGradientOptimized, bool isParentGradientReused) { if (isGradientOptimized) { @@ -188,10 +197,19 @@ public: inputGradient += gradient.Reshaped(inputGradient.GetNumRows(), inputGradient.GetNumCols()); } - virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } - virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } + virtual bool OutputUsedInComputingInputNodesGradients() const override + { + return false; + } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override + { + return false; + } - virtual ParentGradientOptimization ImplementsGradientOptimization(const ComputationNodeBase* input) const override { return ParentGradientOptimization::Reuse; } + virtual ParentGradientOptimization ImplementsGradientOptimization(const ComputationNodeBase* input) const override + { + return ParentGradientOptimization::Reuse; + } private: TensorShape m_replacementSampleLayout; // user-specified dimensions to replace dimensions [beginAxis, endAxis] @@ -222,12 +240,19 @@ template class ReshapeNode; template class ReduceElementsNode : public ComputationNode, public NumInputs<1> { - typedef ComputationNode Base; UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() { return L"ReduceElements"; } + typedef ComputationNode Base; + UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() + { + return L"ReduceElements"; + } void ValidateOp(); - static inline bool Contains(const std::vector& axes, int axis) { return std::find(axes.begin(), axes.end(), axis) != axes.end(); } + static inline bool Contains(const std::vector& axes, int axis) + { + return std::find(axes.begin(), axes.end(), axis) != axes.end(); + } static bool DefaultKeepDimensionsSetting(int axis) { return !((axis == CNTKInternalIdxValueForAllStaticAxes) || (axis == CNTKInternalIdxValueForAllAxes)); @@ -245,11 +270,16 @@ public: { switch (op) { - case ElementWiseOperator::opSum: return 0; - case ElementWiseOperator::opLogSum: return -std::numeric_limits::infinity(); - case ElementWiseOperator::opMin: return std::numeric_limits::infinity(); - case ElementWiseOperator::opMax: return -std::numeric_limits::infinity(); - case ElementWiseOperator::opElementwiseProduct: return 1; + case ElementWiseOperator::opSum: + return 0; + case ElementWiseOperator::opLogSum: + return -std::numeric_limits::infinity(); + case ElementWiseOperator::opMin: + return std::numeric_limits::infinity(); + case ElementWiseOperator::opMax: + return -std::numeric_limits::infinity(); + case ElementWiseOperator::opElementwiseProduct: + return 1; default: InvalidArgument("ReduceElementsNode::NeutralValue: Invalid operation code; allowed are: 'opSum', 'opMax', 'opMin', 'opElementwiseProduct', 'opLogSum'."); } @@ -258,47 +288,57 @@ public: // map the operation specified as a string to an ElementWiseOperator value. static ElementWiseOperator ReductionOpEnumValue(const std::wstring& opName) { - if (opName == L"Plus") return ElementWiseOperator::opSum; - else if (opName == L"Sum") return ElementWiseOperator::opSum; - else if (opName == L"Mean") return ElementWiseOperator::opSum; - else if (opName == L"LogSum") return ElementWiseOperator::opLogSum; - else if (opName == L"Min") return ElementWiseOperator::opMin; - else if (opName == L"Max") return ElementWiseOperator::opMax; - else if (opName == L"Prod") return ElementWiseOperator::opElementwiseProduct; - else if (opName == L"Argmin") return ElementWiseOperator::opArgmin; - else if (opName == L"Argmax") return ElementWiseOperator::opArgmax; + if (opName == L"Plus") + return ElementWiseOperator::opSum; + else if (opName == L"Sum") + return ElementWiseOperator::opSum; + else if (opName == L"Mean") + return ElementWiseOperator::opSum; + else if (opName == L"LogSum") + return ElementWiseOperator::opLogSum; + else if (opName == L"Min") + return ElementWiseOperator::opMin; + else if (opName == L"Max") + return ElementWiseOperator::opMax; + else if (opName == L"Prod") + return ElementWiseOperator::opElementwiseProduct; + else if (opName == L"Argmin") + return ElementWiseOperator::opArgmin; + else if (opName == L"Argmax") + return ElementWiseOperator::opArgmax; // more here - else InvalidArgument("Invalid operation code '%ls'. Allowed are: 'Sum', 'Max', 'Min', 'Prod', 'Argmax', 'Argmin'.", opName.c_str()); + else + InvalidArgument("Invalid operation code '%ls'. Allowed are: 'Sum', 'Max', 'Min', 'Prod', 'Argmax', 'Argmin'.", opName.c_str()); } public: - ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation, int axis, bool keepDimensions) : - Base(deviceId, name), m_operation(operation), m_axes({ axis }), m_reductionOp((ElementWiseOperator)-1/*invalid*/), m_scale(0/*invalid*/), m_keepDimensions(keepDimensions) + ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation, int axis, bool keepDimensions) + : Base(deviceId, name), m_operation(operation), m_axes({axis}), m_reductionOp((ElementWiseOperator) -1 /*invalid*/), m_scale(0 /*invalid*/), m_keepDimensions(keepDimensions) { if (!m_operation.empty()) // verify validity already here out of courtesy (would otherwise be caught in Validate()) ValidateOp(); } - ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation = std::wstring(), int axis = CNTKInternalIdxValueForAllStaticAxes) : - ReduceElementsNode(deviceId, name, operation, { axis }, DefaultKeepDimensionsSetting(axis)) + ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation = std::wstring(), int axis = CNTKInternalIdxValueForAllStaticAxes) + : ReduceElementsNode(deviceId, name, operation, {axis}, DefaultKeepDimensionsSetting(axis)) { } - ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation, const std::vector& axis, bool keepDimensions) : - Base(deviceId, name), m_operation(operation), m_axes(axis), m_reductionOp((ElementWiseOperator)-1/*invalid*/), m_scale(0/*invalid*/), m_keepDimensions(keepDimensions) + ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation, const std::vector& axis, bool keepDimensions) + : Base(deviceId, name), m_operation(operation), m_axes(axis), m_reductionOp((ElementWiseOperator) -1 /*invalid*/), m_scale(0 /*invalid*/), m_keepDimensions(keepDimensions) { if (!m_operation.empty()) // verify validity already here out of courtesy (would otherwise be caught in Validate()) ValidateOp(); } - ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation, const std::vector& axis) : - ReduceElementsNode(deviceId, name, operation, axis, DefaultKeepDimensionsSetting(axis)) + ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation, const std::vector& axis) + : ReduceElementsNode(deviceId, name, operation, axis, DefaultKeepDimensionsSetting(axis)) { } - ReduceElementsNode(const ScriptableObjects::IConfigRecordPtr configp) : - ReduceElementsNode(configp->Get(L"deviceId"), L"", configp->Get(L"reductionOp"), (int) configp->Get(L"axis")) + ReduceElementsNode(const ScriptableObjects::IConfigRecordPtr configp) + : ReduceElementsNode(configp->Get(L"deviceId"), L"", configp->Get(L"reductionOp"), (int) configp->Get(L"axis")) { AttachInputsFromConfig(configp, this->GetExpectedNumInputs()); } @@ -352,25 +392,55 @@ public: ReleaseMatrixToPool(m_tempGatherIndices, matrixPool); } - std::wstring ReductionOpName() const { return m_operation; } - const std::vector& ReductionAxis() const { return m_axes; } + std::wstring ReductionOpName() const + { + return m_operation; + } + const std::vector& ReductionAxis() const + { + return m_axes; + } + // for debug purpose + void set_frame(int nfs) + { + num_frames = nfs; + } + void is_multi_thread(bool mt) + { + multi_thread = mt; + } - static const int CNTKInternalIdxValueForAllStaticAxes = 0; - static const int CNTKInternalIdxValueForAllAxes = -1; - static const int CNTKInternalIdxValueForSequenceAxis = -2; - static const int CNTKInternalIdxValueForBatchAxis = -3; + static const int CNTKInternalIdxValueForAllStaticAxes = 0; + static const int CNTKInternalIdxValueForAllAxes = -1; + static const int CNTKInternalIdxValueForSequenceAxis = -2; + static const int CNTKInternalIdxValueForBatchAxis = -3; private: - bool IsMean() const { return (m_operation == L"Mean"); } - bool ReduceAllStaticAxes() const { return Contains(m_axes, CNTKInternalIdxValueForAllStaticAxes); } - bool ReduceAllAxes() const { return Contains(m_axes, CNTKInternalIdxValueForAllAxes); } - bool ReduceSequenceAxis() const { return Contains(m_axes, CNTKInternalIdxValueForSequenceAxis); } - bool ReduceBatchAxis() const { return Contains(m_axes, CNTKInternalIdxValueForBatchAxis); } + bool IsMean() const + { + return (m_operation == L"Mean"); + } + bool ReduceAllStaticAxes() const + { + return Contains(m_axes, CNTKInternalIdxValueForAllStaticAxes); + } + bool ReduceAllAxes() const + { + return Contains(m_axes, CNTKInternalIdxValueForAllAxes); + } + bool ReduceSequenceAxis() const + { + return Contains(m_axes, CNTKInternalIdxValueForSequenceAxis); + } + bool ReduceBatchAxis() const + { + return Contains(m_axes, CNTKInternalIdxValueForBatchAxis); + } private: // operation attributes std::vector m_axes; - std::wstring m_operation; // the operation as a string, e.g. "Sum", see ValidateOp() + std::wstring m_operation; // the operation as a string, e.g. "Sum", see ValidateOp() bool m_keepDimensions; // things cached during validation @@ -381,6 +451,9 @@ private: shared_ptr> m_tempMask; shared_ptr> m_tempScatterIndices; shared_ptr> m_tempUnpackedData; + // debug + size_t num_frames; + bool multi_thread = false; }; // ----------------------------------------------------------------------- @@ -395,8 +468,12 @@ private: template class ReconcileDynamicAxisNode : public ComputationNode, public NumInputs<2> { - typedef ComputationNode Base; UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() { return L"ReconcileDynamicAxis"; } + typedef ComputationNode Base; + UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() + { + return L"ReconcileDynamicAxis"; + } public: DeclareConstructorFromConfigWithNumInputs(ReconcileDynamicAxisNode); @@ -412,9 +489,9 @@ public: m_layoutsMatch = InputRef(0).HasMBLayout() && *m_pMBLayout == *InputRef(0).GetMBLayout(); // this does a deep value-level comparison - if (InputRef(0).HasMBLayout() && !m_layoutsMatch && // input is a mismatching data input --only allowed case is broadcast_as() - ((InputRef(0).GetMBLayout()->GetNumTimeSteps() != 1) || // not broadcast_as() - (InputRef(0).GetMBLayout()->GetNumSequences() != m_pMBLayout->GetNumSequences()))) // different batch?? + if (InputRef(0).HasMBLayout() && !m_layoutsMatch && // input is a mismatching data input --only allowed case is broadcast_as() + ((InputRef(0).GetMBLayout()->GetNumTimeSteps() != 1) || // not broadcast_as() + (InputRef(0).GetMBLayout()->GetNumSequences() != m_pMBLayout->GetNumSequences()))) // different batch?? { InvalidArgument("%ls %ls operation discovered that %ls %ls operation produced an MB layout that is incompatible with that of %ls %ls.", NodeName().c_str(), OperationName().c_str(), @@ -422,11 +499,11 @@ public: InputRef(1).NodeName().c_str(), InputRef(1).OperationName().c_str()); } - if (!InputRef(0).HasMBLayout() || m_layoutsMatch) // no shuffle-case: everything matches or non-data that can use tensor broadcast + if (!InputRef(0).HasMBLayout() || m_layoutsMatch) // no shuffle-case: everything matches or non-data that can use tensor broadcast { // copy the data from 'dataInput' size_t rank = GetSampleLayout().GetRank(); - auto result = ValueTensorFor(rank, fr); + auto result = ValueTensorFor(rank, fr); auto input0 = InputRef(0).ValueTensorFor(rank, InputRef(0).HasMBLayout() ? fr.WithLayout(InputRef(0).GetMBLayout()) : fr.AllowBroadcast()); // If data input has a layout (which is known to match), then replace the pointer here ^^ to avoid another runtime check. // If it has no layout, then set the broadcast-allowed flag, which will accept any layout to be passed in. @@ -436,7 +513,7 @@ public: else // Broadcasting along the sequence case: must reshuffle { auto result = ValueFor(fr); - ComputationNode::BroadcastToPacked(InputRef(0).Value(), InputRef(0).GetMBLayout(), /*beta =*/ 0, result, fr, m_tempGatherIndices); + ComputationNode::BroadcastToPacked(InputRef(0).Value(), InputRef(0).GetMBLayout(), /*beta =*/0, result, fr, m_tempGatherIndices); } } @@ -454,7 +531,7 @@ public: TensorView inputGradient; if (!InputRef(0).GetMBLayout() || m_layoutsMatch) { - gradient = GradientTensorFor(rank, fr); + gradient = GradientTensorFor(rank, fr); inputGradient = InputRef(inputIndex).GradientTensorFor(rank, InputRef(inputIndex).HasMBLayout() ? fr.WithLayout(InputRef(inputIndex).GetMBLayout()) : fr.AllowBroadcast()); } else @@ -464,7 +541,7 @@ public: InvalidArgument("%ls %ls operation does not support broadcasting the left operand to the right operand's dynamic axis, inside a recurrent loop.", NodeName().c_str(), OperationName().c_str()); ElemType gapPadValue = 0; - gradient = ComputationNode::Unpack(GetSampleLayout(), GradientFor(fr), m_pMBLayout, m_tempUnpackedData, m_tempScatterIndices, std::shared_ptr>(nullptr), /*batchMajor=*/ true, &gapPadValue); + gradient = ComputationNode::Unpack(GetSampleLayout(), GradientFor(fr), m_pMBLayout, m_tempUnpackedData, m_tempScatterIndices, std::shared_ptr>(nullptr), /*batchMajor=*/true, &gapPadValue); inputGradient = Input(inputIndex)->GradientTensorFor(rank, FrameRange(InputRef(inputIndex).GetMBLayout(), 0)); } @@ -477,8 +554,14 @@ public: } } - virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } - virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } + virtual bool OutputUsedInComputingInputNodesGradients() const override + { + return false; + } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override + { + return false; + } virtual ParentGradientOptimization ImplementsGradientOptimization(const ComputationNodeBase* input) const override { return (Input(0).get() == input) ? ParentGradientOptimization::Overwrite : ParentGradientOptimization::None; // no gradient propagation to input1 @@ -533,17 +616,19 @@ template class ReconcileDynamicAxisNode; template class ToBatchAxisNode : public ComputationNodeNonLooping, public NumInputs<1> { - typedef ComputationNodeNonLooping Base; UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() { + typedef ComputationNodeNonLooping Base; + UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() + { return L"ToBatchAxisNode"; } + public: ToBatchAxisNode(DEVICEID_TYPE deviceId, const wstring& name) : Base(deviceId, name) { - } - + virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override { auto& inputValue = InputRef(0).Value(); @@ -570,7 +655,7 @@ public: return false; } - virtual ParentGradientOptimization ImplementsGradientOptimization(const ComputationNodeBase* input) const override + virtual ParentGradientOptimization ImplementsGradientOptimization(const ComputationNodeBase* input) const override { return ParentGradientOptimization::Reuse; } @@ -583,7 +668,7 @@ public: virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override { Base::Validate(isFinalValidationPass); - + if (!m_pMBLayout) { m_pMBLayout = make_shared(); // this generates a new layout @@ -614,14 +699,16 @@ public: template class ToBatchAxisNode; template class ToBatchAxisNode; - template class UnpackBatchAxisNode : public ComputationNodeNonLooping, public NumInputs<1> { - typedef ComputationNodeNonLooping Base; UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() { + typedef ComputationNodeNonLooping Base; + UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() + { return L"UnpackBatchAxis"; } + public: UnpackBatchAxisNode(DEVICEID_TYPE deviceId, const wstring& name) : Base(deviceId, name) @@ -644,12 +731,12 @@ public: ReshapeNode::BackpropImpl(gradient, inputGradient, Input(0)->IsGradientOptimized(this), Input(0)->ParentGradientReused()); } - virtual bool OutputUsedInComputingInputNodesGradients() const override + virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } - virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } @@ -659,7 +746,7 @@ public: return ParentGradientOptimization::Reuse; } - bool ForceDynamicValidation() const override + bool ForceDynamicValidation() const override { return true; } @@ -705,19 +792,23 @@ template class UnpackBatchAxisNode; template class SliceNode : public ComputationNode, public NumInputs<1> { - typedef ComputationNode Base; UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() { return L"Slice"; } + typedef ComputationNode Base; + UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() + { + return L"Slice"; + } public: SliceNode(DEVICEID_TYPE deviceId, const wstring& name, std::vector beginIndex = {0}, std::vector endIndex = {0}, std::vector axis = {1}, std::vector stride_multiplier = {1}) : Base(deviceId, name), m_beginIndex(beginIndex), m_endIndex(endIndex), m_axis(axis), m_stride_multiplier(stride_multiplier) { if (m_beginIndex.size() != m_endIndex.size() || m_beginIndex.size() != m_axis.size()) - InvalidArgument("%ls %ls operation: invalid size of beginIndex (%d), endIndx (%d) and axis (%d). They must agree.", NodeName().c_str(), OperationName().c_str(), (int)m_beginIndex.size(), (int)m_endIndex.size(), (int)m_axis.size()); + InvalidArgument("%ls %ls operation: invalid size of beginIndex (%d), endIndx (%d) and axis (%d). They must agree.", NodeName().c_str(), OperationName().c_str(), (int) m_beginIndex.size(), (int) m_endIndex.size(), (int) m_axis.size()); } SliceNode(const ScriptableObjects::IConfigRecordPtr configp) - : SliceNode(configp->Get(L"deviceId"), L"", { configp->Get(L"beginIndex") }, { configp->Get(L"endIndex") }, { configp->Get(L"axis") }) + : SliceNode(configp->Get(L"deviceId"), L"", {configp->Get(L"beginIndex")}, {configp->Get(L"endIndex")}, {configp->Get(L"axis")}) { AttachInputsFromConfig(configp, this->GetExpectedNumInputs()); } @@ -735,35 +826,35 @@ public: Base::CopyTo(nodeP, newName, flags); auto node = dynamic_pointer_cast>(nodeP); node->m_beginIndex = m_beginIndex; - node->m_endIndex = m_endIndex; - node->m_axis = m_axis; + node->m_endIndex = m_endIndex; + node->m_axis = m_axis; node->m_stride_multiplier = m_stride_multiplier; } virtual void Load(File& fstream, size_t modelVersion) override { Base::Load(fstream, modelVersion); - int num = 1, axis = 1, stride_multiplier = 1; // axis = 1 to emulate old RowSliceNode + int num = 1, axis = 1, stride_multiplier = 1; // axis = 1 to emulate old RowSliceNode ptrdiff_t beginIndex, height; if (modelVersion >= CNTK_MODEL_VERSION_22) - fstream >> num; + fstream >> num; if (num < 1) - InvalidArgument("Slice node number of axes (%d) invalid, must be >=1", num); + InvalidArgument("Slice node number of axes (%d) invalid, must be >=1", num); - m_beginIndex.clear(); + m_beginIndex.clear(); m_endIndex.clear(); - m_axis.clear(); + m_axis.clear(); m_stride_multiplier.clear(); for (int i = 0; i < num; i++) { fstream >> beginIndex >> height; // legacy format stored (end-begin) - m_beginIndex.push_back((int)beginIndex); - m_endIndex.push_back((int)(beginIndex + height)); + m_beginIndex.push_back((int) beginIndex); + m_endIndex.push_back((int) (beginIndex + height)); if (modelVersion >= CNTK_MODEL_VERSION_3) fstream >> axis; if (modelVersion >= CNTK_MODEL_VERSION_27) fstream >> stride_multiplier; - m_axis.push_back(axis); + m_axis.push_back(axis); m_stride_multiplier.push_back(stride_multiplier); } } @@ -771,57 +862,64 @@ public: virtual void Save(File& fstream) const override { Base::Save(fstream); - int num = (int)m_axis.size(); - fstream << num; + int num = (int) m_axis.size(); + fstream << num; for (auto i = 0; i < num; i++) { - fstream << (ptrdiff_t)m_beginIndex[i] << (ptrdiff_t)(m_endIndex[i] - m_beginIndex[i]); // legacy file format stores (end-begin), we keep it that way + fstream << (ptrdiff_t) m_beginIndex[i] << (ptrdiff_t)(m_endIndex[i] - m_beginIndex[i]); // legacy file format stores (end-begin), we keep it that way fstream << m_axis[i]; fstream << m_stride_multiplier[i]; } } // these implement numpy-style negative bound values to index from the end - std::vector BeginIndex() const { return m_beginIndex; } - size_t BeginIndex(int idx) const + std::vector BeginIndex() const { - if (idx >= (int)m_axis.size()) - InvalidArgument("Slice BeginIndex call with invalid index (%d) >= axis size (%d)", idx, (int)m_axis.size()); - return m_beginIndex[idx] >= 0 ? (size_t)m_beginIndex[idx] : (size_t)(m_beginIndex[idx] + InputRef(0).GetSampleLayout()[m_axis[idx] - 1]); + return m_beginIndex; } - std::vector EndIndex() const { return m_endIndex; } - size_t EndIndex(int idx) const + size_t BeginIndex(int idx) const { - if (idx >= (int)m_axis.size()) - InvalidArgument("Slice EndIndex call with invalid index (%d) >= axis size (%d)", idx, (int)m_axis.size()); - return m_endIndex[idx] > 0 ? (size_t)m_endIndex[idx] : (size_t)(m_endIndex[idx] + InputRef(0).GetSampleLayout()[m_axis[idx] - 1]); + if (idx >= (int) m_axis.size()) + InvalidArgument("Slice BeginIndex call with invalid index (%d) >= axis size (%d)", idx, (int) m_axis.size()); + return m_beginIndex[idx] >= 0 ? (size_t) m_beginIndex[idx] : (size_t)(m_beginIndex[idx] + InputRef(0).GetSampleLayout()[m_axis[idx] - 1]); } - std::vector Axis() const { return m_axis; } - int Axis(int idx) const - { - if (idx >= (int)m_axis.size()) - InvalidArgument("Slice Axis call with invalid index (%d) >= axis size (%d)", idx, (int)m_axis.size()); - return m_axis[idx]; + std::vector EndIndex() const + { + return m_endIndex; + } + size_t EndIndex(int idx) const + { + if (idx >= (int) m_axis.size()) + InvalidArgument("Slice EndIndex call with invalid index (%d) >= axis size (%d)", idx, (int) m_axis.size()); + return m_endIndex[idx] > 0 ? (size_t) m_endIndex[idx] : (size_t)(m_endIndex[idx] + InputRef(0).GetSampleLayout()[m_axis[idx] - 1]); + } + std::vector Axis() const + { + return m_axis; + } + int Axis(int idx) const + { + if (idx >= (int) m_axis.size()) + InvalidArgument("Slice Axis call with invalid index (%d) >= axis size (%d)", idx, (int) m_axis.size()); + return m_axis[idx]; } private: - // determine the tensor shape that represents slice of the input that we are taking - TensorShape GetInputSlice(size_t rank, const FrameRange & fr) const + TensorShape GetInputSlice(size_t rank, const FrameRange& fr) const { - auto inputSlice = InputRef(0).GetTensorSliceFor(rank, fr); // input must be narrowed down - for (int i = 0; i < (int)m_axis.size(); i++) - inputSlice.NarrowTo(Axis(i)-1, BeginIndex(i), EndIndex(i), m_stride_multiplier[i]); + auto inputSlice = InputRef(0).GetTensorSliceFor(rank, fr); // input must be narrowed down + for (int i = 0; i < (int) m_axis.size(); i++) + inputSlice.NarrowTo(Axis(i) - 1, BeginIndex(i), EndIndex(i), m_stride_multiplier[i]); return inputSlice; } public: - virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override { size_t rank = DetermineElementwiseTensorRank(); auto output = ValueTensorFor(rank, fr); - let input = TensorView(InputRef(0).ValuePtr(), GetInputSlice(rank, fr.AllowBroadcast())); + let input = TensorView(InputRef(0).ValuePtr(), GetInputSlice(rank, fr.AllowBroadcast())); output.AssignCopyOf(input); } @@ -833,8 +931,14 @@ public: inputGrad.AddCopyOf(outputGrad); } - virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } - virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } + virtual bool OutputUsedInComputingInputNodesGradients() const override + { + return false; + } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override + { + return false; + } virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override { @@ -842,13 +946,13 @@ public: InferMBLayoutFromInputsForStandardCase(isFinalValidationPass); auto sampleLayout = Input(0)->GetSampleLayout(); - for (int i = 0; i < (int)m_axis.size(); i++) + for (int i = 0; i < (int) m_axis.size(); i++) { if (m_axis[i] < 1 || (isFinalValidationPass && m_axis[i] > sampleLayout.GetRank())) RuntimeError("%ls %ls operation: axis parameter %d (%d) must be in range 1..rank of input ([%s]).", NodeName().c_str(), OperationName().c_str(), i, m_axis[i], string(sampleLayout).c_str()); if (isFinalValidationPass && (sampleLayout[m_axis[i] - 1] < EndIndex(i) || EndIndex(i) < BeginIndex(i) || BeginIndex(i) < 0)) - RuntimeError("%ls %ls operation: Index range [%d,%d), interpreted as [%d,%d), is invalid for input ([%s]).", NodeName().c_str(), OperationName().c_str(), m_beginIndex[i], m_endIndex[i], (int)BeginIndex(i), (int)EndIndex(i), string(sampleLayout).c_str()); + RuntimeError("%ls %ls operation: Index range [%d,%d), interpreted as [%d,%d), is invalid for input ([%s]).", NodeName().c_str(), OperationName().c_str(), m_beginIndex[i], m_endIndex[i], (int) BeginIndex(i), (int) EndIndex(i), string(sampleLayout).c_str()); // propagate as much as we can if (isFinalValidationPass || (m_axis[i] - 1 < sampleLayout.GetRank() && 0 <= BeginIndex(i) && BeginIndex(i) <= EndIndex(i) && EndIndex(i) <= sampleLayout[m_axis[i] - 1])) // (the second condition guards against failing an out-of-bounds error if not isFinalValidationPass) @@ -868,40 +972,40 @@ template class SliceNode; enum class PaddingType { - CONSTANTPAD = 0, // the default, fill the padding cells with 0 - REFLECTPAD = 1, // Padding with reflect mode + CONSTANTPAD = 0, // the default, fill the padding cells with 0 + REFLECTPAD = 1, // Padding with reflect mode SYMMETRICPAD = 2, // Padding with symmetric mode }; template class PaddingNode : public ComputationNode, public NumInputs<1> { - typedef ComputationNode Base; + typedef ComputationNode Base; UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() + static const std::wstring TypeName() { return L"Padding"; } -public: - +public: public: PaddingNode(DEVICEID_TYPE deviceId, const wstring& name, std::vector head, std::vector foot, PaddingType mode = PaddingType::CONSTANTPAD, double constantValue = 0) - : Base(deviceId, name), m_head(head), m_foot(foot), m_mode(mode), m_constant_value((ElemType)constantValue) + : Base(deviceId, name), m_head(head), m_foot(foot), m_mode(mode), m_constant_value((ElemType) constantValue) { } - PaddingNode(DEVICEID_TYPE deviceId, const wstring& name) : Base(deviceId, name) + PaddingNode(DEVICEID_TYPE deviceId, const wstring& name) + : Base(deviceId, name) { } - + public: virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override { size_t rank = DetermineElementwiseTensorRank(); auto outputSlice = GetTensorSliceFor(rank, fr); // tensor slice that represents the entire output for FrameRange let input = InputRef(0).ValueTensorFor(rank, fr.AllowBroadcast()); - int maxRank = (int)(Input(0)->GetSampleLayout().GetRank()); + int maxRank = (int) (Input(0)->GetSampleLayout().GetRank()); let dims = Input(0)->GetSampleLayout().GetDims(); let outputDims = GetSampleLayout().GetDims(); auto outputSubSlice = outputSlice; @@ -935,14 +1039,14 @@ public: } } } - + virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override { size_t rank = DetermineElementwiseTensorRank(); let outputSlice = GetTensorSliceFor(rank, fr); // tensor slice that represents the entire output for FrameRange auto inputGrad = InputRef(inputIndex).GradientTensorFor(rank, fr.AllowBroadcast()); - int maxRank = (int)(Input(inputIndex)->GetSampleLayout().GetRank()); + int maxRank = (int) (Input(inputIndex)->GetSampleLayout().GetRank()); let dims = Input(inputIndex)->GetSampleLayout().GetDims(); let outputDims = GetSampleLayout().GetDims(); // first folder the gradients if its padding mode is reflect or symmetric @@ -973,12 +1077,12 @@ public: inputGrad.AddCopyOf(outputGrad); } - virtual bool OutputUsedInComputingInputNodesGradients() const override + virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } - - virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override + + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } @@ -1017,7 +1121,6 @@ public: } private: - void FillPaddingCells(const FrameRange& fr, size_t rank, size_t axis, size_t outputIndex, size_t inputIndex, size_t size, bool reverse) { if (size > 0) @@ -1101,7 +1204,10 @@ class CropNode : public ComputationNode, public TransformerNode typedef ComputationNode Base; UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() { return L"Crop"; } + static const std::wstring TypeName() + { + return L"Crop"; + } public: CropNode(DEVICEID_TYPE deviceId, const std::wstring& name); @@ -1127,17 +1233,18 @@ private: using TransformerNode::m_transforms; // Declaration of matrix getting method to unify accessing values and gradients. - typedef MatrixBasePtr(ComputationNode::*MatrixGetter)() const; + typedef MatrixBasePtr (ComputationNode::*MatrixGetter)() const; // Helper structure to store input/output views which define parts of input and output we work with. struct CroppedIOViews { - CroppedIOViews(CropNode* cropNode, MatrixGetter matrixGetter, TensorShape inputShapeCropped, TensorShape outputShape) : - // Input view is derived from first input. - inputViewCropped((cropNode->Input(0).get()->*matrixGetter)(), inputShapeCropped), - // Output view corresponds to single output. - outputView((cropNode->*matrixGetter)(), outputShape) - {} + CroppedIOViews(CropNode* cropNode, MatrixGetter matrixGetter, TensorShape inputShapeCropped, TensorShape outputShape) + : // Input view is derived from first input. + inputViewCropped((cropNode->Input(0).get()->*matrixGetter)(), inputShapeCropped), + // Output view corresponds to single output. + outputView((cropNode->*matrixGetter)(), outputShape) + { + } TensorView inputViewCropped; TensorView outputView; @@ -1151,9 +1258,9 @@ private: // Performs offsets computation if necessary. void ComputeCropOffsets(); - virtual void /*TransformerNode::*/ComputeTransforms() override; + virtual void /*TransformerNode::*/ ComputeTransforms() override; - virtual bool /*TransformerNode::*/SupportsTransformOnInput(size_t inputIndex) override; + virtual bool /*TransformerNode::*/ SupportsTransformOnInput(size_t inputIndex) override; protected: // Offset along x axis. We need to store offsets as floats for precision if one crop node affects computation of other. @@ -1172,11 +1279,15 @@ protected: template class RowStackNode : public ComputationNode // note: not deriving from NumInputs<> like most other nodes, because this one takes a variable number of inputs { - typedef ComputationNode Base; UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() { return L"RowStack"; } + typedef ComputationNode Base; + UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() + { + return L"RowStack"; + } public: - RowStackNode(DEVICEID_TYPE deviceId, const wstring& name, int spliceDim = 1/*TODO: complete this*/) + RowStackNode(DEVICEID_TYPE deviceId, const wstring& name, int spliceDim = 1 /*TODO: complete this*/) : Base(deviceId, name), m_spliceDim(spliceDim) { } @@ -1194,7 +1305,7 @@ public: { auto node = dynamic_pointer_cast>(nodeP); node->m_firstIndices = m_firstIndices; - node->m_spliceDim = m_spliceDim; + node->m_spliceDim = m_spliceDim; } } @@ -1215,11 +1326,11 @@ public: private: // changes the result slice (which includes all stacked inputs) to the stripe that matches where one of the inputs goes - TensorShape NarrowToStripe(const TensorShape & resultSlice, size_t inputIndex) + TensorShape NarrowToStripe(const TensorShape& resultSlice, size_t inputIndex) { auto resultSubSlice = resultSlice; assert(m_spliceDim > 0); - size_t index = (size_t)m_spliceDim - 1; + size_t index = (size_t) m_spliceDim - 1; resultSubSlice.NarrowTo(index, m_firstIndices[inputIndex], m_firstIndices[inputIndex + 1]); return resultSubSlice; } @@ -1250,8 +1361,14 @@ public: inputGrad.AddCopyOf(outputGrad); } - virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } - virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } + virtual bool OutputUsedInComputingInputNodesGradients() const override + { + return false; + } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override + { + return false; + } virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override { @@ -1263,7 +1380,7 @@ public: // determine maximum rank (we can stack tensors with lower rank, which will have their dimensions paded to max automatically) assert(m_spliceDim > 0); - size_t index = (size_t)m_spliceDim - 1; + size_t index = (size_t) m_spliceDim - 1; size_t maxRank = index + 1; // spliceDim may exceed all of them, which will create a new dimension, e.g. stacking column vectors into a matrix for (int i = 0; i < GetNumInputs(); i++) if (maxRank < Input(i)->GetSampleLayout().GetRank()) @@ -1278,7 +1395,7 @@ public: for (int i = 0; i < GetNumInputs(); i++) { // check/fuse dims and accumulate the spliced dimension - let & shape = Input(i)->GetSampleLayout(); + let& shape = Input(i)->GetSampleLayout(); for (size_t k = 0; k < maxRank; k++) { size_t dim = shape.GetDimPadded(k); @@ -1286,15 +1403,15 @@ public: { // accumulate the spliced dimension dims[index] += dim; - m_firstIndices.push_back(dims[index]); // and remember it + m_firstIndices.push_back(dims[index]); // and remember it } else { // check/fuse dimensions if (isFinalValidationPass && dim != dims[k] && dim != 1 && dims[k] != 1) InvalidArgument("%ls %ls operation: Conflicting dimension %d between %ls %ls operation (%d) and other(s) (%d)", - NodeName().c_str(), OperationName().c_str(), (int)k, Input(i)->NodeName().c_str(), Input(i)->OperationName().c_str(), (int)dim, (int)dims[k]); - if (dims[k] == 1) // broadcast + NodeName().c_str(), OperationName().c_str(), (int) k, Input(i)->NodeName().c_str(), Input(i)->OperationName().c_str(), (int) dim, (int) dims[k]); + if (dims[k] == 1) // broadcast dims[k] = dim; } } @@ -1303,7 +1420,10 @@ public: SetDims(TensorShape(dims), HasMBLayout()); } - int GetSpliceDim() const { return m_spliceDim; } + int GetSpliceDim() const + { + return m_spliceDim; + } private: std::vector m_firstIndices; // start row number in the stacked matrix of each input (child) (cumsum of matrix heights); plus one final entry that equals the total dimension @@ -1320,8 +1440,12 @@ template class RowStackNode; template class RowRepeatNode : public ComputationNode, public NumInputs<1> { - typedef ComputationNode Base; UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() { return L"RowRepeat"; } + typedef ComputationNode Base; + UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() + { + return L"RowRepeat"; + } public: RowRepeatNode(DEVICEID_TYPE deviceId, const wstring& name, size_t numRepeats = 1) @@ -1385,8 +1509,14 @@ public: InputRef(0).GradientFor(fr).AddToRowRepeatValuesOf(GradientFor(fr), m_numRepeat); } - virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } - virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } + virtual bool OutputUsedInComputingInputNodesGradients() const override + { + return false; + } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override + { + return false; + } private: size_t m_numRepeat; @@ -1430,22 +1560,36 @@ and Scatter(). template class WhereNode : public ComputationNodeNonLooping, public NumInputs<1> { - typedef ComputationNodeNonLooping Base; UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() { return L"Where"; } + typedef ComputationNodeNonLooping Base; + UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() + { + return L"Where"; + } + + static const std::wstring DefaultWhereNodeDynamicAxisName() + { + return L"WhereNodeAxis"; + } - static const std::wstring DefaultWhereNodeDynamicAxisName() { return L"WhereNodeAxis"; } public: DeclareConstructorFromConfigWithNumInputs(WhereNode); - WhereNode(DEVICEID_TYPE deviceId, const wstring& name, const wstring& dynamicAxisName = DefaultWhereNodeDynamicAxisName()) : - Base(deviceId, name), m_dynamicAxisName(dynamicAxisName) + WhereNode(DEVICEID_TYPE deviceId, const wstring& name, const wstring& dynamicAxisName = DefaultWhereNodeDynamicAxisName()) + : Base(deviceId, name), m_dynamicAxisName(dynamicAxisName) { MarkValueNonSharable(); } virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override; virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override; - virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } - virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } + virtual bool OutputUsedInComputingInputNodesGradients() const override + { + return false; + } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override + { + return false; + } virtual void Validate(bool isFinalValidationPass) override; virtual void Load(File& fstream, size_t modelVersion) override @@ -1463,12 +1607,15 @@ public: fstream << m_dynamicAxisName; } - std::wstring DynamicAxisName() const { return m_dynamicAxisName; } + std::wstring DynamicAxisName() const + { + return m_dynamicAxisName; + } private: // buffers for creating the result sequences (kept as object state to avoid memory allocations) - std::vector> m_indexSequenceBuffer; // [sequenceIndex][t] for creating the result sequences - std::vector m_rowAllocationsBuffer; // [row] for determining new MBLayout packing + std::vector> m_indexSequenceBuffer; // [sequenceIndex][t] for creating the result sequences + std::vector m_rowAllocationsBuffer; // [row] for determining new MBLayout packing std::vector> m_placementBuffer; // [sequenceIndex] assigned location for a sequence std::wstring m_dynamicAxisName; }; @@ -1486,25 +1633,35 @@ private: template class PackedIndexNode : public ComputationNodeNonLooping, public NumInputs<2> { - typedef ComputationNodeNonLooping Base; UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() { return L"PackedIndex"; } + typedef ComputationNodeNonLooping Base; + UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() + { + return L"PackedIndex"; + } // our inputs static const size_t SOURCEDATA = 0; - static const size_t INDEXDATA = 1; + static const size_t INDEXDATA = 1; public: DeclareConstructorFromConfigWithNumInputs(PackedIndexNode); - PackedIndexNode(DEVICEID_TYPE deviceId, const wstring& name) : - Base(deviceId, name) + PackedIndexNode(DEVICEID_TYPE deviceId, const wstring& name) + : Base(deviceId, name) { MarkValueNonSharable(); } virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override; virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override; - virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } - virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } + virtual bool OutputUsedInComputingInputNodesGradients() const override + { + return false; + } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override + { + return false; + } virtual void Validate(bool isFinalValidationPass) override; }; @@ -1519,8 +1676,12 @@ public: template class GatherPackedNode : public ComputationNodeNonLooping, public NumInputs<2> { - typedef ComputationNodeNonLooping Base; UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() { return L"GatherPacked"; } + typedef ComputationNodeNonLooping Base; + UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() + { + return L"GatherPacked"; + } // our inputs static const size_t INDEXDATA = 0; @@ -1528,15 +1689,21 @@ class GatherPackedNode : public ComputationNodeNonLooping, public NumI public: DeclareConstructorFromConfigWithNumInputs(GatherPackedNode); - GatherPackedNode(DEVICEID_TYPE deviceId, const wstring& name) : - Base(deviceId, name) + GatherPackedNode(DEVICEID_TYPE deviceId, const wstring& name) + : Base(deviceId, name) { } virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override; virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t inputIndex) override; - virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } - virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override { return childIndex == INDEXDATA; } + virtual bool OutputUsedInComputingInputNodesGradients() const override + { + return false; + } + virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override + { + return childIndex == INDEXDATA; + } virtual void Validate(bool isFinalValidationPass) override; }; @@ -1552,25 +1719,35 @@ public: template class ScatterPackedNode : public ComputationNodeNonLooping, public NumInputs<3> { - typedef ComputationNodeNonLooping Base; UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() { return L"ScatterPacked"; } + typedef ComputationNodeNonLooping Base; + UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() + { + return L"ScatterPacked"; + } // our inputs static const size_t LAYOUTDATA = 0; - static const size_t INDEXDATA = 1; + static const size_t INDEXDATA = 1; static const size_t SOURCEDATA = 2; public: DeclareConstructorFromConfigWithNumInputs(ScatterPackedNode); - ScatterPackedNode(DEVICEID_TYPE deviceId, const wstring& name) : - Base(deviceId, name) + ScatterPackedNode(DEVICEID_TYPE deviceId, const wstring& name) + : Base(deviceId, name) { } virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override; virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t inputIndex) override; - virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } - virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override { return childIndex == INDEXDATA; } + virtual bool OutputUsedInComputingInputNodesGradients() const override + { + return false; + } + virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override + { + return childIndex == INDEXDATA; + } virtual void Validate(bool isFinalValidationPass) override; }; @@ -1581,8 +1758,12 @@ public: template class DiagonalNode : public ComputationNodeNonLooping, public NumInputs<1> { - typedef ComputationNodeNonLooping Base; UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() { return L"Diagonal"; } + typedef ComputationNodeNonLooping Base; + UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() + { + return L"Diagonal"; + } public: DeclareConstructorFromConfigWithNumInputs(DiagonalNode); @@ -1620,8 +1801,14 @@ public: #endif } - virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } - virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } + virtual bool OutputUsedInComputingInputNodesGradients() const override + { + return false; + } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override + { + return false; + } virtual void Validate(bool isFinalValidationPass) override { @@ -1802,7 +1989,7 @@ public: { } LegacyReshapeNode(const ScriptableObjects::IConfigRecordPtr configp) - : LegacyReshapeNode(configp->Get(L"deviceId"), L"", configp->Get(L"numRows"), ImageDimensions::AsTensorShape(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKindFrom(configp->Get(L"imageLayout"))/*legacy*/)) + : LegacyReshapeNode(configp->Get(L"deviceId"), L"", configp->Get(L"numRows"), ImageDimensions::AsTensorShape(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKindFrom(configp->Get(L"imageLayout")) /*legacy*/)) { // BUGBUG: We should not operate on image layouts here, but on a proper tensor layout. AttachInputsFromConfig(configp, this->GetExpectedNumInputs()); @@ -1919,8 +2106,14 @@ public: } } - virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } - virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } + virtual bool OutputUsedInComputingInputNodesGradients() const override + { + return false; + } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override + { + return false; + } virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override { @@ -2051,7 +2244,6 @@ private: template class LegacyReshapeNode; template class LegacyReshapeNode; - template class GatherNode : public ComputationNodeNonLooping, public NumInputs<2> { @@ -2063,7 +2255,8 @@ class GatherNode : public ComputationNodeNonLooping, public NumInputs< } public: - GatherNode(DEVICEID_TYPE deviceId, const wstring& name) : Base(deviceId, name) + GatherNode(DEVICEID_TYPE deviceId, const wstring& name) + : Base(deviceId, name) { } @@ -2092,7 +2285,7 @@ public: { if (inputIndex == 1) //only right operand need calculate gradient { - let& indices = InputRef(0).Value(); + let& indices = InputRef(0).Value(); const auto& indicesMask = InputRef(0).GetMBLayout()->GetColumnsValidityMask(indices.GetDeviceId()); auto& sourceGradient = InputRef(1).Gradient(); auto& outputGradient = Gradient(); @@ -2118,10 +2311,12 @@ public: } } - virtual bool OutputUsedInComputingInputNodesGradients() const override { + virtual bool OutputUsedInComputingInputNodesGradients() const override + { return false; } - virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override { + virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override + { return childIndex == 0; } @@ -2143,7 +2338,7 @@ public: const auto& inputDims2 = inputSampleLayout2.GetDims(); SmallVector dims; - dims.append(inputDims2.begin(), inputDims2.end() - 1);//pop the last dim of right operand + dims.append(inputDims2.begin(), inputDims2.end() - 1); //pop the last dim of right operand dims.append(inputDims1.begin(), inputDims1.end()); auto sampleLayout = TensorShape(dims); @@ -2353,4 +2548,6 @@ other */ -}}} +} // namespace CNTK +} // namespace MSR +} // namespace Microsoft diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h index 3ac37bfc9..f9a0c1a8a 100755 --- a/Source/Math/CPUMatrix.h +++ b/Source/Math/CPUMatrix.h @@ -311,6 +311,7 @@ public: CPUMatrix& InplaceSoftThreshold(const ElemType threshold); CPUMatrix& SetToZeroIfAbsLessThan(const ElemType threshold); + CPUMatrix& SetToZeroIfLessThan(const ElemType threshold); ElemType SumOfAbsElements() const; // sum of all abs(elements) ElemType SumOfElements() const; // sum of all elements diff --git a/Source/Math/CPUMatrixImpl.h b/Source/Math/CPUMatrixImpl.h index 2f997f564..df09d92db 100644 --- a/Source/Math/CPUMatrixImpl.h +++ b/Source/Math/CPUMatrixImpl.h @@ -3215,6 +3215,24 @@ CPUMatrix& CPUMatrix::SetToZeroIfAbsLessThan(const ElemType return *this; } +template +CPUMatrix& CPUMatrix::SetToZeroIfLessThan(const ElemType threshold) +{ + if (IsEmpty()) + LogicError("SetToZeroIfLessThan: Matrix is empty."); + + auto& us = *this; + +#pragma omp parallel for + foreach_coord (i, j, us) + { + if ((us(i, j)) < threshold) + us(i, j) = 0; + } + + return *this; +} + //sum of all abs(elements) template ElemType CPUMatrix::SumOfAbsElements() const diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu index c9b27cdb3..ae7661e65 100755 --- a/Source/Math/GPUMatrix.cu +++ b/Source/Math/GPUMatrix.cu @@ -2486,6 +2486,18 @@ GPUMatrix& GPUMatrix::SetToZeroIfAbsLessThan(const ElemType return *this; } +template +GPUMatrix& GPUMatrix::SetToZeroIfLessThan(const ElemType threshold) +{ + if (IsEmpty()) + LogicError("SetToZeroIfLessThan: Matrix is empty."); + CUDA_LONG N = (CUDA_LONG) GetNumElements(); + int blocksPerGrid = (int) ceil(N * 1.0 / GridDim::maxThreadsPerBlock); + PrepareDevice(); + SyncGuard syncGuard; + _setToZeroIfLessThan<<>>(Data(), threshold, N); + return *this; +} template ElemType GPUMatrix::SumOfAbsElements() const { @@ -5318,6 +5330,119 @@ void GPUMatrix::TensorOp(ElemType beta, const GPUMatrix& a, return TensorOpN(beta, array{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); } +// perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides +// This binds the N-ariness to a template parameter N, and gets the data pointers out from the matrix objects. +template +void GPUMatrix::TensorOpDebug(ElemType beta, const GPUMatrix& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& reducingStrides) +{ + if (reductionOp != ElementWiseOperator::opSum && + reductionOp != ElementWiseOperator::opLogSum && + reductionOp != ElementWiseOperator::opMin && + reductionOp != ElementWiseOperator::opMax && + reductionOp != ElementWiseOperator::opElementwiseProduct) + InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented."); + + a.PrepareDevice(); + if (a.GetComputeDeviceId() != GetComputeDeviceId()) + InvalidArgument("All matrices must be on the same GPU"); + + // special case: linear processing + // The case statement has measurable impact for unary ops (but not for binary ops it seems, due to double mem access). + // Linear gap-free unary ops happen so regularly that we will eliminate the case statement from the CUDA kernel, and instead expand all. + if (regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && reducingOpDims.size() == 0) + { + // special case: for copy, use cudaMemcpy() instead, or cublas_axpy() + // TODO: We should observe if these actually make a speed difference, and if not, remove these special cases. + if (op == ElementWiseOperator::opCopy && beta == 0 && alpha == 1) + { + //fprintf(stderr, "TensorOpDebug 1 \n"); + return CUDA_CALL(cudaMemcpy(Data() + offsets[1], a.Data() + offsets[0], sizeof(ElemType) * regularOpDims[0], cudaMemcpyDeviceToDevice)); + } + else if (op == ElementWiseOperator::opCopy && beta == 1) + { + //fprintf(stderr, "TensorOpDebug 2 \n"); + return CUBLAS_CALL(cublasaxpyHelper(GetCublasHandle(GetComputeDeviceId()), (int) regularOpDims[0], &alpha, a.Data() + offsets[0], 1, Data() + offsets[1], 1)); + } + else + { + //fprintf(stderr, "TensorOpDebug 3 \n"); + + return LaunchUnaryTensorOp(beta, a.Data() + offsets[0], Data() + offsets[1], alpha, op, regularOpDims[0]); + } + } + + // special case: sum-reducing a matrix onto a column vector; can be done with SGEMM + // Note: A minor risk is that with this, our own reduction function will rarely be used. + // That function was tested to give the same results with 'double', and nearly the same with 'float' (different summation order matters). + else if (op == ElementWiseOperator::opCopy && // we are just adding to target without any further operation + reductionOp == ElementWiseOperator::opSum && +#ifdef _DEBUG + sizeof(ElemType) == sizeof(float) && // in debug don't shortcut 'double' so we have some test of our own codepath +#endif + regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && // we are processing a column + reducingOpDims.size() == 1 && reducingStrides[0][0] >= (ptrdiff_t) regularOpDims[0]) // reducing across columns and no overlap + { + assert(reducingStrides[1][0] == 0); + auto ARows = regularOpDims[0]; // vertical steps + auto ACols = reducingOpDims[0]; // horizontal steps (reduction) + auto ALd = reducingStrides[0][0]; // horizontal step width through matrix + cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); + CUBLAS_CALL(cublasgemmHelper(cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, (int) /*CRows=*/ARows, /*CCols=*/1, (int) ACols, &alpha, + /*A00=*/a.Data() + offsets[0], (int) ALd, + /*B00=*/GetOnesVector(ACols, a.GetComputeDeviceId())->Data(), (int) /*BRows=*/ACols, &beta, + /*C00=*/Data() + offsets[1], (int) /*CRows=*/ARows)); + //fprintf(stderr, "TensorOpDebug 4 \n"); + + return; + } + + // TODO: Add a special case for tensor bias reduction. cudnn is ~7% faster on Image/QuickE2E. + + // regular case + else + { + /* + fprintf(stderr, "TensorOpDebug 5 \n"); + for (size_t i = 0; i < 2; i++) + { + // fprintf(stderr, "i = %d, offsets = %d, regularStrides = %d, reducingStrides = %d \n ", int(i), int(offsets[i]), int(regularStrides[i]), int(reducingStrides[i])); + fprintf(stderr, "TensorOpDebug 5.1, i = %d, offsets = %d\n ", int(i), int(offsets[i])); + } + fprintf(stderr, "TensorOpDebug 5.1, regularOpDims.size() = %d \n", int(regularOpDims.size())); + + for (size_t i = 0; i < regularOpDims.size(); i++) + fprintf(stderr, "TensorOpDebug 5.1, i = %d, regularOpDims = %d\n ", int(i), int(regularOpDims[i])); + + fprintf(stderr, "TensorOpDebug 5.1, reducingOpDims.size() = %d \n", int(reducingOpDims.size())); + for (size_t i = 0; i < reducingOpDims.size(); i++) + fprintf(stderr, "TensorOpDebug 5.1, i = %d, reducingOpDims = %d\n ", int(i), int(reducingOpDims[i])); + + for (size_t i = 0; i < 2; i++) + { + fprintf(stderr, "TensorOpDebug 5.1, i = %d, regularStrides.size() = %d \n", int(i), int(regularStrides[i].size())); + for (size_t j = 0; j < regularStrides[i].size(); j++) + { + fprintf(stderr, "TensorOpDebug 5.1, i = %d, j = %d, regularStrides = %d \n ", int(i), int(j), int(regularStrides[i][j])); + } + + fprintf(stderr, "TensorOpDebug 5.1, i = %d, reducingStrides.size() = %d \n", int(i), int(reducingStrides[i].size())); + for (size_t j = 0; j < reducingStrides[i].size(); j++) + { + fprintf(stderr, "TensorOpDebug 5.1, i = %d, j = %d, reducingStrides = %d \n ", int(i), int(j), int(reducingStrides[i][j])); + } + } + + fprintf(stderr, "TensorOpDebug 5.2, beta = %f, alpha = %f, a.data = %f, data = %f \n", double(beta), double(alpha), double(a.FrobeniusNorm()), double(FrobeniusNorm())); + // return TensorOpN(beta, array{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + */ + return TensorOpNDebug(beta, array{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides, a, *this); + + } +} + // perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides template void GPUMatrix::TensorOp(ElemType beta, const GPUMatrix& a, const GPUMatrix& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h index 02391dc0a..628e88fd1 100755 --- a/Source/Math/GPUMatrix.h +++ b/Source/Math/GPUMatrix.h @@ -437,6 +437,7 @@ public: GPUMatrix& InplaceSoftThreshold(const ElemType threshold); GPUMatrix& SetToZeroIfAbsLessThan(const ElemType threshold); + GPUMatrix& SetToZeroIfLessThan(const ElemType threshold); DeviceBoundNumber Sum_AsDeviceBoundNum() const; ElemType SumOfAbsElements() const; // sum of all abs(elements) @@ -607,6 +608,11 @@ public: const std::array& offsets, const SmallVector& regularOpDims, const std::array, 2>& regularStrides, const SmallVector& reducingOpDims, const std::array, 2>& reducingStrides); + void TensorOpDebug(ElemType beta, const GPUMatrix& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const std::array& offsets, + const SmallVector& regularOpDims, const std::array, 2>& regularStrides, + const SmallVector& reducingOpDims, const std::array, 2>& reducingStrides); + void TensorOp(ElemType beta, const GPUMatrix& a, const GPUMatrix& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, const std::array& offsets, const SmallVector& regularOpDims, const std::array, 3>& regularStrides, diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh index 281a50391..60f6db240 100755 --- a/Source/Math/GPUMatrixCUDAKernels.cuh +++ b/Source/Math/GPUMatrixCUDAKernels.cuh @@ -1306,6 +1306,19 @@ __global__ void _setToZeroIfAbsLessThan( a[id] = 0; } +template +__global__ void _setToZeroIfLessThan( + ElemType* a, + const ElemType threshold, + const CUDA_LONG N) +{ + typedef typename TypeSelector::comp_t comp_t; + CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; + if (id >= N) + return; + if (((comp_t) a[id]) < (comp_t) threshold) + a[id] = 0; +} template __global__ void _areEqual( const ElemType* a, diff --git a/Source/Math/GPUSparseMatrix.cu b/Source/Math/GPUSparseMatrix.cu index eee40d1b5..4284b2b56 100755 --- a/Source/Math/GPUSparseMatrix.cu +++ b/Source/Math/GPUSparseMatrix.cu @@ -2987,6 +2987,19 @@ GPUSparseMatrix& GPUSparseMatrix::SetToZeroIfAbsLessThan(con return *this; } +template +GPUSparseMatrix& GPUSparseMatrix::SetToZeroIfLessThan(const ElemType threshold) +{ + VerifyWritable(__FUNCTION__); + + if (IsEmpty()) + LogicError("SetToZeroIfLessThan: Matrix is empty."); + CUDA_LONG N = (CUDA_LONG) GetNumNZElements(); + int blocksPerGrid = (int) ceil(N * 1.0 / GridDim::maxThreadsPerBlock); + SyncGuard syncGuard; + _setToZeroIfLessThan<<>>(NzValues(), threshold, N); + return *this; +} #pragma endregion #pragma region Helper Functions diff --git a/Source/Math/GPUSparseMatrix.h b/Source/Math/GPUSparseMatrix.h index 49b9439dd..00832b4c4 100755 --- a/Source/Math/GPUSparseMatrix.h +++ b/Source/Math/GPUSparseMatrix.h @@ -395,7 +395,7 @@ public: GPUSparseMatrix& AssignTruncateTopOf(const GPUSparseMatrix& a, const ElemType threshold); GPUSparseMatrix& SetToZeroIfAbsLessThan(const ElemType threshold); - + GPUSparseMatrix& SetToZeroIfLessThan(const ElemType threshold); GPUSparseMatrix& AssignOneHot(const GPUMatrix& a, vector& shape, size_t axis); void SetDiagonalValue(const ElemType v); void SetDiagonalValue(const GPUMatrix& vector); diff --git a/Source/Math/GPUTensor.cu b/Source/Math/GPUTensor.cu index 2fdfdb515..3a94e8ab1 100644 --- a/Source/Math/GPUTensor.cu +++ b/Source/Math/GPUTensor.cu @@ -875,6 +875,12 @@ static shared_ptr GetReductionBuffer(size_t N) return reductionBuffersCache[deviceId]; } +// this is safe for multithread calling in RNNT_EMBR +template +static shared_ptr GetReductionBufferNoCache(size_t N) +{ + return AllocateReductionBuffer(N); +} // All dimensions (N-ariness, number of input dimensions K and number of reduction dimensions M) are bound to template parameters now. template static void LaunchTensorOpWithReduction(ElemType beta, array pointerVector, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, @@ -1112,6 +1118,272 @@ static void LaunchTensorOpWithReduction(ElemType beta, array point } } + + +template +static void LaunchTensorOpWithReductionDebug(ElemType beta, array pointerVector, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const SmallVector& regularOpDims, const array, N>& regularStrideVectors, + const SmallVector& reducingOpDimVector, const array, N>& reducingStrideVectors, const GPUMatrix& a, GPUMatrix& result) +{ + //fprintf(stderr, "LaunchTensorOpWithReductionDebug 1, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm())); + // return TensorOpN(beta, array{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + a; + result; + typedef typename TypeSelector::comp_t ReduceElemType; + // copy all parameters to CUDA-compatible data structures + FixedArray pointers(pointerVector); + SmallVector regularOpStrideVector; // kernel needs the strides for converting thread index back to multi-dimensional tensor index + C_size_t numElements = 1; + // input divisors + SmallVector regularOpStrideDivmodVector; + for (C_size_t k = 0; k < regularOpDims.size(); k++) + { + regularOpStrideVector.push_back(numElements); // stride for dense representation of our output elements (if they were flattened) + regularOpStrideDivmodVector.push_back(fast_divmod((unsigned int) numElements)); + numElements *= (C_size_t) regularOpDims[k]; + } + // output divisors + SmallVector reducingOpDimDivmodVector; + C_size_t stride = 1; + for (C_size_t k = 0; k < reducingOpDimVector.size(); ++k) + { + reducingOpDimDivmodVector.push_back(fast_divmod(stride)); + stride *= (C_size_t) reducingOpDimVector[k]; + } + + FixedArray regularOpStrides(regularOpStrideVector); + FixedMatrix regularStrides(regularStrideVectors); + FixedArray reducingOpDims(reducingOpDimVector); + FixedMatrix reducingStrides(reducingStrideVectors); + // reduced divisors + FixedArray regularOpStrideDivmod(regularOpStrideDivmodVector); + FixedArray reducingOpDimDivmod(reducingOpDimDivmodVector); + + // launch the kernel + CUDA_LONG NN = (CUDA_LONG) numElements; // linear space identifying each individual output element + SyncGuard syncGuard; + + // do some optimization for reductions + // - example: 30 GPU procs, warp size 32 --> 960 GPU cores + // - NN elements must be computed, each involving a reduction over reductionDim elements + // Cases: + // - #output elements NN >= GPU cores --> use one proc per element, do reduction in inner loop + // E.g. if >=960 elements are computed, each gets its own GPU thread. + // - reduction dimension would benefit from multiple blocks --> multiple blocks work on a single output element + // E.g. + // - gradient of adding a bias: reducing to a bias, e.g. 512-dim + // - gradient of scalar multiplication: big elementwise product reduced to a scalar (big dot product, e.g. [1024 x 1024] = 1M elements) + // - softmax in seq-2-seq attention model: reduce over length of attention window (e.g. 20) + // - summation of criterion value: scalar reduction over a few hundred or thousand samples in the minibatch + C_size_t reductionDim = 1; // number of elements to reduce over + for (C_size_t k = 0; k < reducingOpDimVector.size(); k++) + reductionDim *= (C_size_t) reducingOpDimVector[k]; + GridDim grid(NN); + let& props = GridDim::GetDeviceProps(); + bool disableParallelReduction = false; // (for debugging) + //fprintf(stderr, "LaunchTensorOpWithReductionDebug 2, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm())); + + // === arg based reduction, one thread per output element + if ((reductionOp == ElementWiseOperator::opArgmax) || + (reductionOp == ElementWiseOperator::opArgmin)) + { + + _launchTensorArgOp<<>>( + pointers, reductionOp, + regularOpStrides, regularStrides, grid.m_N, + reducingOpDims, reducingStrides, + regularOpStrideDivmod, reducingOpDimDivmod); + } + // === simple case: NN large, one thread per output element + else if (reductionDim == 1 || // no reduction + grid.m_blocksPerGrid >= props.multiProcessorCount || // enough output elements to fill all multiprocs + reductionDim * numElements <= 2 * props.warpSize || // trivial operation not worth the trouble (2* because the more complex one also needs 2 kernel launches) + disableParallelReduction || // (for debugging) + reductionDim * numElements <= props.multiProcessorCount) // recursive call from reduction below + { + // we got enough elements to generate: do one element per thread, and reduction inside + //fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.1, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm())); + + _launchTensorOp<<>>( + beta, pointers, alpha, op, reductionOp, + regularOpStrides, regularStrides, grid.m_N, + reducingOpDims, reducingStrides, + regularOpStrideDivmod, reducingOpDimDivmod); + //fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.2, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm())); + + } + // === optimization: simple case would not use all multiprocs + else + { + // m_blocksPerGrid can be thought of NN / 512, with appropriate rounding + + // we are reducing and are underutilizing the multiprocs we have: get more parallelism by doing reduction in parallel + // If we get here, then + // - the total number of outputs to produce is < #multiprocs * warpSize, e.g. < 960 + // - each output has at least two inputs, but possibly millions + // Examples: + // (a1) NN=900 + // - each multiproc processes multiple elements concurrently, each reducing over its inputs inside + // - use one block per output element + // (a2) NN=30 + // - same as (a1) except 30 multiprocs run only a single block each + // (a3) NN=16 + // - same as (a1) except only 16 multiproc run one block + // (b1) NN=15 + // - 2 blocks work together on a single output element + // (b2) NN=1 (NN < #multiprocs, e.g. NN < 30) + // - multiple blocks work together on a single output element + // - only this case requires memory, and only K * NN + // where K = blocks that work together, + // both K and NN < #multiprocs, + // and K * NN = on the order of NN, but generally a bit larger due to rounding. + + // By how much do we underutilize? + // We increase #blocks by that factor by breaking reduction into that many chunks. + //fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.3, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm())); + + int numReductionChunks = std::max(props.multiProcessorCount / NN, 1); // only >1 for NN < multiProcessorCount + + // distribute NN over block X and Y + int blockXOverBy = CeilDiv(NN, props.maxGridSize[0]); + int numBlocksX = CeilDiv(NN, blockXOverBy); + int numBlocksY = CeilDiv(NN, numBlocksX); + // while block Z is for multiple blocks working together on a single output element + int numBlocksZ = numReductionChunks; + // Block dim is now: + // - X, Y: such that X*Y covers NN + // - Z: reduction chunks + + // reduction goes into thread dim X + int reductionChunkSize = CeilDiv(reductionDim, numReductionChunks); + int numThreadsX = std::min(reductionChunkSize, GridDim::maxThreadsPerBlock); // any that's over will be done by looping inside the kernel + + // --- cases (a1) and (a2) + // This involves no reduction across blocks. + //fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.4, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm())); + + if (numReductionChunks == 1) + { + //fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.5, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm())); + + _launchTensorOpWithReduction<<>>( + beta, pointers, alpha, op, reductionOp, + regularOpStrides, regularStrides, NN, + reducingOpDims, reducingStrides, /*reductionBegin*/ 0, reductionChunkSize, + regularOpStrideDivmod, reducingOpDimDivmod); + //fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.6, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm())); + + } + // --- case (b) + // Reduction across blocks. This is the difficult one. +#ifndef ALLOW_ATOMIC_REDUCTION // temporarily disabled to ensure it is not causing the non-reproducability + else + { + //fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.7, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm())); + + // we get here if NN <= #multiprocs + assert(NN <= props.multiProcessorCount && numBlocksX == NN && numBlocksY == 1); + // dims are: + // - numBlocksZ = numReductionChunks = how many multiprocs work together to produce one output element + // - numBlocksX = NN = number of output elements + // - numThreadsX = reductionChunkSize clipped to 512; reductionChunkSize > 512 is handled by an inner for loop inside of the kernel + + // we need memory for block outputs of dimension [numBlocksX x numBlocksZ] + // - total elements = NN * Floor(#multiprocs / NN) = <= #multiprocs + let reductionBufferSize = props.multiProcessorCount; + assert(reductionBufferSize >= NN * numBlocksZ); + shared_ptr reductionBuffer = GetReductionBufferNoCache(reductionBufferSize); + //fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.8, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm())); + + // 'pointers', 'regularOpStrides', and 'regularStrides' are set up to point to the target memory. + // We need to reroute them to point to our reductionBuffer. + // - pointer[N-1] -> replace by reductionBuffer + // - regularStrides -> replace [N-1] by regularOpStrides which already represent the NN elements for a dense memory layout + // - beta -> 0 since we write into temp memory + // - kernel must use block.z as second index into the output buffer; add (block.z * NN) to the pointer + FixedArray pointers1 = pointers; + pointers1[N - 1] = reductionBuffer.get(); + auto regularStrideVectors1 = regularStrideVectors; + for (size_t k = 0; k < regularOpStrides.size(); k++) + regularStrideVectors1[N - 1][k] = (ptrdiff_t) regularOpStrideVector[k]; + FixedMatrix regularStrides1(regularStrideVectors1); + ElemType beta1 = 0; + ElemType alpha1 = 1; + // fprintf(stderr, "LaunchTensorOpWithReductionDebug 2.9, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm())); + + _launchTensorOpWithReduction<<>>( + beta1, pointers1, alpha1, op, reductionOp, + regularOpStrides, regularStrides1, NN, + reducingOpDims, reducingStrides, /*reductionBegin*/ 0, reductionChunkSize, + regularOpStrideDivmod, reducingOpDimDivmod); + //fprintf(stderr, "LaunchTensorOpWithReductionDebug 3, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm())); + + +#if 1 + // now reduce and redistribute + // Create a new tensor task, and execute it recursively: + // - input = reductionBuffer + // - output = true output + // - op dims/strides = output elements + // - reduce dims/strides = numBlocksZ + // - op = opCopy + array pointerVector2{reductionBuffer.get(), pointerVector[N - 1]}; + const array, 2> regularStrideVectors2{regularStrideVectors1[N - 1], regularStrideVectors[N - 1]}; + const array, 2> reducingStrideVectors2{SmallVector{NN}, SmallVector{0}}; + const SmallVector reducingOpDimVector2{(size_t) numReductionChunks}; + //fprintf(stderr, "LaunchTensorOpWithReductionDebug 3.1, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm())); + + LaunchTensorOpWithReduction( + beta, pointerVector2, alpha, ElementWiseOperator::opCopy, reductionOp, + regularOpDims, regularStrideVectors2, + reducingOpDimVector2, reducingStrideVectors2); + //fprintf(stderr, "LaunchTensorOpWithReductionDebug 3.2, a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm())); + + // (note: ^^this will have a nested syncGuard, which is fine) + +#else + _launchTensorOp<<>>( + beta, pointers, alpha, op, reductionOp, + regularOpStrides, regularStrides, grid.m_N, + reducingOpDims, reducingStrides); + //for (size_t z = 0; z < numBlocksZ; z++) + // _launchTensorOpWithReduction<<>>(z == 0 ? beta : 1, pointers, alpha, op, + // regularOpStrides, regularStrides, NN, + // reducingOpDims, reducingStrides, reductionChunkSize * z, reductionChunkSize); + vector peekPartial(NN * numBlocksZ, -42); + vector peekFinal(NN, -42); + CUDA_CALL(cudaMemcpy(peekPartial.data(), reductionBuffer, sizeof(ElemType) * peekPartial.size(), cudaMemcpyDeviceToHost)); + CUDA_CALL(cudaMemcpy(peekFinal.data(), pointers[pointers.size() - 1], sizeof(ElemType) * peekFinal.size(), cudaMemcpyDeviceToHost)); + double s1 = 0, s2 = 0; + for (auto v : peekPartial) + s1 += v; + for (auto v : peekFinal) + s2 += v; + sin(1.0); +#endif + } +#else + else if (beta == 1) + { + // no need to pre-scale; just add (common for gradients) + _launchTensorOpWithReduction<<>>(beta, pointers, alpha, op, reductionOp, regularOpStrides, + regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize, + regularOpStrideDivmod, reducingOpDimDivmod); + return; + } + else + { + // We need more than one chunk, we will use atomicAdd(). + // First reset/pre-multiply input; then do the remaining chunks using atomicAdd(). + _launchTensorOpWithReduction<<>>(beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize, + regularOpStrideDivmod, reducingOpDimDivmod); + // We will leave it like this for a while, but eventually need to revisit using temporary memory. + _launchTensorOpWithReduction<<>>(/*beta=*/1, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize, + regularOpStrideDivmod, reducingOpDimDivmod); + } +#endif + } +} // ----------------------------------------------------------------------- // kernel and launch --linear unary // ----------------------------------------------------------------------- @@ -1205,6 +1477,24 @@ static void TensorOpWithRegularLoop(ElemType beta, const array& po } } +template +static void TensorOpWithRegularLoopDebug(ElemType beta, const array& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const SmallVector& regularOpDims, const array, N>& regularStrides, + const SmallVector& reducingOpDims, const array, N>& reducingStrides) +{ + size_t dims = reducingOpDims.size(); + switch (dims) + { + case 2: + return LaunchTensorOpWithReduction(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 1: + return LaunchTensorOpWithReduction(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 0: + return LaunchTensorOp(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides); + default: + LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (C_int) dims); + } +} // tensor operation, generalized in number of arguments // This function now expands into different k. It also eliminates the offsets by adding them to the pointers. template @@ -1236,6 +1526,17 @@ void TensorOpN(ElemType beta, array pointers, ElemType alpha, Elem } } +template +void TensorOpNDebug(ElemType beta, array pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, N>& regularStrides, + const SmallVector& reducingOpDims, const array, N>& reducingStrides, const GPUMatrix& a, GPUMatrix& result) +{ + for (C_size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled + pointers[i] += offsets[i]; + return LaunchTensorOpWithReductionDebug(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides, a, result); +} + //------------------------------------------------------------------------ // explicit instantiations--these are being called from GPUMatrix.cu //------------------------------------------------------------------------ @@ -1277,6 +1578,21 @@ template void TensorOpN(half beta, array pointers, half alpha const SmallVector& regularOpDims, const array, 4>& regularStrides, const SmallVector& reducingOpDims, const array, 4>& reducingStrides); +template void TensorOpNDebug(half beta, array pointers, half alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& reducingStrides, const GPUMatrix& a, GPUMatrix& result); + +template void TensorOpNDebug(double beta, array pointers, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& reducingStrides, const GPUMatrix& a, GPUMatrix& result); + +template void TensorOpNDebug(float beta, array pointers, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& reducingStrides, const GPUMatrix& a, GPUMatrix& result); + template void LaunchUnaryTensorOp(float beta, const float* pa, float* pb, float alpha, ElementWiseOperator op, size_t regularOpDim); template void LaunchUnaryTensorOp(double beta, const double* pa, double* pb, double alpha, ElementWiseOperator op, size_t regularOpDim); diff --git a/Source/Math/GPUTensor.h b/Source/Math/GPUTensor.h index dfe0b6ad4..b66559b7b 100644 --- a/Source/Math/GPUTensor.h +++ b/Source/Math/GPUTensor.h @@ -23,6 +23,12 @@ void TensorOpN(ElemType beta, array pointers, ElemType alpha, Elem const SmallVector& regularOpDims, const array, N>& regularStrides, const SmallVector& reducingOpDims, const array, N>& reducingStrides); +template +void TensorOpNDebug(ElemType beta, array pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, N>& regularStrides, + const SmallVector& reducingOpDims, const array, N>& reducingStrides, const GPUMatrix& a, GPUMatrix& result); + template void LaunchUnaryTensorOp(ElemType beta, const ElemType* pa, ElemType* pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim); diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp index 17df4cf4c..0b27957b4 100755 --- a/Source/Math/Matrix.cpp +++ b/Source/Math/Matrix.cpp @@ -3669,6 +3669,21 @@ Matrix& Matrix::SetToZeroIfAbsLessThan(const ElemType thresh return *this; } +template +Matrix& Matrix::SetToZeroIfLessThan(const ElemType threshold) +{ + if (IsEmpty()) + LogicError("SetToZeroIfLessThan: Matrix is empty."); + + DISPATCH_MATRIX_ON_FLAG(this, + this, + m_CPUMatrix->SetToZeroIfLessThan(threshold), + m_GPUMatrix->SetToZeroIfLessThan(threshold), + NOT_IMPLEMENTED, + m_GPUSparseMatrix->SetToZeroIfLessThan(threshold)); + + return *this; +} //sum of all elements template ElemType Matrix::SumOfElements() const @@ -6399,6 +6414,23 @@ void Matrix::TensorOp(ElemType beta, const Matrix& a, ElemTy NOT_IMPLEMENTED, NOT_IMPLEMENTED); } +template +void Matrix::TensorOpDebug(ElemType beta, const Matrix& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& reducingStrides) +{ + VerifyIsDense(*this) && VerifyIsDense(a); + + DecideAndMoveToRightDevice(*this, a); + + DISPATCH_MATRIX_ON_FLAG(this, + this, + m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides), + m_GPUMatrix->TensorOpDebug(beta, *a.m_GPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED); +} template void Matrix::TensorOp(ElemType beta, const Matrix& a, const Matrix& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h index 241be55d2..529a91c52 100755 --- a/Source/Math/Matrix.h +++ b/Source/Math/Matrix.h @@ -474,6 +474,8 @@ public: void InplaceTranspose(); Matrix& SetToZeroIfAbsLessThan(const ElemType threshold); + Matrix& SetToZeroIfLessThan(const ElemType threshold); + DeviceBoundNumber Sum_AsDeviceBoundNum() const; ElemType SumOfAbsElements() const; // sum of all abs(elements) @@ -658,6 +660,11 @@ public: const std::array& offsets, const SmallVector& regularOpDims, const std::array, 2>& regularStrides, const SmallVector& reducingOpDims, const std::array, 2>& reducingStrides); + void TensorOpDebug(ElemType beta, const Matrix& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const std::array& offsets, + const SmallVector& regularOpDims, const std::array, 2>& regularStrides, + const SmallVector& reducingOpDims, const std::array, 2>& reducingStrides); + void TensorOp(ElemType beta, const Matrix& a, const Matrix& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, const std::array& offsets, const SmallVector& regularOpDims, const std::array, 3>& regularStrides, diff --git a/Source/Math/NoGPU.cpp b/Source/Math/NoGPU.cpp index 995932e34..720343e43 100755 --- a/Source/Math/NoGPU.cpp +++ b/Source/Math/NoGPU.cpp @@ -709,6 +709,11 @@ GPUSparseMatrix& GPUSparseMatrix::SetToZeroIfAbsLessThan(con return *this; } +template +GPUSparseMatrix& GPUSparseMatrix::SetToZeroIfLessThan(const ElemType threshold) +{ + return *this; +} template GPUSparseMatrix& GPUSparseMatrix::InplaceSoftThreshold(const ElemType threshold) { @@ -1723,7 +1728,11 @@ GPUMatrix& GPUMatrix::SetToZeroIfAbsLessThan(const ElemType { return *this; } - +template +GPUMatrix& GPUMatrix::SetToZeroIfLessThan(const ElemType threshold) +{ + return *this; +} template ElemType GPUMatrix::SumOfAbsElements() const { diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp index 42175e8e0..fefd95945 100644 --- a/Source/Math/TensorView.cpp +++ b/Source/Math/TensorView.cpp @@ -217,7 +217,7 @@ static void PrepareTensorOperands(array shapes, array } for (size_t i = 0; i < N; i++) - offsets[i] = shapes[i].GetOffset(); + offsets[i] = shapes[i].GetOffset(); } // enforce that in case of broadcasting, the output must not be an input @@ -249,6 +249,47 @@ void TensorView::DoUnaryOpOf(ElemType beta, const TensorView& a, ElemT GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); } +template +void TensorView::DoUnaryOpOfDebug(ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp) +{ + // static int cc = 0; if (cc++ == 0) + // fprintf(stderr, "Tensor Op: Op %d: %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(GetShape()).c_str()); + + // prepare all tensor descriptor information as needed for execution + array offsets; + array, 2> regularStrides, reducingStrides; + SmallVector regularOpDims, reducingOpDims; + PrepareTensorOperands(array{a.GetShape(), GetShape()}, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + + // output cannot be input when reducing + if (reducingOpDims.size() > 0) + CheckDifferentObject(a, *this); + /* + for (size_t i = 0; i < 2; i++) + { + // fprintf(stderr, "i = %d, offsets = %d, regularStrides = %d, reducingStrides = %d \n ", int(i), int(offsets[i]), int(regularStrides[i]), int(reducingStrides[i])); + fprintf(stderr, "i = %d, offsets = %d\n ", int(i), int(offsets[i])); + } + for (size_t i = 0; i < regularOpDims.size(); i++) + fprintf(stderr, "i = %d, regularOpDims = %d\n ", int(i), int(regularOpDims[i])); + for (size_t i = 0; i < reducingOpDims.size(); i++) + fprintf(stderr, "i = %d, reducingOpDims = %d\n ", int(i), int(reducingOpDims[i])); + + for (size_t i = 0; i < 2; i++) + { + for (size_t j = 0; j < regularStrides[i].size(); j++) + { + fprintf(stderr, "i = %d, j = %d, regularStrides = %d \n ", int(i), int(j), int(regularStrides[i][j])); + } + for (size_t j = 0; j < reducingStrides[i].size(); j++) + { + fprintf(stderr, "i = %d, j = %d, reducingStrides = %d \n ", int(i), int(j), int(reducingStrides[i][j])); + } + } + */ + // now perform the operation + GetSOB().TensorOpDebug(beta, a.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); +} template void TensorView::DoBinaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp) { diff --git a/Source/Math/TensorView.h b/Source/Math/TensorView.h index a462fce7e..2cd76ba53 100644 --- a/Source/Math/TensorView.h +++ b/Source/Math/TensorView.h @@ -131,6 +131,7 @@ public: #pragma pop_macro("DeclareTernaryTensorOp") void DoUnaryOpOf (ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp); + void DoUnaryOpOfDebug(ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp); void DoBinaryOpOf (ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp); void DoTernaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp); diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp index ef06834df..ae3a28a84 100644 --- a/Source/SGDLib/SGD.cpp +++ b/Source/SGDLib/SGD.cpp @@ -1051,10 +1051,18 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, string showWERMode, bool SVD) { - PROFILE_SCOPE(profilerEvtMainEpoch); + PROFILE_SCOPE(profilerEvtMainEpoch); + std::vector decodeOutputNodeNames(outputNodeNamesVector.begin() + 1, outputNodeNamesVector.begin() + 2); ScopedNetworkOperationMode modeGuard(net, NetworkOperationMode::training); + ComputationNetwork decode_cn_root; + decode_cn_root.CopySubTree(*net, decodeOutputNodeNames[0], L"", CopyNodeFlags::copyNodeAll); + decode_cn_root.CompileNetwork(); + std::vector decodeOutputNodesTmp = decode_cn_root.OutputNodesByName(decodeOutputNodeNames); + decode_cn_root.FormEvalOrder(decodeOutputNodesTmp[0]); + decode_cn_root.FormNestedNetwork(decodeOutputNodesTmp[0]); + // bring our 'out' values into consistent state epochCriterion = EpochCriterion(0); epochEvalErrors.assign(epochEvalErrors.size(), EpochCriterion(0)); @@ -1078,7 +1086,6 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, auto ContainsAccumulatedResult = [&evaluationNodesWhichAccumulateResult](ComputationNodeBasePtr node) { return evaluationNodesWhichAccumulateResult.find(node) != evaluationNodesWhichAccumulateResult.end(); }; - // MA-related variables size_t nSamplesSinceLastModelSync = 0; size_t blockSizePerWorker = 0; @@ -1122,7 +1129,6 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, // this is non-trivial, we need a manager object to handle this if (numSubminibatchesNeeded > 1) smbDispatcher.Init(net, learnableNodes, criterionNodes, evaluationNodes); - // The following is a special feature only supported by the Kaldi2Reader for more efficient sequence training. // This attempts to compute the error signal for the whole utterance, which will // be fed to the neural network as features. Currently it is a workaround @@ -1241,6 +1247,7 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, numSamplesWithLabelOfNetworkMBR = 0; auto profGetMinibatch = ProfilerTimeBegin(); + bool wasDataRead = DataReaderHelpers::GetMinibatchIntoNetwork(*trainSetDataReader, net, criterionNodes[0], useDistributedMBReading, useParallelTrain, *inputMatrices, actualMBSize, m_mpi); @@ -1321,12 +1328,10 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, //net->CompileNetwork(); std::vector encodeOutputNodeNames(outputNodeNamesVector.begin(), outputNodeNamesVector.begin() + 1); std::vector encodeOutputNodes = net->OutputNodesByName(encodeOutputNodeNames); - // //net->CollectInputAndLearnableParameters(encodeOutputNodes[0]); std::list InputNodesList = net->InputNodes(criterionNodes[0]); std::vector encodeInputNodeNames; - if (SVD) encodeInputNodeNames.assign(outputNodeNamesVector.begin() + 7, outputNodeNamesVector.begin() + 8); else @@ -1336,7 +1341,6 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, *encodeInputMatrices = DataReaderHelpersFunctions::RetrieveInputMatrices(encodeInputNodes); //get decode input matrix - std::vector decodeOutputNodeNames(outputNodeNamesVector.begin() + 1, outputNodeNamesVector.begin() + 2); std::vector decodeOutputNodes = net->OutputNodesByName(decodeOutputNodeNames); //net->CollectInputAndLearnableParameters(decodeOutputNodes[0]); @@ -1346,6 +1350,7 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, else decodeInputNodeNames.assign(outputNodeNamesVector.begin() + 7, outputNodeNamesVector.begin() + 8); std::vector decodeinputNodes = net->OutputNodesByName(decodeInputNodeNames); + *decodeinputMatrices = DataReaderHelpersFunctions::RetrieveInputMatrices(decodeinputNodes); if (!ordered) @@ -1359,7 +1364,6 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, std::vector Plustransnodes = net->OutputNodesByName(plusTransNodeNames); net->FormEvalOrder(Plustransnodes[0]); } - //form eval order for RELU auto reffeainput = (*encodeInputMatrices).begin(); @@ -1367,9 +1371,13 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, auto reflminput = (*decodeinputMatrices).begin(); auto decodeMBLayout = reflminput->second.pMBLayout; - net->ForwardProp(encodeOutputNodes); + //form eval order for RELU + time_t my_time = time(NULL); + fprintf(stderr, "SGD time 0 = %s", ctime(&my_time)); - Matrix encodeOutput(net->GetDeviceId()); + net->ForwardProp(encodeOutputNodes); + size_t deviceid = net->GetDeviceId(); + Matrix encodeOutput(deviceid); encodeOutput.SetValue(*(&dynamic_pointer_cast>(encodeOutputNodes[0])->Value())); vector> uttPathsInfo; @@ -1379,12 +1387,50 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, vector vt_onebest_wer; vt_onebest_wer.clear(); - // time_t my_time = time(NULL); - // fprintf(stderr, "SGD time 1 = %s", ctime(&my_time)); + my_time = time(NULL); + fprintf(stderr, "SGD time 1 = %s", ctime(&my_time)); RNNTDecodeFunctions rnntdfs; - rnntdfs.RNNT_decode_nbest_MBR(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, *net); - //my_time = time(NULL); - //fprintf(stderr, "SGD time 2 = %s", ctime(&my_time)); + //rnntdfs.RNNT_decode_nbest_MBR(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, *net); + + //vt_printname.push_back(L"DecodeOutputLN"); + + if (m_enableMultiThreadDecodeMBR) + { + ComputationNodeBasePtr WmNode, WmuNode, WmvNode, bmNode; + Matrix Wm(deviceid), Wmu(deviceid), Wmv(deviceid), bm(deviceid); + if (SVD) + { + WmuNode = net->GetNodeFromName(outputNodeNamesVector[4]); + WmvNode = net->GetNodeFromName(outputNodeNamesVector[5]); + bmNode = net->GetNodeFromName(outputNodeNamesVector[6]); + Wmu.SetValue(*(&dynamic_pointer_cast>(WmuNode)->Value())); + Wmv.SetValue(*(&dynamic_pointer_cast>(WmvNode)->Value())); + WmNode; + Wm; + } + else + { + WmNode = net->GetNodeFromName(outputNodeNamesVector[4]); + bmNode = net->GetNodeFromName(outputNodeNamesVector[5]); + Wm.SetValue(*(&dynamic_pointer_cast>(WmNode)->Value())); + WmuNode; + WmvNode; + Wmu; + Wmv; + } + + bm.SetValue(*(&dynamic_pointer_cast>(bmNode)->Value())); + /* + size_t num_utt = 7; + size_t start_utt = 0; + */ + rnntdfs.RNNT_decode_nbest_MBR_Multithread(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix(), decodeMBLayout, decodeInputNodeNames, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, decode_cn_root, Wm, Wmu, Wmv, bm); /*, num_utt, start_utt); */ + } + else + rnntdfs.RNNT_decode_nbest_MBR(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, net); + // rnntdfs.RNNT_decode_nbest_MBR_Multithread(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, *net, decode_cn, decodeinputNodes_tmp); + my_time = time(NULL); + fprintf(stderr, "SGD time 2 = %s", ctime(&my_time)); //fprintf(stderr, "decode SGD v0 .\n"); //net->BumpEvalTimeStamp(decodeinputNodes); @@ -1400,21 +1446,11 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, refFeaMatBackup.SetValue(reffeainput->second.GetMatrix()); size_t numParallelSequences = encodeMBLayout->GetNumParallelSequences(); - //my_time = time(NULL); - //fprintf(stderr, "SGD time 3 = %s", ctime(&my_time)); + my_time = time(NULL); + fprintf(stderr, "SGD time 3 = %s", ctime(&my_time)); for (const auto& seq : encodeMBLayout->GetAllSequences()) { - /* - if (seqId == 1) - { - for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++) - { - ComputationNodePtr node = dynamic_pointer_cast>(*nodeIter); - node->force_gradient_accumulate(true); - } - } - */ if (seq.seqId == GAP_SEQUENCE_ID) { continue; @@ -1423,15 +1459,12 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, { continue; } - - //if (firstdebug) cNode->SetMWERInfo(uttPathsInfo[seqId], lengthNorm, wordPathPosteriorFromDecodeMBR, doMBR, vt_nws[seqId]); // get the feature MBLayout size_t numFrames = seq.GetNumTimeSteps(); numSamplesWithLabelOfNetworkMBR += numFrames; - // if (firstdebug) reffeainput->second.pMBLayout->Init(1, numFrames); // 1 channel, 1 utterance Matrix fea(deviceID); @@ -1447,11 +1480,9 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, fea.SetColumn(refFeaMatBackup.ColumnSlice(uID, 1), t); } - //if (firstdebug) - { - reffeainput->second.GetMatrix().SetValue(fea); - reffeainput->second.pMBLayout->AddSequence(0, 0, 0, numFrames); // guoye: first 0 is for utterance ID, second 0 means 0th channel, lenght is 0 to numFrames - } + + reffeainput->second.GetMatrix().SetValue(fea); + reffeainput->second.pMBLayout->AddSequence(0, 0, 0, numFrames); // guoye: first 0 is for utterance ID, second 0 means 0th channel, lenght is 0 to numFrames // guoye: the below 2 commands reset the state, to make sure ForwardProb always get carried out ComputationNetwork::BumpEvalTimeStamp(encodeInputNodes); // guoy: update the time stamp before you do forward prob @@ -1463,9 +1494,9 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, size_t nBest = uttPathsInfo[seqId].size(); if (nBest > (m_maxFrameNumPerMinibatchMBR / numFrames)) { - // reset nBest to make the MB size framenum with budget + // reset nBest to make the MB size nBest = (m_maxFrameNumPerMinibatchMBR / numFrames); - } + } size_t maxPhoneSeqLen = uttPathsInfo[seqId][0].label_seq.size(); for (size_t n = 1; n < nBest; n++) @@ -1583,8 +1614,8 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, } } - //my_time = time(NULL); - //fprintf(stderr, "SGD time 4 = %s", ctime(&my_time)); + my_time = time(NULL); + fprintf(stderr, "SGD time 4 = %s", ctime(&my_time)); } // =========================================================== // forward prop for evaluate eval nodes @@ -1790,8 +1821,7 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, { #ifdef _DEBUG if (smoothedGradientIter->HasNan("TrainOneEpoch/UpdateWeights(): ")) - LogicError(" - %ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str()); + LogicError("%ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str()); #endif double nodeDependentLearningRatePerSample = learnRatePerSample * node->GetLearningRateMultiplier(); double nodeDependentRegMultiplier = dynamic_pointer_cast>(node)->GetRegMultiplier(); @@ -3460,7 +3490,7 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType) m_lengthNorm = configSGD(L"LengthNorm", true); m_showWERMode = configSGD(L"showWERMode", "average"); m_isSVD = configSGD(L"SVD", true); - + m_enableMultiThreadDecodeMBR = configSGD(L"enableMultiThreadDecodeMBR", true); m_maxFrameNumPerMinibatchMBR = configSGD(L"MaxFrameNumPerMinibatchMBR", (size_t) 2000); if (m_doGradientCheck && sizeofElemType != sizeof(double)) { diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h index 8db27b79e..106d06b2e 100644 --- a/Source/SGDLib/SGD.h +++ b/Source/SGDLib/SGD.h @@ -292,6 +292,7 @@ protected: string m_showWERMode; bool m_isSVD; size_t m_maxFrameNumPerMinibatchMBR; + bool m_enableMultiThreadDecodeMBR; // Parallel training MPIWrapperPtr m_mpi; diff --git a/Source/SGDLib/SimpleOutputWriter.h b/Source/SGDLib/SimpleOutputWriter.h index a008636c7..0926a9eeb 100644 --- a/Source/SGDLib/SimpleOutputWriter.h +++ b/Source/SGDLib/SimpleOutputWriter.h @@ -421,7 +421,7 @@ public: Matrix encodeOutput(deviceid); Matrix decodeOutput(deviceid); Matrix greedyOutput(deviceid), greedyOutputMax(deviceid); - Matrix sumofENandDE(deviceid), maxIdx(deviceid), maxVal(deviceid); + Matrix maxIdx(deviceid), maxVal(deviceid); Matrix lmin(deviceid); MatrixPool m_matrixPool; m_matrixPool.OptimizedMemoryAllocation(); @@ -472,35 +472,7 @@ public: CurSequences = nextSequences; vector::Sequence>().swap(nextSequences); - //deal with the same prefix - /*sort(CurSequences.begin(), CurSequences.end(), - [](const Sequence& a, const Sequence& b) -> bool { - return a.labelseq.size() > b.labelseq.size(); - }); - for (size_t n = 0; n < CurSequences.size() - 1; n++) - { - for (size_t h = n + 1; h < CurSequences.size(); h++) - { - if (isPrefix(CurSequences[h], CurSequences[n])) - { - //forward_prop the prefix - forward_decode(CurSequences[h], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, CurSequences[h].labelseq.size()); - forwardmerged(CurSequences[h], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes); - - size_t idx = CurSequences[h].labelseq.size(); - ElemType curlogp = CurSequences[h].logP + decodeOutput(CurSequences[n].labelseq[idx], 0); - for (size_t k = idx; k < CurSequences[n].labelseq.size() - 1; k++) - { - forward_decode(CurSequences[n], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, k + 1); - forwardmerged(CurSequences[n], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes); - - curlogp += decodeOutput(CurSequences[n].labelseq[k + 1], 0); - } - CurSequences[n].logP = decodeOutput.LogAdd(curlogp, CurSequences[n].logP); - } - } - }*/ //nextSequences.clear(); while (true) { @@ -515,8 +487,8 @@ public: rnntdfs.prepareSequence(tempSeq); rnntdfs.forward_decode(tempSeq, decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, tempSeq.labelseq.size(), *m_net); if (isSVD) - rnntdfs.forwardmergedSVD(tempSeq, t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wmu, Wmv, bm, *m_net); - else rnntdfs.forwardmerged(tempSeq, t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wm, bm, *m_net); + rnntdfs.forwardmergedSVD(tempSeq, t, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wmu, Wmv, bm, m_net); + else rnntdfs.forwardmerged(tempSeq, t, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wm, bm, m_net); //sumofENandDE.Print("sum"); //sort log posterior and get best N labels @@ -715,7 +687,7 @@ public: Matrix encodeOutput(deviceid); Matrix decodeOutput(deviceid); Matrix greedyOutput(deviceid), greedyOutputMax(deviceid); - Matrix sumofENandDE(deviceid), maxIdx(deviceid), maxVal(deviceid); + Matrix maxIdx(deviceid), maxVal(deviceid); Matrix lmin(deviceid); MatrixPool m_matrixPool; m_matrixPool.OptimizedMemoryAllocation(); @@ -760,36 +732,6 @@ public: CurSequences = nextSequences; vector::Sequence>().swap(nextSequences); - //deal with the same prefix - /*sort(CurSequences.begin(), CurSequences.end(), - [](const Sequence& a, const Sequence& b) -> bool { - return a.labelseq.size() > b.labelseq.size(); - }); - for (size_t n = 0; n < CurSequences.size() - 1; n++) - { - for (size_t h = n + 1; h < CurSequences.size(); h++) - { - if (isPrefix(CurSequences[h], CurSequences[n])) - { - //forward_prop the prefix - forward_decode(CurSequences[h], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, CurSequences[h].labelseq.size()); - - forwardmerged(CurSequences[h], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes); - - size_t idx = CurSequences[h].labelseq.size(); - ElemType curlogp = CurSequences[h].logP + decodeOutput(CurSequences[n].labelseq[idx], 0); - for (size_t k = idx; k < CurSequences[n].labelseq.size() - 1; k++) - { - forward_decode(CurSequences[n], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, k + 1); - forwardmerged(CurSequences[n], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes); - - curlogp += decodeOutput(CurSequences[n].labelseq[k + 1], 0); - } - CurSequences[n].logP = decodeOutput.LogAdd(curlogp, CurSequences[n].logP); - } - } - }*/ - //nextSequences.clear(); while (true) { @@ -802,7 +744,7 @@ public: CurSequences.erase(maxSeq); rnntdfs.prepareSequence(tempSeq); rnntdfs.forward_decode(tempSeq, decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, tempSeq.labelseq.size(), *m_net); - rnntdfs.forwardmerged(tempSeq, t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wm, bm, *m_net); + rnntdfs.forwardmerged(tempSeq, t, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wm, bm, m_net); //sumofENandDE.Print("sum"); //sort log posterior and get best N labels