no profile, save memory further

2019-05-28 23:07:46 -07:00 · 2019-05-28 23:07:46 -07:00 · ea3c0df6ac
--- a/Source/ComputationNetworkLib/SpecialPurposeNodes.h
+++ b/Source/ComputationNetworkLib/SpecialPurposeNodes.h
@ -1655,8 +1655,8 @@ public:
        //m_tmpMatrix->AssignUserOp2(RNNTDerivative, InputRef(2).Value().GetNumCols(), InputRef(1).Value().GetNumCols(), InputRef(0).GetMBLayout()->GetNumParallelSequences(), 0);
        //m_tmpMatrix->TransferFromDeviceToDevice(CPUDEVICE, InputRef(0).Value().GetDeviceId());
        // inputGradientValues+= gradientValues*(softmaxOfRight - CTCposterior)
-        Matrix<ElemType>::Scale(gradientValues.Get00Element(), RNNTDerivative, *m_tmpMatrix);
-        Matrix<ElemType>::VectorSum(*m_tmpMatrix,  inputGradientValues, false);
+        //Matrix<ElemType>::Scale(gradientValues.Get00Element(), RNNTDerivative, *m_tmpMatrix);
+        Matrix<ElemType>::VectorSum(RNNTDerivative, inputGradientValues, false);
        //inputGradientValues.Print("gradient");
        /*printf("back to F\n");
        if (gradientValues.GetDeviceId() != CPUDEVICE)
@ -1678,8 +1678,8 @@ public:
        //m_tmpMatrix->AssignUserOp2(RNNTDerivative, InputRef(2).Value().GetNumCols(), InputRef(1).Value().GetNumCols(), InputRef(0).GetMBLayout()->GetNumParallelSequences(), 0);
        //m_tmpMatrix->TransferFromDeviceToDevice(CPUDEVICE, InputRef(0).Value().GetDeviceId());
        // inputGradientValues+= gradientValues*(softmaxOfRight - CTCposterior)
-        Matrix<ElemType>::Scale(gradientValues.Get00Element(), RNNTDerivative, *m_tmpMatrix);
-        inputGradientValues.AssignProductOf(inputValue, false, *m_tmpMatrix, true);
+        //Matrix<ElemType>::Scale(gradientValues.Get00Element(), RNNTDerivative, *m_tmpMatrix);
+        inputGradientValues.AssignProductOf(inputValue, false, RNNTDerivative, true);
        //inputGradientValues.Print("gradient");
        /*printf("back to F\n");
        if (gradientValues.GetDeviceId() != CPUDEVICE)
@ -1701,8 +1701,8 @@ public:
        //m_tmpMatrix->AssignUserOp2(RNNTDerivative, InputRef(2).Value().GetNumCols(), InputRef(1).Value().GetNumCols(), InputRef(0).GetMBLayout()->GetNumParallelSequences(), 0);
        //m_tmpMatrix->TransferFromDeviceToDevice(CPUDEVICE, InputRef(0).Value().GetDeviceId());
        // inputGradientValues+= gradientValues*(softmaxOfRight - CTCposterior)
-        Matrix<ElemType>::Scale(gradientValues.Get00Element(), RNNTDerivative, *m_tmpMatrix);
-        inputGradientValues.AssignProductOf(inputValue, false, *m_tmpMatrix, false);
+        //Matrix<ElemType>::Scale(gradientValues.Get00Element(), RNNTDerivative, *m_tmpMatrix);
+        inputGradientValues.AssignProductOf(inputValue, false, RNNTDerivative, false);
        //inputGradientValues.Print("gradient");
        /*printf("back to F\n");
        if (gradientValues.GetDeviceId() != CPUDEVICE)
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -4702,9 +4702,6 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRNNTScore(const GPUMatrix<ElemTy
        // Max number of phones in utterances in this minibatch
        //size_t maxPhoneNum = phoneSeq.GetNumRows();

-        cudaEvent_t start, stop;
-        cudaEventCreate(&start);
-        cudaEventCreate(&stop);

        

@ -4744,7 +4741,7 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRNNTScore(const GPUMatrix<ElemTy

        int blocksPerGrid = (int) ceil(1.0 * uttNum / GridDim::maxThreadsPerBlock);
        //_AssignSequenceError<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(hsmoothingWeight, Data(), label.Data(), dnnoutput.Data(), gamma.Data(), alpha, N);
-        cudaEventRecord(start);
+
        for (size_t t = 0; t < maxFrameNum; t++)
        {
            for (size_t u = 0; u < maxPhoneNum; u++)
@ -4760,12 +4757,7 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRNNTScore(const GPUMatrix<ElemTy
                                                                                                  gpuFrameNum, gpuPhoneNum, gpuBeginFrame, gpuFrameToChanInd, gpuUttBeginForMergedinput, numParallelSequences, t, u,
                                                                                                  maxPhoneNum, totalPhoneNum, blankTokenId, uttNum);
        }
-        cudaEventRecord(stop);
-        cudaEventSynchronize(stop);
-        float milliseconds = 0;
-        cudaEventElapsedTime(&milliseconds, start, stop);

-        printf("time for fb:%f\n", milliseconds);
            //beta.Print("beta");
            //alpha.Print("alpha");
            ElemType zerVar = 0.0;
@ -4774,7 +4766,6 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRNNTScore(const GPUMatrix<ElemTy

        this->SetValue(0.0);

-        cudaEventRecord(start);
        // x dimension is for each phone
        // y dimention is for each time
        // Ensure that we allocate correct number of blocks for given number of utterances and max number of phones in those utterances
@ -4797,11 +4788,7 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRNNTScore(const GPUMatrix<ElemTy
            _assignRNNTScoreS3<<<block_tail, thread_tail, 0, t_stream>>>(Data(), alpha.Data(), beta.Data(), phoneSeq.Data(), uttFrameNum[s], uttPhoneNum[s], uttFrameBeginIdx[s], uttFrameToChanInd[s],
                                                                         uttBeginForOutputditribution[s], numParallelSequences, maxPhoneNum, totalPhoneNum, blankTokenId, s);
        }
-        cudaEventRecord(stop);
-        cudaEventSynchronize(stop);
-        cudaEventElapsedTime(&milliseconds, start, stop);

-        printf("time for error cal:%f\n", milliseconds);
        CUDA_CALL(cudaFree(gpuFrameNum));
        CUDA_CALL(cudaFree(gpuPhoneNum));
        CUDA_CALL(cudaFree(gpuBeginFrame));
--- a/Source/SequenceTrainingLib/gammacalculation.h
+++ b/Source/SequenceTrainingLib/gammacalculation.h
@ -555,6 +555,7 @@ public:
        
        mergedinput.InplaceExp();
        m_derivative.AssignElementProductOf(m_derivative, mergedinput);
+        //mergedinput.ReleaseMemory();
        ElemType finalscore = 0;
        //m_derivative.Print("RNNT");
        finalscore =  totalScore.Get00Element();