no profile, save memory further

This commit is contained in:
Rui Zhao (SPEECH) 2019-05-28 23:07:46 -07:00
Родитель 9bd990319a
Коммит ea3c0df6ac
3 изменённых файлов: 8 добавлений и 20 удалений

Просмотреть файл

@ -1655,8 +1655,8 @@ public:
//m_tmpMatrix->AssignUserOp2(RNNTDerivative, InputRef(2).Value().GetNumCols(), InputRef(1).Value().GetNumCols(), InputRef(0).GetMBLayout()->GetNumParallelSequences(), 0);
//m_tmpMatrix->TransferFromDeviceToDevice(CPUDEVICE, InputRef(0).Value().GetDeviceId());
// inputGradientValues+= gradientValues*(softmaxOfRight - CTCposterior)
Matrix<ElemType>::Scale(gradientValues.Get00Element(), RNNTDerivative, *m_tmpMatrix);
Matrix<ElemType>::VectorSum(*m_tmpMatrix, inputGradientValues, false);
//Matrix<ElemType>::Scale(gradientValues.Get00Element(), RNNTDerivative, *m_tmpMatrix);
Matrix<ElemType>::VectorSum(RNNTDerivative, inputGradientValues, false);
//inputGradientValues.Print("gradient");
/*printf("back to F\n");
if (gradientValues.GetDeviceId() != CPUDEVICE)
@ -1678,8 +1678,8 @@ public:
//m_tmpMatrix->AssignUserOp2(RNNTDerivative, InputRef(2).Value().GetNumCols(), InputRef(1).Value().GetNumCols(), InputRef(0).GetMBLayout()->GetNumParallelSequences(), 0);
//m_tmpMatrix->TransferFromDeviceToDevice(CPUDEVICE, InputRef(0).Value().GetDeviceId());
// inputGradientValues+= gradientValues*(softmaxOfRight - CTCposterior)
Matrix<ElemType>::Scale(gradientValues.Get00Element(), RNNTDerivative, *m_tmpMatrix);
inputGradientValues.AssignProductOf(inputValue, false, *m_tmpMatrix, true);
//Matrix<ElemType>::Scale(gradientValues.Get00Element(), RNNTDerivative, *m_tmpMatrix);
inputGradientValues.AssignProductOf(inputValue, false, RNNTDerivative, true);
//inputGradientValues.Print("gradient");
/*printf("back to F\n");
if (gradientValues.GetDeviceId() != CPUDEVICE)
@ -1701,8 +1701,8 @@ public:
//m_tmpMatrix->AssignUserOp2(RNNTDerivative, InputRef(2).Value().GetNumCols(), InputRef(1).Value().GetNumCols(), InputRef(0).GetMBLayout()->GetNumParallelSequences(), 0);
//m_tmpMatrix->TransferFromDeviceToDevice(CPUDEVICE, InputRef(0).Value().GetDeviceId());
// inputGradientValues+= gradientValues*(softmaxOfRight - CTCposterior)
Matrix<ElemType>::Scale(gradientValues.Get00Element(), RNNTDerivative, *m_tmpMatrix);
inputGradientValues.AssignProductOf(inputValue, false, *m_tmpMatrix, false);
//Matrix<ElemType>::Scale(gradientValues.Get00Element(), RNNTDerivative, *m_tmpMatrix);
inputGradientValues.AssignProductOf(inputValue, false, RNNTDerivative, false);
//inputGradientValues.Print("gradient");
/*printf("back to F\n");
if (gradientValues.GetDeviceId() != CPUDEVICE)

Просмотреть файл

@ -4702,9 +4702,6 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRNNTScore(const GPUMatrix<ElemTy
// Max number of phones in utterances in this minibatch
//size_t maxPhoneNum = phoneSeq.GetNumRows();
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
@ -4744,7 +4741,7 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRNNTScore(const GPUMatrix<ElemTy
int blocksPerGrid = (int) ceil(1.0 * uttNum / GridDim::maxThreadsPerBlock);
//_AssignSequenceError<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(hsmoothingWeight, Data(), label.Data(), dnnoutput.Data(), gamma.Data(), alpha, N);
cudaEventRecord(start);
for (size_t t = 0; t < maxFrameNum; t++)
{
for (size_t u = 0; u < maxPhoneNum; u++)
@ -4760,12 +4757,7 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRNNTScore(const GPUMatrix<ElemTy
gpuFrameNum, gpuPhoneNum, gpuBeginFrame, gpuFrameToChanInd, gpuUttBeginForMergedinput, numParallelSequences, t, u,
maxPhoneNum, totalPhoneNum, blankTokenId, uttNum);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("time for fb:%f\n", milliseconds);
//beta.Print("beta");
//alpha.Print("alpha");
ElemType zerVar = 0.0;
@ -4774,7 +4766,6 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRNNTScore(const GPUMatrix<ElemTy
this->SetValue(0.0);
cudaEventRecord(start);
// x dimension is for each phone
// y dimention is for each time
// Ensure that we allocate correct number of blocks for given number of utterances and max number of phones in those utterances
@ -4797,11 +4788,7 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRNNTScore(const GPUMatrix<ElemTy
_assignRNNTScoreS3<<<block_tail, thread_tail, 0, t_stream>>>(Data(), alpha.Data(), beta.Data(), phoneSeq.Data(), uttFrameNum[s], uttPhoneNum[s], uttFrameBeginIdx[s], uttFrameToChanInd[s],
uttBeginForOutputditribution[s], numParallelSequences, maxPhoneNum, totalPhoneNum, blankTokenId, s);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("time for error cal:%f\n", milliseconds);
CUDA_CALL(cudaFree(gpuFrameNum));
CUDA_CALL(cudaFree(gpuPhoneNum));
CUDA_CALL(cudaFree(gpuBeginFrame));

Просмотреть файл

@ -555,6 +555,7 @@ public:
mergedinput.InplaceExp();
m_derivative.AssignElementProductOf(m_derivative, mergedinput);
//mergedinput.ReleaseMemory();
ElemType finalscore = 0;
//m_derivative.Print("RNNT");
finalscore = totalScore.Get00Element();