profile cost
This commit is contained in:
Родитель
35935a189c
Коммит
d0675c2731
|
@ -4702,6 +4702,12 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRNNTScore(const GPUMatrix<ElemTy
|
|||
// Max number of phones in utterances in this minibatch
|
||||
//size_t maxPhoneNum = phoneSeq.GetNumRows();
|
||||
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
|
||||
|
||||
size_t* gpuFrameNum;
|
||||
CUDA_CALL(cudaMalloc((void**) &gpuFrameNum, uttNum * sizeof(size_t)));
|
||||
CUDA_CALL(cudaMemcpy(gpuFrameNum, uttFrameNum.data(), uttNum * sizeof(size_t), cudaMemcpyHostToDevice));
|
||||
|
@ -4738,7 +4744,7 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRNNTScore(const GPUMatrix<ElemTy
|
|||
|
||||
int blocksPerGrid = (int) ceil(1.0 * uttNum / GridDim::maxThreadsPerBlock);
|
||||
//_AssignSequenceError<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(hsmoothingWeight, Data(), label.Data(), dnnoutput.Data(), gamma.Data(), alpha, N);
|
||||
|
||||
cudaEventRecord(start);
|
||||
for (size_t t = 0; t < maxFrameNum; t++)
|
||||
{
|
||||
for (size_t u = 0; u < maxPhoneNum; u++)
|
||||
|
@ -4754,15 +4760,21 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRNNTScore(const GPUMatrix<ElemTy
|
|||
gpuFrameNum, gpuPhoneNum, gpuBeginFrame, gpuFrameToChanInd, gpuUttBeginForMergedinput, numParallelSequences, t, u,
|
||||
maxPhoneNum, totalPhoneNum, blankTokenId, uttNum);
|
||||
}
|
||||
cudaEventRecord(stop);
|
||||
cudaEventSynchronize(stop);
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
|
||||
//beta.Print("beta");
|
||||
//alpha.Print("alpha");
|
||||
ElemType zerVar = 0.0;
|
||||
printf("time for fb:%f\n", milliseconds);
|
||||
//beta.Print("beta");
|
||||
//alpha.Print("alpha");
|
||||
ElemType zerVar = 0.0;
|
||||
totalScore.SetColumn(&zerVar, 0);
|
||||
_assignRNNTTotalScore<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(alpha.Data(), beta.Data(), totalScore.Data(), uttNum, gpuFrameNum, gpuFrameToChanInd, gpuBeginFrame, numParallelSequences, maxPhoneNum);
|
||||
|
||||
this->SetValue(0.0);
|
||||
|
||||
cudaEventRecord(start);
|
||||
// x dimension is for each phone
|
||||
// y dimention is for each time
|
||||
// Ensure that we allocate correct number of blocks for given number of utterances and max number of phones in those utterances
|
||||
|
@ -4785,7 +4797,11 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRNNTScore(const GPUMatrix<ElemTy
|
|||
_assignRNNTScoreS3<<<block_tail, thread_tail, 0, t_stream>>>(Data(), alpha.Data(), beta.Data(), phoneSeq.Data(), uttFrameNum[s], uttPhoneNum[s], uttFrameBeginIdx[s], uttFrameToChanInd[s],
|
||||
uttBeginForOutputditribution[s], numParallelSequences, maxPhoneNum, totalPhoneNum, blankTokenId, s);
|
||||
}
|
||||
cudaEventRecord(stop);
|
||||
cudaEventSynchronize(stop);
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
|
||||
printf("time for error cal:%f\n", milliseconds);
|
||||
CUDA_CALL(cudaFree(gpuFrameNum));
|
||||
CUDA_CALL(cudaFree(gpuPhoneNum));
|
||||
CUDA_CALL(cudaFree(gpuBeginFrame));
|
||||
|
|
|
@ -539,18 +539,21 @@ public:
|
|||
// totalcol, numParallelSequences, numPhoneParallelSequences);
|
||||
//matrixOutputDistribution.Print("h");
|
||||
//log softmax of f+g
|
||||
mergedinput.InplaceLogSoftmax(true);
|
||||
//mergedinput.InplaceLogSoftmax(true);
|
||||
Microsoft::MSR::CNTK::Matrix<ElemType> logsoftmax(m_deviceid_gpu);
|
||||
logsoftmax.SetValue(mergedinput);
|
||||
|
||||
logsoftmax.InplaceLogSoftmax(true);
|
||||
//matrixOutputDistribution.Print("prob");
|
||||
// forward backward to compute alpha, beta derivaitves
|
||||
Microsoft::MSR::CNTK::Matrix<ElemType> alpha(m_deviceid_gpu);
|
||||
Microsoft::MSR::CNTK::Matrix<ElemType> beta(m_deviceid_gpu);
|
||||
m_derivative.TransferToDeviceIfNotThere(m_deviceid_gpu);
|
||||
m_derivative.AssignRNNTScore(mergedinput, alpha, beta, matrixPhoneSeqs, matrixPhoneSeqs, uttFrameToChanInd, uttFrameBeginIdx, uttBeginForOutputditribution, uttPhoneToChanInd, uttPhoneBeginIdx,
|
||||
m_derivative.AssignRNNTScore(logsoftmax, alpha, beta, matrixPhoneSeqs, matrixPhoneSeqs, uttFrameToChanInd, uttFrameBeginIdx, uttBeginForOutputditribution, uttPhoneToChanInd, uttPhoneBeginIdx,
|
||||
uttFrameNum, uttPhoneNum, numParallelSequences, numPhoneParallelSequences, maxPhoneNum, maxFrameNum, totalScore, blankTokenId, -1,true);
|
||||
|
||||
mergedinput.InplaceExp();
|
||||
m_derivative.AssignElementProductOf(m_derivative, mergedinput);
|
||||
logsoftmax.InplaceExp();
|
||||
m_derivative.AssignElementProductOf(m_derivative, logsoftmax);
|
||||
ElemType finalscore = 0;
|
||||
//m_derivative.Print("RNNT");
|
||||
finalscore = totalScore.Get00Element();
|
||||
|
|
Загрузка…
Ссылка в новой задаче