(fix for previous fix)
This commit is contained in:
Родитель
8f716986ae
Коммит
ee23bb2000
|
@ -5788,14 +5788,14 @@ void CPUMatrix<ElemType>::RCRFBackwardCompute(const CPUMatrix<ElemType>& alpha,
|
|||
#pragma omp parallel for
|
||||
for (int k = 0; k < iNumLab; k++)
|
||||
{
|
||||
_rcrfBackwardCompute1024Threads(t, k, alpha, beta, pair_scores);
|
||||
_rcrfBackwardCompute(t, k, alpha, beta, pair_scores);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// the kernel function for RCRF backward computation
|
||||
template <class ElemType>
|
||||
void CPUMatrix<ElemType>::_rcrfBackwardCompute1024Threads(size_t t, size_t k, const CPUMatrix<ElemType>& alpha,
|
||||
void CPUMatrix<ElemType>::_rcrfBackwardCompute(size_t t, size_t k, const CPUMatrix<ElemType>& alpha,
|
||||
CPUMatrix<ElemType>& beta,
|
||||
const CPUMatrix<ElemType>& pair_scores)
|
||||
{
|
||||
|
@ -5859,7 +5859,7 @@ void CPUMatrix<ElemType>::RCRFTransGrdCompute(const CPUMatrix<ElemType>& lbls,
|
|||
#pragma omp parallel for
|
||||
for (int i = 0; i < iNumLab; i++)
|
||||
{
|
||||
_rcrfTransGrdCompute1024Threads(i, lbls, alpha, beta, pair_scores, grd, tPos);
|
||||
_rcrfTransGrdCompute(i, lbls, alpha, beta, pair_scores, grd, tPos);
|
||||
}
|
||||
|
||||
// transition score
|
||||
|
@ -5891,7 +5891,7 @@ void CPUMatrix<ElemType>::RCRFTransGrdCompute(const CPUMatrix<ElemType>& lbls,
|
|||
};
|
||||
|
||||
template <class ElemType>
|
||||
void CPUMatrix<ElemType>::_rcrfTransGrdCompute1024Threads(size_t i,
|
||||
void CPUMatrix<ElemType>::_rcrfTransGrdCompute(size_t i,
|
||||
const CPUMatrix<ElemType>& lbls,
|
||||
const CPUMatrix<ElemType>& alpha,
|
||||
const CPUMatrix<ElemType>& beta,
|
||||
|
|
|
@ -486,7 +486,7 @@ public:
|
|||
static void RCRFBackwardCompute(const CPUMatrix<ElemType>& alpha, CPUMatrix<ElemType>& beta,
|
||||
const CPUMatrix<ElemType>& lbls,
|
||||
const CPUMatrix<ElemType>& pair_scores);
|
||||
static void _rcrfBackwardCompute1024Threads(size_t t, size_t k, const CPUMatrix<ElemType>& alpha,
|
||||
static void _rcrfBackwardCompute(size_t t, size_t k, const CPUMatrix<ElemType>& alpha,
|
||||
CPUMatrix<ElemType>& beta,
|
||||
const CPUMatrix<ElemType>& pair_scores);
|
||||
|
||||
|
@ -496,7 +496,7 @@ public:
|
|||
const CPUMatrix<ElemType>& pair_scores,
|
||||
CPUMatrix<ElemType>& grd);
|
||||
|
||||
static void _rcrfTransGrdCompute1024Threads(size_t i,
|
||||
static void _rcrfTransGrdCompute(size_t i,
|
||||
const CPUMatrix<ElemType>& lbls,
|
||||
const CPUMatrix<ElemType>& alpha,
|
||||
const CPUMatrix<ElemType>& beta,
|
||||
|
|
|
@ -1950,7 +1950,7 @@ void GPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const GPUMatrix<ElemT
|
|||
p = p / 2;
|
||||
|
||||
// note: kernel has hard-coded dimension of 512
|
||||
_computeNceOutput512Threads<ElemType> << <GetNumElements() / 2, p >> >(
|
||||
_computeNceOutputMax512Threads<ElemType> << <GetNumElements() / 2, p >> >(
|
||||
Data(),
|
||||
sampleCount,
|
||||
m_numRows / 2,
|
||||
|
@ -1965,7 +1965,7 @@ void GPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const GPUMatrix<ElemT
|
|||
p = p / 2;
|
||||
// summing up objective must be done in one block
|
||||
// note: kernel has hard-coded dimension of 512
|
||||
_assignNoiseContrastiveEstimation512Threads<ElemType> << <1, p >> >(
|
||||
_assignNoiseContrastiveEstimationMax512Threads<ElemType> << <1, p >> >(
|
||||
Data(),
|
||||
sampleCount,
|
||||
m_numRows / 2,
|
||||
|
@ -2011,7 +2011,7 @@ void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatr
|
|||
p = p / 2;
|
||||
|
||||
// note: kernel has hard-coded dimension of 512
|
||||
_assignSoftmaxSum512Threads<ElemType> << <1, p >> >(
|
||||
_assignSoftmaxSumMax512Threads<ElemType> << <1, p >> >(
|
||||
my_a.Data(),
|
||||
width,
|
||||
Data(),
|
||||
|
@ -4293,11 +4293,11 @@ void GPUMatrix<ElemType>::RCRFBackwardCompute(
|
|||
{
|
||||
szMemSize = sizeof(ElemType) * iNumLab;
|
||||
// note: kernel has hard-coded dimension of 1024
|
||||
_rcrfBackwardComputeZeta1024Threads<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize >> >(t, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, shift);
|
||||
_rcrfBackwardComputeZeta1024Threads<ElemType> << <blocksPerGrid, 1024, szMemSize >> >(t, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, shift);
|
||||
szMemSize = iNumLab * 3;
|
||||
szMemSize *= sizeof(ElemType);
|
||||
// note: kernel has hard-coded dimension of 1024
|
||||
_rcrfBackwardCompute1024Threads<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize >> >(t, iNumPos, alpha.Data(), beta.Data(),
|
||||
_rcrfBackwardCompute1024Threads<ElemType> << <blocksPerGrid, 1024, szMemSize >> >(t, iNumPos, alpha.Data(), beta.Data(),
|
||||
d_zeta, pair_scores.Data(), iNumLab, shift);
|
||||
}
|
||||
/*
|
||||
|
@ -4336,12 +4336,12 @@ void GPUMatrix<ElemType>::RCRFTransGrdCompute(const GPUMatrix<ElemType>& lbls,
|
|||
{
|
||||
szMemSize = sizeof(ElemType) * iNumLab;
|
||||
// note: kernel has hard-coded dimension of 1024
|
||||
_rcrfTransGrdComputeZeta<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize >> >(t - 1, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, startLbl, shift);
|
||||
_rcrfTransGrdComputeZeta1024Threads<ElemType> << <blocksPerGrid, 1024, szMemSize >> >(t - 1, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, startLbl, shift);
|
||||
szMemSize = iNumLab * 3;
|
||||
szMemSize *= sizeof(ElemType);
|
||||
// note: kernel has hard-coded dimension of 1024
|
||||
_rcrfTransGrdCompute1024Threads<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize >> >(t, startLbl, alpha.Data(), beta.Data(),
|
||||
d_zeta, pair_scores.Data(), lbls.Data(), grd.Data(), iNumPos, iNumLab, shift);
|
||||
_rcrfTransGrdCompute1024Threads<ElemType> << <blocksPerGrid, 1024, szMemSize >> >(t, startLbl, alpha.Data(), beta.Data(),
|
||||
d_zeta, pair_scores.Data(), lbls.Data(), grd.Data(), iNumPos, iNumLab, shift);
|
||||
}
|
||||
TracingGPUMemoryAllocator::Free<ElemType>(alpha.GetComputeDeviceId(), d_zeta);
|
||||
};
|
||||
|
|
|
@ -3410,7 +3410,7 @@ __global__ void computeNCEForwardProp512Threads(
|
|||
#endif
|
||||
|
||||
template <class ElemType>
|
||||
__global__ void _computeNceOutput512Threads(
|
||||
__global__ void _computeNceOutputMax512Threads(
|
||||
const ElemType* col,
|
||||
int numRows,
|
||||
int sampleCount,
|
||||
|
@ -3479,7 +3479,7 @@ __global__ void _computeNceOutput512Threads(
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
__global__ void _assignSoftmaxSum512Threads(
|
||||
__global__ void _assignSoftmaxSumMax512Threads(
|
||||
const ElemType* softmax,
|
||||
int sampleCount,
|
||||
const ElemType* a,
|
||||
|
@ -3491,7 +3491,7 @@ __global__ void _assignSoftmaxSum512Threads(
|
|||
// col is an array contains index of the word samples
|
||||
// a is a matrix in column major format contains output from hidden layer
|
||||
// b is the weight matrix for output layer
|
||||
// tmp is the buffer that stores NCE output calculated from _computeNceOutput512Threads
|
||||
// tmp is the buffer that stores NCE output calculated from _computeNceOutputMax512Threads
|
||||
// c is the matrix to store objective
|
||||
|
||||
__shared__ ElemType partials[512];
|
||||
|
@ -3531,7 +3531,7 @@ __global__ void _assignSoftmaxSum512Threads(
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
__global__ void _assignNoiseContrastiveEstimation512Threads(
|
||||
__global__ void _assignNoiseContrastiveEstimationMax512Threads(
|
||||
const ElemType* val,
|
||||
int numRows,
|
||||
int sampleCount,
|
||||
|
@ -3547,7 +3547,7 @@ __global__ void _assignNoiseContrastiveEstimation512Threads(
|
|||
// col is an array contains index of the word samples
|
||||
// a is a matrix in column major format contains output from hidden layer
|
||||
// b is the weight matrix for output layer
|
||||
// tmp is the buffer that stores NCE output calculated from _computeNceOutput512Threads
|
||||
// tmp is the buffer that stores NCE output calculated from _computeNceOutputMax512Threads
|
||||
// c is the matrix to store objective
|
||||
|
||||
__shared__ ElemType partials[512];
|
||||
|
@ -4742,7 +4742,7 @@ __global__ void _rcrfBackwardComputeZeta1024Threads(
|
|||
|
||||
/// $\zeta_t(j) = {\sum_k exp(\delta_{t-1}(k) + a_{kj}(t))}$.
|
||||
template <class ElemType>
|
||||
__global__ void _rcrfTransGrdComputeZeta(
|
||||
__global__ void _rcrfTransGrdComputeZeta1024Threads(
|
||||
const int t, // time position
|
||||
const size_t iNumPos,
|
||||
const ElemType* galpha, // column slice at current time t
|
||||
|
|
Загрузка…
Ссылка в новой задаче