This commit is contained in:
Frank Seide 2016-07-26 18:03:52 -07:00
Родитель 8f716986ae
Коммит ee23bb2000
4 изменённых файлов: 20 добавлений и 20 удалений

Просмотреть файл

@ -5788,14 +5788,14 @@ void CPUMatrix<ElemType>::RCRFBackwardCompute(const CPUMatrix<ElemType>& alpha,
#pragma omp parallel for
for (int k = 0; k < iNumLab; k++)
{
_rcrfBackwardCompute1024Threads(t, k, alpha, beta, pair_scores);
_rcrfBackwardCompute(t, k, alpha, beta, pair_scores);
}
}
};
/// the kernel function for RCRF backward computation
template <class ElemType>
void CPUMatrix<ElemType>::_rcrfBackwardCompute1024Threads(size_t t, size_t k, const CPUMatrix<ElemType>& alpha,
void CPUMatrix<ElemType>::_rcrfBackwardCompute(size_t t, size_t k, const CPUMatrix<ElemType>& alpha,
CPUMatrix<ElemType>& beta,
const CPUMatrix<ElemType>& pair_scores)
{
@ -5859,7 +5859,7 @@ void CPUMatrix<ElemType>::RCRFTransGrdCompute(const CPUMatrix<ElemType>& lbls,
#pragma omp parallel for
for (int i = 0; i < iNumLab; i++)
{
_rcrfTransGrdCompute1024Threads(i, lbls, alpha, beta, pair_scores, grd, tPos);
_rcrfTransGrdCompute(i, lbls, alpha, beta, pair_scores, grd, tPos);
}
// transition score
@ -5891,7 +5891,7 @@ void CPUMatrix<ElemType>::RCRFTransGrdCompute(const CPUMatrix<ElemType>& lbls,
};
template <class ElemType>
void CPUMatrix<ElemType>::_rcrfTransGrdCompute1024Threads(size_t i,
void CPUMatrix<ElemType>::_rcrfTransGrdCompute(size_t i,
const CPUMatrix<ElemType>& lbls,
const CPUMatrix<ElemType>& alpha,
const CPUMatrix<ElemType>& beta,

Просмотреть файл

@ -486,7 +486,7 @@ public:
static void RCRFBackwardCompute(const CPUMatrix<ElemType>& alpha, CPUMatrix<ElemType>& beta,
const CPUMatrix<ElemType>& lbls,
const CPUMatrix<ElemType>& pair_scores);
static void _rcrfBackwardCompute1024Threads(size_t t, size_t k, const CPUMatrix<ElemType>& alpha,
static void _rcrfBackwardCompute(size_t t, size_t k, const CPUMatrix<ElemType>& alpha,
CPUMatrix<ElemType>& beta,
const CPUMatrix<ElemType>& pair_scores);
@ -496,7 +496,7 @@ public:
const CPUMatrix<ElemType>& pair_scores,
CPUMatrix<ElemType>& grd);
static void _rcrfTransGrdCompute1024Threads(size_t i,
static void _rcrfTransGrdCompute(size_t i,
const CPUMatrix<ElemType>& lbls,
const CPUMatrix<ElemType>& alpha,
const CPUMatrix<ElemType>& beta,

Просмотреть файл

@ -1950,7 +1950,7 @@ void GPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const GPUMatrix<ElemT
p = p / 2;
// note: kernel has hard-coded dimension of 512
_computeNceOutput512Threads<ElemType> << <GetNumElements() / 2, p >> >(
_computeNceOutputMax512Threads<ElemType> << <GetNumElements() / 2, p >> >(
Data(),
sampleCount,
m_numRows / 2,
@ -1965,7 +1965,7 @@ void GPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const GPUMatrix<ElemT
p = p / 2;
// summing up objective must be done in one block
// note: kernel has hard-coded dimension of 512
_assignNoiseContrastiveEstimation512Threads<ElemType> << <1, p >> >(
_assignNoiseContrastiveEstimationMax512Threads<ElemType> << <1, p >> >(
Data(),
sampleCount,
m_numRows / 2,
@ -2011,7 +2011,7 @@ void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatr
p = p / 2;
// note: kernel has hard-coded dimension of 512
_assignSoftmaxSum512Threads<ElemType> << <1, p >> >(
_assignSoftmaxSumMax512Threads<ElemType> << <1, p >> >(
my_a.Data(),
width,
Data(),
@ -4293,11 +4293,11 @@ void GPUMatrix<ElemType>::RCRFBackwardCompute(
{
szMemSize = sizeof(ElemType) * iNumLab;
// note: kernel has hard-coded dimension of 1024
_rcrfBackwardComputeZeta1024Threads<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize >> >(t, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, shift);
_rcrfBackwardComputeZeta1024Threads<ElemType> << <blocksPerGrid, 1024, szMemSize >> >(t, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, shift);
szMemSize = iNumLab * 3;
szMemSize *= sizeof(ElemType);
// note: kernel has hard-coded dimension of 1024
_rcrfBackwardCompute1024Threads<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize >> >(t, iNumPos, alpha.Data(), beta.Data(),
_rcrfBackwardCompute1024Threads<ElemType> << <blocksPerGrid, 1024, szMemSize >> >(t, iNumPos, alpha.Data(), beta.Data(),
d_zeta, pair_scores.Data(), iNumLab, shift);
}
/*
@ -4336,12 +4336,12 @@ void GPUMatrix<ElemType>::RCRFTransGrdCompute(const GPUMatrix<ElemType>& lbls,
{
szMemSize = sizeof(ElemType) * iNumLab;
// note: kernel has hard-coded dimension of 1024
_rcrfTransGrdComputeZeta<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize >> >(t - 1, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, startLbl, shift);
_rcrfTransGrdComputeZeta1024Threads<ElemType> << <blocksPerGrid, 1024, szMemSize >> >(t - 1, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, startLbl, shift);
szMemSize = iNumLab * 3;
szMemSize *= sizeof(ElemType);
// note: kernel has hard-coded dimension of 1024
_rcrfTransGrdCompute1024Threads<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize >> >(t, startLbl, alpha.Data(), beta.Data(),
d_zeta, pair_scores.Data(), lbls.Data(), grd.Data(), iNumPos, iNumLab, shift);
_rcrfTransGrdCompute1024Threads<ElemType> << <blocksPerGrid, 1024, szMemSize >> >(t, startLbl, alpha.Data(), beta.Data(),
d_zeta, pair_scores.Data(), lbls.Data(), grd.Data(), iNumPos, iNumLab, shift);
}
TracingGPUMemoryAllocator::Free<ElemType>(alpha.GetComputeDeviceId(), d_zeta);
};

Просмотреть файл

@ -3410,7 +3410,7 @@ __global__ void computeNCEForwardProp512Threads(
#endif
template <class ElemType>
__global__ void _computeNceOutput512Threads(
__global__ void _computeNceOutputMax512Threads(
const ElemType* col,
int numRows,
int sampleCount,
@ -3479,7 +3479,7 @@ __global__ void _computeNceOutput512Threads(
}
template <class ElemType>
__global__ void _assignSoftmaxSum512Threads(
__global__ void _assignSoftmaxSumMax512Threads(
const ElemType* softmax,
int sampleCount,
const ElemType* a,
@ -3491,7 +3491,7 @@ __global__ void _assignSoftmaxSum512Threads(
// col is an array contains index of the word samples
// a is a matrix in column major format contains output from hidden layer
// b is the weight matrix for output layer
// tmp is the buffer that stores NCE output calculated from _computeNceOutput512Threads
// tmp is the buffer that stores NCE output calculated from _computeNceOutputMax512Threads
// c is the matrix to store objective
__shared__ ElemType partials[512];
@ -3531,7 +3531,7 @@ __global__ void _assignSoftmaxSum512Threads(
}
template <class ElemType>
__global__ void _assignNoiseContrastiveEstimation512Threads(
__global__ void _assignNoiseContrastiveEstimationMax512Threads(
const ElemType* val,
int numRows,
int sampleCount,
@ -3547,7 +3547,7 @@ __global__ void _assignNoiseContrastiveEstimation512Threads(
// col is an array contains index of the word samples
// a is a matrix in column major format contains output from hidden layer
// b is the weight matrix for output layer
// tmp is the buffer that stores NCE output calculated from _computeNceOutput512Threads
// tmp is the buffer that stores NCE output calculated from _computeNceOutputMax512Threads
// c is the matrix to store objective
__shared__ ElemType partials[512];
@ -4742,7 +4742,7 @@ __global__ void _rcrfBackwardComputeZeta1024Threads(
/// $\zeta_t(j) = {\sum_k exp(\delta_{t-1}(k) + a_{kj}(t))}$.
template <class ElemType>
__global__ void _rcrfTransGrdComputeZeta(
__global__ void _rcrfTransGrdComputeZeta1024Threads(
const int t, // time position
const size_t iNumPos,
const ElemType* galpha, // column slice at current time t