(fix for previous fix)

2016-07-26 18:03:52 -07:00 · 2016-07-26 18:03:52 -07:00 · ee23bb2000
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -5788,14 +5788,14 @@ void CPUMatrix<ElemType>::RCRFBackwardCompute(const CPUMatrix<ElemType>& alpha,
 #pragma omp parallel for
        for (int k = 0; k < iNumLab; k++)
        {
-            _rcrfBackwardCompute1024Threads(t, k, alpha, beta, pair_scores);
+            _rcrfBackwardCompute(t, k, alpha, beta, pair_scores);
        }
    }
 };

 /// the kernel function for RCRF backward computation
 template <class ElemType>
-void CPUMatrix<ElemType>::_rcrfBackwardCompute1024Threads(size_t t, size_t k, const CPUMatrix<ElemType>& alpha,
+void CPUMatrix<ElemType>::_rcrfBackwardCompute(size_t t, size_t k, const CPUMatrix<ElemType>& alpha,
                                               CPUMatrix<ElemType>& beta,
                                               const CPUMatrix<ElemType>& pair_scores)
 {
@ -5859,7 +5859,7 @@ void CPUMatrix<ElemType>::RCRFTransGrdCompute(const CPUMatrix<ElemType>& lbls,
 #pragma omp parallel for
        for (int i = 0; i < iNumLab; i++)
        {
-            _rcrfTransGrdCompute1024Threads(i, lbls, alpha, beta, pair_scores, grd, tPos);
+            _rcrfTransGrdCompute(i, lbls, alpha, beta, pair_scores, grd, tPos);
        }

        // transition score
@ -5891,7 +5891,7 @@ void CPUMatrix<ElemType>::RCRFTransGrdCompute(const CPUMatrix<ElemType>& lbls,
 };

 template <class ElemType>
-void CPUMatrix<ElemType>::_rcrfTransGrdCompute1024Threads(size_t i,
+void CPUMatrix<ElemType>::_rcrfTransGrdCompute(size_t i,
                                               const CPUMatrix<ElemType>& lbls,
                                               const CPUMatrix<ElemType>& alpha,
                                               const CPUMatrix<ElemType>& beta,
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@ -486,7 +486,7 @@ public:
    static void RCRFBackwardCompute(const CPUMatrix<ElemType>& alpha, CPUMatrix<ElemType>& beta,
                                    const CPUMatrix<ElemType>& lbls,
                                    const CPUMatrix<ElemType>& pair_scores);
-    static void _rcrfBackwardCompute1024Threads(size_t t, size_t k, const CPUMatrix<ElemType>& alpha,
+    static void _rcrfBackwardCompute(size_t t, size_t k, const CPUMatrix<ElemType>& alpha,
                                     CPUMatrix<ElemType>& beta,
                                     const CPUMatrix<ElemType>& pair_scores);

@ -496,7 +496,7 @@ public:
                                    const CPUMatrix<ElemType>& pair_scores,
                                    CPUMatrix<ElemType>& grd);

-    static void _rcrfTransGrdCompute1024Threads(size_t i,
+    static void _rcrfTransGrdCompute(size_t i,
                                     const CPUMatrix<ElemType>& lbls,
                                     const CPUMatrix<ElemType>& alpha,
                                     const CPUMatrix<ElemType>& beta,
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -1950,7 +1950,7 @@ void GPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const GPUMatrix<ElemT
        p = p / 2;

    // note: kernel has hard-coded dimension of 512
-    _computeNceOutput512Threads<ElemType> << <GetNumElements() / 2, p >> >(
+    _computeNceOutputMax512Threads<ElemType> << <GetNumElements() / 2, p >> >(
        Data(),
        sampleCount,
        m_numRows / 2,
@ -1965,7 +1965,7 @@ void GPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const GPUMatrix<ElemT
        p = p / 2;
    // summing up objective must be done in one block
    // note: kernel has hard-coded dimension of 512
-    _assignNoiseContrastiveEstimation512Threads<ElemType> << <1, p >> >(
+    _assignNoiseContrastiveEstimationMax512Threads<ElemType> << <1, p >> >(
        Data(),
        sampleCount,
        m_numRows / 2,
@ -2011,7 +2011,7 @@ void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatr
        p = p / 2;

    // note: kernel has hard-coded dimension of 512
-    _assignSoftmaxSum512Threads<ElemType> << <1, p >> >(
+    _assignSoftmaxSumMax512Threads<ElemType> << <1, p >> >(
        my_a.Data(),
        width,
        Data(),
@ -4293,11 +4293,11 @@ void GPUMatrix<ElemType>::RCRFBackwardCompute(
    {
        szMemSize = sizeof(ElemType) * iNumLab;
        // note: kernel has hard-coded dimension of 1024
-        _rcrfBackwardComputeZeta1024Threads<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize >> >(t, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, shift);
+        _rcrfBackwardComputeZeta1024Threads<ElemType> << <blocksPerGrid, 1024, szMemSize >> >(t, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, shift);
        szMemSize = iNumLab * 3;
        szMemSize *= sizeof(ElemType);
        // note: kernel has hard-coded dimension of 1024
-        _rcrfBackwardCompute1024Threads<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize >> >(t, iNumPos, alpha.Data(), beta.Data(),
+        _rcrfBackwardCompute1024Threads<ElemType> << <blocksPerGrid, 1024, szMemSize >> >(t, iNumPos, alpha.Data(), beta.Data(),
                                                                                                  d_zeta, pair_scores.Data(), iNumLab, shift);
    }
    /*
@ -4336,12 +4336,12 @@ void GPUMatrix<ElemType>::RCRFTransGrdCompute(const GPUMatrix<ElemType>& lbls,
    {
        szMemSize = sizeof(ElemType) * iNumLab;
        // note: kernel has hard-coded dimension of 1024
-        _rcrfTransGrdComputeZeta<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize >> >(t - 1, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, startLbl, shift);
+        _rcrfTransGrdComputeZeta1024Threads<ElemType> << <blocksPerGrid, 1024, szMemSize >> >(t - 1, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, startLbl, shift);
        szMemSize = iNumLab * 3;
        szMemSize *= sizeof(ElemType);
        // note: kernel has hard-coded dimension of 1024
-        _rcrfTransGrdCompute1024Threads<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize >> >(t, startLbl, alpha.Data(), beta.Data(),
-                                                                                                  d_zeta, pair_scores.Data(), lbls.Data(), grd.Data(), iNumPos, iNumLab, shift);
+        _rcrfTransGrdCompute1024Threads<ElemType> << <blocksPerGrid, 1024, szMemSize >> >(t, startLbl, alpha.Data(), beta.Data(),
+                                                                                          d_zeta, pair_scores.Data(), lbls.Data(), grd.Data(), iNumPos, iNumLab, shift);
    }
    TracingGPUMemoryAllocator::Free<ElemType>(alpha.GetComputeDeviceId(), d_zeta);
 };
--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@ -3410,7 +3410,7 @@ __global__ void computeNCEForwardProp512Threads(
 #endif

 template <class ElemType>
-__global__ void _computeNceOutput512Threads(
+__global__ void _computeNceOutputMax512Threads(
    const ElemType* col,
    int numRows,
    int sampleCount,
@ -3479,7 +3479,7 @@ __global__ void _computeNceOutput512Threads(
 }

 template <class ElemType>
-__global__ void _assignSoftmaxSum512Threads(
+__global__ void _assignSoftmaxSumMax512Threads(
    const ElemType* softmax,
    int sampleCount,
    const ElemType* a,
@ -3491,7 +3491,7 @@ __global__ void _assignSoftmaxSum512Threads(
    // col is an array contains index of the word samples
    // a is a matrix in column major format contains output from hidden layer
    // b is the weight matrix for output layer
-    // tmp is the buffer that stores NCE output calculated from _computeNceOutput512Threads
+    // tmp is the buffer that stores NCE output calculated from _computeNceOutputMax512Threads
    // c is the matrix to store objective

    __shared__ ElemType partials[512];
@ -3531,7 +3531,7 @@ __global__ void _assignSoftmaxSum512Threads(
 }

 template <class ElemType>
-__global__ void _assignNoiseContrastiveEstimation512Threads(
+__global__ void _assignNoiseContrastiveEstimationMax512Threads(
    const ElemType* val,
    int numRows,
    int sampleCount,
@ -3547,7 +3547,7 @@ __global__ void _assignNoiseContrastiveEstimation512Threads(
    // col is an array contains index of the word samples
    // a is a matrix in column major format contains output from hidden layer
    // b is the weight matrix for output layer
-    // tmp is the buffer that stores NCE output calculated from _computeNceOutput512Threads
+    // tmp is the buffer that stores NCE output calculated from _computeNceOutputMax512Threads
    // c is the matrix to store objective

    __shared__ ElemType partials[512];
@ -4742,7 +4742,7 @@ __global__ void _rcrfBackwardComputeZeta1024Threads(

 /// $\zeta_t(j) = {\sum_k exp(\delta_{t-1}(k) + a_{kj}(t))}$.
 template <class ElemType>
-__global__ void _rcrfTransGrdComputeZeta(
+__global__ void _rcrfTransGrdComputeZeta1024Threads(
    const int t, // time position
    const size_t iNumPos,
    const ElemType* galpha, // column slice at current time t