Finish GPU NCE training

2015-07-06 20:56:16 -07:00 · 2015-07-06 20:56:16 -07:00 · 204b879dfe
--- a/DataReader/LMSequenceReader/SequenceReader.cpp
+++ b/DataReader/LMSequenceReader/SequenceReader.cpp
@ -2052,7 +2052,7 @@ void BatchSequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring,
    }
    if (curDevId != CPUDEVICE)
    {
-        labels->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
+        labels->TransferFromDeviceToDevice(CPUDEVICE, curDevId, false, false, false);
    }
 }

--- a/MachineLearning/CNTK/SGD.h
+++ b/MachineLearning/CNTK/SGD.h
@ -1523,8 +1523,6 @@ protected:
        trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, m_epochSize);

        startReadMBTime = Timer::MilliSecondElapsed();
-        int a = 0;
-        if (a)
        while (trainSetDataReader->GetMinibatch(inputMatrices))
        {
 #ifdef MPI_SUPPORT
--- a/MachineLearning/CNTK/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTK/TrainingCriterionNodes.h
@ -948,10 +948,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (Inputs(0)->FunctionValues().GetNumRows() == 1)
            {
                for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++)
-                if (Inputs(0)->FunctionValues()(0, i) > 0)
-                    positive++;
-                else if (Inputs(0)->FunctionValues()(0, i) < 0)
-                    negative++;
+                {
+                    if (Inputs(0)->FunctionValues()(0, i) > 0)
+                        positive++;
+                    else if (Inputs(0)->FunctionValues()(0, i) < 0)
+                        negative++;
+                }
                assert(positive * negative == 0);
            }
            if (m_evalMode == NCEEvalMode::Softmax || (Inputs(0)->FunctionValues().GetNumRows() == 1 && positive > 0))
@ -960,10 +962,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                m_logSoftmax.AssignProductOf(Inputs(1)->FunctionValues(), true, Inputs(2)->FunctionValues(), false);
                m_logSoftmax += Inputs(3)->FunctionValues();
                m_logSoftmax.InplaceLogSoftmax(false);
-                FunctionValues().Resize(1, 1);
-                FunctionValues().SetValue(0);
-                for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++)
-                    FunctionValues()(0, 0) -= m_logSoftmax(i, (size_t)Inputs(0)->FunctionValues()(0, i));
+                FunctionValues().AssignSoftmaxSum(Inputs(0)->FunctionValues(), m_logSoftmax);
            }
            else if (m_evalMode == NCEEvalMode::Unnormalized || (Inputs(0)->FunctionValues().GetNumRows() == 1 && negative > 0))
            {
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@ -3842,9 +3842,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return CPUMatrix<ElemType>::MultiplyAndWeightedAdd(1.0, a, transposeA, b, transposeB, 1.0, c);
    }
    template<class ElemType>
-    void CPUMatrix<ElemType>::AssignSoftmaxSum(const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& softmax)
+    void CPUMatrix<ElemType>::AssignSoftmaxSum(const CPUMatrix<ElemType>& softmax, CPUMatrix<ElemType>& c)
    {
-
+        ElemType log_likelihood = 0.0;
+        size_t batch_size = this->GetNumCols();
+#pragma omp parallel for reduction(+:log_likelihood)
+        for (int instance_id = 0; instance_id < batch_size; instance_id++)
+        {
+            int sample = (int)(*this)(0, instance_id);
+            log_likelihood += softmax(instance_id, sample);
+        }
+        c(0, 0) = -log_likelihood;
    }

    template<class ElemType>
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@ -1930,9 +1930,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (do_sync) CUDA_CALL(cudaEventDestroy(done));
    }
    template<class ElemType>
-    void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& softmax)
+    void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
    {
+        UNCONST(ElemType, a, my_a);
+        cudaEvent_t done = nullptr;
+        if (do_sync) CUDA_CALL(cudaEventCreate(&done));
+        int p = 512;
+        int width = a.GetNumRows();
+        while (p / 2 > width) p = p / 2;

+        _assignSoftmaxSum<ElemType> << <1, p >> >(
+            my_a.GetArray(),
+            width,
+            GetArray(),
+            c.GetArray()
+            );
+
+        if (do_sync) CUDA_CALL(cudaEventRecord(done));
+        if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync) CUDA_CALL(cudaEventDestroy(done));
    }
    template<class ElemType>
    void GPUMatrix<ElemType>::AssignNCEUnnormalizedEval(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
--- a/Math/Math/GPUMatrix.h
+++ b/Math/Math/GPUMatrix.h
@ -294,7 +294,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            size_t sampleCount, GPUMatrix<ElemType>& tmp, GPUMatrix<ElemType>& c);
        void AssignNCEDerivative(GPUMatrix<ElemType>& tmp, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, size_t inputIndex, GPUMatrix<ElemType>& c);    
        void AssignNCEUnnormalizedEval(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
-
+        void AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& softmax);

        void Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const;
        void Print(const char* matrixName = NULL) const; //print whole matrix. can be expensive
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@ -2868,6 +2868,59 @@ __global__ void _computeNceOutput(
    }
 }

+
+template<class ElemType>
+__global__ void _assignSoftmaxSum(
+    const ElemType* softmax,    
+    int sampleCount,
+    const ElemType* a, 
+    ElemType* c) // run on 512 threads per block
+{
+    // val and col are in CSR format
+    // val is an array contains log_Pn(w). To differentiate positive and negative samples, 
+    // we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
+    // col is an array contains index of the word samples
+    // a is a matrix in column major format contains output from hidden layer
+    // b is the weight matrix for output layer
+    // tmp is the buffer that stores NCE output calculated from _computeNceOutput
+    // c is the matrix to store objective
+
+    __shared__ ElemType partials[512];
+    partials[threadIdx.x] = 0;
+
+    int total = sampleCount;
+    int loadPerThread = (total + blockDim.x - 1) / blockDim.x;
+
+    // find out the items this thread is responsible for
+    int start = loadPerThread * threadIdx.x;
+    int end = min(total, loadPerThread * (threadIdx.x + 1));    
+    for (int i = start; i < end; i++)
+    {
+        int wid = (int)a[i];
+        partials[threadIdx.x] += softmax[IDX2C(i, wid, sampleCount)];
+    }
+
+    __syncthreads();
+
+    // now sum up the objective function
+    int nTotalThreads = blockDim.x;
+
+    while (nTotalThreads >1)
+    {
+        int halfPoint = (nTotalThreads >> 1);
+
+        if (threadIdx.x < halfPoint)
+            partials[threadIdx.x] += partials[threadIdx.x + halfPoint];
+
+        __syncthreads();
+
+        nTotalThreads = (nTotalThreads >> 1);
+    }
+
+    if (threadIdx.x == 0)
+        c[0] = -partials[0];
+}
+
 template<class ElemType>
 __global__ void _assignNoiseContrastiveEstimation(
    const ElemType* val,
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@ -747,9 +747,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {

 #define NUM_MATRIXTYPE_CHANGED_WARN 20
        m_numTimesMatrixTypeChanged++;
+     
        if (m_numTimesMatrixTypeChanged == NUM_MATRIXTYPE_CHANGED_WARN)
-            fprintf(stderr, "WARNING: The same matrix with dim [%lu, %lu] has been transferred between different devices for %d times.\n", (unsigned long)GetNumRows(), (unsigned long)GetNumCols(), NUM_MATRIXTYPE_CHANGED_WARN);
-
+        {            
+            fprintf(stderr, "WARNING: The same matrix with dim [%lu, %lu] has been transferred between different devices for %d times.\n", (unsigned long)GetNumRows(), (unsigned long)GetNumCols(), NUM_MATRIXTYPE_CHANGED_WARN);         
+        }      
        if (GetDeviceId()<0) //CPU
        {
            if (newMatrixType==MatrixType::SPARSE)
--- a/Math/Math/NoGPU.cpp
+++ b/Math/Math/NoGPU.cpp
@ -1067,6 +1067,10 @@ namespace Microsoft {

            }

+            template<class ElemType>
+            void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
+            {
+            }

            template<class ElemType>
            void GPUMatrix<ElemType>::AssignNCEUnnormalizedEval(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)