Finish GPU NCE training
This commit is contained in:
Родитель
b5038c7bf1
Коммит
204b879dfe
|
@ -2052,7 +2052,7 @@ void BatchSequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring,
|
|||
}
|
||||
if (curDevId != CPUDEVICE)
|
||||
{
|
||||
labels->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
|
||||
labels->TransferFromDeviceToDevice(CPUDEVICE, curDevId, false, false, false);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1523,8 +1523,6 @@ protected:
|
|||
trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, m_epochSize);
|
||||
|
||||
startReadMBTime = Timer::MilliSecondElapsed();
|
||||
int a = 0;
|
||||
if (a)
|
||||
while (trainSetDataReader->GetMinibatch(inputMatrices))
|
||||
{
|
||||
#ifdef MPI_SUPPORT
|
||||
|
|
|
@ -948,10 +948,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (Inputs(0)->FunctionValues().GetNumRows() == 1)
|
||||
{
|
||||
for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++)
|
||||
if (Inputs(0)->FunctionValues()(0, i) > 0)
|
||||
positive++;
|
||||
else if (Inputs(0)->FunctionValues()(0, i) < 0)
|
||||
negative++;
|
||||
{
|
||||
if (Inputs(0)->FunctionValues()(0, i) > 0)
|
||||
positive++;
|
||||
else if (Inputs(0)->FunctionValues()(0, i) < 0)
|
||||
negative++;
|
||||
}
|
||||
assert(positive * negative == 0);
|
||||
}
|
||||
if (m_evalMode == NCEEvalMode::Softmax || (Inputs(0)->FunctionValues().GetNumRows() == 1 && positive > 0))
|
||||
|
@ -960,10 +962,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_logSoftmax.AssignProductOf(Inputs(1)->FunctionValues(), true, Inputs(2)->FunctionValues(), false);
|
||||
m_logSoftmax += Inputs(3)->FunctionValues();
|
||||
m_logSoftmax.InplaceLogSoftmax(false);
|
||||
FunctionValues().Resize(1, 1);
|
||||
FunctionValues().SetValue(0);
|
||||
for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++)
|
||||
FunctionValues()(0, 0) -= m_logSoftmax(i, (size_t)Inputs(0)->FunctionValues()(0, i));
|
||||
FunctionValues().AssignSoftmaxSum(Inputs(0)->FunctionValues(), m_logSoftmax);
|
||||
}
|
||||
else if (m_evalMode == NCEEvalMode::Unnormalized || (Inputs(0)->FunctionValues().GetNumRows() == 1 && negative > 0))
|
||||
{
|
||||
|
|
|
@ -3842,9 +3842,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
return CPUMatrix<ElemType>::MultiplyAndWeightedAdd(1.0, a, transposeA, b, transposeB, 1.0, c);
|
||||
}
|
||||
template<class ElemType>
|
||||
void CPUMatrix<ElemType>::AssignSoftmaxSum(const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& softmax)
|
||||
void CPUMatrix<ElemType>::AssignSoftmaxSum(const CPUMatrix<ElemType>& softmax, CPUMatrix<ElemType>& c)
|
||||
{
|
||||
|
||||
ElemType log_likelihood = 0.0;
|
||||
size_t batch_size = this->GetNumCols();
|
||||
#pragma omp parallel for reduction(+:log_likelihood)
|
||||
for (int instance_id = 0; instance_id < batch_size; instance_id++)
|
||||
{
|
||||
int sample = (int)(*this)(0, instance_id);
|
||||
log_likelihood += softmax(instance_id, sample);
|
||||
}
|
||||
c(0, 0) = -log_likelihood;
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
|
|
|
@ -1930,9 +1930,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
}
|
||||
template<class ElemType>
|
||||
void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& softmax)
|
||||
void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
|
||||
{
|
||||
UNCONST(ElemType, a, my_a);
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
int p = 512;
|
||||
int width = a.GetNumRows();
|
||||
while (p / 2 > width) p = p / 2;
|
||||
|
||||
_assignSoftmaxSum<ElemType> << <1, p >> >(
|
||||
my_a.GetArray(),
|
||||
width,
|
||||
GetArray(),
|
||||
c.GetArray()
|
||||
);
|
||||
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
}
|
||||
template<class ElemType>
|
||||
void GPUMatrix<ElemType>::AssignNCEUnnormalizedEval(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
|
||||
|
|
|
@ -294,7 +294,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
size_t sampleCount, GPUMatrix<ElemType>& tmp, GPUMatrix<ElemType>& c);
|
||||
void AssignNCEDerivative(GPUMatrix<ElemType>& tmp, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, size_t inputIndex, GPUMatrix<ElemType>& c);
|
||||
void AssignNCEUnnormalizedEval(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
|
||||
|
||||
void AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& softmax);
|
||||
|
||||
void Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const;
|
||||
void Print(const char* matrixName = NULL) const; //print whole matrix. can be expensive
|
||||
|
|
|
@ -2868,6 +2868,59 @@ __global__ void _computeNceOutput(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _assignSoftmaxSum(
|
||||
const ElemType* softmax,
|
||||
int sampleCount,
|
||||
const ElemType* a,
|
||||
ElemType* c) // run on 512 threads per block
|
||||
{
|
||||
// val and col are in CSR format
|
||||
// val is an array contains log_Pn(w). To differentiate positive and negative samples,
|
||||
// we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
|
||||
// col is an array contains index of the word samples
|
||||
// a is a matrix in column major format contains output from hidden layer
|
||||
// b is the weight matrix for output layer
|
||||
// tmp is the buffer that stores NCE output calculated from _computeNceOutput
|
||||
// c is the matrix to store objective
|
||||
|
||||
__shared__ ElemType partials[512];
|
||||
partials[threadIdx.x] = 0;
|
||||
|
||||
int total = sampleCount;
|
||||
int loadPerThread = (total + blockDim.x - 1) / blockDim.x;
|
||||
|
||||
// find out the items this thread is responsible for
|
||||
int start = loadPerThread * threadIdx.x;
|
||||
int end = min(total, loadPerThread * (threadIdx.x + 1));
|
||||
for (int i = start; i < end; i++)
|
||||
{
|
||||
int wid = (int)a[i];
|
||||
partials[threadIdx.x] += softmax[IDX2C(i, wid, sampleCount)];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// now sum up the objective function
|
||||
int nTotalThreads = blockDim.x;
|
||||
|
||||
while (nTotalThreads >1)
|
||||
{
|
||||
int halfPoint = (nTotalThreads >> 1);
|
||||
|
||||
if (threadIdx.x < halfPoint)
|
||||
partials[threadIdx.x] += partials[threadIdx.x + halfPoint];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
nTotalThreads = (nTotalThreads >> 1);
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
c[0] = -partials[0];
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _assignNoiseContrastiveEstimation(
|
||||
const ElemType* val,
|
||||
|
|
|
@ -747,9 +747,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
#define NUM_MATRIXTYPE_CHANGED_WARN 20
|
||||
m_numTimesMatrixTypeChanged++;
|
||||
|
||||
if (m_numTimesMatrixTypeChanged == NUM_MATRIXTYPE_CHANGED_WARN)
|
||||
fprintf(stderr, "WARNING: The same matrix with dim [%lu, %lu] has been transferred between different devices for %d times.\n", (unsigned long)GetNumRows(), (unsigned long)GetNumCols(), NUM_MATRIXTYPE_CHANGED_WARN);
|
||||
|
||||
{
|
||||
fprintf(stderr, "WARNING: The same matrix with dim [%lu, %lu] has been transferred between different devices for %d times.\n", (unsigned long)GetNumRows(), (unsigned long)GetNumCols(), NUM_MATRIXTYPE_CHANGED_WARN);
|
||||
}
|
||||
if (GetDeviceId()<0) //CPU
|
||||
{
|
||||
if (newMatrixType==MatrixType::SPARSE)
|
||||
|
|
|
@ -1067,6 +1067,10 @@ namespace Microsoft {
|
|||
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
|
||||
{
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
void GPUMatrix<ElemType>::AssignNCEUnnormalizedEval(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
|
||||
|
|
Загрузка…
Ссылка в новой задаче