This commit is contained in:
Yinggong ZHAO 2015-07-06 20:56:16 -07:00
Родитель b5038c7bf1
Коммит 204b879dfe
9 изменённых файлов: 97 добавлений и 17 удалений

Просмотреть файл

@ -2052,7 +2052,7 @@ void BatchSequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring,
}
if (curDevId != CPUDEVICE)
{
labels->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
labels->TransferFromDeviceToDevice(CPUDEVICE, curDevId, false, false, false);
}
}

Просмотреть файл

@ -1523,8 +1523,6 @@ protected:
trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, m_epochSize);
startReadMBTime = Timer::MilliSecondElapsed();
int a = 0;
if (a)
while (trainSetDataReader->GetMinibatch(inputMatrices))
{
#ifdef MPI_SUPPORT

Просмотреть файл

@ -948,10 +948,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (Inputs(0)->FunctionValues().GetNumRows() == 1)
{
for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++)
if (Inputs(0)->FunctionValues()(0, i) > 0)
positive++;
else if (Inputs(0)->FunctionValues()(0, i) < 0)
negative++;
{
if (Inputs(0)->FunctionValues()(0, i) > 0)
positive++;
else if (Inputs(0)->FunctionValues()(0, i) < 0)
negative++;
}
assert(positive * negative == 0);
}
if (m_evalMode == NCEEvalMode::Softmax || (Inputs(0)->FunctionValues().GetNumRows() == 1 && positive > 0))
@ -960,10 +962,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_logSoftmax.AssignProductOf(Inputs(1)->FunctionValues(), true, Inputs(2)->FunctionValues(), false);
m_logSoftmax += Inputs(3)->FunctionValues();
m_logSoftmax.InplaceLogSoftmax(false);
FunctionValues().Resize(1, 1);
FunctionValues().SetValue(0);
for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++)
FunctionValues()(0, 0) -= m_logSoftmax(i, (size_t)Inputs(0)->FunctionValues()(0, i));
FunctionValues().AssignSoftmaxSum(Inputs(0)->FunctionValues(), m_logSoftmax);
}
else if (m_evalMode == NCEEvalMode::Unnormalized || (Inputs(0)->FunctionValues().GetNumRows() == 1 && negative > 0))
{

Просмотреть файл

@ -3842,9 +3842,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return CPUMatrix<ElemType>::MultiplyAndWeightedAdd(1.0, a, transposeA, b, transposeB, 1.0, c);
}
template<class ElemType>
void CPUMatrix<ElemType>::AssignSoftmaxSum(const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& softmax)
void CPUMatrix<ElemType>::AssignSoftmaxSum(const CPUMatrix<ElemType>& softmax, CPUMatrix<ElemType>& c)
{
ElemType log_likelihood = 0.0;
size_t batch_size = this->GetNumCols();
#pragma omp parallel for reduction(+:log_likelihood)
for (int instance_id = 0; instance_id < batch_size; instance_id++)
{
int sample = (int)(*this)(0, instance_id);
log_likelihood += softmax(instance_id, sample);
}
c(0, 0) = -log_likelihood;
}
template<class ElemType>

Просмотреть файл

@ -1930,9 +1930,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
}
template<class ElemType>
void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& softmax)
void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
{
UNCONST(ElemType, a, my_a);
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
int p = 512;
int width = a.GetNumRows();
while (p / 2 > width) p = p / 2;
_assignSoftmaxSum<ElemType> << <1, p >> >(
my_a.GetArray(),
width,
GetArray(),
c.GetArray()
);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
}
template<class ElemType>
void GPUMatrix<ElemType>::AssignNCEUnnormalizedEval(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)

Просмотреть файл

@ -294,7 +294,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t sampleCount, GPUMatrix<ElemType>& tmp, GPUMatrix<ElemType>& c);
void AssignNCEDerivative(GPUMatrix<ElemType>& tmp, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, size_t inputIndex, GPUMatrix<ElemType>& c);
void AssignNCEUnnormalizedEval(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
void AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& softmax);
void Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const;
void Print(const char* matrixName = NULL) const; //print whole matrix. can be expensive

Просмотреть файл

@ -2868,6 +2868,59 @@ __global__ void _computeNceOutput(
}
}
template<class ElemType>
__global__ void _assignSoftmaxSum(
const ElemType* softmax,
int sampleCount,
const ElemType* a,
ElemType* c) // run on 512 threads per block
{
// val and col are in CSR format
// val is an array contains log_Pn(w). To differentiate positive and negative samples,
// we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
// col is an array contains index of the word samples
// a is a matrix in column major format contains output from hidden layer
// b is the weight matrix for output layer
// tmp is the buffer that stores NCE output calculated from _computeNceOutput
// c is the matrix to store objective
__shared__ ElemType partials[512];
partials[threadIdx.x] = 0;
int total = sampleCount;
int loadPerThread = (total + blockDim.x - 1) / blockDim.x;
// find out the items this thread is responsible for
int start = loadPerThread * threadIdx.x;
int end = min(total, loadPerThread * (threadIdx.x + 1));
for (int i = start; i < end; i++)
{
int wid = (int)a[i];
partials[threadIdx.x] += softmax[IDX2C(i, wid, sampleCount)];
}
__syncthreads();
// now sum up the objective function
int nTotalThreads = blockDim.x;
while (nTotalThreads >1)
{
int halfPoint = (nTotalThreads >> 1);
if (threadIdx.x < halfPoint)
partials[threadIdx.x] += partials[threadIdx.x + halfPoint];
__syncthreads();
nTotalThreads = (nTotalThreads >> 1);
}
if (threadIdx.x == 0)
c[0] = -partials[0];
}
template<class ElemType>
__global__ void _assignNoiseContrastiveEstimation(
const ElemType* val,

Просмотреть файл

@ -747,9 +747,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
#define NUM_MATRIXTYPE_CHANGED_WARN 20
m_numTimesMatrixTypeChanged++;
if (m_numTimesMatrixTypeChanged == NUM_MATRIXTYPE_CHANGED_WARN)
fprintf(stderr, "WARNING: The same matrix with dim [%lu, %lu] has been transferred between different devices for %d times.\n", (unsigned long)GetNumRows(), (unsigned long)GetNumCols(), NUM_MATRIXTYPE_CHANGED_WARN);
{
fprintf(stderr, "WARNING: The same matrix with dim [%lu, %lu] has been transferred between different devices for %d times.\n", (unsigned long)GetNumRows(), (unsigned long)GetNumCols(), NUM_MATRIXTYPE_CHANGED_WARN);
}
if (GetDeviceId()<0) //CPU
{
if (newMatrixType==MatrixType::SPARSE)

Просмотреть файл

@ -1067,6 +1067,10 @@ namespace Microsoft {
}
template<class ElemType>
void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
{
}
template<class ElemType>
void GPUMatrix<ElemType>::AssignNCEUnnormalizedEval(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)