update

2019-05-20 14:26:59 +08:00 · 2019-05-20 14:26:59 +08:00 · 453ff07438
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@ -1234,9 +1234,7 @@ void* DistributedGatheredLabels<ElemType>::initializeNodePtr(NULL);
 template <class ElemType>
 shared_ptr<Matrix<ElemType>> DistributedGatheredLabels<ElemType>::m_gatheredLabels;
 template <class ElemType>
-shared_ptr<Matrix<ElemType>> DistributedGatheredLabels<ElemType>::m_labelsIndex;
-template <class ElemType>
-shared_ptr<Matrix<ElemType>> DistributedGatheredLabels<ElemType>::m_labelsValue;
+shared_ptr<Matrix<ElemType>> DistributedGatheredLabels<ElemType>::m_labels;
 template <class ElemType>
 size_t DistributedGatheredLabels<ElemType>::m_minibatchSize(0);

--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -2259,10 +2259,10 @@ template <class ElemType>
 class DistributedGatheredLabels
 {
 public:
-    static void gatherDistributedLabels(const Matrix<ElemType>& labels)
+    static void gatherDistributedLabels(const Matrix<ElemType>& oneHotLabels)
    {
-        labels.VectorMax(*m_labelsIndex, *m_labelsValue, true);
-        m_distGradAggPtr->DistributedAllGather(*m_labelsIndex, *m_gatheredLabels, m_minibatchSize);
+        Matrix<ElemType>::GetDenseLabelsFromOneHot(oneHotLabels, *m_labels);
+        m_distGradAggPtr->DistributedAllGather(*m_labels, *m_gatheredLabels, m_minibatchSize);
    }

    static void setMinibatchSize(size_t minibatchSize)
@ -2273,8 +2273,7 @@ public:
            m_distGradAggPtr = (IDistGradAggregator<ElemType>*) Globals::GetDistGradAggPtr();
        }
        m_gatheredLabels->Resize(1, m_minibatchSize * Globals::GetProcessNum());
-        m_labelsIndex->Resize(1, m_minibatchSize);
-        m_labelsValue->Resize(1, m_minibatchSize);
+        m_labels->Resize(1, m_minibatchSize);
    }

    static void setInitializeNode(void* nodePtr)
@ -2291,8 +2290,7 @@ public:
    static IDistGradAggregator<ElemType>* m_distGradAggPtr;
    static void* initializeNodePtr;
    static shared_ptr<Matrix<ElemType>> m_gatheredLabels;
-    static shared_ptr<Matrix<ElemType>> m_labelsIndex;
-    static shared_ptr<Matrix<ElemType>> m_labelsValue;
+    static shared_ptr<Matrix<ElemType>> m_labels;
    static size_t m_minibatchSize;
 };

--- a/Source/ComputationNetworkLib/EvaluationNodes.h
+++ b/Source/ComputationNetworkLib/EvaluationNodes.h
@ -231,8 +231,7 @@ public:
        if (DistributedGatheredLabels<ElemType>::isInitializeNode(this))
        {
            RequestMatrixFromPool(DistributedGatheredLabels<ElemType>::m_gatheredLabels, matrixPool);
-            RequestMatrixFromPool(DistributedGatheredLabels<ElemType>::m_labelsIndex, matrixPool, 1, true);
-            RequestMatrixFromPool(DistributedGatheredLabels<ElemType>::m_labelsValue, matrixPool, 1, true);
+            RequestMatrixFromPool(DistributedGatheredLabels<ElemType>::m_labels, matrixPool, 1, true);
        }
    }

@ -244,8 +243,7 @@ public:
        if (DistributedGatheredLabels<ElemType>::isInitializeNode(this))
        {
            ReleaseMatrixToPool(DistributedGatheredLabels<ElemType>::m_gatheredLabels, matrixPool);
-            ReleaseMatrixToPool(DistributedGatheredLabels<ElemType>::m_labelsIndex, matrixPool);
-            ReleaseMatrixToPool(DistributedGatheredLabels<ElemType>::m_labelsValue, matrixPool);
+            ReleaseMatrixToPool(DistributedGatheredLabels<ElemType>::m_labels, matrixPool);
        }
    }

--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@ -495,8 +495,7 @@ public:
        if (DistributedGatheredLabels<ElemType>::isInitializeNode(this))
        {
            RequestMatrixFromPool(DistributedGatheredLabels<ElemType>::m_gatheredLabels, matrixPool);
-            RequestMatrixFromPool(DistributedGatheredLabels<ElemType>::m_labelsIndex, matrixPool, 1, true);
-            RequestMatrixFromPool(DistributedGatheredLabels<ElemType>::m_labelsValue, matrixPool, 1, true);
+            RequestMatrixFromPool(DistributedGatheredLabels<ElemType>::m_labels, matrixPool, 1, true);
        }
    }

@ -511,8 +510,7 @@ public:
        if (DistributedGatheredLabels<ElemType>::isInitializeNode(this))
        {
            ReleaseMatrixToPool(DistributedGatheredLabels<ElemType>::m_gatheredLabels, matrixPool);
-            ReleaseMatrixToPool(DistributedGatheredLabels<ElemType>::m_labelsIndex, matrixPool);
-            ReleaseMatrixToPool(DistributedGatheredLabels<ElemType>::m_labelsValue, matrixPool);
+            ReleaseMatrixToPool(DistributedGatheredLabels<ElemType>::m_labels, matrixPool);
        }
    }

@ -686,8 +684,7 @@ public:
        if (DistributedGatheredLabels<ElemType>::isInitializeNode(this))
        {
            RequestMatrixFromPool(DistributedGatheredLabels<ElemType>::m_gatheredLabels, matrixPool);
-            RequestMatrixFromPool(DistributedGatheredLabels<ElemType>::m_labelsIndex, matrixPool, 1, true);
-            RequestMatrixFromPool(DistributedGatheredLabels<ElemType>::m_labelsValue, matrixPool, 1, true);
+            RequestMatrixFromPool(DistributedGatheredLabels<ElemType>::m_labels, matrixPool, 1, true);
        }
    }

@ -701,8 +698,7 @@ public:
        if (DistributedGatheredLabels<ElemType>::isInitializeNode(this))
        {
            ReleaseMatrixToPool(DistributedGatheredLabels<ElemType>::m_gatheredLabels, matrixPool);
-            ReleaseMatrixToPool(DistributedGatheredLabels<ElemType>::m_labelsIndex, matrixPool);
-            ReleaseMatrixToPool(DistributedGatheredLabels<ElemType>::m_labelsValue, matrixPool);
+            ReleaseMatrixToPool(DistributedGatheredLabels<ElemType>::m_labels, matrixPool);
        }
    }

--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@ -487,6 +487,8 @@ public:

 #pragma region DistributedFC

+    static void GetDenseLabelsFromOneHot(const CPUMatrix<ElemType>& oneHotLabels, const CPUMatrix<ElemType>& labels);
+
    static void Scatter(const CPUMatrix<ElemType>& src, const CPUMatrix<ElemType>& dst, size_t minibatchSize, size_t rank, size_t processNum);

    static void AddColumnVector(const CPUMatrix<ElemType>& src, const CPUMatrix<ElemType>& columnVector, const CPUMatrix<ElemType>& dst);
--- a/Source/Math/CPUMatrixImpl.h
+++ b/Source/Math/CPUMatrixImpl.h
@ -5457,7 +5457,7 @@ void CPUMatrix<ElemType>::LabelSmoothing(const CPUMatrix<ElemType>& label, ElemT
    // handle remaining stuffs
    for (long i = numElements & ~3; i < numElements; i++)
    {
-        if (labelPtr[i] == (ElemType)0)
+        if (labelPtr[i] == (ElemType)0.5)
            labelPtr[i] = smoothValue;
        else
            labelPtr[i] = labelPtr[i] * keepRate + smoothValue;
@ -5468,6 +5468,36 @@ void CPUMatrix<ElemType>::LabelSmoothing(const CPUMatrix<ElemType>& label, ElemT

 #pragma region DistributedFC

+template <class ElemType>
+void CPUMatrix<ElemType>::GetDenseLabelsFromOneHot(const CPUMatrix<ElemType>& oneHotLabels, const CPUMatrix<ElemType>& labels)
+{
+    long rows = (long)oneHotLabels.GetNumRows();
+    long cols = (long)oneHotLabels.GetNumCols();
+
+#pragma omp parallel for
+    for (long j = 0; j < cols; ++j)
+    {
+        // four-way unrolling
+        for (long i = 0; i < (rows & ~3); i += 4)
+        {
+            if (oneHotLabels(i, j) > (ElemType)0)
+                labels(0, j) = i;
+            if (oneHotLabels(i + 1, j) > (ElemType)0)
+                labels(0, j) = i + 1;
+            if (oneHotLabels(i + 2, j) > (ElemType)0)
+                labels(0, j) = i + 2;
+            if (oneHotLabels(i + 3, j) > (ElemType)0)
+                labels(0, j) = i + 3;
+        }
+        // handle remaining stuffs
+        for (long i = rows & ~3; i < rows; i++)
+        {
+            if (oneHotLabels(i, j) > (ElemType)0)
+                labels(0, j) = i;
+        }
+    }
+}
+
 template <class ElemType>
 void CPUMatrix<ElemType>::Scatter(const CPUMatrix<ElemType>& src, const CPUMatrix<ElemType>& dst, size_t minibatchSize, size_t rank, size_t processNum)
 {
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -3852,6 +3852,30 @@ void GPUMatrix<ElemType>::LabelSmoothing(const GPUMatrix<ElemType>& label, ElemT

 #pragma region DistributedFC

+template <class ElemType>
+__global__ void _getDenseLabelsFromOneHot(ElemType* oneHotLabels, ElemType* labels, CUDA_LONG rows, CUDA_LONG numElements)
+{
+    CUDA_LONG id = GridDim::GetLinearThreadId();
+    if (id < numElements)
+    {
+        if (oneHotLabels[id] > (ElemType)0.5)
+            labels[id / rows] = id % rows;
+    }
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::GetDenseLabelsFromOneHot(const GPUMatrix<ElemType>& oneHotLabels, const GPUMatrix<ElemType>& labels)
+{
+    CUDA_LONG numElements = (CUDA_LONG)oneHotLabels.GetNumElements();
+    CUDA_LONG rows = (CUDA_LONG)oneHotLabels.GetNumRows();
+
+    int blocksPerGrid = (int)ceil(1.0 * numElements / GridDim::maxThreadsPerBlock);
+    oneHotLabels.PrepareDevice();
+    SyncGuard syncGuard;
+
+    _getDenseLabelsFromOneHot<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream >> > (oneHotLabels.Data(), labels.Data(), rows, numElements);
+}
+
 template <class ElemType>
 __global__ void _scatter(ElemType* src, ElemType* dst, CUDA_LONG outputDim, CUDA_LONG minioutputDim, CUDA_LONG blockSize, CUDA_LONG blockOffset, CUDA_LONG numElements)
 {
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -603,6 +603,8 @@ public:

 #pragma region DistributedFC

+    static void GetDenseLabelsFromOneHot(const GPUMatrix<ElemType>& oneHotLabels, const GPUMatrix<ElemType>& labels);
+
    static void Scatter(const GPUMatrix<ElemType>& src, const GPUMatrix<ElemType>& dst, size_t minibatchSize, size_t rank, size_t processNum);

    static void AddColumnVector(const GPUMatrix<ElemType>& src, const GPUMatrix<ElemType>& columnVector, const GPUMatrix<ElemType>& dst);
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -5206,6 +5206,17 @@ void Matrix<ElemType>::BatchNormalizationBackward(const Matrix<ElemType>& in, Ma

 #pragma region DistributedFC

+template <class ElemType>
+/*static*/ void Matrix<ElemType>::GetDenseLabelsFromOneHot(const Matrix<ElemType>& oneHotLabels, const Matrix<ElemType>& labels)
+{
+    DISPATCH_MATRIX_ON_FLAG(&labels,
+        &labels,
+        CPUMatrix<ElemType>::GetDenseLabelsFromOneHot(*(oneHotLabels.m_CPUMatrix), *(labels.m_CPUMatrix)),
+        GPUMatrix<ElemType>::GetDenseLabelsFromOneHot(*(oneHotLabels.m_GPUMatrix), *(labels.m_GPUMatrix)),
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED);
+}
+
 template <class ElemType>
 /*static*/ void Matrix<ElemType>::Scatter(const Matrix<ElemType>& src, const Matrix<ElemType>& dst, size_t minibatchSize, size_t rank, size_t processNum)
 {
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -646,6 +646,8 @@ public:

 #pragma region DistributedFC

+    static void GetDenseLabelsFromOneHot(const Matrix<ElemType>& oneHotLabels, const Matrix<ElemType>& labels);
+
    static void Scatter(const Matrix<ElemType>& src, const Matrix<ElemType>& dst, size_t minibatchSize, size_t rank, size_t processNum);

    static void AddColumnVector(const Matrix<ElemType>& src, const Matrix<ElemType>& columnVector, const Matrix<ElemType>& dst);
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@ -2175,6 +2175,11 @@ void GPUMatrix<ElemType>::LabelSmoothing(const GPUMatrix<ElemType>& label, ElemT

 #pragma region DistributedFC

+template <class ElemType>
+void GPUMatrix<ElemType>::GetDenseLabelsFromOneHot(const GPUMatrix<ElemType>& oneHotLabels, const GPUMatrix<ElemType>& labels)
+{
+}
+
 template <class ElemType>
 void GPUMatrix<ElemType>::Scatter(const GPUMatrix<ElemType>& src, const GPUMatrix<ElemType>& dst, size_t minibatchSize, size_t rank, size_t processNum)
 {