diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h index 0f17fa622..25bb1aaf2 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.h +++ b/Source/ComputationNetworkLib/ReshapingNodes.h @@ -2110,7 +2110,7 @@ public: row_elements *= dims[i]; } - sourceGradient.ScatterToIndicesWithMask(outputGradient, indices, indicesMask, row_elements); + sourceGradient.ScatterToIndices(outputGradient, indices, row_elements, &indicesMask); } else { diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h index f84adedd7..cedf0e8d0 100755 --- a/Source/Math/CPUMatrix.h +++ b/Source/Math/CPUMatrix.h @@ -303,8 +303,7 @@ public: CPUMatrix& AssignOneHot(const CPUMatrix& a, vector& shape, size_t axis); CPUMatrix& GatherFromTarget(const CPUMatrix& indices, const CPUMatrix& target, size_t row_elements); - CPUMatrix& ScatterToIndices(const CPUMatrix& values, const CPUMatrix& indices, size_t row_elements); - CPUMatrix& ScatterToIndices(const CPUMatrix& values, const CPUMatrix& indices, const CPUMatrix& mask, size_t row_elements); + CPUMatrix& ScatterToIndices(const CPUMatrix& values, const CPUMatrix& indices, size_t row_elements, const CPUMatrix* mask = nullptr); bool IsEqualTo(const CPUMatrix& a, const ElemType threshold = 1e-8) const; @@ -593,7 +592,8 @@ protected: private: void Clear(); - void ScatterValues(ElemType* indices, ElemType* value, ElemType* data, ElemType alpha, size_t num_indices, size_t rows, size_t cols, size_t indices_step = 1, char* mask = nullptr); + void ScatterValues(ElemType* indices, ElemType* value, ElemType* data, ElemType alpha, size_t num_indices, size_t rows, size_t cols, size_t indices_step = 1); + void ScatterValues(ElemType* indices, ElemType* value, ElemType* data, char* mask, ElemType alpha, size_t num_indices, size_t rows, size_t cols, size_t indices_step = 1); private: static int m_optimizationFlags; diff --git a/Source/Math/CPUMatrixImpl.h b/Source/Math/CPUMatrixImpl.h index 7b2a8fd75..db235ce78 100644 --- a/Source/Math/CPUMatrixImpl.h +++ b/Source/Math/CPUMatrixImpl.h @@ -3338,35 +3338,25 @@ CPUMatrix& CPUMatrix::GatherFromTarget(const CPUMatrix -CPUMatrix& CPUMatrix::ScatterToIndices(const CPUMatrix& values, const CPUMatrix& indices, size_t row_elements) +CPUMatrix& CPUMatrix::ScatterToIndices(const CPUMatrix& values, const CPUMatrix& indices, size_t row_elements, + const CPUMatrix* mask/*= nullptr*/) { if (indices.IsEmpty() || values.IsEmpty()) LogicError("ScatterToIndices: input matrix is empty."); + if (mask) + { + if (mask->IsEmpty()) + LogicError("ScatterToIndices: input matrix is empty."); + if (indices.GetNumRows() != mask->GetNumRows() || indices.GetNumCols() != mask->GetNumCols()) + LogicError("ScatterToIndices: indices matrix must have same shape with mask matrix."); + } ElemType* indicesBufPtr = indices.Data(); ElemType* valueBufPtr = values.Data(); + char* maskBufPtr = mask ? mask->Data() : nullptr; ElemType* buffer = Data(); - ScatterValues(indicesBufPtr, valueBufPtr, buffer, (ElemType)1, indices.GetNumElements(), row_elements, this->GetNumCols()); - - return *this; -} - -template -CPUMatrix& CPUMatrix::ScatterToIndices(const CPUMatrix& values, const CPUMatrix& indices, - const CPUMatrix& mask, size_t row_elements) -{ - if (indices.IsEmpty() || values.IsEmpty() || mask.IsEmpty()) - LogicError("ScatterToIndices: input matrix is empty."); - if (indices.GetNumRows() != mask.GetNumRows() || indices.GetNumCols() != mask.GetNumCols()) - LogicError("ScatterToIndices: indices matrix must have same shape with mask matrix."); - - ElemType* indicesBufPtr = indices.Data(); - ElemType* valueBufPtr = values.Data(); - char* maskBufPtr = mask.Data(); - ElemType* buffer = Data(); - - ScatterValues(indicesBufPtr, valueBufPtr, buffer, (ElemType)1, indices.GetNumElements(), row_elements, this->GetNumCols(), /*indices_step=*/1, maskBufPtr); + ScatterValues(indicesBufPtr, valueBufPtr, buffer, maskBufPtr, (ElemType)1, indices.GetNumElements(), row_elements, this->GetNumCols()); return *this; } @@ -7293,7 +7283,13 @@ void CPUMatrix::TensorArgOp(const CPUMatrix& a, ElementWiseO } template -void CPUMatrix::ScatterValues(ElemType* indices, ElemType* value, ElemType* data, ElemType alpha, size_t num_indices, size_t rows, size_t cols, size_t indices_step/*=1*/, char* mask/*=nullptr*/) +void CPUMatrix::ScatterValues(ElemType* indices, ElemType* value, ElemType* data, ElemType alpha, size_t num_indices, size_t rows, size_t cols, size_t indices_step/*=1*/) +{ + ScatterValues(indices, value, data, nullptr, alpha, num_indices, rows, cols, indices_step); +} + +template +void CPUMatrix::ScatterValues(ElemType* indices, ElemType* value, ElemType* data, char* mask, ElemType alpha, size_t num_indices, size_t rows, size_t cols, size_t indices_step/*=1*/) { if (!indices || !value || !data) LogicError("ScatterValues: input data is null."); diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu index 8fb48f6cc..7c1c24edf 100755 --- a/Source/Math/GPUMatrix.cu +++ b/Source/Math/GPUMatrix.cu @@ -4436,38 +4436,20 @@ GPUMatrix& GPUMatrix::GatherFromTarget(const GPUMatrix -GPUMatrix& GPUMatrix::ScatterToIndices(const GPUMatrix& values, const GPUMatrix& indices, size_t row_elements) +GPUMatrix& GPUMatrix::ScatterToIndices(const GPUMatrix& values, const GPUMatrix& indices, size_t row_elements, const GPUMatrix* mask/*= nullptr*/) { - if (indices.IsEmpty() || values.IsEmpty()) + if (indices.IsEmpty() || values.IsEmpty() || (mask && mask->IsEmpty())) LogicError("ScatterToIndices: input matrix is empty."); ElemType* indicesBufPtr = indices.Data(); ElemType* valueBufPtr = values.Data(); + char* maskBufPtr = mask ? mask->Data() : nullptr; ElemType* buffer = Data(); size_t num_indices = indices.GetNumElements(); CUDA_LONG N = (CUDA_LONG)num_indices * row_elements; int blocksPerGrid = (int)ceil(((double)N) / GridDim::maxThreadsPerBlock); - _scatterToIndices << > > (indicesBufPtr, valueBufPtr, buffer, row_elements, num_indices, N); - - return *this; -} - -template -GPUMatrix& GPUMatrix::ScatterToIndices(const GPUMatrix& values, const GPUMatrix& indices, const GPUMatrix& mask, size_t row_elements) -{ - if (indices.IsEmpty() || values.IsEmpty() || mask.IsEmpty()) - LogicError("ScatterToIndices: input matrix is empty."); - - ElemType* indicesBufPtr = indices.Data(); - ElemType* valueBufPtr = values.Data(); - char* maskBufPtr = mask.Data(); - ElemType* buffer = Data(); - - size_t num_indices = indices.GetNumElements(); - CUDA_LONG N = (CUDA_LONG)num_indices * row_elements; - int blocksPerGrid = (int)ceil(((double)N) / GridDim::maxThreadsPerBlock); - _scatterToIndices << > > (indicesBufPtr, valueBufPtr, buffer, row_elements, num_indices, N, maskBufPtr); + _scatterToIndices << > > (indicesBufPtr, valueBufPtr, buffer, maskBufPtr, row_elements, num_indices, N); return *this; } diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h index 6dbb615f1..80f650069 100755 --- a/Source/Math/GPUMatrix.h +++ b/Source/Math/GPUMatrix.h @@ -292,8 +292,7 @@ public: GPUMatrix& AssignOneHot(const GPUMatrix& a, vector& shape, size_t axis); GPUMatrix& GatherFromTarget(const GPUMatrix& indices, const GPUMatrix& target, size_t row_elements); - GPUMatrix& ScatterToIndices(const GPUMatrix& values, const GPUMatrix& indices, size_t row_elements); - GPUMatrix& ScatterToIndices(const GPUMatrix& values, const GPUMatrix& indices, const GPUMatrix& mask, size_t row_elements); + GPUMatrix& ScatterToIndices(const GPUMatrix& values, const GPUMatrix& indices, size_t row_elements, const GPUMatrix* mask = nullptr); GPUMatrix Transpose() const; GPUMatrix& AssignTransposeOf(const GPUMatrix& a); diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh index 7986e2f15..33537b073 100755 --- a/Source/Math/GPUMatrixCUDAKernels.cuh +++ b/Source/Math/GPUMatrixCUDAKernels.cuh @@ -5818,25 +5818,51 @@ __global__ void _gatherFromTarget(ElemType *indices, } } +template +inline __device__ void _scatterToIndices4Index(ElemType *indices, + ElemType *value, + ElemType *buffer, + char *mask, + CUDA_LONG index, + size_t num_row_elements) +{ + size_t indices_index = index / num_row_elements; + size_t offset = index % num_row_elements; + //Skip missing values + if (mask && mask[indices_index] == 0) return; + //We resort to nondeterministic behavior (floating point addition is not associative). + //Note that the CPU parallel algorithm will have poor performance on the GPU because of thread divergence + atomicAdd(&buffer[(size_t)(unsigned long long int)indices[indices_index] * num_row_elements + offset], value[index]); +} + template __global__ void _scatterToIndices(ElemType *indices, ElemType *value, ElemType *buffer, size_t num_row_elements, size_t num_indices, - CUDA_LONG num_elements, - char *mask = nullptr) + CUDA_LONG num_elements) { const CUDA_LONG index = blockIdx.x * blockDim.x + threadIdx.x; if (index < num_elements) { - size_t indices_index = index / num_row_elements; - size_t offset = index % num_row_elements; - //Skip missing values - if (mask && mask[indices_index] == 0) return; - //We resort to nondeterministic behavior (floating point addition is not associative). - //Note that the CPU parallel algorithm will have poor performance on the GPU because of thread divergence - atomicAdd(&buffer[(size_t)(unsigned long long int)indices[indices_index] * num_row_elements + offset], value[index]); + _scatterToIndices4Index(indices, value, buffer, /*mask*/nullptr, index, num_row_elements); + } +} + +template +__global__ void _scatterToIndices(ElemType *indices, + ElemType *value, + ElemType *buffer, + char *mask, + size_t num_row_elements, + size_t num_indices, + CUDA_LONG num_elements) +{ + const CUDA_LONG index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < num_elements) + { + _scatterToIndices4Index(indices, value, buffer, mask, index, num_row_elements); } } diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp index a9746a251..d3dfd7d08 100755 --- a/Source/Math/Matrix.cpp +++ b/Source/Math/Matrix.cpp @@ -3692,32 +3692,15 @@ Matrix& Matrix::GatherFromTarget(const Matrix& ind return *this; } template -Matrix& Matrix::ScatterToIndices(const Matrix& values, const Matrix& indices, size_t row_elements) +Matrix& Matrix::ScatterToIndices(const Matrix& values, const Matrix& indices, size_t row_elements, const Matrix* mask/* = nullptr*/) { if (indices.IsEmpty() || values.IsEmpty()) LogicError("ScatterAccordingIndices: input matrix is empty."); DISPATCH_MATRIX_ON_FLAG(&values, this, - m_CPUMatrix->ScatterToIndices(*values.m_CPUMatrix, *indices.m_CPUMatrix, row_elements), - m_GPUMatrix->ScatterToIndices(*values.m_GPUMatrix, *indices.m_GPUMatrix, row_elements), - NOT_IMPLEMENTED, - NOT_IMPLEMENTED); - - return *this; -} -template -Matrix& Matrix::ScatterToIndicesWithMask(const Matrix& values, const Matrix& indices, const Matrix& mask, size_t row_elements) -{ - if (indices.IsEmpty() || values.IsEmpty() || mask.IsEmpty()) - LogicError("ScatterAccordingIndices: input matrix is empty."); - if (indices.GetNumRows() != mask.GetNumRows() || indices.GetNumCols() != mask.GetNumCols()) - LogicError("ScatterAccordingIndices: indices matrix must have same shape with mask matrix."); - - DISPATCH_MATRIX_ON_FLAG(&values, - this, - m_CPUMatrix->ScatterToIndices(*values.m_CPUMatrix, *indices.m_CPUMatrix, *mask.m_CPUMatrix, row_elements), - m_GPUMatrix->ScatterToIndices(*values.m_GPUMatrix, *indices.m_GPUMatrix, *mask.m_GPUMatrix, row_elements), + m_CPUMatrix->ScatterToIndices(*values.m_CPUMatrix, *indices.m_CPUMatrix, row_elements, mask ? mask->m_CPUMatrix.get() : nullptr), + m_GPUMatrix->ScatterToIndices(*values.m_GPUMatrix, *indices.m_GPUMatrix, row_elements, mask ? mask->m_GPUMatrix.get() : nullptr), NOT_IMPLEMENTED, NOT_IMPLEMENTED); diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h index c69ad63c0..414e7e166 100755 --- a/Source/Math/Matrix.h +++ b/Source/Math/Matrix.h @@ -320,8 +320,7 @@ public: Matrix& AssignOneHot(const Matrix& a, vector& shape, size_t axis, bool is_sparse); Matrix& GatherFromTarget(const Matrix& indices, const Matrix& target, size_t row_elements); - Matrix& ScatterToIndices(const Matrix& values, const Matrix& indices, size_t row_elements); - Matrix& ScatterToIndicesWithMask(const Matrix& values, const Matrix& indices, const Matrix& mask, size_t row_elements); + Matrix& ScatterToIndices(const Matrix& values, const Matrix& indices, size_t row_elements, const Matrix* mask = nullptr); Matrix Transpose(); // This method doesn't change state of Matrix. It should be a const function Matrix& AssignTransposeOf(const Matrix& a);