diff --git a/octree/octree/octree_nn.cpp b/octree/octree/octree_nn.cpp index 8918647..549fe7f 100644 --- a/octree/octree/octree_nn.cpp +++ b/octree/octree/octree_nn.cpp @@ -240,6 +240,64 @@ void col2octree_cpu(const Dtype* data_col, Dtype* data_octree, } } + +template +void octree2colP_cpu(Dtype* data_col, const Dtype* data_octree, const int channel, + const int height, const int octree_h, const int kernel_sdim, const int stride, + const int* neigh, const int* ni, const int* child, const int* ichild, + const int height_col, const int n) { + for (int c = 0; c < channel; ++c) { + for (int k = 0; k < kernel_sdim; ++k) { + int h_start = n * height_col; + int i_start = (c * kernel_sdim + k) * height_col - h_start; + for (int h = h_start; h < h_start + height_col; ++h) { + // boundary condition + if (h >= height) { + data_col[i_start + h] = Dtype(0); + continue; + } + // neighborhood searching + const int hp = ichild[h]; + const int index = stride == 2 ? (h << 6) + ni[k] : + (hp >> 3 << 6) + ni[(hp % 8) * kernel_sdim + k]; + int p = neigh[index]; + if (p >= 0) { p = child[p]; } + // assign values + data_col[i_start + h] = + p < 0 ? Dtype(0) : data_octree[c * octree_h + p]; + } + } + } +} + +template +void col2octreeP_cpu(const Dtype* data_col, Dtype* data_octree, const int channel, + const int height, const int octree_h, const int kernel_sdim, const int stride, + const int* neigh, const int* ni, const int* child, const int* ichild, + const int height_col, const int n) { + // set data_octree to zero ONCE when n ==0 + if (n == 0) { memset_cpu(channel * octree_h, Dtype(0), data_octree); } + for (int c = 0; c < channel; ++c) { + for (int k = 0; k < kernel_sdim; ++k) { + int h_start = n * height_col; + int i_start = (c * kernel_sdim + k) * height_col - h_start; + for (int h = h_start; h < h_start + height_col; ++h) { + // boundary condition + if (h >= height) continue; + // neighborhood searching + const int hp = ichild[h]; + const int index = stride == 2 ? (h << 6) + ni[k] : + (hp >> 3 << 6) + ni[(hp % 8) * kernel_sdim + k]; + int p = neigh[index]; + if (p >= 0) { p = child[p]; } + // assign values + if (p >= 0) { data_octree[c * octree_h + p] += data_col[i_start + h]; } + } + } + } +} + + template void octree_max_pool_cpu(Dtype* top_data, int top_h, int* mask, const Dtype* btm_data, int btm_h, int channel) { @@ -604,7 +662,8 @@ template void memset_cpu(const size_t N, const char alpha, char* Y); template void memset_cpu(const size_t N, const int8_t alpha, int8_t* Y); template void memset_cpu(const size_t N, const uint8_t alpha, uint8_t* Y); template void memcpy_cpu(const size_t N, const int* X, int* Y); -template void memcpy_cpu(const size_t N, const unsigned* X, unsigned* Y); +template void memcpy_cpu(const size_t N, const uint32* X, uint32* Y); +template void memcpy_cpu(const size_t N, const uint64* X, uint64* Y); template void memcpy_cpu(const size_t N, const float* X, float* Y); template void memcpy_cpu(const size_t N, const double* X, double* Y); template void sequence_cpu(int* ptr, const int num); @@ -629,6 +688,22 @@ template void col2octree_cpu(const float* data_col, float* data_octree, template void col2octree_cpu(const double* data_col, double* data_octree, const int channel, const int height, const int kernel_sdim, const int stride, const int* neigh, const int* ni, const int height_col, const int n); +template void octree2colP_cpu(float* data_col, const float* data_octree, + const int channel, const int height, const int octree_h, const int kernel_sdim, + const int stride, const int* neigh, const int* ni, const int* child, + const int* ichild, const int height_col, const int n); +template void col2octreeP_cpu(const float* data_col, float* data_octree, + const int channel, const int height, const int octree_h, const int kernel_sdim, + const int stride, const int* neigh, const int* ni, const int* child, + const int* ichild, const int height_col, const int n); +template void octree2colP_cpu(double* data_col, const double* data_octree, + const int channel, const int height, const int octree_h, const int kernel_sdim, + const int stride, const int* neigh, const int* ni, const int* child, + const int* ichild, const int height_col, const int n); +template void col2octreeP_cpu(const double* data_col, double* data_octree, + const int channel, const int height, const int octree_h, const int kernel_sdim, + const int stride, const int* neigh, const int* ni, const int* child, + const int* ichild, const int height_col, const int n); template void generate_label_cpu(int* label_data, int& top_h, const float* bottom_data, const int bottom_h, const int mask); template void generate_label_cpu(int* label_data, int& top_h, diff --git a/octree/octree/octree_nn.cu b/octree/octree/octree_nn.cu index 3ac5ea4..5526b7e 100644 --- a/octree/octree/octree_nn.cu +++ b/octree/octree/octree_nn.cu @@ -151,10 +151,10 @@ void octree2col_gpu(Dtype* data_col, const Dtype* data_octree, const int channel, const int height, const int kernel_sdim, const int stride, const int* neigh, const int* ni, const int height_col, const int n) { - const int kernel = kernel_sdim; - const int thread_num = channel * kernel * height_col; + const int thread_num = channel * kernel_sdim * height_col; octree2col_kernel <<< CudaGetBlocks(thread_num), kCudaThreadsNum >>> ( - data_col, data_octree, height, kernel, stride, neigh, ni, height_col, n, thread_num); + data_col, data_octree, height, kernel_sdim, stride, neigh, ni, height_col, + n, thread_num); CUDA_POST_KERNEL_CHECK; } @@ -163,17 +163,97 @@ void col2octree_gpu(const Dtype* data_col, Dtype* data_octree, const int channel, const int height, const int kernel_sdim, const int stride, const int* neigh, const int* ni, const int height_col, const int n) { - const int kernel = kernel_sdim; // kernel size: 3*3*3 - const int thread_num = channel * kernel * height_col; + const int thread_num = channel * kernel_sdim * height_col; int octree_h = height << 3 * (stride - 1); // set data_octree to zero ONCE when n ==0 if (n == 0) memset_gpu(channel * octree_h, Dtype(0), data_octree); col2octree_kernel <<< CudaGetBlocks(thread_num), kCudaThreadsNum >>> ( - data_col, data_octree, height, kernel, stride, neigh, ni, height_col, n, thread_num); + data_col, data_octree, height, kernel_sdim, stride, neigh, ni, height_col, + n, thread_num); CUDA_POST_KERNEL_CHECK; } + +template +__global__ void octree2colP_kernel(Dtype* data_col, const Dtype* data_octree, + const int height, const int octree_h, const int kernel_sdim, const int stride, + const int* neigh, const int* ni, const int* child, const int* ichild, + const int height_col, const int n, const int thread_num) { + CUDA_KERNEL_LOOP(i, thread_num) { + int h = i % height_col; + int h1 = h + n * height_col; + if (h1 >= height) { data_col[i] = 0; continue; } + int t = i / height_col; + int k = t % kernel_sdim; + int c = t / kernel_sdim; + + // neighborhood searching + const int hp = ichild[h]; + const int index = stride == 2 ? (h << 6) + ni[k] : + (hp >> 3 << 6) + ni[(hp % 8) * kernel_sdim + k]; + int p = neigh[index]; + if (p >= 0) { p = child[p]; } + + data_col[i] = p < 0 ? Dtype(0) : data_octree[c * octree_h + p]; + } +} + +template +__global__ void col2octreeP_kernel(const Dtype* data_col, Dtype* data_octree, + const int height, const int octree_h, const int kernel_sdim, const int stride, + const int* neigh, const int* ni, const int* child, const int* ichild, + const int height_col, const int n, const int thread_num) { + CUDA_KERNEL_LOOP(i, thread_num) { + int h = i % height_col; + int h1 = h + n * height_col; + if (h1 >= height) continue; + int t = i / height_col; + int k = t % kernel_sdim; + int c = t / kernel_sdim; + + // neighborhood searching + const int hp = ichild[h]; + const int index = stride == 2 ? (h << 6) + ni[k] : + (hp >> 3 << 6) + ni[(hp % 8) * kernel_sdim + k]; + int p = neigh[index]; + if (p >= 0) { p = child[p]; } + + // assign values + if (p >= 0) { + caffe_gpu_atomic_add(data_col[i], data_octree + c * octree_h + p); + } + } +} + +template +void octree2colP_gpu(Dtype* data_col, const Dtype* data_octree, const int channel, + const int height, const int octree_h, const int kernel_sdim, const int stride, + const int* neigh, const int* ni, const int* child, const int* ichild, + const int height_col, const int n) { + const int thread_num = channel * kernel_sdim * height_col; + octree2colP_kernel <<< CudaGetBlocks(thread_num), kCudaThreadsNum >>> ( + data_col, data_octree, height, octree_h, kernel_sdim, stride, neigh, ni, + child, ichild, height_col, n, thread_num); + CUDA_POST_KERNEL_CHECK; +} + +template +void col2octreeP_gpu(const Dtype* data_col, Dtype* data_octree, const int channel, + const int height, const int octree_h, const int kernel_sdim, const int stride, + const int* neigh, const int* ni, const int* child, const int* ichild, + const int height_col, const int n) { + const int thread_num = channel * kernel_sdim * height_col; + // set data_octree to zero ONCE when n ==0 + if (n == 0) { memset_gpu(channel * octree_h, Dtype(0), data_octree); } + col2octreeP_kernel <<< CudaGetBlocks(thread_num), kCudaThreadsNum >>> ( + data_col, data_octree, height, octree_h, kernel_sdim, stride, neigh, ni, + child, ichild, height_col, n, thread_num); + CUDA_POST_KERNEL_CHECK; +} + + + template __global__ void octree_max_pool_kernel(Dtype* top_data, const int top_h, int* mask, const Dtype* btm_data, const int btm_h, const int nthreads) { @@ -762,6 +842,8 @@ template void pad_backward_gpu(float* X, const int Hx, const int Cx, const float* Y, const int Hy, const int* label); template void pad_backward_gpu(double* X, const int Hx, const int Cx, const double* Y, const int Hy, const int* label); +template void pad_backward_gpu(int* X, const int Hx, const int Cx, + const int* Y, const int Hy, const int* label); template void octree2col_gpu(float* data_col, const float* data_octree, const int channel, const int height, const int kernel_sdim, const int stride, const int* neigh, const int* ni, const int height_col, const int n); @@ -774,6 +856,22 @@ template void col2octree_gpu(const float* data_col, float* data_octree, template void col2octree_gpu(const double* data_col, double* data_octree, const int channel, const int height, const int kernel_sdim, const int stride, const int* neigh, const int* ni, const int height_col, const int n); +template void octree2colP_gpu(float* data_col, const float* data_octree, + const int channel, const int height, const int octree_h, const int kernel_sdim, + const int stride, const int* neigh, const int* ni, const int* child, + const int* ichild, const int height_col, const int n); +template void col2octreeP_gpu(const float* data_col, float* data_octree, + const int channel, const int height, const int octree_h, const int kernel_sdim, + const int stride, const int* neigh, const int* ni, const int* child, + const int* ichild, const int height_col, const int n); +template void octree2colP_gpu(double* data_col, const double* data_octree, + const int channel, const int height, const int octree_h, const int kernel_sdim, + const int stride, const int* neigh, const int* ni, const int* child, + const int* ichild, const int height_col, const int n); +template void col2octreeP_gpu(const double* data_col, double* data_octree, + const int channel, const int height, const int octree_h, const int kernel_sdim, + const int stride, const int* neigh, const int* ni, const int* child, + const int* ichild, const int height_col, const int n); template void generate_label_gpu(int* label_data, int& top_h, const float* bottom_data, const int bottom_h, const int mask); template void generate_label_gpu(int* label_data, int& top_h, diff --git a/octree/octree/octree_nn.h b/octree/octree/octree_nn.h index fc31df1..27e89e3 100644 --- a/octree/octree/octree_nn.h +++ b/octree/octree/octree_nn.h @@ -98,6 +98,26 @@ void col2octree_gpu(const Dtype* data_col, Dtype* data_octree, const int stride, const int* neigh, const int* ni, const int height_col, const int n); +template +void octree2colP_cpu(Dtype* data_col, const Dtype* data_octree, const int channel, + const int height, const int octree_h, const int kernel_sdim, const int stride, + const int* neigh, const int* ni, const int* child, const int* ichild, + const int height_col, const int n); +template +void octree2colP_gpu(Dtype* data_col, const Dtype* data_octree, const int channel, + const int height, const int octree_h, const int kernel_sdim, const int stride, + const int* neigh, const int* ni, const int* child, const int* ichild, + const int height_col, const int n); +template +void col2octreeP_cpu(const Dtype* data_col, Dtype* data_octree, const int channel, + const int height, const int octree_h, const int kernel_sdim, const int stride, + const int* neigh, const int* ni, const int* child, const int* ichild, + const int height_col, const int n); +template +void col2octreeP_gpu(const Dtype* data_col, Dtype* data_octree, const int channel, + const int height, const int octree_h, const int kernel_sdim, const int stride, + const int* neigh, const int* ni, const int* child, const int* ichild, + const int height_col, const int n); template void octree_max_pool_cpu(Dtype* top_data, int top_h, int* mask,