зеркало из https://github.com/microsoft/O-CNN.git
Add octree2colP
This commit is contained in:
Родитель
6e5bd2d1de
Коммит
0d4e048f9c
|
@ -240,6 +240,64 @@ void col2octree_cpu(const Dtype* data_col, Dtype* data_octree,
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename Dtype>
|
||||
void octree2colP_cpu(Dtype* data_col, const Dtype* data_octree, const int channel,
|
||||
const int height, const int octree_h, const int kernel_sdim, const int stride,
|
||||
const int* neigh, const int* ni, const int* child, const int* ichild,
|
||||
const int height_col, const int n) {
|
||||
for (int c = 0; c < channel; ++c) {
|
||||
for (int k = 0; k < kernel_sdim; ++k) {
|
||||
int h_start = n * height_col;
|
||||
int i_start = (c * kernel_sdim + k) * height_col - h_start;
|
||||
for (int h = h_start; h < h_start + height_col; ++h) {
|
||||
// boundary condition
|
||||
if (h >= height) {
|
||||
data_col[i_start + h] = Dtype(0);
|
||||
continue;
|
||||
}
|
||||
// neighborhood searching
|
||||
const int hp = ichild[h];
|
||||
const int index = stride == 2 ? (h << 6) + ni[k] :
|
||||
(hp >> 3 << 6) + ni[(hp % 8) * kernel_sdim + k];
|
||||
int p = neigh[index];
|
||||
if (p >= 0) { p = child[p]; }
|
||||
// assign values
|
||||
data_col[i_start + h] =
|
||||
p < 0 ? Dtype(0) : data_octree[c * octree_h + p];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Dtype>
|
||||
void col2octreeP_cpu(const Dtype* data_col, Dtype* data_octree, const int channel,
|
||||
const int height, const int octree_h, const int kernel_sdim, const int stride,
|
||||
const int* neigh, const int* ni, const int* child, const int* ichild,
|
||||
const int height_col, const int n) {
|
||||
// set data_octree to zero ONCE when n ==0
|
||||
if (n == 0) { memset_cpu(channel * octree_h, Dtype(0), data_octree); }
|
||||
for (int c = 0; c < channel; ++c) {
|
||||
for (int k = 0; k < kernel_sdim; ++k) {
|
||||
int h_start = n * height_col;
|
||||
int i_start = (c * kernel_sdim + k) * height_col - h_start;
|
||||
for (int h = h_start; h < h_start + height_col; ++h) {
|
||||
// boundary condition
|
||||
if (h >= height) continue;
|
||||
// neighborhood searching
|
||||
const int hp = ichild[h];
|
||||
const int index = stride == 2 ? (h << 6) + ni[k] :
|
||||
(hp >> 3 << 6) + ni[(hp % 8) * kernel_sdim + k];
|
||||
int p = neigh[index];
|
||||
if (p >= 0) { p = child[p]; }
|
||||
// assign values
|
||||
if (p >= 0) { data_octree[c * octree_h + p] += data_col[i_start + h]; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Dtype>
|
||||
void octree_max_pool_cpu(Dtype* top_data, int top_h, int* mask,
|
||||
const Dtype* btm_data, int btm_h, int channel) {
|
||||
|
@ -604,7 +662,8 @@ template void memset_cpu<char>(const size_t N, const char alpha, char* Y);
|
|||
template void memset_cpu<int8_t>(const size_t N, const int8_t alpha, int8_t* Y);
|
||||
template void memset_cpu<uint8_t>(const size_t N, const uint8_t alpha, uint8_t* Y);
|
||||
template void memcpy_cpu<int>(const size_t N, const int* X, int* Y);
|
||||
template void memcpy_cpu<unsigned>(const size_t N, const unsigned* X, unsigned* Y);
|
||||
template void memcpy_cpu<uint32>(const size_t N, const uint32* X, uint32* Y);
|
||||
template void memcpy_cpu<uint64>(const size_t N, const uint64* X, uint64* Y);
|
||||
template void memcpy_cpu<float>(const size_t N, const float* X, float* Y);
|
||||
template void memcpy_cpu<double>(const size_t N, const double* X, double* Y);
|
||||
template void sequence_cpu<int>(int* ptr, const int num);
|
||||
|
@ -629,6 +688,22 @@ template void col2octree_cpu<float>(const float* data_col, float* data_octree,
|
|||
template void col2octree_cpu<double>(const double* data_col, double* data_octree,
|
||||
const int channel, const int height, const int kernel_sdim, const int stride,
|
||||
const int* neigh, const int* ni, const int height_col, const int n);
|
||||
template void octree2colP_cpu<float>(float* data_col, const float* data_octree,
|
||||
const int channel, const int height, const int octree_h, const int kernel_sdim,
|
||||
const int stride, const int* neigh, const int* ni, const int* child,
|
||||
const int* ichild, const int height_col, const int n);
|
||||
template void col2octreeP_cpu<float>(const float* data_col, float* data_octree,
|
||||
const int channel, const int height, const int octree_h, const int kernel_sdim,
|
||||
const int stride, const int* neigh, const int* ni, const int* child,
|
||||
const int* ichild, const int height_col, const int n);
|
||||
template void octree2colP_cpu<double>(double* data_col, const double* data_octree,
|
||||
const int channel, const int height, const int octree_h, const int kernel_sdim,
|
||||
const int stride, const int* neigh, const int* ni, const int* child,
|
||||
const int* ichild, const int height_col, const int n);
|
||||
template void col2octreeP_cpu<double>(const double* data_col, double* data_octree,
|
||||
const int channel, const int height, const int octree_h, const int kernel_sdim,
|
||||
const int stride, const int* neigh, const int* ni, const int* child,
|
||||
const int* ichild, const int height_col, const int n);
|
||||
template void generate_label_cpu<float>(int* label_data, int& top_h,
|
||||
const float* bottom_data, const int bottom_h, const int mask);
|
||||
template void generate_label_cpu<double>(int* label_data, int& top_h,
|
||||
|
|
|
@ -151,10 +151,10 @@ void octree2col_gpu(Dtype* data_col, const Dtype* data_octree,
|
|||
const int channel, const int height, const int kernel_sdim,
|
||||
const int stride, const int* neigh, const int* ni,
|
||||
const int height_col, const int n) {
|
||||
const int kernel = kernel_sdim;
|
||||
const int thread_num = channel * kernel * height_col;
|
||||
const int thread_num = channel * kernel_sdim * height_col;
|
||||
octree2col_kernel<Dtype> <<< CudaGetBlocks(thread_num), kCudaThreadsNum >>> (
|
||||
data_col, data_octree, height, kernel, stride, neigh, ni, height_col, n, thread_num);
|
||||
data_col, data_octree, height, kernel_sdim, stride, neigh, ni, height_col,
|
||||
n, thread_num);
|
||||
CUDA_POST_KERNEL_CHECK;
|
||||
}
|
||||
|
||||
|
@ -163,17 +163,97 @@ void col2octree_gpu(const Dtype* data_col, Dtype* data_octree,
|
|||
const int channel, const int height, const int kernel_sdim,
|
||||
const int stride, const int* neigh, const int* ni,
|
||||
const int height_col, const int n) {
|
||||
const int kernel = kernel_sdim; // kernel size: 3*3*3
|
||||
const int thread_num = channel * kernel * height_col;
|
||||
const int thread_num = channel * kernel_sdim * height_col;
|
||||
int octree_h = height << 3 * (stride - 1);
|
||||
// set data_octree to zero ONCE when n ==0
|
||||
if (n == 0) memset_gpu(channel * octree_h, Dtype(0), data_octree);
|
||||
col2octree_kernel<Dtype> <<< CudaGetBlocks(thread_num), kCudaThreadsNum >>> (
|
||||
data_col, data_octree, height, kernel, stride, neigh, ni, height_col, n, thread_num);
|
||||
data_col, data_octree, height, kernel_sdim, stride, neigh, ni, height_col,
|
||||
n, thread_num);
|
||||
CUDA_POST_KERNEL_CHECK;
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <typename Dtype>
|
||||
__global__ void octree2colP_kernel(Dtype* data_col, const Dtype* data_octree,
|
||||
const int height, const int octree_h, const int kernel_sdim, const int stride,
|
||||
const int* neigh, const int* ni, const int* child, const int* ichild,
|
||||
const int height_col, const int n, const int thread_num) {
|
||||
CUDA_KERNEL_LOOP(i, thread_num) {
|
||||
int h = i % height_col;
|
||||
int h1 = h + n * height_col;
|
||||
if (h1 >= height) { data_col[i] = 0; continue; }
|
||||
int t = i / height_col;
|
||||
int k = t % kernel_sdim;
|
||||
int c = t / kernel_sdim;
|
||||
|
||||
// neighborhood searching
|
||||
const int hp = ichild[h];
|
||||
const int index = stride == 2 ? (h << 6) + ni[k] :
|
||||
(hp >> 3 << 6) + ni[(hp % 8) * kernel_sdim + k];
|
||||
int p = neigh[index];
|
||||
if (p >= 0) { p = child[p]; }
|
||||
|
||||
data_col[i] = p < 0 ? Dtype(0) : data_octree[c * octree_h + p];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Dtype>
|
||||
__global__ void col2octreeP_kernel(const Dtype* data_col, Dtype* data_octree,
|
||||
const int height, const int octree_h, const int kernel_sdim, const int stride,
|
||||
const int* neigh, const int* ni, const int* child, const int* ichild,
|
||||
const int height_col, const int n, const int thread_num) {
|
||||
CUDA_KERNEL_LOOP(i, thread_num) {
|
||||
int h = i % height_col;
|
||||
int h1 = h + n * height_col;
|
||||
if (h1 >= height) continue;
|
||||
int t = i / height_col;
|
||||
int k = t % kernel_sdim;
|
||||
int c = t / kernel_sdim;
|
||||
|
||||
// neighborhood searching
|
||||
const int hp = ichild[h];
|
||||
const int index = stride == 2 ? (h << 6) + ni[k] :
|
||||
(hp >> 3 << 6) + ni[(hp % 8) * kernel_sdim + k];
|
||||
int p = neigh[index];
|
||||
if (p >= 0) { p = child[p]; }
|
||||
|
||||
// assign values
|
||||
if (p >= 0) {
|
||||
caffe_gpu_atomic_add(data_col[i], data_octree + c * octree_h + p);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Dtype>
|
||||
void octree2colP_gpu(Dtype* data_col, const Dtype* data_octree, const int channel,
|
||||
const int height, const int octree_h, const int kernel_sdim, const int stride,
|
||||
const int* neigh, const int* ni, const int* child, const int* ichild,
|
||||
const int height_col, const int n) {
|
||||
const int thread_num = channel * kernel_sdim * height_col;
|
||||
octree2colP_kernel<Dtype> <<< CudaGetBlocks(thread_num), kCudaThreadsNum >>> (
|
||||
data_col, data_octree, height, octree_h, kernel_sdim, stride, neigh, ni,
|
||||
child, ichild, height_col, n, thread_num);
|
||||
CUDA_POST_KERNEL_CHECK;
|
||||
}
|
||||
|
||||
template <typename Dtype>
|
||||
void col2octreeP_gpu(const Dtype* data_col, Dtype* data_octree, const int channel,
|
||||
const int height, const int octree_h, const int kernel_sdim, const int stride,
|
||||
const int* neigh, const int* ni, const int* child, const int* ichild,
|
||||
const int height_col, const int n) {
|
||||
const int thread_num = channel * kernel_sdim * height_col;
|
||||
// set data_octree to zero ONCE when n ==0
|
||||
if (n == 0) { memset_gpu(channel * octree_h, Dtype(0), data_octree); }
|
||||
col2octreeP_kernel<Dtype> <<< CudaGetBlocks(thread_num), kCudaThreadsNum >>> (
|
||||
data_col, data_octree, height, octree_h, kernel_sdim, stride, neigh, ni,
|
||||
child, ichild, height_col, n, thread_num);
|
||||
CUDA_POST_KERNEL_CHECK;
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <typename Dtype>
|
||||
__global__ void octree_max_pool_kernel(Dtype* top_data, const int top_h,
|
||||
int* mask, const Dtype* btm_data, const int btm_h, const int nthreads) {
|
||||
|
@ -762,6 +842,8 @@ template void pad_backward_gpu<float>(float* X, const int Hx, const int Cx,
|
|||
const float* Y, const int Hy, const int* label);
|
||||
template void pad_backward_gpu<double>(double* X, const int Hx, const int Cx,
|
||||
const double* Y, const int Hy, const int* label);
|
||||
template void pad_backward_gpu<int>(int* X, const int Hx, const int Cx,
|
||||
const int* Y, const int Hy, const int* label);
|
||||
template void octree2col_gpu<float>(float* data_col, const float* data_octree,
|
||||
const int channel, const int height, const int kernel_sdim, const int stride,
|
||||
const int* neigh, const int* ni, const int height_col, const int n);
|
||||
|
@ -774,6 +856,22 @@ template void col2octree_gpu<float>(const float* data_col, float* data_octree,
|
|||
template void col2octree_gpu<double>(const double* data_col, double* data_octree,
|
||||
const int channel, const int height, const int kernel_sdim, const int stride,
|
||||
const int* neigh, const int* ni, const int height_col, const int n);
|
||||
template void octree2colP_gpu<float>(float* data_col, const float* data_octree,
|
||||
const int channel, const int height, const int octree_h, const int kernel_sdim,
|
||||
const int stride, const int* neigh, const int* ni, const int* child,
|
||||
const int* ichild, const int height_col, const int n);
|
||||
template void col2octreeP_gpu<float>(const float* data_col, float* data_octree,
|
||||
const int channel, const int height, const int octree_h, const int kernel_sdim,
|
||||
const int stride, const int* neigh, const int* ni, const int* child,
|
||||
const int* ichild, const int height_col, const int n);
|
||||
template void octree2colP_gpu<double>(double* data_col, const double* data_octree,
|
||||
const int channel, const int height, const int octree_h, const int kernel_sdim,
|
||||
const int stride, const int* neigh, const int* ni, const int* child,
|
||||
const int* ichild, const int height_col, const int n);
|
||||
template void col2octreeP_gpu<double>(const double* data_col, double* data_octree,
|
||||
const int channel, const int height, const int octree_h, const int kernel_sdim,
|
||||
const int stride, const int* neigh, const int* ni, const int* child,
|
||||
const int* ichild, const int height_col, const int n);
|
||||
template void generate_label_gpu<float>(int* label_data, int& top_h,
|
||||
const float* bottom_data, const int bottom_h, const int mask);
|
||||
template void generate_label_gpu<double>(int* label_data, int& top_h,
|
||||
|
|
|
@ -98,6 +98,26 @@ void col2octree_gpu(const Dtype* data_col, Dtype* data_octree,
|
|||
const int stride, const int* neigh, const int* ni,
|
||||
const int height_col, const int n);
|
||||
|
||||
template <typename Dtype>
|
||||
void octree2colP_cpu(Dtype* data_col, const Dtype* data_octree, const int channel,
|
||||
const int height, const int octree_h, const int kernel_sdim, const int stride,
|
||||
const int* neigh, const int* ni, const int* child, const int* ichild,
|
||||
const int height_col, const int n);
|
||||
template <typename Dtype>
|
||||
void octree2colP_gpu(Dtype* data_col, const Dtype* data_octree, const int channel,
|
||||
const int height, const int octree_h, const int kernel_sdim, const int stride,
|
||||
const int* neigh, const int* ni, const int* child, const int* ichild,
|
||||
const int height_col, const int n);
|
||||
template <typename Dtype>
|
||||
void col2octreeP_cpu(const Dtype* data_col, Dtype* data_octree, const int channel,
|
||||
const int height, const int octree_h, const int kernel_sdim, const int stride,
|
||||
const int* neigh, const int* ni, const int* child, const int* ichild,
|
||||
const int height_col, const int n);
|
||||
template <typename Dtype>
|
||||
void col2octreeP_gpu(const Dtype* data_col, Dtype* data_octree, const int channel,
|
||||
const int height, const int octree_h, const int kernel_sdim, const int stride,
|
||||
const int* neigh, const int* ni, const int* child, const int* ichild,
|
||||
const int height_col, const int n);
|
||||
|
||||
template <typename Dtype>
|
||||
void octree_max_pool_cpu(Dtype* top_data, int top_h, int* mask,
|
||||
|
|
Загрузка…
Ссылка в новой задаче