Update octree

2021-08-24 09:44:19 +08:00 · 2021-08-24 09:44:19 +08:00 · f74c9539a5
--- a/octree/octree/octree.cpp
+++ b/octree/octree/octree.cpp
@ -1244,7 +1244,8 @@ void Octree::octree2mesh(vector<float>& V, vector<int>& F, int depth_start,

    vector<float> pts, normals, pts_ref;
    for (int i = 0; i < num; ++i) {
-      if (node_type(child_d[i]) == kInternelNode && d != depth) continue;
+      if ((node_type(child_d[i]) == kInternelNode && d != depth) ||
+          (node_type(child_d[i]) == kLeaf && d == depth)) continue;

      float n[3], pt[3], pt_ref[3];
      node_normal(n, i, d);
--- a/octree/octree/octree_conv.cpp
+++ b/octree/octree/octree_conv.cpp
@ -3,11 +3,14 @@
 #include "logs.h"
 #include <algorithm>

+
 namespace octree {

 template <typename Dtype>
-void OctreeBaseConv<Dtype>::setup(const vector<int>& kernel_size, const int stride,
-    const int curr_depth, const int channel_in, const int channel_out) {
+void OctreeBaseConv<Dtype>::setup(const vector<int>& kernel_size,
+                                  const int stride, const int curr_depth,
+                                  const int channel_in, const int channel_out,
+                                  const bool nempty) {
  // kernel size
  kernel_size_ = kernel_size;
  CHECK(kernel_size_[0] < 4 && kernel_size_[1] < 4 && kernel_size_[2] < 4)
@ -31,98 +34,155 @@ void OctreeBaseConv<Dtype>::setup(const vector<int>& kernel_size, const int stri
    std::swap(conv_out_channels_, conv_in_channels_);
  }

+  // !!! perform the convolution on non-empty octree nodes or not
+  nempty_ = nempty;
+
  kernel_sdim_ = kernel_size_[0] * kernel_size_[1] * kernel_size_[2];
  kernel_dim_ = kernel_sdim_ * conv_in_channels_;

  ni_cpu_ptr_ = NeighHelper::get_ni(kernel_size_).data();
-  ni_gpu_ptr_ = nullptr; // must be set before using
+  ni_gpu_ptr_ = nullptr;  // must be set before using
 }

-
 template <typename Dtype>
 void OctreeBaseConv<Dtype>::reshape() {
-  // weight shape
-  weights_shape_ = vector<int> {conv_out_channels_, conv_in_channels_ * kernel_sdim_};
-
-  // compute top shape
-  int btm_h = octree_.info().node_num(curr_depth_);
-  int top_blob_depth = curr_depth_, top_h = btm_h;
+  // assign depth for different blobs
+  // curr_depth_ and top_depth are the octree depth of the input and output
+  // data; workspace_depth_ is the octree depth of the `col` data, different
+  // from top_depth, workspace_depth_ is always the same as the depth of larger
+  // data when doing octree2col or col2octree
+  int top_depth = workspace_depth_ = curr_depth_;
  if (stride_ == 2) {
    if (is_deconvolution_layer()) {
-      top_blob_depth++;
-      top_h = octree_.info().node_num(top_blob_depth);
+      top_depth = workspace_depth_ = curr_depth_ + 1;
    } else {
-      top_blob_depth--;
-      top_h = octree_.info().node_num_nempty(top_blob_depth);
+      top_depth = curr_depth_ - 1;
    }
-    CHECK(0 <= top_blob_depth && top_blob_depth <= octree_.info().depth());
+    CHECK(0 <= top_depth && top_depth <= octree_.info().depth());
  }
-  if (top_h == 0) top_h = 1;      // avoid degenerated case
-  top_shape_ = vector<int> { 1, num_output_, top_h, 1 };

-  // reshape workspce
-  workspace_depth_ = curr_depth_; // the depth value used for octree2col
-  if (is_deconvolution_layer() && stride_ == 2) workspace_depth_++;
-  workspace_h_ = btm_h;
-  if (stride_ == 2) {
-    if (is_deconvolution_layer()) { workspace_h_ = top_h >> 3; }
-    else { workspace_h_ = btm_h >> 3; }
+  // weight shape
+  weights_shape_ =
+      vector<int>{conv_out_channels_, conv_in_channels_ * kernel_sdim_};
+
+  // top shape
+  int top_h = 0;
+  if (!nempty_) {
+    top_h = octree_.info().node_num(top_depth);
+    if (stride_ == 2 && !is_deconvolution_layer()) {
+      // In this case, the octree_pad is needed to pad the output data,
+      // so the top_h is equal to the non-empty node number.
+      top_h = octree_.info().node_num_nempty(top_depth);
+    }
+  } else {
+    top_h = octree_.info().node_num_nempty(top_depth);
  }
+  if (top_h == 0) top_h = 1;  // avoid degenerated case
+  top_shape_ = vector<int>{1, num_output_, top_h, 1};
+
+  // workspce shape
+  workspace_h_ = top_h;  // equals to the output height if stride is 1
+  if (stride_ == 2) {
+    if (is_deconvolution_layer()) {
+      workspace_h_ = octree_.info().node_num(top_depth) / 8;
+    } else {
+      workspace_h_ = octree_.info().node_num(curr_depth_) / 8;
+    }
+  }
+
+  // child_h_, ichild_h_, octree_h_ are used for octree2col/col2octree
+  // only if nempty_ is True.
+  if (nempty_) {
+    child_h_ = octree_.info().node_num(workspace_depth_);
+    ichild_h_ = octree_.info().node_num_nempty(workspace_depth_);
+
+    // octree_h is the height of octree data for octree2col/col2octree
+    octree_h_ = octree_.info().node_num_nempty(curr_depth_);
+    if (stride_ == 2 && is_deconvolution_layer()) {
+      octree_h_ = octree_.info().node_num_nempty(top_depth);
+    }
+  }
+
+  // workspace number and workspace actual shape
  workspace_n_ = 1;
  workspace_ha_ = workspace_h_;
-  uint64 ideal_size = (uint64) workspace_h_ * (uint64) kernel_dim_;
+  uint64 ideal_size = (uint64)workspace_h_ * (uint64)kernel_dim_;
  if (ideal_size > MAX_SIZE && !is_1x1_) {
    workspace_n_ = (ideal_size + MAX_SIZE - 1) / MAX_SIZE;
    workspace_ha_ = (workspace_h_ + workspace_n_ - 1) / workspace_n_;
  }
-  workspace_shape_ = vector<int> { kernel_dim_, workspace_ha_};
+  workspace_shape_ = vector<int>{kernel_dim_, workspace_ha_};

-  // reshape result_buffer_
+  // result_buffer_ shape
  if (workspace_n_ > 1) {
-    result_buffer_shape_ = vector<int> { conv_out_channels_, workspace_ha_ };
+    result_buffer_shape_ = vector<int>{conv_out_channels_, workspace_ha_};
  } else {
    result_buffer_shape_.clear();
  }
+}

-  // reshape data_buffer_
-  if (stride_ == 2) {
-    data_buffer_shape_ = vector<int> { 1, conv_out_channels_, workspace_h_, 1 };
+template <typename Dtype>
+void OctreeBaseConv<Dtype>::octree2col_cpu_wrapper(Dtype* workspace,
+                                                   const Dtype* bottom_data,
+                                                   const int n) {
+  if (!nempty_) {
+    octree2col_cpu<Dtype>(workspace, bottom_data, conv_in_channels_,
+                          workspace_h_, kernel_sdim_, stride_,
+                          octree_.neighbor_cpu(workspace_depth_), ni_cpu_ptr_,
+                          workspace_ha_, n);
  } else {
-    data_buffer_shape_.clear();
+    octree2colP_cpu<Dtype>(workspace, bottom_data, conv_in_channels_,
+                           workspace_h_, octree_h_, kernel_sdim_, stride_,
+                           octree_.neighbor_gpu(workspace_depth_), ni_cpu_ptr_,
+                           child_, ichild_, workspace_ha_, n);
  }
 }

+template <typename Dtype>
+void OctreeBaseConv<Dtype>::col2octree_cpu_wrapper(const Dtype* col_diff,
+                                                   Dtype* bottom_diff, int n) {
+  if (!nempty_) {
+    col2octree_cpu<Dtype>(col_diff, bottom_diff, conv_in_channels_,
+                          workspace_h_, kernel_sdim_, stride_,
+                          octree_.neighbor_cpu(workspace_depth_), ni_cpu_ptr_,
+                          workspace_ha_, n);
+  } else {
+    col2octreeP_cpu<Dtype>(col_diff, bottom_diff, conv_in_channels_,
+                           workspace_h_, octree_h_, kernel_sdim_, stride_,
+                           octree_.neighbor_gpu(workspace_depth_), ni_cpu_ptr_,
+                           child_, ichild_, workspace_ha_, n);
+  }
+}

 template <typename Dtype>
 void OctreeBaseConv<Dtype>::forward_cpu_gemm(Dtype* top_data,
-    const Dtype* bottom_data, const Dtype* weights) {
+                                             const Dtype* bottom_data,
+                                             const Dtype* weights) {
  const Dtype* col_data = bottom_data;
  Dtype* result_data = workspace_n_ == 1 ? top_data : result_buffer_;
  for (int n = 0; n < workspace_n_; ++n) {
    if (!is_1x1_) {
-      octree2col_cpu<Dtype>(workspace_,
-          bottom_data, conv_in_channels_, workspace_h_, kernel_sdim_,
-          stride_, octree_.neighbor_cpu(workspace_depth_),
-          ni_cpu_ptr_, workspace_ha_, n);
+      octree2col_cpu_wrapper(workspace_, bottom_data, n);
      col_data = workspace_;
    }

-    engine_cpu_->gemm(false, false, conv_out_channels_,
-        workspace_ha_, kernel_dim_, Dtype(1.0), weights, col_data,
-        Dtype(0), result_data);
+    engine_cpu_->gemm(false, false, conv_out_channels_, workspace_ha_,
+                      kernel_dim_, Dtype(1.0), weights, col_data, Dtype(0),
+                      result_data);

    if (workspace_n_ == 1) return;
    int num = std::min(workspace_ha_, workspace_h_ - n * workspace_ha_);
    for (int c = 0; c < conv_out_channels_; ++c) {
      memcpy_cpu(num, result_data + c * workspace_ha_,
-          top_data + c * workspace_h_ + n * workspace_ha_);
+                 top_data + c * workspace_h_ + n * workspace_ha_);
    }
  }
 }

 template <typename Dtype>
 void OctreeBaseConv<Dtype>::backward_cpu_gemm(Dtype* bottom_diff,
-    const Dtype* top_diff, const Dtype* weights) {
+                                              const Dtype* top_diff,
+                                              const Dtype* weights) {
  Dtype* col_diff = is_1x1_ ? bottom_diff : workspace_;
  for (int n = 0; n < workspace_n_; ++n) {
    const Dtype* result_buffer = top_diff;
@ -131,28 +191,25 @@ void OctreeBaseConv<Dtype>::backward_cpu_gemm(Dtype* bottom_diff,
      int num = std::min(workspace_ha_, workspace_h_ - n * workspace_ha_);
      for (int c = 0; c < conv_out_channels_; ++c) {
        memcpy_cpu(num, top_diff + c * workspace_h_ + n * workspace_ha_,
-            buffer_ + c * workspace_ha_);
+                   buffer_ + c * workspace_ha_);
      }
      result_buffer = result_buffer_;
    }

-    engine_cpu_->gemm(true, false, kernel_dim_,
-        workspace_ha_, conv_out_channels_, Dtype(1.0), weights,
-        result_buffer, Dtype(0.0), col_diff);
+    engine_cpu_->gemm(true, false, kernel_dim_, workspace_ha_,
+                      conv_out_channels_, Dtype(1.0), weights, result_buffer,
+                      Dtype(0.0), col_diff);

    if (!is_1x1_) {
-      col2octree_cpu<Dtype>(col_diff, bottom_diff,
-          conv_in_channels_, workspace_h_, kernel_sdim_,
-          stride_, octree_.neighbor_cpu(workspace_depth_),
-          ni_cpu_ptr_, workspace_ha_, n);
+      col2octree_cpu_wrapper(col_diff, bottom_diff, n);
    }
  }
 }

-
 template <typename Dtype>
 void OctreeBaseConv<Dtype>::weight_cpu_gemm(Dtype* weights_diff,
-    const Dtype* bottom_data, const Dtype* top_diff) {
+                                            const Dtype* bottom_data,
+                                            const Dtype* top_diff) {
  int num = num_elements(weights_shape_);
  memset_cpu(num, Dtype(0), weights_diff);

@ -160,10 +217,7 @@ void OctreeBaseConv<Dtype>::weight_cpu_gemm(Dtype* weights_diff,
  const Dtype* result_buffer = top_diff;
  for (int n = 0; n < workspace_n_; ++n) {
    if (!is_1x1_) {
-      octree2col_cpu<Dtype>(workspace_,
-          bottom_data, conv_in_channels_, workspace_h_, kernel_sdim_,
-          stride_, octree_.neighbor_cpu(workspace_depth_),
-          ni_cpu_ptr_, workspace_ha_, n);
+      octree2col_cpu_wrapper(workspace_, bottom_data, n);
      col_data = workspace_;
    }

@ -172,49 +226,81 @@ void OctreeBaseConv<Dtype>::weight_cpu_gemm(Dtype* weights_diff,
      Dtype* buffer = result_buffer_;
      for (int c = 0; c < conv_out_channels_; ++c) {
        memcpy_cpu(num, top_diff + c * workspace_h_ + n * workspace_ha_,
-            buffer + c * workspace_ha_);
+                   buffer + c * workspace_ha_);
      }
      result_buffer = result_buffer_;
    }

-    engine_cpu_->gemm(false, true, conv_out_channels_,
-        kernel_dim_, workspace_ha_, Dtype(1.0), result_buffer, col_data,
-        Dtype(1.0), weights_diff);
+    engine_cpu_->gemm(false, true, conv_out_channels_, kernel_dim_,
+                      workspace_ha_, Dtype(1.0), result_buffer, col_data,
+                      Dtype(1.0), weights_diff);
  }
 }

 #ifdef USE_CUDA

+template <typename Dtype>
+void OctreeBaseConv<Dtype>::octree2col_gpu_wrapper(Dtype* workspace,
+                                                   const Dtype* bottom_data,
+                                                   const int n) {
+  if (!nempty_) {
+    octree2col_gpu<Dtype>(workspace, bottom_data, conv_in_channels_,
+                          workspace_h_, kernel_sdim_, stride_,
+                          octree_.neighbor_gpu(workspace_depth_), ni_gpu_ptr_,
+                          workspace_ha_, n);
+  } else {
+    octree2colP_gpu<Dtype>(workspace, bottom_data, conv_in_channels_,
+                           workspace_h_, octree_h_, kernel_sdim_, stride_,
+                           octree_.neighbor_gpu(workspace_depth_), ni_gpu_ptr_,
+                           child_, ichild_, workspace_ha_, n);
+  }
+}
+
+template <typename Dtype>
+void OctreeBaseConv<Dtype>::col2octree_gpu_wrapper(const Dtype* col_diff,
+                                                   Dtype* bottom_diff, int n) {
+  if (!nempty_) {
+    col2octree_gpu<Dtype>(col_diff, bottom_diff, conv_in_channels_,
+                          workspace_h_, kernel_sdim_, stride_,
+                          octree_.neighbor_gpu(workspace_depth_), ni_gpu_ptr_,
+                          workspace_ha_, n);
+  } else {
+    col2octreeP_gpu<Dtype>(col_diff, bottom_diff, conv_in_channels_,
+                           workspace_h_, octree_h_, kernel_sdim_, stride_,
+                           octree_.neighbor_gpu(workspace_depth_), ni_gpu_ptr_,
+                           child_, ichild_, workspace_ha_, n);
+  }
+}
+
 template <typename Dtype>
 void OctreeBaseConv<Dtype>::forward_gpu_gemm(Dtype* top_data,
-    const Dtype* bottom_data, const Dtype* weights) {
+                                             const Dtype* bottom_data,
+                                             const Dtype* weights) {
  const Dtype* col_data = bottom_data;
  Dtype* result_data = workspace_n_ == 1 ? top_data : result_buffer_;
  for (int n = 0; n < workspace_n_; ++n) {
    if (!is_1x1_) {
-      octree2col_gpu<Dtype>(workspace_,
-          bottom_data, conv_in_channels_, workspace_h_, kernel_sdim_,
-          stride_, octree_.neighbor_gpu(workspace_depth_),
-          ni_gpu_ptr_, workspace_ha_, n);
+      octree2col_gpu_wrapper(workspace_, bottom_data, n);
      col_data = workspace_;
    }

-    engine_gpu_->gemm(false, false, conv_out_channels_,
-        workspace_ha_, kernel_dim_, Dtype(1.0), weights, col_data,
-        Dtype(0), result_data);
+    engine_gpu_->gemm(false, false, conv_out_channels_, workspace_ha_,
+                      kernel_dim_, Dtype(1.0), weights, col_data, Dtype(0),
+                      result_data);

    if (workspace_n_ == 1) return;
    int num = std::min(workspace_ha_, workspace_h_ - n * workspace_ha_);
    for (int c = 0; c < conv_out_channels_; ++c) {
      memcpy_gpu(num, result_data + c * workspace_ha_,
-          top_data + c * workspace_h_ + n * workspace_ha_);
+                 top_data + c * workspace_h_ + n * workspace_ha_);
    }
  }
 }

 template <typename Dtype>
 void OctreeBaseConv<Dtype>::backward_gpu_gemm(Dtype* bottom_diff,
-    const Dtype* top_diff, const Dtype* weights) {
+                                              const Dtype* top_diff,
+                                              const Dtype* weights) {
  Dtype* col_diff = is_1x1_ ? bottom_diff : workspace_;
  for (int n = 0; n < workspace_n_; ++n) {
    const Dtype* result_buffer = top_diff;
@ -223,28 +309,25 @@ void OctreeBaseConv<Dtype>::backward_gpu_gemm(Dtype* bottom_diff,
      int num = std::min(workspace_ha_, workspace_h_ - n * workspace_ha_);
      for (int c = 0; c < conv_out_channels_; ++c) {
        memcpy_gpu(num, top_diff + c * workspace_h_ + n * workspace_ha_,
-            buffer_ + c * workspace_ha_);
+                   buffer_ + c * workspace_ha_);
      }
      result_buffer = result_buffer_;
    }

-    engine_gpu_->gemm(true, false, kernel_dim_,
-        workspace_ha_, conv_out_channels_, Dtype(1.0), weights,
-        result_buffer, Dtype(0.0), col_diff);
+    engine_gpu_->gemm(true, false, kernel_dim_, workspace_ha_,
+                      conv_out_channels_, Dtype(1.0), weights, result_buffer,
+                      Dtype(0.0), col_diff);

    if (!is_1x1_) {
-      col2octree_gpu<Dtype>(col_diff, bottom_diff,
-          conv_in_channels_, workspace_h_, kernel_sdim_,
-          stride_, octree_.neighbor_gpu(workspace_depth_),
-          ni_gpu_ptr_, workspace_ha_, n);
+      col2octree_gpu_wrapper(col_diff, bottom_diff, n);
    }
  }
 }

-
 template <typename Dtype>
 void OctreeBaseConv<Dtype>::weight_gpu_gemm(Dtype* weights_diff,
-    const Dtype* bottom_data, const Dtype* top_diff) {
+                                            const Dtype* bottom_data,
+                                            const Dtype* top_diff) {
  int num = num_elements(weights_shape_);
  memset_gpu(num, Dtype(0), weights_diff);

@ -252,10 +335,7 @@ void OctreeBaseConv<Dtype>::weight_gpu_gemm(Dtype* weights_diff,
  const Dtype* result_buffer = top_diff;
  for (int n = 0; n < workspace_n_; ++n) {
    if (!is_1x1_) {
-      octree2col_gpu<Dtype>(workspace_,
-          bottom_data, conv_in_channels_, workspace_h_, kernel_sdim_,
-          stride_, octree_.neighbor_gpu(workspace_depth_),
-          ni_gpu_ptr_, workspace_ha_, n);
+      octree2col_gpu_wrapper(workspace_, bottom_data, n);
      col_data = workspace_;
    }

@ -264,18 +344,18 @@ void OctreeBaseConv<Dtype>::weight_gpu_gemm(Dtype* weights_diff,
      Dtype* buffer = result_buffer_;
      for (int c = 0; c < conv_out_channels_; ++c) {
        memcpy_gpu(num, top_diff + c * workspace_h_ + n * workspace_ha_,
-            buffer + c * workspace_ha_);
+                   buffer + c * workspace_ha_);
      }
      result_buffer = result_buffer_;
    }

-    engine_gpu_->gemm(false, true, conv_out_channels_,
-        kernel_dim_, workspace_ha_, Dtype(1.0), result_buffer, col_data,
-        Dtype(1.0), weights_diff);
+    engine_gpu_->gemm(false, true, conv_out_channels_, kernel_dim_,
+                      workspace_ha_, Dtype(1.0), result_buffer, col_data,
+                      Dtype(1.0), weights_diff);
  }
 }

-#endif // USE_CUDA
+#endif  // USE_CUDA

 template class OctreeBaseConv<float>;
 template class OctreeBaseConv<double>;
--- a/octree/octree/octree_conv.h
+++ b/octree/octree/octree_conv.h
@ -13,11 +13,13 @@ template <typename Dtype>
 class OctreeBaseConv {
 public:
  explicit OctreeBaseConv(int max_size = 256 * 1024 * 1024)
-    : MAX_SIZE(max_size), engine_cpu_(nullptr), engine_gpu_(nullptr) {}
+      : MAX_SIZE(max_size), engine_cpu_(nullptr), engine_gpu_(nullptr),
+        nempty_(false), child_(nullptr), ichild_(nullptr) {}
  void setup(const vector<int>& kernel_size, const int stride,
-      const int curr_depth, const int channel_in, const int channel_out);
-  // after setup() and before reshpae(), 
-  // please set engine_cpu/gpu_,  octree_ and ni_gpu_ptr_
+      const int curr_depth, const int channel_in, const int channel_out,
+      const bool nempty = false);
+  // !!! Please set engine_cpu/gpu_, octree_ and ni_gpu_ptr_
+  // after calling setup() and before reshpae(),
  void reshape();

 protected:
@ -40,11 +42,16 @@ class OctreeBaseConv {
  void weight_gpu_gemm(Dtype* weights_diff, const Dtype* bottom_data,
      const Dtype* top_diff);

+  void octree2col_cpu_wrapper(Dtype* workspace, const Dtype* bottom_data, int n);
+  void col2octree_cpu_wrapper(const Dtype* workspace, Dtype* bottom_data, int n);
+  void octree2col_gpu_wrapper(Dtype* workspace, const Dtype* bottom_data, int n);
+  void col2octree_gpu_wrapper(const Dtype* workspace, Dtype* bottom_data, int n);
+
 protected:
  int stride_;
  vector<int> kernel_size_;
  int kernel_dim_;
-  int kernel_sdim_; // spatial dim of the kernel
+  int kernel_sdim_;  // spatial dim of the kernel
  bool is_1x1_;

  // input channel & output channel
@ -59,19 +66,17 @@ class OctreeBaseConv {
  OctreeParser octree_;

  int workspace_n_;
-  int workspace_ha_;	// actual worksapce h
-  int workspace_h_;	  // ideal workspace h
-  int workspace_depth_;
+  int workspace_ha_;    // actual worksapce h, the height of `col` data
+  int workspace_h_;     // ideal  workspace h
+  int workspace_depth_; // the depth value used for octree2col

  vector<int> top_shape_;
  vector<int> weights_shape_;
  vector<int> workspace_shape_;
-  vector<int> data_buffer_shape_;
  vector<int> result_buffer_shape_;

  Dtype* workspace_;
-  Dtype* data_buffer_;
-  Dtype* result_buffer_;
+  Dtype* result_buffer_;  // hold the temporary result of octree2col

  const int* ni_cpu_ptr_; // hold cpu data from NeighHelper::get_ni(kernel_size_)
  const int* ni_gpu_ptr_; // hold gpu data from NeighHelper::get_ni(kernel_size_)
@ -80,6 +85,15 @@ class OctreeBaseConv {

  GEMMEngine<Dtype>* engine_cpu_;
  GEMMEngine<Dtype>* engine_gpu_;
+
+  bool nempty_;          // perform convolution on non-empty voxels
+
+  // used for octree2col and col2octree on non-empty voxels
+  int octree_h_;         // the height of octree data
+  int child_h_;
+  int ichild_h_;
+  const int* child_;
+  const int* ichild_;
 };

 }  // namespace octree
--- a/octree/octree/octree_info.h
+++ b/octree/octree/octree_info.h
@ -41,6 +41,9 @@ class OctreeInfo {
  int node_num(int d) const { return nnum_[d]; }
  int node_num_cum(int d) const { return nnum_cum_[d]; }
  int node_num_nempty(int d) const { return nnum_nempty_[d]; }
+  const int* node_num_ptr() const { return nnum_; }
+  const int* node_nempty_ptr() const { return nnum_nempty_; }
+  const int* node_num_cum_ptr() const { return nnum_cum_; }
  int total_nnum() const { return nnum_cum_[depth_ + 1]; }
  int total_nnum_capacity() const { return nnum_cum_[depth_ + 2]; }
  int content_flags() const { return content_flags_; }
--- a/octree/octree/octree_nn.cpp
+++ b/octree/octree/octree_nn.cpp
@ -10,7 +10,7 @@ void NeighHelper::init_neigh_index() {
    { "331", 6 }, { "313", 7 }, { "133", 8 } };

  const vector<vector<int> > vec{ {} /* 333, 27 */, { 13 } /* 111, 1 */,
-    { 13, 14, 16, 17, 22, 23, 25, 26 } /* 222, 8 */,
+    { 13, 14, 16, 17, 22, 23, 25, 26 } /* 222, 8, 8 octants */,
    {  4, 13, 22 } /* 311, 3 */,
    { 10, 13, 16 } /* 131, 3 */,
    { 12, 13, 14 } /* 113, 3 */,
--- a/octree/octree/octree_nn.cu
+++ b/octree/octree/octree_nn.cu
@ -825,6 +825,7 @@ template void memset_gpu<double>(const size_t N, const double alpha, double* Y);
 template void memset_gpu<char>(const size_t N, const char alpha, char* Y);
 template void memset_gpu<int8_t>(const size_t N, const int8_t alpha, int8_t* Y);
 template void memset_gpu<uint8_t>(const size_t N, const uint8_t alpha, uint8_t* Y);
+template void memcpy_gpu<char>(const size_t N, const char* X, char* Y);
 template void memcpy_gpu<int>(const size_t N, const int* X, int* Y);
 template void memcpy_gpu<int64_t>(const size_t N, const int64_t* X, int64_t* Y);
 template void memcpy_gpu<int16_t>(const size_t N, const int16_t* X, int16_t* Y);
@ -844,6 +845,8 @@ template void pad_backward_gpu<double>(double* X, const int Hx, const int Cx,
    const double* Y, const int Hy, const int* label);
 template void pad_backward_gpu<int>(int* X, const int Hx, const int Cx,
    const int* Y, const int Hy, const int* label);
+template void pad_backward_gpu<uintk>(uintk* X, const int Hx, const int Cx,
+    const uintk* Y, const int Hy, const int* label);
 template void octree2col_gpu<float>(float* data_col, const float* data_octree,
    const int channel, const int height,  const int kernel_sdim, const int stride,
    const int* neigh, const int* ni, const int height_col, const int n);
--- a/octree/octree/points_parser.cpp
+++ b/octree/octree/points_parser.cpp
@ -150,7 +150,7 @@ void PointsParser::transform(const float* mat) {
  }
 }

-void PointsParser::clip(const float* bbmin, const float* bbmax) {
+vector<int> PointsParser::clip(const float* bbmin, const float* bbmax) {
  int npt = info_->pt_num(), npt_in_bbox = 0;
  float* pts = mutable_points();
  vector<int> in_bbox(npt, 0);
@ -162,7 +162,10 @@ void PointsParser::clip(const float* bbmin, const float* bbmax) {
    npt_in_bbox += in_bbox[i];
  }

-  if (npt_in_bbox == npt) return; // early stop
+  if (npt_in_bbox == npt) {       // early stop
+    return in_bbox;
+  }
+
  if (npt_in_bbox == 0) {         // no points
    // just keep one point to avoid the degenerated case
    npt_in_bbox = 1;
@ -171,7 +174,7 @@ void PointsParser::clip(const float* bbmin, const float* bbmax) {
    for (int i = 0; i < 3; ++i) { p[i] = bbmin[i]; }
  }

-  // Just discard the points which are out of the bbox
+  // discard the points which are out of the bbox
  for (int t = 0; t < PointsInfo::kPTypeNum; ++t) {
    auto ptype = static_cast<PointsInfo::PropType>(1 << t);
    int channel = info_->channel(ptype);
@ -188,6 +191,7 @@ void PointsParser::clip(const float* bbmin, const float* bbmax) {
  }

  info_->set_pt_num(npt_in_bbox);
+  return in_bbox;
 }

 void PointsParser::add_noise(const float std_pt, const float std_nm) {
--- a/octree/octree/points_parser.h
+++ b/octree/octree/points_parser.h
@ -39,7 +39,7 @@ class PointsParser {
  void rotate(const float angle, const float* axis); // angle in radian
  void rotate(const float* angles); 
  void transform(const float* trans_matrix);
-  void clip(const float* bbmin, const float* bbmax);
+  vector<int> clip(const float* bbmin, const float* bbmax);
  void add_noise(const float std_pt, const float std_nm);
  void normalize(); // translate and scale the points to unit sphere
  void orient_normal(const string axis);