refine categorical split (#919)

* refine categorical split * add test
2017-09-28 12:29:18 +08:00 · 2017-09-28 12:29:18 +08:00 · cc11525d26
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@ -224,7 +224,12 @@ public:
  int gpu_device_id = -1;
  /*! \brief Set to true to use double precision math on GPU (default using single precision) */
  bool gpu_use_dp = false;
+  int max_cat_group = 64;
+  int min_data_per_group = 10;
  int max_cat_threshold = 256;
+  double cat_smooth_ratio = 0.01;
+  double min_cat_smooth = 5;
+  double max_cat_smooth = 100;
  LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override;
 };

@ -463,8 +468,8 @@ struct ParameterAlias {
      "snapshot_freq", "verbosity", "sparse_threshold", "enable_load_from_binary_file",
      "max_conflict_rate", "poisson_max_delta_step", "gaussian_eta",
      "histogram_pool_size", "output_freq", "is_provide_training_metric", "machine_list_filename",
-      "zero_as_missing", "max_cat_threshold",
-      "init_score_file", "valid_init_score_file", "is_predict_contrib"
+      "zero_as_missing", "init_score_file", "valid_init_score_file", "is_predict_contrib",
+      "max_cat_threshold", "max_cat_group", "cat_smooth_ratio", "min_cat_smooth", "max_cat_smooth", "min_data_per_group"
    });
    std::unordered_map<std::string, std::string> tmp_map;
    for (const auto& pair : *params) {
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@ -370,7 +370,18 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params)
  GetInt(params, "gpu_platform_id", &gpu_platform_id);
  GetInt(params, "gpu_device_id", &gpu_device_id);
  GetBool(params, "gpu_use_dp", &gpu_use_dp);
+  GetInt(params, "max_cat_group", &max_cat_group);
  GetInt(params, "max_cat_threshold", &max_cat_threshold);
+  GetDouble(params, "cat_smooth_ratio", &cat_smooth_ratio);
+  GetDouble(params, "min_cat_smooth", &min_cat_smooth);
+  GetDouble(params, "max_cat_smooth", &max_cat_smooth);
+  GetInt(params, "min_data_per_group", &min_data_per_group);
+  CHECK(max_cat_group > 1);
+  CHECK(max_cat_threshold > 0);
+  CHECK(cat_smooth_ratio >= 0);
+  CHECK(min_cat_smooth >= 1);
+  CHECK(max_cat_smooth > min_cat_smooth);
+  CHECK(min_data_per_group > 0);
 }

 void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& params) {
--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@ -106,58 +106,90 @@ public:
    output->default_left = false;
    double best_gain = kMinScore;
    data_size_t best_left_count = 0;
-    double best_sum_left_gradient = 0.0f;
-    double best_sum_left_hessian = 0.0f;
-    double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
-                                         meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
+    double best_sum_left_gradient = 0;
+    double best_sum_left_hessian = 0;
+    double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian, meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);

    double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
-    is_splittable_ = false;
-
-    uint32_t best_threshold = 0;
    bool is_full_categorical = meta_->missing_type == MissingType::None;
+    int used_bin = meta_->num_bin - 1;

-    int used_bin = meta_->num_bin - 1 + is_full_categorical;
+    if (is_full_categorical) ++used_bin;

-    // from right to left, and we don't need data in bin0
-    for (int t = 0; t < used_bin; ++t) {
-      // if data not enough, or sum hessian too small
-      if (data_[t].cnt < meta_->tree_config->min_data_in_leaf
-          || data_[t].sum_hessians < meta_->tree_config->min_sum_hessian_in_leaf) continue;
-      data_size_t other_count = num_data - data_[t].cnt;
-      // if data not enough
-      if (other_count < meta_->tree_config->min_data_in_leaf) continue;
+    std::vector<int> sorted_idx(used_bin);
+    for (int i = 0; i < used_bin; ++i) sorted_idx[i] = i;

-      double sum_other_hessian = sum_hessian - data_[t].sum_hessians - kEpsilon;
-      // if sum hessian too small
-      if (sum_other_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
+    double smooth_hess = meta_->tree_config->cat_smooth_ratio * num_data;
+    smooth_hess = std::min(meta_->tree_config->max_cat_smooth, std::max(smooth_hess, meta_->tree_config->min_cat_smooth));
+    const double smooth_grad = smooth_hess * sum_gradient / sum_hessian;

-      double sum_other_gradient = sum_gradient - data_[t].sum_gradients;
-      // current split gain
-      double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian,
-                                             meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
-        + GetLeafSplitGain(data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon,
-                           meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
-      // gain with split is worse than without split
-      if (current_gain <= min_gain_shift) continue;
+    auto ctr_fun = [&smooth_hess, &smooth_grad](double sum_grad, double sum_hess) {
+      return (sum_grad + smooth_grad) / (sum_hess + smooth_hess);
+    };
+    std::sort(sorted_idx.begin(), sorted_idx.end(),
+              [this, &ctr_fun](int i, int j) {
+      return ctr_fun(data_[i].sum_gradients, data_[i].sum_hessians) < ctr_fun(data_[j].sum_gradients, data_[j].sum_hessians);
+    });

-      // mark to is splittable
-      is_splittable_ = true;
-      // better split point
-      if (current_gain > best_gain) {
-        best_threshold = static_cast<uint32_t>(t);
-        best_sum_left_gradient = data_[t].sum_gradients;
-        best_sum_left_hessian = data_[t].sum_hessians + kEpsilon;
-        best_left_count = data_[t].cnt;
-        best_gain = current_gain;
+    std::vector<int> find_direction(1, 1);
+    std::vector<int> start_position(1, 0);
+    if (!is_full_categorical
+        || meta_->tree_config->max_cat_threshold * 2 < meta_->num_bin) {
+      find_direction.push_back(-1);
+      start_position.push_back(used_bin - 1);
+    }
+
+    is_splittable_ = false;
+    int best_threshold = -1;
+    int best_dir = 1;
+    for (size_t out_i = 0; out_i < find_direction.size(); ++out_i) {
+      auto dir = find_direction[out_i];
+      auto start_pos = start_position[out_i];
+      data_size_t rest_group = meta_->tree_config->max_cat_group;
+      data_size_t min_data_per_group = std::max(meta_->tree_config->min_data_per_group, num_data / rest_group);
+      data_size_t cnt_cur_group = 0;
+      double sum_left_gradient = 0.0f;
+      double sum_left_hessian = kEpsilon;
+      data_size_t left_count = 0;
+      for (int i = 0; i < used_bin && i < meta_->tree_config->max_cat_threshold; ++i) {
+        auto t = sorted_idx[start_pos];
+        start_pos += dir;
+
+        sum_left_gradient += data_[t].sum_gradients;
+        sum_left_hessian += data_[t].sum_hessians;
+        left_count += data_[t].cnt;
+        cnt_cur_group += data_[t].cnt;
+
+        if (left_count < meta_->tree_config->min_data_in_leaf
+            || sum_left_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
+        data_size_t right_count = num_data - left_count;
+        if (right_count < meta_->tree_config->min_data_in_leaf || right_count < min_data_per_group) break;
+
+        double sum_right_hessian = sum_hessian - sum_left_hessian;
+        if (sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) break;
+
+        if (cnt_cur_group < min_data_per_group) continue;
+
+        cnt_cur_group = 0;
+        if (--rest_group > 0) min_data_per_group = std::max(meta_->tree_config->min_data_per_group, right_count / rest_group);
+
+        double sum_right_gradient = sum_gradient - sum_left_gradient;
+        double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian, meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
+          + GetLeafSplitGain(sum_right_gradient, sum_right_hessian, meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
+        if (current_gain <= min_gain_shift) continue;
+        is_splittable_ = true;
+        if (current_gain > best_gain) {
+          best_left_count = left_count;
+          best_sum_left_gradient = sum_left_gradient;
+          best_sum_left_hessian = sum_left_hessian;
+          best_threshold = i;
+          best_gain = current_gain;
+          best_dir = dir;
+        }
      }
    }

    if (is_splittable_) {
-      // update split information
-      output->num_cat_threshold = 1;
-      output->cat_threshold.resize(output->num_cat_threshold);
-      output->cat_threshold[0] = best_threshold;
      output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
                                                        meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
      output->left_count = best_left_count;
@ -170,6 +202,17 @@ public:
      output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
      output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
      output->gain = best_gain - min_gain_shift;
+      output->num_cat_threshold = best_threshold + 1;
+      output->cat_threshold = std::vector<uint32_t>(output->num_cat_threshold);
+      if (best_dir == 1) {
+        for (int i = 0; i < output->num_cat_threshold; ++i) {
+          output->cat_threshold[i] = sorted_idx[i];
+        }
+      } else {
+        for (int i = 0; i < output->num_cat_threshold; ++i) {
+          output->cat_threshold[i] = sorted_idx[used_bin - 1 - i];
+        }
+      }
    }
  }

@ -287,7 +330,7 @@ private:
          best_gain = current_gain;
        }
      }
-    } else{
+    } else {
      double sum_left_gradient = 0.0f;
      double sum_left_hessian = kEpsilon;
      data_size_t left_count = 0;
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@ -132,7 +132,7 @@ class TestEngine(unittest.TestCase):
        lgb_eval = lgb.Dataset(X_train, y_train)

        params = {
-            'objective': 'binary',
+            'objective': 'regression',
            'metric': 'auc',
            'verbose': -1,
            'boost_from_average': False,
@ -149,7 +149,7 @@ class TestEngine(unittest.TestCase):
                        verbose_eval=True,
                        evals_result=evals_result)
        pred = gbm.predict(X_train)
-        self.assertAlmostEqual(pred[-1], pred[0], places=5)
+        np.testing.assert_almost_equal(pred, y)

    def test_missing_value_handle_zero(self):
        x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
@ -161,7 +161,7 @@ class TestEngine(unittest.TestCase):
        lgb_eval = lgb.Dataset(X_train, y_train)

        params = {
-            'objective': 'binary',
+            'objective': 'regression',
            'metric': 'auc',
            'verbose': -1,
            'boost_from_average': False,
@ -178,8 +178,7 @@ class TestEngine(unittest.TestCase):
                        verbose_eval=True,
                        evals_result=evals_result)
        pred = gbm.predict(X_train)
-        self.assertAlmostEqual(pred[-1], pred[-2], places=5)
-        self.assertAlmostEqual(pred[-1], pred[0], places=5)
+        np.testing.assert_almost_equal(pred, y)

    def test_missing_value_handle_none(self):
        x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
@ -191,7 +190,7 @@ class TestEngine(unittest.TestCase):
        lgb_eval = lgb.Dataset(X_train, y_train)

        params = {
-            'objective': 'binary',
+            'objective': 'regression',
            'metric': 'auc',
            'verbose': -1,
            'boost_from_average': False,
@ -211,6 +210,37 @@ class TestEngine(unittest.TestCase):
        self.assertAlmostEqual(pred[0], pred[1], places=5)
        self.assertAlmostEqual(pred[-1], pred[0], places=5)

+    def test_categorical_handle(self):
+        x = [0, 1, 2, 3, 4, 5, 6, 7]
+        y = [0, 1, 0, 1, 0, 1, 0, 1]
+
+        X_train = np.array(x).reshape(len(x), 1)
+        y_train = np.array(y)
+        lgb_train = lgb.Dataset(X_train, y_train)
+        lgb_eval = lgb.Dataset(X_train, y_train)
+
+        params = {
+            'objective': 'regression',
+            'metric': 'auc',
+            'verbose': -1,
+            'boost_from_average': False,
+            'min_data': 1,
+            'num_leaves': 2,
+            'learning_rate': 1,
+            'min_data_in_bin': 1,
+            'min_data_per_group': 1,
+            'zero_as_missing': True,
+            'categorical_column': 0
+        }
+        evals_result = {}
+        gbm = lgb.train(params, lgb_train,
+                        num_boost_round=1,
+                        valid_sets=lgb_eval,
+                        verbose_eval=True,
+                        evals_result=evals_result)
+        pred = gbm.predict(X_train)
+        np.testing.assert_almost_equal(pred, y)
+
    def test_multiclass(self):
        X, y = load_digits(10, True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)