зеркало из https://github.com/microsoft/LightGBM.git
Родитель
b1b24ee2f6
Коммит
cc11525d26
|
@ -224,7 +224,12 @@ public:
|
|||
int gpu_device_id = -1;
|
||||
/*! \brief Set to true to use double precision math on GPU (default using single precision) */
|
||||
bool gpu_use_dp = false;
|
||||
int max_cat_group = 64;
|
||||
int min_data_per_group = 10;
|
||||
int max_cat_threshold = 256;
|
||||
double cat_smooth_ratio = 0.01;
|
||||
double min_cat_smooth = 5;
|
||||
double max_cat_smooth = 100;
|
||||
LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override;
|
||||
};
|
||||
|
||||
|
@ -463,8 +468,8 @@ struct ParameterAlias {
|
|||
"snapshot_freq", "verbosity", "sparse_threshold", "enable_load_from_binary_file",
|
||||
"max_conflict_rate", "poisson_max_delta_step", "gaussian_eta",
|
||||
"histogram_pool_size", "output_freq", "is_provide_training_metric", "machine_list_filename",
|
||||
"zero_as_missing", "max_cat_threshold",
|
||||
"init_score_file", "valid_init_score_file", "is_predict_contrib"
|
||||
"zero_as_missing", "init_score_file", "valid_init_score_file", "is_predict_contrib",
|
||||
"max_cat_threshold", "max_cat_group", "cat_smooth_ratio", "min_cat_smooth", "max_cat_smooth", "min_data_per_group"
|
||||
});
|
||||
std::unordered_map<std::string, std::string> tmp_map;
|
||||
for (const auto& pair : *params) {
|
||||
|
|
|
@ -370,7 +370,18 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params)
|
|||
GetInt(params, "gpu_platform_id", &gpu_platform_id);
|
||||
GetInt(params, "gpu_device_id", &gpu_device_id);
|
||||
GetBool(params, "gpu_use_dp", &gpu_use_dp);
|
||||
GetInt(params, "max_cat_group", &max_cat_group);
|
||||
GetInt(params, "max_cat_threshold", &max_cat_threshold);
|
||||
GetDouble(params, "cat_smooth_ratio", &cat_smooth_ratio);
|
||||
GetDouble(params, "min_cat_smooth", &min_cat_smooth);
|
||||
GetDouble(params, "max_cat_smooth", &max_cat_smooth);
|
||||
GetInt(params, "min_data_per_group", &min_data_per_group);
|
||||
CHECK(max_cat_group > 1);
|
||||
CHECK(max_cat_threshold > 0);
|
||||
CHECK(cat_smooth_ratio >= 0);
|
||||
CHECK(min_cat_smooth >= 1);
|
||||
CHECK(max_cat_smooth > min_cat_smooth);
|
||||
CHECK(min_data_per_group > 0);
|
||||
}
|
||||
|
||||
void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& params) {
|
||||
|
|
|
@ -106,58 +106,90 @@ public:
|
|||
output->default_left = false;
|
||||
double best_gain = kMinScore;
|
||||
data_size_t best_left_count = 0;
|
||||
double best_sum_left_gradient = 0.0f;
|
||||
double best_sum_left_hessian = 0.0f;
|
||||
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
|
||||
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
|
||||
double best_sum_left_gradient = 0;
|
||||
double best_sum_left_hessian = 0;
|
||||
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian, meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
|
||||
|
||||
double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
|
||||
is_splittable_ = false;
|
||||
|
||||
uint32_t best_threshold = 0;
|
||||
bool is_full_categorical = meta_->missing_type == MissingType::None;
|
||||
int used_bin = meta_->num_bin - 1;
|
||||
|
||||
int used_bin = meta_->num_bin - 1 + is_full_categorical;
|
||||
if (is_full_categorical) ++used_bin;
|
||||
|
||||
// from right to left, and we don't need data in bin0
|
||||
for (int t = 0; t < used_bin; ++t) {
|
||||
// if data not enough, or sum hessian too small
|
||||
if (data_[t].cnt < meta_->tree_config->min_data_in_leaf
|
||||
|| data_[t].sum_hessians < meta_->tree_config->min_sum_hessian_in_leaf) continue;
|
||||
data_size_t other_count = num_data - data_[t].cnt;
|
||||
// if data not enough
|
||||
if (other_count < meta_->tree_config->min_data_in_leaf) continue;
|
||||
std::vector<int> sorted_idx(used_bin);
|
||||
for (int i = 0; i < used_bin; ++i) sorted_idx[i] = i;
|
||||
|
||||
double sum_other_hessian = sum_hessian - data_[t].sum_hessians - kEpsilon;
|
||||
// if sum hessian too small
|
||||
if (sum_other_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
|
||||
double smooth_hess = meta_->tree_config->cat_smooth_ratio * num_data;
|
||||
smooth_hess = std::min(meta_->tree_config->max_cat_smooth, std::max(smooth_hess, meta_->tree_config->min_cat_smooth));
|
||||
const double smooth_grad = smooth_hess * sum_gradient / sum_hessian;
|
||||
|
||||
double sum_other_gradient = sum_gradient - data_[t].sum_gradients;
|
||||
// current split gain
|
||||
double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian,
|
||||
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
|
||||
+ GetLeafSplitGain(data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon,
|
||||
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
|
||||
// gain with split is worse than without split
|
||||
if (current_gain <= min_gain_shift) continue;
|
||||
auto ctr_fun = [&smooth_hess, &smooth_grad](double sum_grad, double sum_hess) {
|
||||
return (sum_grad + smooth_grad) / (sum_hess + smooth_hess);
|
||||
};
|
||||
std::sort(sorted_idx.begin(), sorted_idx.end(),
|
||||
[this, &ctr_fun](int i, int j) {
|
||||
return ctr_fun(data_[i].sum_gradients, data_[i].sum_hessians) < ctr_fun(data_[j].sum_gradients, data_[j].sum_hessians);
|
||||
});
|
||||
|
||||
// mark to is splittable
|
||||
is_splittable_ = true;
|
||||
// better split point
|
||||
if (current_gain > best_gain) {
|
||||
best_threshold = static_cast<uint32_t>(t);
|
||||
best_sum_left_gradient = data_[t].sum_gradients;
|
||||
best_sum_left_hessian = data_[t].sum_hessians + kEpsilon;
|
||||
best_left_count = data_[t].cnt;
|
||||
best_gain = current_gain;
|
||||
std::vector<int> find_direction(1, 1);
|
||||
std::vector<int> start_position(1, 0);
|
||||
if (!is_full_categorical
|
||||
|| meta_->tree_config->max_cat_threshold * 2 < meta_->num_bin) {
|
||||
find_direction.push_back(-1);
|
||||
start_position.push_back(used_bin - 1);
|
||||
}
|
||||
|
||||
is_splittable_ = false;
|
||||
int best_threshold = -1;
|
||||
int best_dir = 1;
|
||||
for (size_t out_i = 0; out_i < find_direction.size(); ++out_i) {
|
||||
auto dir = find_direction[out_i];
|
||||
auto start_pos = start_position[out_i];
|
||||
data_size_t rest_group = meta_->tree_config->max_cat_group;
|
||||
data_size_t min_data_per_group = std::max(meta_->tree_config->min_data_per_group, num_data / rest_group);
|
||||
data_size_t cnt_cur_group = 0;
|
||||
double sum_left_gradient = 0.0f;
|
||||
double sum_left_hessian = kEpsilon;
|
||||
data_size_t left_count = 0;
|
||||
for (int i = 0; i < used_bin && i < meta_->tree_config->max_cat_threshold; ++i) {
|
||||
auto t = sorted_idx[start_pos];
|
||||
start_pos += dir;
|
||||
|
||||
sum_left_gradient += data_[t].sum_gradients;
|
||||
sum_left_hessian += data_[t].sum_hessians;
|
||||
left_count += data_[t].cnt;
|
||||
cnt_cur_group += data_[t].cnt;
|
||||
|
||||
if (left_count < meta_->tree_config->min_data_in_leaf
|
||||
|| sum_left_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
|
||||
data_size_t right_count = num_data - left_count;
|
||||
if (right_count < meta_->tree_config->min_data_in_leaf || right_count < min_data_per_group) break;
|
||||
|
||||
double sum_right_hessian = sum_hessian - sum_left_hessian;
|
||||
if (sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) break;
|
||||
|
||||
if (cnt_cur_group < min_data_per_group) continue;
|
||||
|
||||
cnt_cur_group = 0;
|
||||
if (--rest_group > 0) min_data_per_group = std::max(meta_->tree_config->min_data_per_group, right_count / rest_group);
|
||||
|
||||
double sum_right_gradient = sum_gradient - sum_left_gradient;
|
||||
double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian, meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
|
||||
+ GetLeafSplitGain(sum_right_gradient, sum_right_hessian, meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
|
||||
if (current_gain <= min_gain_shift) continue;
|
||||
is_splittable_ = true;
|
||||
if (current_gain > best_gain) {
|
||||
best_left_count = left_count;
|
||||
best_sum_left_gradient = sum_left_gradient;
|
||||
best_sum_left_hessian = sum_left_hessian;
|
||||
best_threshold = i;
|
||||
best_gain = current_gain;
|
||||
best_dir = dir;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (is_splittable_) {
|
||||
// update split information
|
||||
output->num_cat_threshold = 1;
|
||||
output->cat_threshold.resize(output->num_cat_threshold);
|
||||
output->cat_threshold[0] = best_threshold;
|
||||
output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
|
||||
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
|
||||
output->left_count = best_left_count;
|
||||
|
@ -170,6 +202,17 @@ public:
|
|||
output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
|
||||
output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
|
||||
output->gain = best_gain - min_gain_shift;
|
||||
output->num_cat_threshold = best_threshold + 1;
|
||||
output->cat_threshold = std::vector<uint32_t>(output->num_cat_threshold);
|
||||
if (best_dir == 1) {
|
||||
for (int i = 0; i < output->num_cat_threshold; ++i) {
|
||||
output->cat_threshold[i] = sorted_idx[i];
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < output->num_cat_threshold; ++i) {
|
||||
output->cat_threshold[i] = sorted_idx[used_bin - 1 - i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -287,7 +330,7 @@ private:
|
|||
best_gain = current_gain;
|
||||
}
|
||||
}
|
||||
} else{
|
||||
} else {
|
||||
double sum_left_gradient = 0.0f;
|
||||
double sum_left_hessian = kEpsilon;
|
||||
data_size_t left_count = 0;
|
||||
|
|
|
@ -132,7 +132,7 @@ class TestEngine(unittest.TestCase):
|
|||
lgb_eval = lgb.Dataset(X_train, y_train)
|
||||
|
||||
params = {
|
||||
'objective': 'binary',
|
||||
'objective': 'regression',
|
||||
'metric': 'auc',
|
||||
'verbose': -1,
|
||||
'boost_from_average': False,
|
||||
|
@ -149,7 +149,7 @@ class TestEngine(unittest.TestCase):
|
|||
verbose_eval=True,
|
||||
evals_result=evals_result)
|
||||
pred = gbm.predict(X_train)
|
||||
self.assertAlmostEqual(pred[-1], pred[0], places=5)
|
||||
np.testing.assert_almost_equal(pred, y)
|
||||
|
||||
def test_missing_value_handle_zero(self):
|
||||
x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
|
||||
|
@ -161,7 +161,7 @@ class TestEngine(unittest.TestCase):
|
|||
lgb_eval = lgb.Dataset(X_train, y_train)
|
||||
|
||||
params = {
|
||||
'objective': 'binary',
|
||||
'objective': 'regression',
|
||||
'metric': 'auc',
|
||||
'verbose': -1,
|
||||
'boost_from_average': False,
|
||||
|
@ -178,8 +178,7 @@ class TestEngine(unittest.TestCase):
|
|||
verbose_eval=True,
|
||||
evals_result=evals_result)
|
||||
pred = gbm.predict(X_train)
|
||||
self.assertAlmostEqual(pred[-1], pred[-2], places=5)
|
||||
self.assertAlmostEqual(pred[-1], pred[0], places=5)
|
||||
np.testing.assert_almost_equal(pred, y)
|
||||
|
||||
def test_missing_value_handle_none(self):
|
||||
x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
|
||||
|
@ -191,7 +190,7 @@ class TestEngine(unittest.TestCase):
|
|||
lgb_eval = lgb.Dataset(X_train, y_train)
|
||||
|
||||
params = {
|
||||
'objective': 'binary',
|
||||
'objective': 'regression',
|
||||
'metric': 'auc',
|
||||
'verbose': -1,
|
||||
'boost_from_average': False,
|
||||
|
@ -211,6 +210,37 @@ class TestEngine(unittest.TestCase):
|
|||
self.assertAlmostEqual(pred[0], pred[1], places=5)
|
||||
self.assertAlmostEqual(pred[-1], pred[0], places=5)
|
||||
|
||||
def test_categorical_handle(self):
|
||||
x = [0, 1, 2, 3, 4, 5, 6, 7]
|
||||
y = [0, 1, 0, 1, 0, 1, 0, 1]
|
||||
|
||||
X_train = np.array(x).reshape(len(x), 1)
|
||||
y_train = np.array(y)
|
||||
lgb_train = lgb.Dataset(X_train, y_train)
|
||||
lgb_eval = lgb.Dataset(X_train, y_train)
|
||||
|
||||
params = {
|
||||
'objective': 'regression',
|
||||
'metric': 'auc',
|
||||
'verbose': -1,
|
||||
'boost_from_average': False,
|
||||
'min_data': 1,
|
||||
'num_leaves': 2,
|
||||
'learning_rate': 1,
|
||||
'min_data_in_bin': 1,
|
||||
'min_data_per_group': 1,
|
||||
'zero_as_missing': True,
|
||||
'categorical_column': 0
|
||||
}
|
||||
evals_result = {}
|
||||
gbm = lgb.train(params, lgb_train,
|
||||
num_boost_round=1,
|
||||
valid_sets=lgb_eval,
|
||||
verbose_eval=True,
|
||||
evals_result=evals_result)
|
||||
pred = gbm.predict(X_train)
|
||||
np.testing.assert_almost_equal(pred, y)
|
||||
|
||||
def test_multiclass(self):
|
||||
X, y = load_digits(10, True)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
|
||||
|
|
Загрузка…
Ссылка в новой задаче